mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Temporarily remove TBB (#8255)
This commit is contained in:
4
.gitmodules
vendored
4
.gitmodules
vendored
@ -1,7 +1,3 @@
|
|||||||
[submodule "third_party/tbb"]
|
|
||||||
path = third_party/tbb
|
|
||||||
url = https://github.com/01org/tbb
|
|
||||||
branch = tbb_2018
|
|
||||||
[submodule "third_party/catch"]
|
[submodule "third_party/catch"]
|
||||||
path = third_party/catch
|
path = third_party/catch
|
||||||
url = https://github.com/catchorg/Catch2.git
|
url = https://github.com/catchorg/Catch2.git
|
||||||
|
@ -89,24 +89,6 @@ IF(NOT AT_LINK_STYLE)
|
|||||||
SET(AT_LINK_STYLE SHARED)
|
SET(AT_LINK_STYLE SHARED)
|
||||||
ENDIF()
|
ENDIF()
|
||||||
|
|
||||||
# Unset our restrictive C++ flags here and reset them later.
|
|
||||||
# Remove this once we use proper target_compile_options.
|
|
||||||
set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
|
|
||||||
set(CMAKE_CXX_FLAGS)
|
|
||||||
|
|
||||||
set(TBB_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/tbb")
|
|
||||||
set(TBB_BUILD_STATIC ON CACHE BOOL " " FORCE)
|
|
||||||
set(TBB_BUILD_SHARED OFF CACHE BOOL " " FORCE)
|
|
||||||
set(TBB_BUILD_TBBMALLOC OFF CACHE BOOL " " FORCE)
|
|
||||||
set(TBB_BUILD_TBBMALLOC_PROXY OFF CACHE BOOL " " FORCE)
|
|
||||||
set(TBB_BUILD_TESTS OFF CACHE BOOL " " FORCE)
|
|
||||||
add_subdirectory(cpu/tbb)
|
|
||||||
set_property(TARGET tbb_static tbb_def_files PROPERTY FOLDER "dependencies")
|
|
||||||
list(APPEND ATen_THIRD_PARTY_INCLUDE ${TBB_ROOT_DIR}/include)
|
|
||||||
list(APPEND ATen_CPU_DEPENDENCY_LIBS tbb_static)
|
|
||||||
|
|
||||||
set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
|
|
||||||
|
|
||||||
IF(BLAS_FOUND)
|
IF(BLAS_FOUND)
|
||||||
IF ($ENV{TH_BINARY_BUILD})
|
IF ($ENV{TH_BINARY_BUILD})
|
||||||
MESSAGE(STATUS "TH_BINARY_BUILD detected. Enabling special linkage.")
|
MESSAGE(STATUS "TH_BINARY_BUILD detected. Enabling special linkage.")
|
||||||
|
@ -149,7 +149,6 @@ inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
|
|||||||
for (auto& t : tensors)
|
for (auto& t : tensors)
|
||||||
if (t.sizes().equals({0}))
|
if (t.sizes().equals({0}))
|
||||||
return false;
|
return false;
|
||||||
internal::init_tbb_num_threads();
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -351,7 +350,7 @@ template <typename scalar1, typename Op>
|
|||||||
inline void CPU_tensor_parallel_apply1(
|
inline void CPU_tensor_parallel_apply1(
|
||||||
Tensor tensor1,
|
Tensor tensor1,
|
||||||
const Op op,
|
const Op op,
|
||||||
int64_t grain_size = internal::TBB_GRAIN_SIZE) {
|
int64_t grain_size = internal::GRAIN_SIZE) {
|
||||||
if (!_apply_preamble({tensor1}))
|
if (!_apply_preamble({tensor1}))
|
||||||
return;
|
return;
|
||||||
if (tensor1.ndimension() < 8) {
|
if (tensor1.ndimension() < 8) {
|
||||||
@ -383,7 +382,7 @@ inline void CPU_tensor_parallel_apply2(
|
|||||||
Tensor tensor1,
|
Tensor tensor1,
|
||||||
Tensor tensor2,
|
Tensor tensor2,
|
||||||
const Op op,
|
const Op op,
|
||||||
int64_t grain_size = internal::TBB_GRAIN_SIZE) {
|
int64_t grain_size = internal::GRAIN_SIZE) {
|
||||||
if (!_apply_preamble({tensor1, tensor2}))
|
if (!_apply_preamble({tensor1, tensor2}))
|
||||||
return;
|
return;
|
||||||
if (tensor1.ndimension() < 8 && tensor2.ndimension() < 8) {
|
if (tensor1.ndimension() < 8 && tensor2.ndimension() < 8) {
|
||||||
|
@ -1,56 +0,0 @@
|
|||||||
#include <ATen/CPUGeneral.h>
|
|
||||||
#include <ATen/Parallel.h>
|
|
||||||
#include <tbb/blocked_range.h>
|
|
||||||
#include <tbb/parallel_reduce.h>
|
|
||||||
#include <tbb/partitioner.h>
|
|
||||||
#include <tbb/tbb.h>
|
|
||||||
#include <cassert>
|
|
||||||
#include <thread>
|
|
||||||
|
|
||||||
|
|
||||||
namespace at { namespace internal {
|
|
||||||
|
|
||||||
// thread_local variable with internal linkage
|
|
||||||
// requires no guarding as it's storage duration is defined to be per thread
|
|
||||||
static thread_local tbb::task_scheduler_init tbbinit(
|
|
||||||
tbb::task_scheduler_init::deferred);
|
|
||||||
// Tracks number of threads uses which TBB doesn't track.
|
|
||||||
static thread_local int num_threads_ = -1;
|
|
||||||
|
|
||||||
// Negative number of threads means default value
|
|
||||||
void init_tbb_num_threads() {
|
|
||||||
static thread_local bool first_call = true;
|
|
||||||
int num_threads = at::get_num_threads();
|
|
||||||
// In order to have control over the number of threads this function
|
|
||||||
// must be called first before any other tbb parallel construct is
|
|
||||||
// excercised within a particular thread. Otherwise the default
|
|
||||||
// scheduler will be created over which we do not have control.
|
|
||||||
// The following code will and must throw an error if tbb has
|
|
||||||
// already been initialized before this function was called.
|
|
||||||
if (!tbbinit.is_active() && !first_call)
|
|
||||||
throw std::runtime_error(
|
|
||||||
"tbb initialization failed: scheduler not active after first call");
|
|
||||||
if (first_call) {
|
|
||||||
if (tbbinit.is_active())
|
|
||||||
throw std::runtime_error(
|
|
||||||
"tbb initialization failed: scheduler active on first call");
|
|
||||||
if (num_threads < 0) {
|
|
||||||
int max_threads = tbbinit.default_num_threads();
|
|
||||||
tbbinit.initialize(max_threads);
|
|
||||||
} else {
|
|
||||||
tbbinit.initialize(num_threads);
|
|
||||||
}
|
|
||||||
first_call = false;
|
|
||||||
}
|
|
||||||
if (num_threads == 0) {
|
|
||||||
// TODO: For PyTorch 0 means 1
|
|
||||||
num_threads = 1;
|
|
||||||
}
|
|
||||||
if (num_threads > 0 && (num_threads_ != num_threads)) {
|
|
||||||
tbbinit.terminate();
|
|
||||||
tbbinit.initialize(num_threads);
|
|
||||||
num_threads_ = num_threads;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} // namespace internal
|
|
||||||
} // namespace at
|
|
@ -1,54 +1,59 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <tbb/tbb.h>
|
|
||||||
|
|
||||||
|
#ifdef _OPENMP
|
||||||
|
#include <omp.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
namespace internal {
|
namespace internal {
|
||||||
// This needs to be called before the first use of any algorithm such as
|
|
||||||
// parallel or it will have no effect and the default task scheduler is
|
|
||||||
// created which uses all available cores.
|
|
||||||
// See
|
|
||||||
// https://www.threadingbuildingblocks.org/docs/help/reference/task_scheduler/task_scheduler_init_cls.html
|
|
||||||
// This does not initializes the number of workers in the market (the overall
|
|
||||||
// of workers available to a process). It is merely a request to the market
|
|
||||||
// for a certain number of workers. If there are multiple threads making
|
|
||||||
// a request at the size of the maximum number of threads, they will
|
|
||||||
// be allocated a number proportional to the other requests.
|
|
||||||
AT_API void init_tbb_num_threads();
|
|
||||||
// This parameter is heuristically chosen to determine the minimum number of
|
// This parameter is heuristically chosen to determine the minimum number of
|
||||||
// work that warrants paralellism. For example, when summing an array, it is
|
// work that warrants paralellism. For example, when summing an array, it is
|
||||||
// deemed inefficient to parallelise over arrays shorter than 32768. Further,
|
// deemed inefficient to parallelise over arrays shorter than 32768. Further,
|
||||||
// no parallel algorithm (such as parallel_reduce) should split work into
|
// no parallel algorithm (such as parallel_reduce) should split work into
|
||||||
// smaller than GRAIN_SIZE chunks.
|
// smaller than GRAIN_SIZE chunks.
|
||||||
constexpr int64_t TBB_GRAIN_SIZE = 32768;
|
constexpr int64_t GRAIN_SIZE = 32768;
|
||||||
} // namespace internal
|
} // namespace internal
|
||||||
|
|
||||||
|
inline int64_t divup(int64_t x, int64_t y) {
|
||||||
|
return (x + y - 1) / y;
|
||||||
|
}
|
||||||
|
|
||||||
template <class F>
|
template <class F>
|
||||||
inline void parallel_for(
|
inline void parallel_for(
|
||||||
int64_t begin,
|
const int64_t begin,
|
||||||
int64_t end,
|
const int64_t end,
|
||||||
int64_t grain_size,
|
const int64_t grain_size_,
|
||||||
const F& f) {
|
const F f) {
|
||||||
internal::init_tbb_num_threads();
|
const int64_t min_grain_size = divup((end - begin), get_num_threads());
|
||||||
|
const int64_t grain_size = std::max(min_grain_size, grain_size_);
|
||||||
#ifdef __PPC64__
|
#pragma omp parallel for if ((end - begin) >= grain_size && get_num_threads() > 1)
|
||||||
using default_partitioner_type = tbb::simple_partitioner;
|
for (int64_t i = begin; i < end; i += grain_size) {
|
||||||
#else
|
f(i, i + std::min(end - i, grain_size));
|
||||||
using default_partitioner_type = tbb::affinity_partitioner;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
thread_local static default_partitioner_type ap;
|
|
||||||
|
|
||||||
if ((end - begin) < grain_size || get_num_threads() == 1) {
|
|
||||||
f(begin, end);
|
|
||||||
} else {
|
|
||||||
tbb::parallel_for(
|
|
||||||
tbb::blocked_range<int64_t>(begin, end, grain_size),
|
|
||||||
[f](const tbb::blocked_range<int64_t>& r) { f(r.begin(), r.end()); },
|
|
||||||
ap);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <class scalar_t, class F, class SF>
|
||||||
|
inline scalar_t parallel_reduce(
|
||||||
|
const int64_t begin,
|
||||||
|
const int64_t end,
|
||||||
|
const int64_t grain_size_,
|
||||||
|
const scalar_t ident,
|
||||||
|
const F f,
|
||||||
|
const SF sf) {
|
||||||
|
const int64_t min_grain_size = divup((end - begin), get_num_threads());
|
||||||
|
const int64_t grain_size = std::max(min_grain_size, grain_size_);
|
||||||
|
const int64_t num_results = divup((end - begin), grain_size);
|
||||||
|
std::vector<scalar_t> results(num_results);
|
||||||
|
scalar_t* results_data = results.data();
|
||||||
|
#pragma omp parallel for if ((end - begin) >= grain_size && get_num_threads() > 1)
|
||||||
|
for (int64_t id = 0; id < num_results; id++) {
|
||||||
|
int64_t i = begin + id * grain_size;
|
||||||
|
results_data[id] = f(i, i + std::min(end - i, grain_size), ident);
|
||||||
|
}
|
||||||
|
return std::accumulate(
|
||||||
|
results_data, results_data + results.size(), ident, sf);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace at
|
} // namespace at
|
||||||
|
@ -1,376 +0,0 @@
|
|||||||
# Based on https://github.com/wjakob/tbb/blob/master/CMakeLists.txt
|
|
||||||
# All credit goes to Wenzel Jakob!
|
|
||||||
|
|
||||||
cmake_minimum_required (VERSION 2.8.12 FATAL_ERROR)
|
|
||||||
project (tbb CXX)
|
|
||||||
|
|
||||||
include(CheckCXXCompilerFlag)
|
|
||||||
include(CheckCXXSourceRuns)
|
|
||||||
|
|
||||||
if(POLICY CMP0058)
|
|
||||||
cmake_policy(SET CMP0058 NEW)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
|
|
||||||
message(STATUS "Setting build type to 'Release' as none was specified.")
|
|
||||||
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
|
|
||||||
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release"
|
|
||||||
"MinSizeRel" "RelWithDebInfo")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(NOT TBB_ROOT_DIR)
|
|
||||||
set(TBB_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
|
|
||||||
endif()
|
|
||||||
if(NOT TBB_INSTALL_RUNTIME_DIR)
|
|
||||||
set(TBB_INSTALL_RUNTIME_DIR bin)
|
|
||||||
endif()
|
|
||||||
if(NOT TBB_INSTALL_LIBRARY_DIR)
|
|
||||||
set(TBB_INSTALL_LIBRARY_DIR lib)
|
|
||||||
endif()
|
|
||||||
if(NOT TBB_INSTALL_ARCHIVE_DIR)
|
|
||||||
set(TBB_INSTALL_ARCHIVE_DIR lib)
|
|
||||||
endif()
|
|
||||||
if(NOT TBB_INSTALL_INCLUDE_DIR)
|
|
||||||
set(TBB_INSTALL_INCLUDE_DIR "${TBB_ROOT_DIR}/include")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
set(TBB_INCLUDES
|
|
||||||
"${TBB_ROOT_DIR}/include"
|
|
||||||
"${TBB_ROOT_DIR}/src"
|
|
||||||
"${TBB_ROOT_DIR}/src/rml/include"
|
|
||||||
${CMAKE_CURRENT_BINARY_DIR})
|
|
||||||
|
|
||||||
option(TBB_BUILD_SHARED "Build TBB shared library" ON)
|
|
||||||
option(TBB_BUILD_STATIC "Build TBB static library" ON)
|
|
||||||
option(TBB_BUILD_TBBMALLOC "Build TBB malloc library" ON)
|
|
||||||
option(TBB_BUILD_TBBMALLOC_PROXY "Build TBB malloc proxy library" ON)
|
|
||||||
option(TBB_BUILD_TESTS "Build TBB tests and enable testing infrastructure" ON)
|
|
||||||
option(TBB_CI_BUILD "Is this a continuous integration build?" OFF)
|
|
||||||
|
|
||||||
if(APPLE)
|
|
||||||
set(CMAKE_MACOSX_RPATH ON)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
file(GLOB tbb_src "${TBB_ROOT_DIR}/src/tbb/*.cpp" "${TBB_ROOT_DIR}/src/old/*.cpp")
|
|
||||||
list(APPEND tbb_src ${TBB_ROOT_DIR}/src/rml/client/rml_tbb.cpp)
|
|
||||||
file(GLOB to_remove "${TBB_ROOT_DIR}/src/old/test*.cpp")
|
|
||||||
if (NOT "${to_remove}" STREQUAL "")
|
|
||||||
list(REMOVE_ITEM tbb_src ${to_remove})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
set(tbbmalloc_static_src
|
|
||||||
src/tbbmalloc/backend.cpp
|
|
||||||
src/tbbmalloc/large_objects.cpp
|
|
||||||
src/tbbmalloc/backref.cpp
|
|
||||||
src/tbbmalloc/tbbmalloc.cpp
|
|
||||||
src/tbbmalloc/frontend.cpp
|
|
||||||
src/tbb/itt_notify.cpp)
|
|
||||||
|
|
||||||
set(tbbmalloc_src ${tbbmalloc_static_src})
|
|
||||||
|
|
||||||
set(tbbmalloc_proxy_src
|
|
||||||
src/tbbmalloc/proxy.cpp
|
|
||||||
src/tbbmalloc/tbb_function_replacement.cpp)
|
|
||||||
|
|
||||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "(i386|x86_64)")
|
|
||||||
if (NOT APPLE AND NOT MINGW)
|
|
||||||
add_definitions(-DDO_ITT_NOTIFY)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (APPLE)
|
|
||||||
# Disable annoying "has no symbols" warnings
|
|
||||||
set(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> Scr <TARGET> <LINK_FLAGS> <OBJECTS>")
|
|
||||||
set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> Scr <TARGET> <LINK_FLAGS> <OBJECTS>")
|
|
||||||
set(CMAKE_C_ARCHIVE_FINISH "<CMAKE_RANLIB> -no_warning_for_no_symbols -c <TARGET>")
|
|
||||||
set(CMAKE_CXX_ARCHIVE_FINISH "<CMAKE_RANLIB> -no_warning_for_no_symbols -c <TARGET>")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
macro(CHECK_CXX_COMPILER_AND_LINKER_FLAGS _RESULT _CXX_FLAGS _LINKER_FLAGS)
|
|
||||||
set(CMAKE_REQUIRED_FLAGS ${_CXX_FLAGS})
|
|
||||||
set(CMAKE_REQUIRED_LIBRARIES ${_LINKER_FLAGS})
|
|
||||||
set(CMAKE_REQUIRED_QUIET TRUE)
|
|
||||||
check_cxx_source_runs("#include <iostream>\nint main(int argc, char **argv) { std::cout << \"test\"; return 0; }" ${_RESULT})
|
|
||||||
set(CMAKE_REQUIRED_FLAGS "")
|
|
||||||
set(CMAKE_REQUIRED_LIBRARIES "")
|
|
||||||
endmacro()
|
|
||||||
|
|
||||||
# Prefer libc++ in conjunction with Clang
|
|
||||||
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
|
||||||
if (CMAKE_CXX_FLAGS MATCHES "-stdlib=libc\\+\\+")
|
|
||||||
message(STATUS "TBB: using libc++.")
|
|
||||||
else()
|
|
||||||
CHECK_CXX_COMPILER_AND_LINKER_FLAGS(HAS_LIBCPP "-stdlib=libc++" "-stdlib=libc++")
|
|
||||||
if (HAS_LIBCPP)
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++ -D_LIBCPP_VERSION")
|
|
||||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++")
|
|
||||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -stdlib=libc++")
|
|
||||||
message(STATUS "TBB: using libc++.")
|
|
||||||
else()
|
|
||||||
message(STATUS "TBB: NOT using libc++.")
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (UNIX)
|
|
||||||
add_definitions (-DUSE_PTHREAD)
|
|
||||||
|
|
||||||
check_cxx_compiler_flag ("-std=c++11" SUPPORTS_STDCXX11)
|
|
||||||
if (SUPPORTS_STDCXX11)
|
|
||||||
set (CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
check_cxx_compiler_flag ("-mrtm -Werror" SUPPORTS_MRTM)
|
|
||||||
if (SUPPORTS_MRTM)
|
|
||||||
set (CMAKE_CXX_FLAGS "-mrtm ${CMAKE_CXX_FLAGS}")
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
elseif(WIN32)
|
|
||||||
if (MSVC)
|
|
||||||
cmake_minimum_required (VERSION 3.1)
|
|
||||||
enable_language(ASM_MASM)
|
|
||||||
set(CMAKE_CXX_FLAGS "/GS- /Zc:wchar_t /Zc:forScope /DUSE_WINTHREAD ${CMAKE_CXX_FLAGS}")
|
|
||||||
set(CMAKE_CXX_FLAGS "/D_CRT_SECURE_NO_DEPRECATE /D_WIN32_WINNT=0x0600 ${CMAKE_CXX_FLAGS}")
|
|
||||||
check_cxx_compiler_flag ("/volatile:iso" SUPPORTS_VOLATILE_FLAG)
|
|
||||||
if (SUPPORTS_VOLATILE_FLAG)
|
|
||||||
set(CMAKE_CXX_FLAGS "/volatile:iso ${CMAKE_CXX_FLAGS}")
|
|
||||||
endif ()
|
|
||||||
set(CMAKE_CXX_FLAGS "/wd4267 /wd4800 /wd4146 /wd4244 /wd4577 /wd4018 ${CMAKE_CXX_FLAGS}")
|
|
||||||
if (NOT CMAKE_SIZEOF_VOID_P)
|
|
||||||
message(FATAL_ERROR "'CMAKE_SIZEOF_VOID_P' is undefined. Please delete your build directory and rerun CMake again!")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (CMAKE_SIZEOF_VOID_P EQUAL 8)
|
|
||||||
list(APPEND tbb_src "${TBB_ROOT_DIR}/src/tbb/intel64-masm/atomic_support.asm")
|
|
||||||
list(APPEND tbb_src "${TBB_ROOT_DIR}/src/tbb/intel64-masm/itsx.asm")
|
|
||||||
list(APPEND tbb_src "${TBB_ROOT_DIR}/src/tbb/intel64-masm/intel64_misc.asm")
|
|
||||||
list(APPEND tbbmalloc_src "${TBB_ROOT_DIR}/src/tbb/intel64-masm/atomic_support.asm")
|
|
||||||
set(CMAKE_ASM_MASM_FLAGS "/DEM64T=1 ${CMAKE_ASM_MASM_FLAGS}")
|
|
||||||
else()
|
|
||||||
list(APPEND tbb_src "${TBB_ROOT_DIR}/src/tbb/ia32-masm/atomic_support.asm"
|
|
||||||
"${TBB_ROOT_DIR}/src/tbb/ia32-masm/itsx.asm src/tbb/ia32-masm/lock_byte.asm")
|
|
||||||
# Enable SAFESEH feature for assembly (x86 builds only).
|
|
||||||
set(CMAKE_ASM_MASM_FLAGS "/safeseh ${CMAKE_ASM_MASM_FLAGS}")
|
|
||||||
endif()
|
|
||||||
elseif (MINGW)
|
|
||||||
add_definitions(-DUSE_WINTHREAD)
|
|
||||||
add_definitions(-D_WIN32_WINNT=0x0502)
|
|
||||||
set(CMAKE_CXX_FLAGS "-mthreads ${CMAKE_CXX_FLAGS}")
|
|
||||||
endif ()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (MSVC)
|
|
||||||
set(ENABLE_RTTI "/EHsc /GR ")
|
|
||||||
set(DISABLE_RTTI "/EHs- /GR- ")
|
|
||||||
elseif (UNIX)
|
|
||||||
set(ENABLE_RTTI "-frtti -fexceptions ")
|
|
||||||
set(DISABLE_RTTI "-fno-rtti -fno-exceptions ")
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
##--------
|
|
||||||
# - Added TBB_USE_GLIBCXX_VERSION macro to specify the version of GNU
|
|
||||||
# libstdc++ when it cannot be properly recognized, e.g. when used
|
|
||||||
# with Clang on Linux* OS. Inspired by a contribution from David A.
|
|
||||||
if (NOT TBB_USE_GLIBCXX_VERSION AND UNIX AND NOT APPLE)
|
|
||||||
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
|
|
||||||
# using Clang
|
|
||||||
string(REPLACE "." "0" TBB_USE_GLIBCXX_VERSION ${CMAKE_CXX_COMPILER_VERSION})
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (TBB_USE_GLIBCXX_VERSION)
|
|
||||||
add_definitions(-DTBB_USE_GLIBCXX_VERSION=${TBB_USE_GLIBCXX_VERSION})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
##-------
|
|
||||||
|
|
||||||
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
|
|
||||||
check_cxx_compiler_flag ("-flifetime-dse=1" SUPPORTS_FLIFETIME)
|
|
||||||
if (SUPPORTS_FLIFETIME)
|
|
||||||
add_definitions(-flifetime-dse=1)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Linker export definitions
|
|
||||||
if (APPLE)
|
|
||||||
set (ARCH_PREFIX "mac")
|
|
||||||
elseif(WIN32)
|
|
||||||
set (ARCH_PREFIX "win")
|
|
||||||
else()
|
|
||||||
set (ARCH_PREFIX "lin")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (CMAKE_SIZEOF_VOID_P EQUAL 8)
|
|
||||||
set(ARCH_PREFIX "${ARCH_PREFIX}64")
|
|
||||||
else()
|
|
||||||
set(ARCH_PREFIX "${ARCH_PREFIX}32")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (MINGW)
|
|
||||||
set (ARCH_PREFIX "${ARCH_PREFIX}-gcc")
|
|
||||||
# there's no win32-gcc-tbb-export.def, use lin32-tbb-export.def
|
|
||||||
execute_process (COMMAND ${CMAKE_COMMAND} -E copy ${TBB_ROOT_DIR}/src/tbb/lin32-tbb-export.def ${TBB_ROOT_DIR}/src/tbb/win32-gcc-tbb-export.def)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (MSVC)
|
|
||||||
add_custom_command(OUTPUT tbb.def
|
|
||||||
COMMAND ${CMAKE_CXX_COMPILER} /TC /EP ${TBB_ROOT_DIR}/src/tbb/${ARCH_PREFIX}-tbb-export.def -I ${TBB_ROOT_DIR}/include > tbb.def
|
|
||||||
MAIN_DEPENDENCY ${TBB_ROOT_DIR}/src/tbb/${ARCH_PREFIX}-tbb-export.def
|
|
||||||
COMMENT "Preprocessing tbb.def"
|
|
||||||
)
|
|
||||||
|
|
||||||
add_custom_command(OUTPUT tbbmalloc.def
|
|
||||||
COMMAND ${CMAKE_CXX_COMPILER} /TC /EP ${TBB_ROOT_DIR}/src/tbbmalloc/${ARCH_PREFIX}-tbbmalloc-export.def -I ${TBB_ROOT_DIR}/include > tbbmalloc.def
|
|
||||||
MAIN_DEPENDENCY ${TBB_ROOT_DIR}/src/tbbmalloc/${ARCH_PREFIX}-tbbmalloc-export.def
|
|
||||||
COMMENT "Preprocessing tbbmalloc.def"
|
|
||||||
)
|
|
||||||
else()
|
|
||||||
add_custom_command(OUTPUT tbb.def
|
|
||||||
COMMAND ${CMAKE_CXX_COMPILER} -xc++ -E ${TBB_ROOT_DIR}/src/tbb/${ARCH_PREFIX}-tbb-export.def -I ${TBB_ROOT_DIR}/include -o tbb.def
|
|
||||||
MAIN_DEPENDENCY ${TBB_ROOT_DIR}/src/tbb/${ARCH_PREFIX}-tbb-export.def
|
|
||||||
COMMENT "Preprocessing tbb.def"
|
|
||||||
)
|
|
||||||
|
|
||||||
add_custom_command(OUTPUT tbbmalloc.def
|
|
||||||
COMMAND ${CMAKE_CXX_COMPILER} -xc++ -E ${TBB_ROOT_DIR}/src/tbbmalloc/${ARCH_PREFIX}-tbbmalloc-export.def -I ${TBB_ROOT_DIR}/include -o tbbmalloc.def
|
|
||||||
MAIN_DEPENDENCY ${TBB_ROOT_DIR}/src/tbbmalloc/${ARCH_PREFIX}-tbbmalloc-export.def
|
|
||||||
COMMENT "Preprocessing tbbmalloc.def"
|
|
||||||
)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
add_custom_target(tbb_def_files DEPENDS tbb.def tbbmalloc.def)
|
|
||||||
|
|
||||||
# TBB library
|
|
||||||
if (TBB_BUILD_STATIC)
|
|
||||||
add_library(tbb_static STATIC ${tbb_src})
|
|
||||||
target_include_directories(tbb_static PRIVATE ${TBB_INCLUDES})
|
|
||||||
set_property(TARGET tbb_static APPEND PROPERTY COMPILE_DEFINITIONS "__TBB_BUILD=1")
|
|
||||||
set_property(TARGET tbb_static APPEND_STRING PROPERTY COMPILE_FLAGS ${ENABLE_RTTI})
|
|
||||||
install(TARGETS tbb_static ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR})
|
|
||||||
if (MSVC)
|
|
||||||
target_compile_definitions(tbb_static PUBLIC __TBB_NO_IMPLICIT_LINKAGE=1)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (UNIX AND NOT APPLE)
|
|
||||||
target_link_libraries(tbb_static PUBLIC pthread dl)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (TBB_BUILD_SHARED)
|
|
||||||
add_library(tbb SHARED ${tbb_src})
|
|
||||||
target_include_directories(tbb PRIVATE ${TBB_INCLUDES})
|
|
||||||
set_property(TARGET tbb APPEND PROPERTY COMPILE_DEFINITIONS "__TBB_BUILD=1")
|
|
||||||
set_property(TARGET tbb APPEND_STRING PROPERTY COMPILE_FLAGS ${ENABLE_RTTI})
|
|
||||||
add_dependencies(tbb tbb_def_files)
|
|
||||||
|
|
||||||
if (APPLE)
|
|
||||||
set_property(TARGET tbb APPEND PROPERTY LINK_FLAGS "-Wl,-exported_symbols_list,\"${CMAKE_CURRENT_BINARY_DIR}/tbb.def\"")
|
|
||||||
elseif (MSVC)
|
|
||||||
set_property(TARGET tbb APPEND PROPERTY LINK_FLAGS "/DEF:\"${CMAKE_CURRENT_BINARY_DIR}/tbb.def\"")
|
|
||||||
else ()
|
|
||||||
set_property(TARGET tbb APPEND PROPERTY LINK_FLAGS "-Wl,-version-script,\"${CMAKE_CURRENT_BINARY_DIR}/tbb.def\"")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
install(TARGETS tbb
|
|
||||||
LIBRARY DESTINATION ${TBB_INSTALL_LIBRARY_DIR}
|
|
||||||
ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR}
|
|
||||||
RUNTIME DESTINATION ${TBB_INSTALL_RUNTIME_DIR})
|
|
||||||
if (UNIX AND NOT APPLE)
|
|
||||||
target_link_libraries(tbb PUBLIC pthread dl)
|
|
||||||
endif()
|
|
||||||
if (MSVC)
|
|
||||||
target_compile_definitions(tbb PUBLIC __TBB_NO_IMPLICIT_LINKAGE=1)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
|
|
||||||
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
|
|
||||||
# Quench a warning on GCC
|
|
||||||
set_source_files_properties(${TBB_ROOT_DIR}/src/tbb/governor.cpp COMPILE_FLAGS "-Wno-missing-field-initializers ")
|
|
||||||
elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
|
|
||||||
# Quench a warning on Clang
|
|
||||||
set_source_files_properties(${TBB_ROOT_DIR}/src/tbb/itt_notify.cpp COMPILE_FLAGS "-Wno-varargs ")
|
|
||||||
elseif(MSVC)
|
|
||||||
# Quench a warning on MSVC
|
|
||||||
set_source_files_properties(${TBB_ROOT_DIR}/src/tbb/scheduler.cpp COMPILE_FLAGS "/wd4458 ")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(TBB_BUILD_TBBMALLOC)
|
|
||||||
# TBB malloc library
|
|
||||||
if (TBB_BUILD_STATIC)
|
|
||||||
add_library(tbbmalloc_static STATIC ${tbbmalloc_static_src})
|
|
||||||
target_include_directories(tbbmalloc_static PRIVATE ${TBB_INCLUDES})
|
|
||||||
set_property(TARGET tbbmalloc_static APPEND PROPERTY COMPILE_DEFINITIONS "__TBBMALLOC_BUILD=1")
|
|
||||||
set_property(TARGET tbbmalloc_static APPEND_STRING PROPERTY COMPILE_FLAGS ${DISABLE_RTTI})
|
|
||||||
if (MSVC)
|
|
||||||
target_compile_definitions(tbbmalloc_static PUBLIC __TBB_NO_IMPLICIT_LINKAGE=1 __TBBMALLOC_NO_IMPLICIT_LINKAGE=1)
|
|
||||||
endif()
|
|
||||||
install(TARGETS tbbmalloc_static ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (TBB_BUILD_SHARED)
|
|
||||||
add_library(tbbmalloc SHARED ${tbbmalloc_src})
|
|
||||||
target_include_directories(tbbmalloc PRIVATE ${TBB_INCLUDES})
|
|
||||||
set_property(TARGET tbbmalloc APPEND PROPERTY COMPILE_DEFINITIONS "__TBBMALLOC_BUILD=1")
|
|
||||||
set_property(TARGET tbbmalloc APPEND_STRING PROPERTY COMPILE_FLAGS ${DISABLE_RTTI})
|
|
||||||
add_dependencies(tbbmalloc tbb_def_files)
|
|
||||||
if (APPLE)
|
|
||||||
set_property(TARGET tbbmalloc APPEND PROPERTY LINK_FLAGS "-Wl,-exported_symbols_list,\"${CMAKE_CURRENT_BINARY_DIR}/tbbmalloc.def\"")
|
|
||||||
elseif (MSVC)
|
|
||||||
set_property(TARGET tbbmalloc APPEND PROPERTY LINK_FLAGS "/DEF:\"${CMAKE_CURRENT_BINARY_DIR}/tbbmalloc.def\"")
|
|
||||||
else ()
|
|
||||||
set_property(TARGET tbbmalloc APPEND PROPERTY LINK_FLAGS "-Wl,-version-script,\"${CMAKE_CURRENT_BINARY_DIR}/tbbmalloc.def\"")
|
|
||||||
endif()
|
|
||||||
if (MSVC)
|
|
||||||
target_compile_definitions(tbbmalloc PUBLIC __TBB_NO_IMPLICIT_LINKAGE=1 __TBBMALLOC_NO_IMPLICIT_LINKAGE=1)
|
|
||||||
endif()
|
|
||||||
install(TARGETS tbbmalloc
|
|
||||||
LIBRARY DESTINATION ${TBB_INSTALL_LIBRARY_DIR}
|
|
||||||
ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR}
|
|
||||||
RUNTIME DESTINATION ${TBB_INSTALL_RUNTIME_DIR})
|
|
||||||
if (UNIX AND NOT APPLE)
|
|
||||||
target_link_libraries(tbbmalloc PUBLIC pthread dl)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(TBB_BUILD_TBBMALLOC_PROXY)
|
|
||||||
# TBB malloc proxy library
|
|
||||||
if (TBB_BUILD_STATIC)
|
|
||||||
add_library(tbbmalloc_proxy_static STATIC ${tbbmalloc_proxy_src})
|
|
||||||
set_property(TARGET tbbmalloc_proxy_static APPEND PROPERTY COMPILE_DEFINITIONS "__TBBMALLOC_BUILD=1")
|
|
||||||
set_property(TARGET tbbmalloc_proxy_static APPEND_STRING PROPERTY COMPILE_FLAGS ${DISABLE_RTTI})
|
|
||||||
install(TARGETS tbbmalloc_proxy_static ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (TBB_BUILD_SHARED)
|
|
||||||
add_library(tbbmalloc_proxy SHARED ${tbbmalloc_proxy_src})
|
|
||||||
set_property(TARGET tbbmalloc_proxy APPEND PROPERTY COMPILE_DEFINITIONS "__TBBMALLOC_BUILD=1")
|
|
||||||
set_property(TARGET tbbmalloc_proxy APPEND_STRING PROPERTY COMPILE_FLAGS ${DISABLE_RTTI})
|
|
||||||
target_link_libraries(tbbmalloc_proxy PUBLIC tbbmalloc)
|
|
||||||
install(TARGETS tbbmalloc_proxy
|
|
||||||
LIBRARY DESTINATION ${TBB_INSTALL_LIBRARY_DIR}
|
|
||||||
ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR}
|
|
||||||
RUNTIME DESTINATION ${TBB_INSTALL_RUNTIME_DIR})
|
|
||||||
if (UNIX AND NOT APPLE)
|
|
||||||
target_link_libraries(tbbmalloc_proxy PUBLIC pthread dl)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
install(DIRECTORY "${TBB_ROOT_DIR}/include/tbb" DESTINATION ${TBB_INSTALL_INCLUDE_DIR})
|
|
||||||
|
|
||||||
# version_string.ver
|
|
||||||
if (UNIX)
|
|
||||||
execute_process (COMMAND date "+%a, %d %b %Y %H:%M:%S %z"
|
|
||||||
OUTPUT_VARIABLE _configure_date
|
|
||||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
||||||
elseif (WIN32)
|
|
||||||
execute_process (COMMAND cmd " /C date /T"
|
|
||||||
OUTPUT_VARIABLE _configure_date
|
|
||||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
||||||
else ()
|
|
||||||
set (_configure_date "Unknown")
|
|
||||||
endif()
|
|
||||||
include_directories (${CMAKE_BINARY_DIR})
|
|
||||||
configure_file (extra/version_string.ver.in version_string.ver @ONLY)
|
|
@ -1,11 +0,0 @@
|
|||||||
#define __TBB_VERSION_STRINGS(N) \
|
|
||||||
#N": BUILD_HOST @CMAKE_SYSTEM_NAME@" ENDL \
|
|
||||||
#N": BUILD_OS @CMAKE_SYSTEM@" ENDL \
|
|
||||||
#N": BUILD_KERNEL @CMAKE_SYSTEM_VERSION@" ENDL \
|
|
||||||
#N": BUILD_GCC @CMAKE_CXX_COMPILER_ID@" ENDL \
|
|
||||||
#N": BUILD_LIBC Unknown" ENDL \
|
|
||||||
#N": BUILD_LD Unknown" ENDL \
|
|
||||||
#N": BUILD_TARGET Unknown" ENDL \
|
|
||||||
#N": BUILD_COMMAND Unknown" ENDL
|
|
||||||
|
|
||||||
#define __TBB_DATETIME "@_configure_date@"
|
|
@ -22,7 +22,7 @@ namespace {
|
|||||||
template <class T>
|
template <class T>
|
||||||
struct Vec256 {
|
struct Vec256 {
|
||||||
static constexpr int size = 32 / sizeof(T);
|
static constexpr int size = 32 / sizeof(T);
|
||||||
T values[32 / sizeof(T)];
|
T values[32 / sizeof(T)] = {0};
|
||||||
Vec256() {}
|
Vec256() {}
|
||||||
Vec256(T val) {
|
Vec256(T val) {
|
||||||
for (int i = 0; i != size; i++) {
|
for (int i = 0; i != size; i++) {
|
||||||
|
@ -23,7 +23,7 @@ void host_softmax(Tensor output, const Tensor& input, const int64_t dim) {
|
|||||||
int64_t outer_stride = dim_size * dim_stride;
|
int64_t outer_stride = dim_size * dim_stride;
|
||||||
scalar_t* input_data_base = input.data<scalar_t>();
|
scalar_t* input_data_base = input.data<scalar_t>();
|
||||||
scalar_t* output_data_base = output.data<scalar_t>();
|
scalar_t* output_data_base = output.data<scalar_t>();
|
||||||
int64_t grain_size = std::min(internal::TBB_GRAIN_SIZE / dim_size, (int64_t)1);
|
int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1);
|
||||||
parallel_for(
|
parallel_for(
|
||||||
0, outer_size * inner_size, grain_size,
|
0, outer_size * inner_size, grain_size,
|
||||||
[&](int64_t begin, int64_t end) {
|
[&](int64_t begin, int64_t end) {
|
||||||
@ -80,7 +80,7 @@ void host_softmax_backward(
|
|||||||
scalar_t* gradInput_data_base = gI.data<scalar_t>();
|
scalar_t* gradInput_data_base = gI.data<scalar_t>();
|
||||||
scalar_t* output_data_base = output.data<scalar_t>();
|
scalar_t* output_data_base = output.data<scalar_t>();
|
||||||
scalar_t* gradOutput_data_base = grad.data<scalar_t>();
|
scalar_t* gradOutput_data_base = grad.data<scalar_t>();
|
||||||
int64_t grain_size = std::min(internal::TBB_GRAIN_SIZE / dim_size, (int64_t)1);
|
int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1);
|
||||||
parallel_for(
|
parallel_for(
|
||||||
0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) {
|
0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) {
|
||||||
for (int64_t i = begin; i < end; i++) {
|
for (int64_t i = begin; i < end; i++) {
|
||||||
|
@ -9,12 +9,6 @@
|
|||||||
#include "ATen/cpu/vec256/vec256.h"
|
#include "ATen/cpu/vec256/vec256.h"
|
||||||
#include "ATen/optional.h"
|
#include "ATen/optional.h"
|
||||||
|
|
||||||
#ifdef __PPC64__
|
|
||||||
using default_partitioner_type = tbb::simple_partitioner;
|
|
||||||
#else
|
|
||||||
using default_partitioner_type = tbb::affinity_partitioner;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace at { namespace native { namespace {
|
namespace at { namespace native { namespace {
|
||||||
|
|
||||||
using namespace vec256;
|
using namespace vec256;
|
||||||
@ -23,19 +17,22 @@ static inline int64_t round_down(int64_t a, int64_t m) {
|
|||||||
return a - (a % m);
|
return a - (a % m);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename F>
|
template <typename F>
|
||||||
static void parallel_for(int64_t end, int64_t step, bool parallelize, F func) {
|
static void _parallel_for(int64_t size, int64_t step, bool parallelize, F func) {
|
||||||
if (parallelize) {
|
if (parallelize) {
|
||||||
tbb::parallel_for<int64_t>(0, end, step, func);
|
parallel_for(0, size / step, 1, [func, step](int64_t begin, int64_t end) {
|
||||||
|
int64_t k = begin * step;
|
||||||
|
for (int64_t i = begin; i < end; i++, k += step) {
|
||||||
|
func(k);
|
||||||
|
}
|
||||||
|
});
|
||||||
} else {
|
} else {
|
||||||
for (int64_t i = 0; i != end; i += step) {
|
for (int64_t i = 0; i != size; i += step) {
|
||||||
func(i);
|
func(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static default_partitioner_type ap;
|
|
||||||
|
|
||||||
// Vectorized reduction defined by reduce operation `Op` with identity `ident`.
|
// Vectorized reduction defined by reduce operation `Op` with identity `ident`.
|
||||||
// The reduction is built on top of reduce128, which reduces down a column
|
// The reduction is built on top of reduce128, which reduces down a column
|
||||||
// 128 bytes wide (WIDTH scalar elements). The width of 128 bytes is chosen
|
// 128 bytes wide (WIDTH scalar elements). The width of 128 bytes is chosen
|
||||||
@ -50,8 +47,6 @@ struct Reduction {
|
|||||||
using ReduceScalar = Op<scalar_t>;
|
using ReduceScalar = Op<scalar_t>;
|
||||||
|
|
||||||
static void apply(Tensor& res, const Tensor& self, at::optional<int64_t> dim) {
|
static void apply(Tensor& res, const Tensor& self, at::optional<int64_t> dim) {
|
||||||
internal::init_tbb_num_threads();
|
|
||||||
|
|
||||||
auto out = res.data<scalar_t>();
|
auto out = res.data<scalar_t>();
|
||||||
auto data = self.data<scalar_t>();
|
auto data = self.data<scalar_t>();
|
||||||
auto numel = self.numel();
|
auto numel = self.numel();
|
||||||
@ -71,8 +66,8 @@ struct Reduction {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
int64_t batch = numel / (n * stride);
|
int64_t batch = numel / (n * stride);
|
||||||
bool paralellize = batch * n > internal::TBB_GRAIN_SIZE;
|
bool paralellize = batch * n > internal::GRAIN_SIZE;
|
||||||
parallel_for(batch, 1, paralellize, [=](int64_t b) {
|
_parallel_for(batch, 1, paralellize, [=](int64_t b) {
|
||||||
if (stride == 1) {
|
if (stride == 1) {
|
||||||
out[b] = reduce_all(&data[b * n], n);
|
out[b] = reduce_all(&data[b * n], n);
|
||||||
} else {
|
} else {
|
||||||
@ -84,23 +79,17 @@ struct Reduction {
|
|||||||
static scalar_t reduce_all(const scalar_t* data, int64_t size) {
|
static scalar_t reduce_all(const scalar_t* data, int64_t size) {
|
||||||
int64_t k = size / WIDTH;
|
int64_t k = size / WIDTH;
|
||||||
|
|
||||||
scalar_t sum;
|
scalar_t sum = parallel_reduce(
|
||||||
if (size > internal::TBB_GRAIN_SIZE) {
|
0,
|
||||||
sum = tbb::parallel_reduce(
|
k,
|
||||||
tbb::blocked_range<int64_t>(0, k, internal::TBB_GRAIN_SIZE / WIDTH),
|
internal::GRAIN_SIZE / WIDTH,
|
||||||
scalar_t(ident),
|
(scalar_t)ident,
|
||||||
[=](const tbb::blocked_range<int64_t>& r, scalar_t init) {
|
[data](int64_t begin, int64_t end, scalar_t init) {
|
||||||
scalar_t buf[WIDTH];
|
scalar_t buf[WIDTH];
|
||||||
reduce128(&data[r.begin() * WIDTH], buf, r.end() - r.begin(), WIDTH);
|
reduce128(&data[begin * WIDTH], buf, end - begin, WIDTH);
|
||||||
return std::accumulate(buf, buf + WIDTH, init, ReduceScalar());
|
return std::accumulate(buf, buf + WIDTH, init, ReduceScalar());
|
||||||
},
|
},
|
||||||
ReduceScalar(),
|
ReduceScalar());
|
||||||
ap);
|
|
||||||
} else {
|
|
||||||
scalar_t buf[WIDTH];
|
|
||||||
reduce128(data, buf, k, WIDTH);
|
|
||||||
sum = std::accumulate(buf, buf + WIDTH, scalar_t(ident), ReduceScalar());
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int64_t i = k * WIDTH; i != size; i++) {
|
for (int64_t i = k * WIDTH; i != size; i++) {
|
||||||
sum = ReduceScalar()(sum, data[i]);
|
sum = ReduceScalar()(sum, data[i]);
|
||||||
@ -127,8 +116,8 @@ struct Reduction {
|
|||||||
// Reduce a 2d matrix down each column. Stores the results in out[0 ... cols-1]
|
// Reduce a 2d matrix down each column. Stores the results in out[0 ... cols-1]
|
||||||
static void reduce2d(const scalar_t* data, scalar_t* out, int64_t rows, int64_t cols, int64_t stride) {
|
static void reduce2d(const scalar_t* data, scalar_t* out, int64_t rows, int64_t cols, int64_t stride) {
|
||||||
int64_t cols_rounded = round_down(cols, WIDTH);
|
int64_t cols_rounded = round_down(cols, WIDTH);
|
||||||
bool paralellize = cols * rows > internal::TBB_GRAIN_SIZE;
|
bool paralellize = cols * rows > internal::GRAIN_SIZE;
|
||||||
parallel_for(cols_rounded, WIDTH, paralellize, [=](int64_t col) {
|
_parallel_for(cols_rounded, WIDTH, paralellize, [=](int64_t col) {
|
||||||
reduce128(&data[col], &out[col], rows, stride);
|
reduce128(&data[col], &out[col], rows, stride);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -14,7 +14,7 @@
|
|||||||
// compiled with AVX/AVX2 This is because of SSE-AVX transitions and a bug in
|
// compiled with AVX/AVX2 This is because of SSE-AVX transitions and a bug in
|
||||||
// Glibc2.23 See https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/1663280
|
// Glibc2.23 See https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/1663280
|
||||||
//
|
//
|
||||||
// On grainsize: The grainsize is chosen to roughly get TBB_GRAIN_SIZE number of
|
// On grainsize: The grainsize is chosen to roughly get GRAIN_SIZE number of
|
||||||
// computations per task. Each task works across dim_size elements. 16 should be
|
// computations per task. Each task works across dim_size elements. 16 should be
|
||||||
// a very rough approximation of the number of computations per dim_size element
|
// a very rough approximation of the number of computations per dim_size element
|
||||||
// by counting simple computations (*, +, -) as 1 and exp or log as 4.
|
// by counting simple computations (*, +, -) as 1 and exp or log as 4.
|
||||||
@ -30,7 +30,7 @@ inline void _vec_log_softmax_lastdim(
|
|||||||
int64_t dim_size) {
|
int64_t dim_size) {
|
||||||
using Vec = vec256::Vec256<scalar_t>;
|
using Vec = vec256::Vec256<scalar_t>;
|
||||||
static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size;
|
static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size;
|
||||||
int64_t grain_size = internal::TBB_GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
|
int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
|
||||||
if (grain_size < CHUNK_SIZE)
|
if (grain_size < CHUNK_SIZE)
|
||||||
grain_size = CHUNK_SIZE;
|
grain_size = CHUNK_SIZE;
|
||||||
|
|
||||||
@ -93,7 +93,7 @@ inline void _vec_softmax_lastdim(
|
|||||||
int64_t outer_size,
|
int64_t outer_size,
|
||||||
int64_t dim_size) {
|
int64_t dim_size) {
|
||||||
using Vec = vec256::Vec256<scalar_t>;
|
using Vec = vec256::Vec256<scalar_t>;
|
||||||
int64_t grain_size = internal::TBB_GRAIN_SIZE / (16 * dim_size);
|
int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size);
|
||||||
if (grain_size < 1)
|
if (grain_size < 1)
|
||||||
grain_size = 1;
|
grain_size = 1;
|
||||||
|
|
||||||
@ -134,7 +134,7 @@ inline void _vec_host_softmax_backward_lastdim(
|
|||||||
int64_t outer_size,
|
int64_t outer_size,
|
||||||
int64_t dim_size) {
|
int64_t dim_size) {
|
||||||
using Vec = vec256::Vec256<scalar_t>;
|
using Vec = vec256::Vec256<scalar_t>;
|
||||||
int64_t grain_size = internal::TBB_GRAIN_SIZE / (16 * dim_size);
|
int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size);
|
||||||
if (grain_size < 1)
|
if (grain_size < 1)
|
||||||
grain_size = 1;
|
grain_size = 1;
|
||||||
|
|
||||||
|
1
setup.py
1
setup.py
@ -355,7 +355,6 @@ class build_deps(PytorchCommand):
|
|||||||
check_file(os.path.join(third_party_path, "nanopb", "CMakeLists.txt"))
|
check_file(os.path.join(third_party_path, "nanopb", "CMakeLists.txt"))
|
||||||
check_file(os.path.join(third_party_path, "pybind11", "CMakeLists.txt"))
|
check_file(os.path.join(third_party_path, "pybind11", "CMakeLists.txt"))
|
||||||
check_file(os.path.join(third_party_path, 'cpuinfo', 'CMakeLists.txt'))
|
check_file(os.path.join(third_party_path, 'cpuinfo', 'CMakeLists.txt'))
|
||||||
check_file(os.path.join(third_party_path, 'tbb', 'Makefile'))
|
|
||||||
check_file(os.path.join(third_party_path, 'catch', 'CMakeLists.txt'))
|
check_file(os.path.join(third_party_path, 'catch', 'CMakeLists.txt'))
|
||||||
check_file(os.path.join(third_party_path, 'onnx', 'CMakeLists.txt'))
|
check_file(os.path.join(third_party_path, 'onnx', 'CMakeLists.txt'))
|
||||||
|
|
||||||
|
1
third_party/tbb
vendored
1
third_party/tbb
vendored
Submodule third_party/tbb deleted from 633b01ad27
@ -27,7 +27,7 @@ git fetch fullrepo
|
|||||||
git checkout -b temporary-split-branch fullrepo/master
|
git checkout -b temporary-split-branch fullrepo/master
|
||||||
# Cribbed from https://stackoverflow.com/questions/2982055/detach-many-subdirectories-into-a-new-separate-git-repository
|
# Cribbed from https://stackoverflow.com/questions/2982055/detach-many-subdirectories-into-a-new-separate-git-repository
|
||||||
# and https://stackoverflow.com/questions/42355621/git-filter-branch-moving-a-folder-with-index-filter-does-not-work
|
# and https://stackoverflow.com/questions/42355621/git-filter-branch-moving-a-folder-with-index-filter-does-not-work
|
||||||
git filter-branch -f --index-filter 'git rm --cached -qr --ignore-unmatch -- . && git reset -q $GIT_COMMIT -- aten cmake third_party/tbb third_party/catch third_party/cpuinfo && (git ls-files -s | sed "s-.travis.aten.yml-.travis.yml-" | sed "s-.gitmodules.aten-.gitmodules-" | git update-index --index-info)'
|
git filter-branch -f --index-filter 'git rm --cached -qr --ignore-unmatch -- . && git reset -q $GIT_COMMIT -- aten cmake third_party/catch third_party/cpuinfo && (git ls-files -s | sed "s-.travis.aten.yml-.travis.yml-" | sed "s-.gitmodules.aten-.gitmodules-" | git update-index --index-info)'
|
||||||
git checkout master
|
git checkout master
|
||||||
git merge temporary-split-branch
|
git merge temporary-split-branch
|
||||||
git push
|
git push
|
||||||
|
Reference in New Issue
Block a user