Temporarily remove TBB (#8255)

2025-10-20 21:14:14 +08:00 · 2018-06-18 19:31:57 -04:00
parent 4f37a6481d
commit 05c473b85c
14 changed files with 73 additions and 547 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -1,7 +1,3 @@
 [submodule "third_party/tbb"]
 	path = third_party/tbb
 	url = https://github.com/01org/tbb
 	branch = tbb_2018
 [submodule "third_party/catch"]
 	path = third_party/catch
 	url = https://github.com/catchorg/Catch2.git
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -89,24 +89,6 @@ IF(NOT AT_LINK_STYLE)
  SET(AT_LINK_STYLE SHARED)
 ENDIF()
 # Unset our restrictive C++ flags here and reset them later.
 # Remove this once we use proper target_compile_options.
 set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
 set(CMAKE_CXX_FLAGS)
 set(TBB_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/tbb")
 set(TBB_BUILD_STATIC ON CACHE BOOL " " FORCE)
 set(TBB_BUILD_SHARED OFF CACHE BOOL " " FORCE)
 set(TBB_BUILD_TBBMALLOC OFF CACHE BOOL " " FORCE)
 set(TBB_BUILD_TBBMALLOC_PROXY OFF CACHE BOOL " " FORCE)
 set(TBB_BUILD_TESTS OFF CACHE BOOL " " FORCE)
 add_subdirectory(cpu/tbb)
 set_property(TARGET tbb_static tbb_def_files PROPERTY FOLDER "dependencies")
 list(APPEND ATen_THIRD_PARTY_INCLUDE ${TBB_ROOT_DIR}/include)
 list(APPEND ATen_CPU_DEPENDENCY_LIBS tbb_static)
 set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
 IF(BLAS_FOUND)
  IF ($ENV{TH_BINARY_BUILD})
    MESSAGE(STATUS "TH_BINARY_BUILD detected. Enabling special linkage.")
--- a/aten/src/ATen/CPUApplyUtils.h
+++ b/aten/src/ATen/CPUApplyUtils.h
@ -149,7 +149,6 @@ inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
  for (auto& t : tensors)
    if (t.sizes().equals({0}))
      return false;
  internal::init_tbb_num_threads();
  return true;
 }
@ -351,7 +350,7 @@ template <typename scalar1, typename Op>
 inline void CPU_tensor_parallel_apply1(
    Tensor tensor1,
    const Op op,
-    int64_t grain_size = internal::TBB_GRAIN_SIZE) {
+    int64_t grain_size = internal::GRAIN_SIZE) {
  if (!_apply_preamble({tensor1}))
    return;
  if (tensor1.ndimension() < 8) {
@ -383,7 +382,7 @@ inline void CPU_tensor_parallel_apply2(
    Tensor tensor1,
    Tensor tensor2,
    const Op op,
-    int64_t grain_size = internal::TBB_GRAIN_SIZE) {
+    int64_t grain_size = internal::GRAIN_SIZE) {
  if (!_apply_preamble({tensor1, tensor2}))
    return;
  if (tensor1.ndimension() < 8 && tensor2.ndimension() < 8) {
--- a/aten/src/ATen/Parallel.cpp
+++ b/aten/src/ATen/Parallel.cpp
@ -1,56 +0,0 @@
 #include <ATen/CPUGeneral.h>
 #include <ATen/Parallel.h>
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_reduce.h>
 #include <tbb/partitioner.h>
 #include <tbb/tbb.h>
 #include <cassert>
 #include <thread>
 namespace at { namespace internal {
 // thread_local variable with internal linkage
 // requires no guarding as it's storage duration is defined to be per thread
 static thread_local tbb::task_scheduler_init tbbinit(
    tbb::task_scheduler_init::deferred);
 // Tracks number of threads uses which TBB doesn't track.
 static thread_local int num_threads_ = -1;
 // Negative number of threads means default value
 void init_tbb_num_threads() {
  static thread_local bool first_call = true;
  int num_threads = at::get_num_threads();
  // In order to have control over the number of threads this function
  // must be called first before any other tbb parallel construct is
  // excercised within a particular thread. Otherwise the default
  // scheduler will be created over which we do not have control.
  // The following code will and must throw an error if tbb has
  // already been initialized before this function was called.
  if (!tbbinit.is_active() && !first_call)
    throw std::runtime_error(
        "tbb initialization failed: scheduler not active after first call");
  if (first_call) {
    if (tbbinit.is_active())
      throw std::runtime_error(
          "tbb initialization failed: scheduler active on first call");
    if (num_threads < 0) {
      int max_threads = tbbinit.default_num_threads();
      tbbinit.initialize(max_threads);
    } else {
      tbbinit.initialize(num_threads);
    }
    first_call = false;
  }
  if (num_threads == 0) {
    // TODO: For PyTorch 0 means 1
    num_threads = 1;
  }
  if (num_threads > 0 && (num_threads_ != num_threads)) {
    tbbinit.terminate();
    tbbinit.initialize(num_threads);
    num_threads_ = num_threads;
  }
 }
 } // namespace internal
 } // namespace at
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@ -1,54 +1,59 @@
 #pragma once
 #include <ATen/ATen.h>
 #include <cstddef>
 #include <tbb/tbb.h>
 #ifdef _OPENMP
 #include <omp.h>
 #endif
 namespace at {
 namespace internal {
 // This needs to be called before the first use of any algorithm such as
 // parallel or it will have no effect and the default task scheduler is
 // created which uses all available cores.
 // See
 // https://www.threadingbuildingblocks.org/docs/help/reference/task_scheduler/task_scheduler_init_cls.html
 // This does not initializes the number of workers in the market (the overall
 // of workers available to a process). It is merely a request to the market
 // for a certain number of workers. If there are multiple threads making
 // a request at the size of the maximum number of threads, they will
 // be allocated a number proportional to the other requests.
 AT_API void init_tbb_num_threads();
 // This parameter is heuristically chosen to determine the minimum number of
 // work that warrants paralellism. For example, when summing an array, it is
 // deemed inefficient to parallelise over arrays shorter than 32768. Further,
 // no parallel algorithm (such as parallel_reduce) should split work into
 // smaller than GRAIN_SIZE chunks.
-constexpr int64_t TBB_GRAIN_SIZE = 32768;
+constexpr int64_t GRAIN_SIZE = 32768;
 } // namespace internal
 inline int64_t divup(int64_t x, int64_t y) {
  return (x + y - 1) / y;
 }
 template <class F>
 inline void parallel_for(
-    int64_t begin,
+    const int64_t begin,
-    int64_t end,
+    const int64_t end,
-    int64_t grain_size,
+    const int64_t grain_size_,
-    const F& f) {
+    const F f) {
-  internal::init_tbb_num_threads();
+  const int64_t min_grain_size = divup((end - begin), get_num_threads());
-
+  const int64_t grain_size = std::max(min_grain_size, grain_size_);
-#ifdef __PPC64__
+#pragma omp parallel for if ((end - begin) >= grain_size && get_num_threads() > 1)
-  using default_partitioner_type = tbb::simple_partitioner;
+  for (int64_t i = begin; i < end; i += grain_size) {
-#else
+    f(i, i + std::min(end - i, grain_size));
  using default_partitioner_type = tbb::affinity_partitioner;
 #endif
  thread_local static default_partitioner_type ap;
  if ((end - begin) < grain_size || get_num_threads() == 1) {
    f(begin, end);
  } else {
    tbb::parallel_for(
        tbb::blocked_range<int64_t>(begin, end, grain_size),
        [f](const tbb::blocked_range<int64_t>& r) { f(r.begin(), r.end()); },
        ap);
  }
 }
 template <class scalar_t, class F, class SF>
 inline scalar_t parallel_reduce(
    const int64_t begin,
    const int64_t end,
    const int64_t grain_size_,
    const scalar_t ident,
    const F f,
    const SF sf) {
  const int64_t min_grain_size = divup((end - begin), get_num_threads());
  const int64_t grain_size = std::max(min_grain_size, grain_size_);
  const int64_t num_results = divup((end - begin), grain_size);
  std::vector<scalar_t> results(num_results);
  scalar_t* results_data = results.data();
 #pragma omp parallel for if ((end - begin) >= grain_size && get_num_threads() > 1)
  for (int64_t id = 0; id < num_results; id++) {
    int64_t i = begin + id * grain_size;
    results_data[id] = f(i, i + std::min(end - i, grain_size), ident);
  }
  return std::accumulate(
      results_data, results_data + results.size(), ident, sf);
 }
 } // namespace at
--- a/aten/src/ATen/cpu/tbb/CMakeLists.txt
+++ b/aten/src/ATen/cpu/tbb/CMakeLists.txt
@ -1,376 +0,0 @@
 # Based on https://github.com/wjakob/tbb/blob/master/CMakeLists.txt
 # All credit goes to Wenzel Jakob!
 cmake_minimum_required (VERSION 2.8.12 FATAL_ERROR)
 project (tbb CXX)
 include(CheckCXXCompilerFlag)
 include(CheckCXXSourceRuns)
 if(POLICY CMP0058)
  cmake_policy(SET CMP0058 NEW)
 endif()
 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
  message(STATUS "Setting build type to 'Release' as none was specified.")
  set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release"
    "MinSizeRel" "RelWithDebInfo")
 endif()
 if(NOT TBB_ROOT_DIR)
  set(TBB_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
 endif()
 if(NOT TBB_INSTALL_RUNTIME_DIR)
  set(TBB_INSTALL_RUNTIME_DIR bin)
 endif()
 if(NOT TBB_INSTALL_LIBRARY_DIR)
  set(TBB_INSTALL_LIBRARY_DIR lib)
 endif()
 if(NOT TBB_INSTALL_ARCHIVE_DIR)
  set(TBB_INSTALL_ARCHIVE_DIR lib)
 endif()
 if(NOT TBB_INSTALL_INCLUDE_DIR)
  set(TBB_INSTALL_INCLUDE_DIR "${TBB_ROOT_DIR}/include")
 endif()
 set(TBB_INCLUDES
  "${TBB_ROOT_DIR}/include"
  "${TBB_ROOT_DIR}/src"
  "${TBB_ROOT_DIR}/src/rml/include"
  ${CMAKE_CURRENT_BINARY_DIR})
 option(TBB_BUILD_SHARED          "Build TBB shared library" ON)
 option(TBB_BUILD_STATIC          "Build TBB static library" ON)
 option(TBB_BUILD_TBBMALLOC       "Build TBB malloc library" ON)
 option(TBB_BUILD_TBBMALLOC_PROXY "Build TBB malloc proxy library" ON)
 option(TBB_BUILD_TESTS           "Build TBB tests and enable testing infrastructure" ON)
 option(TBB_CI_BUILD              "Is this a continuous integration build?" OFF)
 if(APPLE)
  set(CMAKE_MACOSX_RPATH ON)
 endif()
 file(GLOB tbb_src "${TBB_ROOT_DIR}/src/tbb/*.cpp" "${TBB_ROOT_DIR}/src/old/*.cpp")
 list(APPEND tbb_src ${TBB_ROOT_DIR}/src/rml/client/rml_tbb.cpp)
 file(GLOB to_remove "${TBB_ROOT_DIR}/src/old/test*.cpp")
 if (NOT "${to_remove}" STREQUAL "")
  list(REMOVE_ITEM tbb_src ${to_remove})
 endif()
 set(tbbmalloc_static_src
  src/tbbmalloc/backend.cpp
  src/tbbmalloc/large_objects.cpp
  src/tbbmalloc/backref.cpp
  src/tbbmalloc/tbbmalloc.cpp
  src/tbbmalloc/frontend.cpp
  src/tbb/itt_notify.cpp)
 set(tbbmalloc_src ${tbbmalloc_static_src})
 set(tbbmalloc_proxy_src
  src/tbbmalloc/proxy.cpp
  src/tbbmalloc/tbb_function_replacement.cpp)
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "(i386|x86_64)")
  if (NOT APPLE AND NOT MINGW)
    add_definitions(-DDO_ITT_NOTIFY)
  endif()
 endif()
 if (APPLE)
  # Disable annoying "has no symbols" warnings
  set(CMAKE_C_ARCHIVE_CREATE   "<CMAKE_AR> Scr <TARGET> <LINK_FLAGS> <OBJECTS>")
  set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> Scr <TARGET> <LINK_FLAGS> <OBJECTS>")
  set(CMAKE_C_ARCHIVE_FINISH   "<CMAKE_RANLIB> -no_warning_for_no_symbols -c <TARGET>")
  set(CMAKE_CXX_ARCHIVE_FINISH "<CMAKE_RANLIB> -no_warning_for_no_symbols -c <TARGET>")
 endif()
 macro(CHECK_CXX_COMPILER_AND_LINKER_FLAGS _RESULT _CXX_FLAGS _LINKER_FLAGS)
  set(CMAKE_REQUIRED_FLAGS ${_CXX_FLAGS})
  set(CMAKE_REQUIRED_LIBRARIES ${_LINKER_FLAGS})
  set(CMAKE_REQUIRED_QUIET TRUE)
  check_cxx_source_runs("#include <iostream>\nint main(int argc, char **argv) { std::cout << \"test\"; return 0; }" ${_RESULT})
  set(CMAKE_REQUIRED_FLAGS "")
  set(CMAKE_REQUIRED_LIBRARIES "")
 endmacro()
 # Prefer libc++ in conjunction with Clang
 if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
  if (CMAKE_CXX_FLAGS MATCHES "-stdlib=libc\\+\\+")
    message(STATUS "TBB: using libc++.")
  else()
    CHECK_CXX_COMPILER_AND_LINKER_FLAGS(HAS_LIBCPP "-stdlib=libc++" "-stdlib=libc++")
    if (HAS_LIBCPP)
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++ -D_LIBCPP_VERSION")
      set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++")
      set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -stdlib=libc++")
      message(STATUS "TBB: using libc++.")
    else()
      message(STATUS "TBB: NOT using libc++.")
    endif()
  endif()
 endif()
 if (UNIX)
  add_definitions (-DUSE_PTHREAD)
  check_cxx_compiler_flag ("-std=c++11" SUPPORTS_STDCXX11)
  if (SUPPORTS_STDCXX11)
    set (CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
  endif ()
  check_cxx_compiler_flag ("-mrtm -Werror" SUPPORTS_MRTM)
  if (SUPPORTS_MRTM)
    set (CMAKE_CXX_FLAGS "-mrtm ${CMAKE_CXX_FLAGS}")
  endif ()
 elseif(WIN32)
  if (MSVC)
    cmake_minimum_required (VERSION 3.1)
    enable_language(ASM_MASM)
    set(CMAKE_CXX_FLAGS "/GS- /Zc:wchar_t /Zc:forScope /DUSE_WINTHREAD ${CMAKE_CXX_FLAGS}")
    set(CMAKE_CXX_FLAGS "/D_CRT_SECURE_NO_DEPRECATE /D_WIN32_WINNT=0x0600 ${CMAKE_CXX_FLAGS}")
    check_cxx_compiler_flag ("/volatile:iso" SUPPORTS_VOLATILE_FLAG)
    if (SUPPORTS_VOLATILE_FLAG)
      set(CMAKE_CXX_FLAGS "/volatile:iso ${CMAKE_CXX_FLAGS}")
    endif ()
    set(CMAKE_CXX_FLAGS "/wd4267 /wd4800 /wd4146 /wd4244 /wd4577 /wd4018 ${CMAKE_CXX_FLAGS}")
    if (NOT CMAKE_SIZEOF_VOID_P)
       message(FATAL_ERROR "'CMAKE_SIZEOF_VOID_P' is undefined. Please delete your build directory and rerun CMake again!")
    endif()
    if (CMAKE_SIZEOF_VOID_P EQUAL 8)
      list(APPEND tbb_src "${TBB_ROOT_DIR}/src/tbb/intel64-masm/atomic_support.asm")
      list(APPEND tbb_src "${TBB_ROOT_DIR}/src/tbb/intel64-masm/itsx.asm")
      list(APPEND tbb_src "${TBB_ROOT_DIR}/src/tbb/intel64-masm/intel64_misc.asm")
      list(APPEND tbbmalloc_src "${TBB_ROOT_DIR}/src/tbb/intel64-masm/atomic_support.asm")
      set(CMAKE_ASM_MASM_FLAGS "/DEM64T=1 ${CMAKE_ASM_MASM_FLAGS}")
    else()
      list(APPEND tbb_src "${TBB_ROOT_DIR}/src/tbb/ia32-masm/atomic_support.asm"
        "${TBB_ROOT_DIR}/src/tbb/ia32-masm/itsx.asm src/tbb/ia32-masm/lock_byte.asm")
      # Enable SAFESEH feature for assembly (x86 builds only).
      set(CMAKE_ASM_MASM_FLAGS "/safeseh ${CMAKE_ASM_MASM_FLAGS}")
    endif()
  elseif (MINGW)
    add_definitions(-DUSE_WINTHREAD)
    add_definitions(-D_WIN32_WINNT=0x0502)
    set(CMAKE_CXX_FLAGS "-mthreads ${CMAKE_CXX_FLAGS}")
  endif ()
 endif()
 if (MSVC)
  set(ENABLE_RTTI "/EHsc /GR ")
  set(DISABLE_RTTI "/EHs- /GR- ")
 elseif (UNIX)
  set(ENABLE_RTTI "-frtti -fexceptions ")
  set(DISABLE_RTTI "-fno-rtti -fno-exceptions ")
 endif ()
 ##--------
 #   - Added TBB_USE_GLIBCXX_VERSION macro to specify the version of GNU
 #     libstdc++ when it cannot be properly recognized, e.g. when used
 #     with Clang on Linux* OS. Inspired by a contribution from David A.
 if (NOT TBB_USE_GLIBCXX_VERSION AND UNIX AND NOT APPLE)
  if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
    # using Clang
    string(REPLACE "." "0" TBB_USE_GLIBCXX_VERSION ${CMAKE_CXX_COMPILER_VERSION})
  endif()
 endif()
 if (TBB_USE_GLIBCXX_VERSION)
   add_definitions(-DTBB_USE_GLIBCXX_VERSION=${TBB_USE_GLIBCXX_VERSION})
 endif()
 ##-------
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
   check_cxx_compiler_flag ("-flifetime-dse=1" SUPPORTS_FLIFETIME)
   if (SUPPORTS_FLIFETIME)
     add_definitions(-flifetime-dse=1)
   endif()
 endif()
 # Linker export definitions
 if (APPLE)
  set (ARCH_PREFIX "mac")
 elseif(WIN32)
  set (ARCH_PREFIX "win")
 else()
  set (ARCH_PREFIX "lin")
 endif()
 if (CMAKE_SIZEOF_VOID_P EQUAL 8)
  set(ARCH_PREFIX "${ARCH_PREFIX}64")
 else()
  set(ARCH_PREFIX "${ARCH_PREFIX}32")
 endif()
 if (MINGW)
  set (ARCH_PREFIX "${ARCH_PREFIX}-gcc")
  # there's no win32-gcc-tbb-export.def, use lin32-tbb-export.def
  execute_process (COMMAND ${CMAKE_COMMAND} -E copy ${TBB_ROOT_DIR}/src/tbb/lin32-tbb-export.def ${TBB_ROOT_DIR}/src/tbb/win32-gcc-tbb-export.def)
 endif()
 if (MSVC)
  add_custom_command(OUTPUT tbb.def
    COMMAND ${CMAKE_CXX_COMPILER} /TC /EP ${TBB_ROOT_DIR}/src/tbb/${ARCH_PREFIX}-tbb-export.def  -I ${TBB_ROOT_DIR}/include > tbb.def
    MAIN_DEPENDENCY ${TBB_ROOT_DIR}/src/tbb/${ARCH_PREFIX}-tbb-export.def
    COMMENT "Preprocessing tbb.def"
  )
  add_custom_command(OUTPUT tbbmalloc.def
    COMMAND ${CMAKE_CXX_COMPILER} /TC /EP ${TBB_ROOT_DIR}/src/tbbmalloc/${ARCH_PREFIX}-tbbmalloc-export.def  -I ${TBB_ROOT_DIR}/include >   tbbmalloc.def
    MAIN_DEPENDENCY ${TBB_ROOT_DIR}/src/tbbmalloc/${ARCH_PREFIX}-tbbmalloc-export.def
    COMMENT "Preprocessing tbbmalloc.def"
  )
 else()
  add_custom_command(OUTPUT tbb.def
    COMMAND ${CMAKE_CXX_COMPILER} -xc++ -E ${TBB_ROOT_DIR}/src/tbb/${ARCH_PREFIX}-tbb-export.def  -I ${TBB_ROOT_DIR}/include -o tbb.def
    MAIN_DEPENDENCY ${TBB_ROOT_DIR}/src/tbb/${ARCH_PREFIX}-tbb-export.def
    COMMENT "Preprocessing tbb.def"
  )
  add_custom_command(OUTPUT tbbmalloc.def
    COMMAND ${CMAKE_CXX_COMPILER} -xc++ -E ${TBB_ROOT_DIR}/src/tbbmalloc/${ARCH_PREFIX}-tbbmalloc-export.def  -I ${TBB_ROOT_DIR}/include -o   tbbmalloc.def
    MAIN_DEPENDENCY ${TBB_ROOT_DIR}/src/tbbmalloc/${ARCH_PREFIX}-tbbmalloc-export.def
    COMMENT "Preprocessing tbbmalloc.def"
  )
 endif()
 add_custom_target(tbb_def_files DEPENDS tbb.def tbbmalloc.def)
 # TBB library
 if (TBB_BUILD_STATIC)
  add_library(tbb_static STATIC ${tbb_src})
  target_include_directories(tbb_static PRIVATE ${TBB_INCLUDES})
  set_property(TARGET tbb_static APPEND PROPERTY COMPILE_DEFINITIONS "__TBB_BUILD=1")
  set_property(TARGET tbb_static APPEND_STRING PROPERTY COMPILE_FLAGS ${ENABLE_RTTI})
  install(TARGETS tbb_static ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR})
  if (MSVC)
    target_compile_definitions(tbb_static PUBLIC __TBB_NO_IMPLICIT_LINKAGE=1)
  endif()
  if (UNIX AND NOT APPLE)
    target_link_libraries(tbb_static PUBLIC pthread dl)
  endif()
 endif()
 if (TBB_BUILD_SHARED)
  add_library(tbb SHARED ${tbb_src})
  target_include_directories(tbb PRIVATE ${TBB_INCLUDES})
  set_property(TARGET tbb APPEND PROPERTY COMPILE_DEFINITIONS "__TBB_BUILD=1")
  set_property(TARGET tbb APPEND_STRING PROPERTY COMPILE_FLAGS ${ENABLE_RTTI})
  add_dependencies(tbb tbb_def_files)
  if (APPLE)
    set_property(TARGET tbb APPEND PROPERTY LINK_FLAGS "-Wl,-exported_symbols_list,\"${CMAKE_CURRENT_BINARY_DIR}/tbb.def\"")
  elseif (MSVC)
    set_property(TARGET tbb APPEND PROPERTY LINK_FLAGS "/DEF:\"${CMAKE_CURRENT_BINARY_DIR}/tbb.def\"")
  else ()
    set_property(TARGET tbb APPEND PROPERTY LINK_FLAGS "-Wl,-version-script,\"${CMAKE_CURRENT_BINARY_DIR}/tbb.def\"")
  endif()
  install(TARGETS tbb
          LIBRARY DESTINATION ${TBB_INSTALL_LIBRARY_DIR}
          ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR}
          RUNTIME DESTINATION ${TBB_INSTALL_RUNTIME_DIR})
  if (UNIX AND NOT APPLE)
    target_link_libraries(tbb PUBLIC pthread dl)
  endif()
  if (MSVC)
    target_compile_definitions(tbb PUBLIC __TBB_NO_IMPLICIT_LINKAGE=1)
  endif()
 endif()
 if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
  # Quench a warning on GCC
  set_source_files_properties(${TBB_ROOT_DIR}/src/tbb/governor.cpp COMPILE_FLAGS "-Wno-missing-field-initializers ")
 elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
  # Quench a warning on Clang
  set_source_files_properties(${TBB_ROOT_DIR}/src/tbb/itt_notify.cpp COMPILE_FLAGS "-Wno-varargs ")
 elseif(MSVC)
  # Quench a warning on MSVC
  set_source_files_properties(${TBB_ROOT_DIR}/src/tbb/scheduler.cpp COMPILE_FLAGS "/wd4458 ")
 endif()
 if(TBB_BUILD_TBBMALLOC)
  # TBB malloc library
  if (TBB_BUILD_STATIC)
    add_library(tbbmalloc_static STATIC ${tbbmalloc_static_src})
    target_include_directories(tbbmalloc_static PRIVATE ${TBB_INCLUDES})
    set_property(TARGET tbbmalloc_static APPEND PROPERTY COMPILE_DEFINITIONS "__TBBMALLOC_BUILD=1")
    set_property(TARGET tbbmalloc_static APPEND_STRING PROPERTY COMPILE_FLAGS ${DISABLE_RTTI})
    if (MSVC)
      target_compile_definitions(tbbmalloc_static PUBLIC __TBB_NO_IMPLICIT_LINKAGE=1 __TBBMALLOC_NO_IMPLICIT_LINKAGE=1)
    endif()
    install(TARGETS tbbmalloc_static ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR})
  endif()
  if (TBB_BUILD_SHARED)
    add_library(tbbmalloc SHARED ${tbbmalloc_src})
    target_include_directories(tbbmalloc PRIVATE ${TBB_INCLUDES})
    set_property(TARGET tbbmalloc APPEND PROPERTY COMPILE_DEFINITIONS "__TBBMALLOC_BUILD=1")
    set_property(TARGET tbbmalloc APPEND_STRING PROPERTY COMPILE_FLAGS ${DISABLE_RTTI})
    add_dependencies(tbbmalloc tbb_def_files)
    if (APPLE)
      set_property(TARGET tbbmalloc APPEND PROPERTY LINK_FLAGS "-Wl,-exported_symbols_list,\"${CMAKE_CURRENT_BINARY_DIR}/tbbmalloc.def\"")
    elseif (MSVC)
      set_property(TARGET tbbmalloc APPEND PROPERTY LINK_FLAGS "/DEF:\"${CMAKE_CURRENT_BINARY_DIR}/tbbmalloc.def\"")
    else ()
      set_property(TARGET tbbmalloc APPEND PROPERTY LINK_FLAGS "-Wl,-version-script,\"${CMAKE_CURRENT_BINARY_DIR}/tbbmalloc.def\"")
    endif()
    if (MSVC)
      target_compile_definitions(tbbmalloc PUBLIC __TBB_NO_IMPLICIT_LINKAGE=1 __TBBMALLOC_NO_IMPLICIT_LINKAGE=1)
    endif()
    install(TARGETS tbbmalloc
            LIBRARY DESTINATION ${TBB_INSTALL_LIBRARY_DIR}
            ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR}
            RUNTIME DESTINATION ${TBB_INSTALL_RUNTIME_DIR})
    if (UNIX AND NOT APPLE)
      target_link_libraries(tbbmalloc PUBLIC pthread dl)
    endif()
  endif()
 endif()
 if(TBB_BUILD_TBBMALLOC_PROXY)
  # TBB malloc proxy library
  if (TBB_BUILD_STATIC)
    add_library(tbbmalloc_proxy_static STATIC ${tbbmalloc_proxy_src})
    set_property(TARGET tbbmalloc_proxy_static APPEND PROPERTY COMPILE_DEFINITIONS "__TBBMALLOC_BUILD=1")
    set_property(TARGET tbbmalloc_proxy_static APPEND_STRING PROPERTY COMPILE_FLAGS ${DISABLE_RTTI})
    install(TARGETS tbbmalloc_proxy_static ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR})
  endif()
  if (TBB_BUILD_SHARED)
    add_library(tbbmalloc_proxy SHARED ${tbbmalloc_proxy_src})
    set_property(TARGET tbbmalloc_proxy APPEND PROPERTY COMPILE_DEFINITIONS "__TBBMALLOC_BUILD=1")
    set_property(TARGET tbbmalloc_proxy APPEND_STRING PROPERTY COMPILE_FLAGS ${DISABLE_RTTI})
    target_link_libraries(tbbmalloc_proxy PUBLIC tbbmalloc)
    install(TARGETS tbbmalloc_proxy
            LIBRARY DESTINATION ${TBB_INSTALL_LIBRARY_DIR}
            ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR}
            RUNTIME DESTINATION ${TBB_INSTALL_RUNTIME_DIR})
    if (UNIX AND NOT APPLE)
      target_link_libraries(tbbmalloc_proxy PUBLIC pthread dl)
    endif()
  endif()
 endif()
 install(DIRECTORY "${TBB_ROOT_DIR}/include/tbb" DESTINATION ${TBB_INSTALL_INCLUDE_DIR})
 # version_string.ver
 if (UNIX)
  execute_process (COMMAND date "+%a, %d %b %Y %H:%M:%S %z"
                   OUTPUT_VARIABLE _configure_date
                   OUTPUT_STRIP_TRAILING_WHITESPACE)
 elseif (WIN32)
  execute_process (COMMAND cmd " /C date /T"
                   OUTPUT_VARIABLE _configure_date
                   OUTPUT_STRIP_TRAILING_WHITESPACE)
 else ()
  set (_configure_date "Unknown")
 endif()
 include_directories (${CMAKE_BINARY_DIR})
 configure_file (extra/version_string.ver.in version_string.ver @ONLY)
--- a/aten/src/ATen/cpu/tbb/extra/version_string.ver.in
+++ b/aten/src/ATen/cpu/tbb/extra/version_string.ver.in
@ -1,11 +0,0 @@
 #define __TBB_VERSION_STRINGS(N) \
 #N": BUILD_HOST         @CMAKE_SYSTEM_NAME@" ENDL \
 #N": BUILD_OS           @CMAKE_SYSTEM@" ENDL \
 #N": BUILD_KERNEL       @CMAKE_SYSTEM_VERSION@" ENDL \
 #N": BUILD_GCC          @CMAKE_CXX_COMPILER_ID@" ENDL \
 #N": BUILD_LIBC         Unknown" ENDL \
 #N": BUILD_LD           Unknown" ENDL \
 #N": BUILD_TARGET       Unknown" ENDL \
 #N": BUILD_COMMAND      Unknown" ENDL
 #define __TBB_DATETIME "@_configure_date@"
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@ -22,7 +22,7 @@ namespace {
 template <class T>
 struct Vec256 {
  static constexpr int size = 32 / sizeof(T);
-  T values[32 / sizeof(T)];
+  T values[32 / sizeof(T)] = {0};
  Vec256() {}
  Vec256(T val) {
    for (int i = 0; i != size; i++) {
--- a/aten/src/ATen/native/SoftMax.cpp
+++ b/aten/src/ATen/native/SoftMax.cpp
@ -23,7 +23,7 @@ void host_softmax(Tensor output, const Tensor& input, const int64_t dim) {
  int64_t outer_stride = dim_size * dim_stride;
  scalar_t* input_data_base = input.data<scalar_t>();
  scalar_t* output_data_base = output.data<scalar_t>();
-  int64_t grain_size = std::min(internal::TBB_GRAIN_SIZE / dim_size, (int64_t)1);
+  int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1);
  parallel_for(
      0, outer_size * inner_size, grain_size,
      [&](int64_t begin, int64_t end) {
@ -80,7 +80,7 @@ void host_softmax_backward(
  scalar_t* gradInput_data_base = gI.data<scalar_t>();
  scalar_t* output_data_base = output.data<scalar_t>();
  scalar_t* gradOutput_data_base = grad.data<scalar_t>();
-  int64_t grain_size = std::min(internal::TBB_GRAIN_SIZE / dim_size, (int64_t)1);
+  int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1);
  parallel_for(
      0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) {
        for (int64_t i = begin; i < end; i++) {
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -9,12 +9,6 @@
 #include "ATen/cpu/vec256/vec256.h"
 #include "ATen/optional.h"
 #ifdef __PPC64__
 using default_partitioner_type = tbb::simple_partitioner;
 #else
 using default_partitioner_type = tbb::affinity_partitioner;
 #endif
 namespace at { namespace native { namespace {
 using namespace vec256;
@ -23,19 +17,22 @@ static inline int64_t round_down(int64_t a, int64_t m) {
  return a - (a % m);
 }
-template<typename F>
+template <typename F>
-static void parallel_for(int64_t end, int64_t step, bool parallelize, F func) {
+static void _parallel_for(int64_t size, int64_t step, bool parallelize, F func) {
  if (parallelize) {
-    tbb::parallel_for<int64_t>(0, end, step, func);
+    parallel_for(0, size / step, 1, [func, step](int64_t begin, int64_t end) {
      int64_t k = begin * step;
      for (int64_t i = begin; i < end; i++, k += step) {
        func(k);
      }
    });
  } else {
-    for (int64_t i = 0; i != end; i += step) {
+    for (int64_t i = 0; i != size; i += step) {
      func(i);
    }
  }
 }
 static default_partitioner_type ap;
 // Vectorized reduction defined by reduce operation `Op` with identity `ident`.
 // The reduction is built on top of reduce128, which reduces down a column
 // 128 bytes wide (WIDTH scalar elements). The width of 128 bytes is chosen
@ -50,8 +47,6 @@ struct Reduction {
  using ReduceScalar = Op<scalar_t>;
  static void apply(Tensor& res, const Tensor& self, at::optional<int64_t> dim) {
    internal::init_tbb_num_threads();
    auto out = res.data<scalar_t>();
    auto data = self.data<scalar_t>();
    auto numel = self.numel();
@ -71,8 +66,8 @@ struct Reduction {
      }
    }
    int64_t batch = numel / (n * stride);
-    bool paralellize = batch * n > internal::TBB_GRAIN_SIZE;
+    bool paralellize = batch * n > internal::GRAIN_SIZE;
-    parallel_for(batch, 1, paralellize, [=](int64_t b) {
+    _parallel_for(batch, 1, paralellize, [=](int64_t b) {
      if (stride == 1) {
        out[b] = reduce_all(&data[b * n], n);
      } else {
@ -84,23 +79,17 @@ struct Reduction {
  static scalar_t reduce_all(const scalar_t* data, int64_t size) {
    int64_t k = size / WIDTH;
-    scalar_t sum;
+    scalar_t sum = parallel_reduce(
-    if (size > internal::TBB_GRAIN_SIZE) {
+        0,
-      sum = tbb::parallel_reduce(
+        k,
-          tbb::blocked_range<int64_t>(0, k, internal::TBB_GRAIN_SIZE / WIDTH),
+        internal::GRAIN_SIZE / WIDTH,
-          scalar_t(ident),
+        (scalar_t)ident,
-          [=](const tbb::blocked_range<int64_t>& r, scalar_t init) {
+        [data](int64_t begin, int64_t end, scalar_t init) {
-            scalar_t buf[WIDTH];
+          scalar_t buf[WIDTH];
-            reduce128(&data[r.begin() * WIDTH], buf, r.end() - r.begin(), WIDTH);
+          reduce128(&data[begin * WIDTH], buf, end - begin, WIDTH);
-            return std::accumulate(buf, buf + WIDTH, init, ReduceScalar());
+          return std::accumulate(buf, buf + WIDTH, init, ReduceScalar());
-          },
+        },
-          ReduceScalar(),
+        ReduceScalar());
          ap);
    } else {
      scalar_t buf[WIDTH];
      reduce128(data, buf, k, WIDTH);
      sum = std::accumulate(buf, buf + WIDTH, scalar_t(ident), ReduceScalar());
    }
    for (int64_t i = k * WIDTH; i != size; i++) {
      sum = ReduceScalar()(sum, data[i]);
@ -127,8 +116,8 @@ struct Reduction {
  // Reduce a 2d matrix down each column. Stores the results in out[0 ... cols-1]
  static void reduce2d(const scalar_t* data, scalar_t* out, int64_t rows, int64_t cols, int64_t stride) {
    int64_t cols_rounded = round_down(cols, WIDTH);
-    bool paralellize = cols * rows > internal::TBB_GRAIN_SIZE;
+    bool paralellize = cols * rows > internal::GRAIN_SIZE;
-    parallel_for(cols_rounded, WIDTH, paralellize, [=](int64_t col) {
+    _parallel_for(cols_rounded, WIDTH, paralellize, [=](int64_t col) {
      reduce128(&data[col], &out[col], rows, stride);
    });
--- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
+++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp
@ -14,7 +14,7 @@
 // compiled with AVX/AVX2 This is because of SSE-AVX transitions and a bug in
 // Glibc2.23 See https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/1663280
 //
-// On grainsize: The grainsize is chosen to roughly get TBB_GRAIN_SIZE number of
+// On grainsize: The grainsize is chosen to roughly get GRAIN_SIZE number of
 // computations per task. Each task works across dim_size elements. 16 should be
 // a very rough approximation of the number of computations per dim_size element
 // by counting simple computations (*, +, -) as 1 and exp or log as 4.
@ -30,7 +30,7 @@ inline void _vec_log_softmax_lastdim(
    int64_t dim_size) {
  using Vec = vec256::Vec256<scalar_t>;
  static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size;
-  int64_t grain_size = internal::TBB_GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
+  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
  if (grain_size < CHUNK_SIZE)
    grain_size = CHUNK_SIZE;
@ -93,7 +93,7 @@ inline void _vec_softmax_lastdim(
    int64_t outer_size,
    int64_t dim_size) {
  using Vec = vec256::Vec256<scalar_t>;
-  int64_t grain_size = internal::TBB_GRAIN_SIZE / (16 * dim_size);
+  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size);
  if (grain_size < 1)
    grain_size = 1;
@ -134,7 +134,7 @@ inline void _vec_host_softmax_backward_lastdim(
    int64_t outer_size,
    int64_t dim_size) {
  using Vec = vec256::Vec256<scalar_t>;
-  int64_t grain_size = internal::TBB_GRAIN_SIZE / (16 * dim_size);
+  int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size);
  if (grain_size < 1)
    grain_size = 1;
--- a/setup.py
+++ b/setup.py
@ -355,7 +355,6 @@ class build_deps(PytorchCommand):
        check_file(os.path.join(third_party_path, "nanopb", "CMakeLists.txt"))
        check_file(os.path.join(third_party_path, "pybind11", "CMakeLists.txt"))
        check_file(os.path.join(third_party_path, 'cpuinfo', 'CMakeLists.txt'))
        check_file(os.path.join(third_party_path, 'tbb', 'Makefile'))
        check_file(os.path.join(third_party_path, 'catch', 'CMakeLists.txt'))
        check_file(os.path.join(third_party_path, 'onnx', 'CMakeLists.txt'))
--- a/third_party/tbb
+++ b/third_party/tbb
--- a/tools/aten_mirror.sh
+++ b/tools/aten_mirror.sh
@ -27,7 +27,7 @@ git fetch fullrepo
 git checkout -b temporary-split-branch fullrepo/master
 # Cribbed from https://stackoverflow.com/questions/2982055/detach-many-subdirectories-into-a-new-separate-git-repository
 # and https://stackoverflow.com/questions/42355621/git-filter-branch-moving-a-folder-with-index-filter-does-not-work
-git filter-branch -f --index-filter 'git rm --cached -qr --ignore-unmatch -- . && git reset -q $GIT_COMMIT -- aten cmake third_party/tbb third_party/catch third_party/cpuinfo && (git ls-files -s | sed "s-.travis.aten.yml-.travis.yml-" | sed "s-.gitmodules.aten-.gitmodules-" | git update-index --index-info)'
+git filter-branch -f --index-filter 'git rm --cached -qr --ignore-unmatch -- . && git reset -q $GIT_COMMIT -- aten cmake third_party/catch third_party/cpuinfo && (git ls-files -s | sed "s-.travis.aten.yml-.travis.yml-" | sed "s-.gitmodules.aten-.gitmodules-" | git update-index --index-info)'
 git checkout master
 git merge temporary-split-branch
 git push