mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
Summary: Recent versions of GCC split unaligned load and store intrinsics into two 128-bit instructions. On old processors (Sandy Bridge) this was a bit faster for unaligned data, but bit slower for aligned data. On new processors (Intel Haswell+, recent AMD) splitting loads is slower on both aligned and unaligned data. Clang, MSVC, and ICC do not split unaligned load and store intrinsics. There's a good explanation here: https://stackoverflow.com/questions/52626726/why-doesnt-gcc-resolve-mm256-loadu-pd-as-single-vmovupd#tab-top Splitting load and store intrinsics makes no sense in our AVX2 configuration because the CPUs that support AVX2 instructions are the same CPUs where splitting is disadvantageous on all data alignemnt. Note that this doesn't change the AVX configuration (used by CPUs that support AVX but not AVX2). It's possible this would be benficial for that configuration too (our data is usually 32-byte aligned), but I'd prefer the conservative change for now. torch.add generated assembly (hot loop) (GCC 7.3.0) before: https://gist.github.com/colesbury/066376537bccd514daf8fe4ab54d8295 after: https://gist.github.com/colesbury/8b4b948145001d44b225c51d2428bb91 Timing of `torch.add(x, y, out=z)` for size 10240 (1 thread, Broadwell, no turbo): before: 7.35 us after: 6.39 us (Take the torch.add timings with a grain of salt. The difference in timings is much larger than I would expect.) Pull Request resolved: https://github.com/pytorch/pytorch/pull/20609 Differential Revision: D15385800 Pulled By: colesbury fbshipit-source-id: 66415b148a3b19360b9de9881af594ab46547b6f
203 lines
8.0 KiB
CMake
203 lines
8.0 KiB
CMake
# This ill-named file does a number of things:
|
|
# - Installs Caffe2 header files (this has nothing to do with code generation)
|
|
# - Configures caffe2/core/macros.h
|
|
# - Creates an ATen target for its generated C++ files and adds it
|
|
# as a dependency
|
|
|
|
################################################################################
|
|
# Helper functions
|
|
################################################################################
|
|
|
|
function(filter_list output input)
|
|
unset(result)
|
|
foreach(filename ${${input}})
|
|
foreach(pattern ${ARGN})
|
|
if("${filename}" MATCHES "${pattern}")
|
|
list(APPEND result "${filename}")
|
|
endif()
|
|
endforeach()
|
|
endforeach()
|
|
set(${output} ${result} PARENT_SCOPE)
|
|
endfunction()
|
|
|
|
function(filter_list_exclude output input)
|
|
unset(result)
|
|
foreach(filename ${${input}})
|
|
foreach(pattern ${ARGN})
|
|
if(NOT "${filename}" MATCHES "${pattern}")
|
|
list(APPEND result "${filename}")
|
|
endif()
|
|
endforeach()
|
|
endforeach()
|
|
set(${output} ${result} PARENT_SCOPE)
|
|
endfunction()
|
|
|
|
################################################################################
|
|
|
|
# ---[ Write the macros file
|
|
configure_file(
|
|
${CMAKE_CURRENT_LIST_DIR}/../caffe2/core/macros.h.in
|
|
${CMAKE_BINARY_DIR}/caffe2/core/macros.h)
|
|
|
|
# ---[ Installing the header files
|
|
install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../caffe2
|
|
DESTINATION include
|
|
FILES_MATCHING PATTERN "*.h")
|
|
if (NOT INTERN_BUILD_ATEN_OPS)
|
|
install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core
|
|
DESTINATION include/ATen
|
|
FILES_MATCHING PATTERN "*.h")
|
|
endif()
|
|
install(FILES ${CMAKE_BINARY_DIR}/caffe2/core/macros.h
|
|
DESTINATION include/caffe2/core)
|
|
|
|
# ---[ ATen specific
|
|
if (INTERN_BUILD_ATEN_OPS)
|
|
SET(OPT_FLAG "-O3 ")
|
|
IF(MSVC)
|
|
SET(OPT_FLAG "/Ox /fp:strict ")
|
|
ENDIF()
|
|
SET(VCOMP_LIB "vcomp")
|
|
|
|
IF("${CMAKE_BUILD_TYPE}" MATCHES "Debug")
|
|
SET(OPT_FLAG " ")
|
|
SET(VCOMP_LIB "vcompd")
|
|
ENDIF()
|
|
|
|
IF(C_AVX_FOUND)
|
|
IF(MSVC)
|
|
SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/vector/AVX.cpp PROPERTIES COMPILE_FLAGS "${OPT_FLAG}/arch:AVX ${CXX_AVX_FLAGS}")
|
|
ELSE(MSVC)
|
|
SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/vector/AVX.cpp PROPERTIES COMPILE_FLAGS "${OPT_FLAG} ${CXX_AVX_FLAGS}")
|
|
ENDIF(MSVC)
|
|
ENDIF(C_AVX_FOUND)
|
|
|
|
IF(C_AVX2_FOUND)
|
|
IF(MSVC)
|
|
SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/vector/AVX2.cpp PROPERTIES COMPILE_FLAGS "${OPT_FLAG}/arch:AVX2 ${CXX_AVX2_FLAGS}")
|
|
ELSE(MSVC)
|
|
SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/vector/AVX2.cpp PROPERTIES COMPILE_FLAGS "${OPT_FLAG} ${CXX_AVX2_FLAGS}")
|
|
ENDIF(MSVC)
|
|
ENDIF(C_AVX2_FOUND)
|
|
|
|
IF(NOT MSVC AND NOT "${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
|
|
SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/THAllocator.cpp PROPERTIES COMPILE_FLAGS "-fno-openmp")
|
|
ENDIF()
|
|
|
|
FILE(GLOB cpu_kernel_cpp_in "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cpu/*.cpp")
|
|
|
|
LIST(APPEND CPU_CAPABILITY_NAMES "DEFAULT")
|
|
LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}")
|
|
|
|
IF(CXX_AVX_FOUND)
|
|
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX_CPU_DEFINITION")
|
|
LIST(APPEND CPU_CAPABILITY_NAMES "AVX")
|
|
IF(MSVC)
|
|
LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX")
|
|
ELSE(MSVC)
|
|
LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx")
|
|
ENDIF(MSVC)
|
|
ENDIF(CXX_AVX_FOUND)
|
|
|
|
IF(CXX_AVX2_FOUND)
|
|
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX2_CPU_DEFINITION")
|
|
|
|
# Some versions of GCC pessimistically split unaligned load and store
|
|
# instructions when using the default tuning. This is a bad choice on
|
|
# new Intel and AMD processors so we disable it when compiling with AVX2.
|
|
# See https://stackoverflow.com/questions/52626726/why-doesnt-gcc-resolve-mm256-loadu-pd-as-single-vmovupd#tab-top
|
|
check_cxx_compiler_flag("-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store" COMPILER_SUPPORTS_NO_AVX256_SPLIT)
|
|
IF(COMPILER_SUPPORTS_NO_AVX256_SPLIT)
|
|
SET(CPU_NO_AVX256_SPLIT_FLAGS "-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store")
|
|
ENDIF(COMPILER_SUPPORTS_NO_AVX256_SPLIT)
|
|
|
|
LIST(APPEND CPU_CAPABILITY_NAMES "AVX2")
|
|
IF(MSVC)
|
|
LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX2")
|
|
ELSE(MSVC)
|
|
LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx2 -mfma ${CPU_NO_AVX256_SPLIT_FLAGS}")
|
|
ENDIF(MSVC)
|
|
ENDIF(CXX_AVX2_FOUND)
|
|
|
|
list(LENGTH CPU_CAPABILITY_NAMES NUM_CPU_CAPABILITY_NAMES)
|
|
math(EXPR NUM_CPU_CAPABILITY_NAMES "${NUM_CPU_CAPABILITY_NAMES}-1")
|
|
|
|
FOREACH(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
|
|
FOREACH(IMPL ${cpu_kernel_cpp_in})
|
|
string(REPLACE "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/" "" NAME ${IMPL})
|
|
LIST(GET CPU_CAPABILITY_NAMES ${i} CPU_CAPABILITY)
|
|
SET(NEW_IMPL ${CMAKE_BINARY_DIR}/aten/src/ATen/${NAME}.${CPU_CAPABILITY}.cpp)
|
|
CONFIGURE_FILE(${IMPL} ${NEW_IMPL} COPYONLY)
|
|
SET(cpu_kernel_cpp ${NEW_IMPL} ${cpu_kernel_cpp}) # Create list of copies
|
|
LIST(GET CPU_CAPABILITY_FLAGS ${i} FLAGS)
|
|
IF(MSVC)
|
|
SET(MACRO_FLAG "/DCPU_CAPABILITY=${CPU_CAPABILITY} /DCPU_CAPABILITY_${CPU_CAPABILITY}")
|
|
ELSE(MSVC)
|
|
SET(MACRO_FLAG "-DCPU_CAPABILITY=${CPU_CAPABILITY} -DCPU_CAPABILITY_${CPU_CAPABILITY}")
|
|
ENDIF(MSVC)
|
|
SET_SOURCE_FILES_PROPERTIES(${NEW_IMPL} PROPERTIES COMPILE_FLAGS "${FLAGS} ${MACRO_FLAG}")
|
|
ENDFOREACH()
|
|
ENDFOREACH()
|
|
list(APPEND ATen_CPU_SRCS ${cpu_kernel_cpp})
|
|
|
|
set(cwrap_files
|
|
${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/Declarations.cwrap
|
|
${CMAKE_CURRENT_LIST_DIR}/../aten/src/THNN/generic/THNN.h
|
|
${CMAKE_CURRENT_LIST_DIR}/../aten/src/THCUNN/generic/THCUNN.h
|
|
${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/nn.yaml
|
|
${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/native_functions.yaml)
|
|
|
|
FILE(GLOB all_python "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/*.py")
|
|
|
|
set(GEN_ROCM_FLAG)
|
|
if (USE_ROCM)
|
|
set(GEN_ROCM_FLAG --rocm)
|
|
endif()
|
|
|
|
SET(GEN_COMMAND
|
|
"${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/gen.py
|
|
--source-path ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen
|
|
--install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen
|
|
${GEN_ROCM_FLAG}
|
|
${cwrap_files}
|
|
)
|
|
|
|
EXECUTE_PROCESS(
|
|
COMMAND ${GEN_COMMAND}
|
|
--output-dependencies ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt
|
|
RESULT_VARIABLE RETURN_VALUE
|
|
)
|
|
if (NOT RETURN_VALUE EQUAL 0)
|
|
message(STATUS ${generated_cpp})
|
|
message(FATAL_ERROR "Failed to get generated_cpp list")
|
|
endif()
|
|
file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt generated_cpp)
|
|
file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt-cuda cuda_generated_cpp)
|
|
|
|
file(GLOB_RECURSE all_templates "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/templates/*")
|
|
|
|
# these are files that are generated by the script and checked in -- the script checks
|
|
# that they are equivalent so it must be a dependency of the script
|
|
set(core_gen_checked_inputs
|
|
${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core/Type.h
|
|
${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core/Tensor.h
|
|
${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core/TensorMethods.h)
|
|
|
|
file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen)
|
|
file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen/core_tmp)
|
|
|
|
add_custom_command(OUTPUT ${generated_cpp} ${cuda_generated_cpp}
|
|
COMMAND ${GEN_COMMAND}
|
|
DEPENDS ${all_python} ${all_templates} ${cwrap_files} ${core_gen_checked_inputs})
|
|
|
|
# Generated headers used from a CUDA (.cu) file are
|
|
# not tracked correctly in CMake. We make the libATen.so depend explicitly
|
|
# on building the generated ATen files to workaround.
|
|
add_custom_target(ATEN_CPU_FILES_GEN_TARGET DEPENDS ${generated_cpp})
|
|
add_custom_target(ATEN_CUDA_FILES_GEN_TARGET DEPENDS ${cuda_generated_cpp})
|
|
add_library(ATEN_CPU_FILES_GEN_LIB INTERFACE)
|
|
add_library(ATEN_CUDA_FILES_GEN_LIB INTERFACE)
|
|
add_dependencies(ATEN_CPU_FILES_GEN_LIB ATEN_CPU_FILES_GEN_TARGET)
|
|
add_dependencies(ATEN_CUDA_FILES_GEN_LIB ATEN_CUDA_FILES_GEN_TARGET)
|
|
endif()
|