diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake index fc9f1a74fc5f..8a2c3b1a2fc9 100644 --- a/cmake/Codegen.cmake +++ b/cmake/Codegen.cmake @@ -101,11 +101,21 @@ if (INTERN_BUILD_ATEN_OPS) IF(CXX_AVX2_FOUND) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX2_CPU_DEFINITION") + + # Some versions of GCC pessimistically split unaligned load and store + # instructions when using the default tuning. This is a bad choice on + # new Intel and AMD processors so we disable it when compiling with AVX2. + # See https://stackoverflow.com/questions/52626726/why-doesnt-gcc-resolve-mm256-loadu-pd-as-single-vmovupd#tab-top + check_cxx_compiler_flag("-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store" COMPILER_SUPPORTS_NO_AVX256_SPLIT) + IF(COMPILER_SUPPORTS_NO_AVX256_SPLIT) + SET(CPU_NO_AVX256_SPLIT_FLAGS "-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store") + ENDIF(COMPILER_SUPPORTS_NO_AVX256_SPLIT) + LIST(APPEND CPU_CAPABILITY_NAMES "AVX2") IF(MSVC) LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX2") ELSE(MSVC) - LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx2 -mfma") + LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx2 -mfma ${CPU_NO_AVX256_SPLIT_FLAGS}") ENDIF(MSVC) ENDIF(CXX_AVX2_FOUND)