mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
This PR provides initial cutlass implementation of grouped gemm api as described in this [document](https://docs.google.com/document/d/1985La6wUUVH1AGBkNhaGKUXzx-9ybtbUp567-vYVOM4/edit?tab=t.0#heading=h.g8lzbjnyzzx9). Any combination of 2d and 3d inputs is supported, with 2d input being jagged, and the offsets of the jagged input being given by device tensor `offs`. Only H100 is supported, and only fp8_e4m3 with bf16 output and rowwise scaling. All the dimensions of each individual gemm have to be multiple of 16, that's cutlass limitation. I'll need to add those checks, for dynamic dimensions unfortunately the checks will have to be a device assert. I had to copy-paste cutlass's `Sm90RowBroadcast` and `Sm90ColBroadcast` structs with minor changes to enable scales given as pointer arrays, ideally those should be part of cutlass itself. I copied the schedules from the similar grouped gemm in FBGEMM, but there's a lot of room to improve perf, especially for `fast_accum=False`. Next steps would be perf tuning and increasing coverage to B100, I don't know how cutlass grouped gemm example handles blockwise scaling on B100. Pull Request resolved: https://github.com/pytorch/pytorch/pull/148531 Approved by: https://github.com/drisspg
468 lines
19 KiB
CMake
468 lines
19 KiB
CMake
# This ill-named file does a number of things:
|
|
# - Installs Caffe2 header files (this has nothing to do with code generation)
|
|
# - Configures caffe2/core/macros.h
|
|
# - Creates an ATen target for its generated C++ files and adds it
|
|
# as a dependency
|
|
# - Reads build lists defined in build_variables.bzl
|
|
|
|
################################################################################
|
|
# Helper functions
|
|
################################################################################
|
|
|
|
function(filter_list output input)
|
|
unset(result)
|
|
foreach(filename ${${input}})
|
|
foreach(pattern ${ARGN})
|
|
if("${filename}" MATCHES "${pattern}")
|
|
list(APPEND result "${filename}")
|
|
endif()
|
|
endforeach()
|
|
endforeach()
|
|
set(${output} ${result} PARENT_SCOPE)
|
|
endfunction()
|
|
|
|
function(filter_list_exclude output input)
|
|
unset(result)
|
|
foreach(filename ${${input}})
|
|
foreach(pattern ${ARGN})
|
|
if(NOT "${filename}" MATCHES "${pattern}")
|
|
list(APPEND result "${filename}")
|
|
endif()
|
|
endforeach()
|
|
endforeach()
|
|
set(${output} ${result} PARENT_SCOPE)
|
|
endfunction()
|
|
|
|
################################################################################
|
|
|
|
# -- [ Deterine commit hash
|
|
execute_process(
|
|
COMMAND "${Python_EXECUTABLE}" -c "from tools.generate_torch_version import get_sha;print(get_sha('.'), end='')"
|
|
OUTPUT_VARIABLE COMMIT_SHA
|
|
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
|
|
)
|
|
|
|
# ---[ Write the macros file
|
|
configure_file(
|
|
${CMAKE_CURRENT_LIST_DIR}/../caffe2/core/macros.h.in
|
|
${CMAKE_BINARY_DIR}/caffe2/core/macros.h)
|
|
|
|
# ---[ Installing the header files
|
|
install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../caffe2
|
|
DESTINATION include
|
|
FILES_MATCHING PATTERN "*.h")
|
|
if(NOT INTERN_BUILD_ATEN_OPS)
|
|
install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core
|
|
DESTINATION include/ATen
|
|
FILES_MATCHING PATTERN "*.h")
|
|
endif()
|
|
install(FILES ${CMAKE_BINARY_DIR}/caffe2/core/macros.h
|
|
DESTINATION include/caffe2/core)
|
|
|
|
# ---[ ATen specific
|
|
if(INTERN_BUILD_ATEN_OPS)
|
|
if(MSVC)
|
|
set(OPT_FLAG "/fp:strict ")
|
|
else(MSVC)
|
|
set(OPT_FLAG "-O3 ")
|
|
if("${CMAKE_BUILD_TYPE}" MATCHES "Debug")
|
|
set(OPT_FLAG " ")
|
|
endif()
|
|
endif(MSVC)
|
|
|
|
if(NOT MSVC AND NOT "${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
|
|
set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/MapAllocator.cpp PROPERTIES COMPILE_FLAGS "-fno-openmp")
|
|
endif()
|
|
|
|
file(GLOB_RECURSE all_python "${CMAKE_CURRENT_LIST_DIR}/../torchgen/*.py")
|
|
|
|
# RowwiseScaled.cu requires sm89/sm90a flags
|
|
if(USE_CUDA)
|
|
set(ROWWISE_SCALED_MM_FILE "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/RowwiseScaledMM.cu")
|
|
|
|
# Get existing arch flags
|
|
torch_cuda_get_nvcc_gencode_flag(EXISTING_ARCH_FLAGS)
|
|
|
|
# Check NVCC version and existing arch flags
|
|
set(ROWWISE_SCALED_MM_FILE_COMPILE_FLAGS "")
|
|
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0)
|
|
if(EXISTING_ARCH_FLAGS MATCHES ".*compute_86.*")
|
|
list(APPEND ROWWISE_SCALED_MM_FILE_COMPILE_FLAGS "-gencode;arch=compute_89,code=sm_89")
|
|
endif()
|
|
if(EXISTING_ARCH_FLAGS MATCHES ".*compute_90.*")
|
|
list(APPEND ROWWISE_SCALED_MM_FILE_COMPILE_FLAGS "-gencode;arch=compute_90a,code=sm_90a")
|
|
endif()
|
|
if(EXISTING_ARCH_FLAGS MATCHES ".*compute_100.*")
|
|
list(APPEND ROWWISE_SCALED_MM_FILE_COMPILE_FLAGS "-gencode;arch=compute_100a,code=sm_100a")
|
|
endif()
|
|
endif()
|
|
list(JOIN ROWWISE_SCALED_MM_FILE_COMPILE_FLAGS " " ROWWISE_SCALED_MM_FILE_COMPILE_FLAGS)
|
|
set_source_files_properties(${ROWWISE_SCALED_MM_FILE} PROPERTIES COMPILE_FLAGS "${ROWWISE_SCALED_MM_FILE_COMPILE_FLAGS}")
|
|
|
|
set(ROWWISE_SCALED_MM_FILE "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/ScaledGroupMM.cu")
|
|
|
|
# Get existing arch flags
|
|
torch_cuda_get_nvcc_gencode_flag(EXISTING_ARCH_FLAGS)
|
|
|
|
# Check NVCC version and existing arch flags
|
|
set(ROWWISE_SCALED_MM_FILE_COMPILE_FLAGS "")
|
|
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0)
|
|
if(EXISTING_ARCH_FLAGS MATCHES ".*compute_86.*")
|
|
list(APPEND ROWWISE_SCALED_MM_FILE_COMPILE_FLAGS "-gencode;arch=compute_89,code=sm_89")
|
|
endif()
|
|
if(EXISTING_ARCH_FLAGS MATCHES ".*compute_90.*")
|
|
list(APPEND ROWWISE_SCALED_MM_FILE_COMPILE_FLAGS "-gencode;arch=compute_90a,code=sm_90a")
|
|
endif()
|
|
endif()
|
|
list(JOIN ROWWISE_SCALED_MM_FILE_COMPILE_FLAGS " " ROWWISE_SCALED_MM_FILE_COMPILE_FLAGS)
|
|
set_source_files_properties(${ROWWISE_SCALED_MM_FILE} PROPERTIES COMPILE_FLAGS "${ROWWISE_SCALED_MM_FILE_COMPILE_FLAGS}")
|
|
|
|
|
|
endif()
|
|
|
|
set(GEN_ROCM_FLAG)
|
|
if(USE_ROCM)
|
|
set(GEN_ROCM_FLAG --rocm)
|
|
endif()
|
|
|
|
set(GEN_MPS_FLAG)
|
|
if(USE_MPS)
|
|
set(GEN_MPS_FLAG --mps)
|
|
endif()
|
|
|
|
set(GEN_XPU_FLAG)
|
|
if(USE_XPU)
|
|
set(GEN_XPU_FLAG --xpu)
|
|
endif()
|
|
|
|
set(CUSTOM_BUILD_FLAGS)
|
|
if(INTERN_BUILD_MOBILE)
|
|
if(USE_VULKAN)
|
|
list(APPEND CUSTOM_BUILD_FLAGS --backend_whitelist CPU QuantizedCPU Vulkan)
|
|
else()
|
|
list(APPEND CUSTOM_BUILD_FLAGS --backend_whitelist CPU QuantizedCPU)
|
|
endif()
|
|
endif()
|
|
|
|
if(SELECTED_OP_LIST)
|
|
if(TRACING_BASED)
|
|
message(STATUS "Running tracing-based selective build given operator list: ${SELECTED_OP_LIST}")
|
|
list(APPEND CUSTOM_BUILD_FLAGS
|
|
--op_selection_yaml_path ${SELECTED_OP_LIST})
|
|
elseif(NOT STATIC_DISPATCH_BACKEND)
|
|
message(WARNING
|
|
"You have to run tracing-based selective build with dynamic dispatch.\n"
|
|
"Switching to STATIC_DISPATCH_BACKEND=CPU."
|
|
)
|
|
set(STATIC_DISPATCH_BACKEND CPU)
|
|
endif()
|
|
endif()
|
|
|
|
if(STATIC_DISPATCH_BACKEND)
|
|
message(STATUS "Custom build with static dispatch backends: ${STATIC_DISPATCH_BACKEND}")
|
|
list(LENGTH STATIC_DISPATCH_BACKEND len)
|
|
list(APPEND CUSTOM_BUILD_FLAGS
|
|
--static_dispatch_backend ${STATIC_DISPATCH_BACKEND})
|
|
endif()
|
|
|
|
# Codegen unboxing
|
|
if(USE_LIGHTWEIGHT_DISPATCH)
|
|
file(GLOB_RECURSE all_unboxing_script "${CMAKE_CURRENT_LIST_DIR}/../tools/jit/*.py")
|
|
list(APPEND CUSTOM_BUILD_FLAGS --skip_dispatcher_op_registration)
|
|
set(GEN_UNBOXING_COMMAND
|
|
"${Python_EXECUTABLE}" -m tools.jit.gen_unboxing
|
|
--source-path ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen
|
|
--install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen
|
|
)
|
|
if(SELECTED_OP_LIST)
|
|
list(APPEND GEN_UNBOXING_COMMAND
|
|
--TEST_ONLY_op_registration_allowlist_yaml_path "${SELECTED_OP_LIST}")
|
|
endif()
|
|
set("GEN_UNBOXING_COMMAND_sources"
|
|
${GEN_UNBOXING_COMMAND}
|
|
--output-dependencies ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_unboxing_sources.cmake
|
|
)
|
|
message(STATUS "Generating sources for lightweight dispatch")
|
|
execute_process(
|
|
COMMAND ${GEN_UNBOXING_COMMAND_sources} --dry-run
|
|
RESULT_VARIABLE RETURN_VALUE
|
|
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
|
|
)
|
|
if(NOT RETURN_VALUE EQUAL 0)
|
|
message(FATAL_ERROR "Failed to get generated_unboxing_sources list")
|
|
endif()
|
|
|
|
include("${CMAKE_BINARY_DIR}/aten/src/ATen/generated_unboxing_sources.cmake")
|
|
add_custom_command(
|
|
COMMENT "Generating ATen unboxing sources"
|
|
OUTPUT
|
|
${generated_unboxing_sources}
|
|
${CMAKE_BINARY_DIR}/aten/src/ATen/generated_unboxing_sources.cmake
|
|
COMMAND ${GEN_UNBOXING_COMMAND_sources}
|
|
DEPENDS ${all_unboxing_script} ${sources_templates}
|
|
${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/native_functions.yaml
|
|
${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/tags.yaml
|
|
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
|
|
)
|
|
else() # Otherwise do not generate or include sources into build.
|
|
set(generated_unboxing_sources "")
|
|
endif()
|
|
|
|
set(GEN_PER_OPERATOR_FLAG)
|
|
if(USE_PER_OPERATOR_HEADERS)
|
|
list(APPEND GEN_PER_OPERATOR_FLAG "--per-operator-headers")
|
|
endif()
|
|
|
|
set(GEN_COMMAND
|
|
"${Python_EXECUTABLE}" -m torchgen.gen
|
|
--source-path ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen
|
|
--install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen
|
|
${GEN_PER_OPERATOR_FLAG}
|
|
${GEN_ROCM_FLAG}
|
|
${GEN_MPS_FLAG}
|
|
${GEN_XPU_FLAG}
|
|
${CUSTOM_BUILD_FLAGS}
|
|
)
|
|
|
|
file(GLOB_RECURSE headers_templates "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/templates/*\.h")
|
|
file(GLOB_RECURSE sources_templates "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/templates/*\.cpp")
|
|
set(declarations_yaml_templates "")
|
|
|
|
foreach(gen_type "headers" "sources" "declarations_yaml")
|
|
# The codegen outputs may change dynamically as PyTorch is
|
|
# developed, but add_custom_command only supports dynamic inputs.
|
|
#
|
|
# We work around this by generating a .cmake file which is
|
|
# included below to set the list of output files. If that file
|
|
# ever changes then cmake will be re-run automatically because it
|
|
# was included and so we get fully dynamic outputs.
|
|
|
|
set("GEN_COMMAND_${gen_type}"
|
|
${GEN_COMMAND}
|
|
--generate ${gen_type}
|
|
--output-dependencies ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_${gen_type}.cmake
|
|
)
|
|
|
|
# Dry run to bootstrap the output variables
|
|
execute_process(
|
|
COMMAND ${GEN_COMMAND_${gen_type}} --dry-run
|
|
RESULT_VARIABLE RETURN_VALUE
|
|
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
|
|
)
|
|
|
|
if(NOT RETURN_VALUE EQUAL 0)
|
|
message(FATAL_ERROR "Failed to get generated_${gen_type} list")
|
|
endif()
|
|
|
|
include("${CMAKE_BINARY_DIR}/aten/src/ATen/generated_${gen_type}.cmake")
|
|
include("${CMAKE_BINARY_DIR}/aten/src/ATen/core_generated_${gen_type}.cmake")
|
|
include("${CMAKE_BINARY_DIR}/aten/src/ATen/cpu_vec_generated_${gen_type}.cmake")
|
|
include("${CMAKE_BINARY_DIR}/aten/src/ATen/cuda_generated_${gen_type}.cmake")
|
|
include("${CMAKE_BINARY_DIR}/aten/src/ATen/ops_generated_${gen_type}.cmake")
|
|
if(USE_XPU)
|
|
include("${CMAKE_BINARY_DIR}/aten/src/ATen/xpu_generated_${gen_type}.cmake")
|
|
endif()
|
|
message(STATUS "${gen_type} outputs: ${gen_outputs}")
|
|
set(OUTPUT_LIST
|
|
${generated_${gen_type}}
|
|
${cuda_generated_${gen_type}}
|
|
${core_generated_${gen_type}}
|
|
${cpu_vec_generated_${gen_type}}
|
|
${ops_generated_${gen_type}}
|
|
${CMAKE_BINARY_DIR}/aten/src/ATen/generated_${gen_type}.cmake
|
|
${CMAKE_BINARY_DIR}/aten/src/ATen/ops_generated_${gen_type}.cmake
|
|
${CMAKE_BINARY_DIR}/aten/src/ATen/core_generated_${gen_type}.cmake
|
|
${CMAKE_BINARY_DIR}/aten/src/ATen/cpu_vec_generated_${gen_type}.cmake
|
|
${CMAKE_BINARY_DIR}/aten/src/ATen/cuda_generated_${gen_type}.cmake)
|
|
if(USE_XPU)
|
|
list(APPEND OUTPUT_LIST
|
|
${xpu_generated_${gen_type}}
|
|
${CMAKE_BINARY_DIR}/aten/src/ATen/xpu_generated_${gen_type}.cmake
|
|
)
|
|
endif()
|
|
|
|
add_custom_command(
|
|
COMMENT "Generating ATen ${gen_type}"
|
|
OUTPUT ${OUTPUT_LIST}
|
|
COMMAND ${GEN_COMMAND_${gen_type}}
|
|
DEPENDS ${all_python} ${${gen_type}_templates}
|
|
${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/native_functions.yaml
|
|
${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/tags.yaml
|
|
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
|
|
)
|
|
endforeach()
|
|
|
|
# Generated headers used from a CUDA (.cu) file are
|
|
# not tracked correctly in CMake. We make the libATen.so depend explicitly
|
|
# on building the generated ATen files to workaround.
|
|
add_custom_target(ATEN_CPU_FILES_GEN_TARGET DEPENDS
|
|
${generated_headers} ${core_generated_headers} ${cpu_vec_generated_headers} ${ops_generated_headers}
|
|
${generated_sources} ${core_generated_sources} ${cpu_vec_generated_sources} ${ops_generated_sources}
|
|
${generated_declarations_yaml} ${generated_unboxing_sources})
|
|
add_custom_target(ATEN_CUDA_FILES_GEN_TARGET DEPENDS
|
|
${cuda_generated_headers} ${cuda_generated_sources})
|
|
add_library(ATEN_CPU_FILES_GEN_LIB INTERFACE)
|
|
add_library(ATEN_CUDA_FILES_GEN_LIB INTERFACE)
|
|
add_dependencies(ATEN_CPU_FILES_GEN_LIB ATEN_CPU_FILES_GEN_TARGET)
|
|
add_dependencies(ATEN_CUDA_FILES_GEN_LIB ATEN_CUDA_FILES_GEN_TARGET)
|
|
|
|
if(USE_PER_OPERATOR_HEADERS)
|
|
target_compile_definitions(ATEN_CPU_FILES_GEN_LIB INTERFACE AT_PER_OPERATOR_HEADERS)
|
|
target_compile_definitions(ATEN_CUDA_FILES_GEN_LIB INTERFACE AT_PER_OPERATOR_HEADERS)
|
|
endif()
|
|
|
|
if(USE_XPU)
|
|
add_custom_target(ATEN_XPU_FILES_GEN_TARGET DEPENDS
|
|
${xpu_generated_headers} ${xpu_generated_sources})
|
|
add_library(ATEN_XPU_FILES_GEN_LIB INTERFACE)
|
|
add_dependencies(ATEN_XPU_FILES_GEN_LIB ATEN_XPU_FILES_GEN_TARGET)
|
|
|
|
if(USE_PER_OPERATOR_HEADERS)
|
|
target_compile_definitions(ATEN_XPU_FILES_GEN_LIB INTERFACE AT_PER_OPERATOR_HEADERS)
|
|
endif()
|
|
endif()
|
|
# Handle source files that need to be compiled multiple times for
|
|
# different vectorization options
|
|
file(GLOB cpu_kernel_cpp_in "${PROJECT_SOURCE_DIR}/aten/src/ATen/native/cpu/*.cpp" "${PROJECT_SOURCE_DIR}/aten/src/ATen/native/quantized/cpu/kernels/*.cpp")
|
|
|
|
list(APPEND CPU_CAPABILITY_NAMES "DEFAULT")
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}")
|
|
|
|
if(CXX_AVX512_FOUND)
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX512_CPU_DEFINITION")
|
|
list(APPEND CPU_CAPABILITY_NAMES "AVX512")
|
|
if(MSVC)
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX512")
|
|
else(MSVC)
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx512f -mavx512bw -mavx512vl -mavx512dq -mfma")
|
|
endif(MSVC)
|
|
endif(CXX_AVX512_FOUND)
|
|
|
|
if(CXX_AVX2_FOUND)
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX2_CPU_DEFINITION")
|
|
|
|
# Some versions of GCC pessimistically split unaligned load and store
|
|
# instructions when using the default tuning. This is a bad choice on
|
|
# new Intel and AMD processors so we disable it when compiling with AVX2.
|
|
# See https://stackoverflow.com/questions/52626726/why-doesnt-gcc-resolve-mm256-loadu-pd-as-single-vmovupd#tab-top
|
|
check_cxx_compiler_flag("-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store" COMPILER_SUPPORTS_NO_AVX256_SPLIT)
|
|
if(COMPILER_SUPPORTS_NO_AVX256_SPLIT)
|
|
set(CPU_NO_AVX256_SPLIT_FLAGS "-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store")
|
|
endif(COMPILER_SUPPORTS_NO_AVX256_SPLIT)
|
|
|
|
list(APPEND CPU_CAPABILITY_NAMES "AVX2")
|
|
if(DEFINED ENV{ATEN_AVX512_256})
|
|
if($ENV{ATEN_AVX512_256} MATCHES "TRUE")
|
|
if(CXX_AVX512_FOUND)
|
|
message("-- ATen AVX2 kernels will use 32 ymm registers")
|
|
if(MSVC)
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX512")
|
|
else(MSVC)
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -march=native ${CPU_NO_AVX256_SPLIT_FLAGS}")
|
|
endif(MSVC)
|
|
endif(CXX_AVX512_FOUND)
|
|
endif()
|
|
else()
|
|
if(MSVC)
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX2")
|
|
else(MSVC)
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx2 -mfma -mf16c ${CPU_NO_AVX256_SPLIT_FLAGS}")
|
|
endif(MSVC)
|
|
endif()
|
|
endif(CXX_AVX2_FOUND)
|
|
|
|
if(CXX_VSX_FOUND)
|
|
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_VSX_CPU_DEFINITION")
|
|
LIST(APPEND CPU_CAPABILITY_NAMES "VSX")
|
|
LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} ${CXX_VSX_FLAGS}")
|
|
endif(CXX_VSX_FOUND)
|
|
|
|
if(CXX_ZVECTOR_FOUND)
|
|
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_ZVECTOR_CPU_DEFINITION")
|
|
LIST(APPEND CPU_CAPABILITY_NAMES "ZVECTOR")
|
|
LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} ${CXX_ZVECTOR_FLAGS}")
|
|
endif(CXX_ZVECTOR_FOUND)
|
|
|
|
if(CXX_SVE_FOUND)
|
|
if(CXX_SVE256_FOUND)
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_SVE_CPU_DEFINITION -DHAVE_SVE256_CPU_DEFINITION")
|
|
list(APPEND CPU_CAPABILITY_NAMES "SVE256")
|
|
if("${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -O2 -march=armv8.2-a+sve -DCPU_CAPABILITY_SVE -msve-vector-bits=256")
|
|
else()
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -march=armv8.2-a+sve -DCPU_CAPABILITY_SVE -msve-vector-bits=256")
|
|
endif()
|
|
endif(CXX_SVE256_FOUND)
|
|
endif(CXX_SVE_FOUND)
|
|
|
|
list(LENGTH CPU_CAPABILITY_NAMES NUM_CPU_CAPABILITY_NAMES)
|
|
math(EXPR NUM_CPU_CAPABILITY_NAMES "${NUM_CPU_CAPABILITY_NAMES}-1")
|
|
|
|
# The sources list might get reordered later based on the capabilites.
|
|
# See NOTE [ Linking AVX and non-AVX files ]
|
|
foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
|
|
function(process_vec NAME)
|
|
list(GET CPU_CAPABILITY_NAMES ${i} CPU_CAPABILITY)
|
|
set(NEW_IMPL ${CMAKE_BINARY_DIR}/aten/src/ATen/${NAME}.${CPU_CAPABILITY}.cpp)
|
|
configure_file("${PROJECT_SOURCE_DIR}/cmake/IncludeSource.cpp.in" ${NEW_IMPL})
|
|
set(cpu_kernel_cpp ${NEW_IMPL} ${cpu_kernel_cpp} PARENT_SCOPE) # Create list of copies
|
|
list(GET CPU_CAPABILITY_FLAGS ${i} FLAGS)
|
|
if(MSVC)
|
|
set(EXTRA_FLAGS "/DCPU_CAPABILITY=${CPU_CAPABILITY} /DCPU_CAPABILITY_${CPU_CAPABILITY}")
|
|
else(MSVC)
|
|
set(EXTRA_FLAGS "-DCPU_CAPABILITY=${CPU_CAPABILITY} -DCPU_CAPABILITY_${CPU_CAPABILITY}")
|
|
endif(MSVC)
|
|
|
|
# Only parallelize the SortingKernel for now to avoid side effects
|
|
if(${NAME} STREQUAL "native/cpu/SortingKernel.cpp" AND NOT MSVC AND USE_OMP)
|
|
string(APPEND EXTRA_FLAGS " -D_GLIBCXX_PARALLEL")
|
|
endif()
|
|
|
|
# Disable certain warnings for GCC-9.X
|
|
if(CMAKE_COMPILER_IS_GNUCXX)
|
|
if(("${NAME}" STREQUAL "native/cpu/GridSamplerKernel.cpp") AND ("${CPU_CAPABILITY}" STREQUAL "DEFAULT"))
|
|
# See https://github.com/pytorch/pytorch/issues/38855
|
|
set(EXTRA_FLAGS "${EXTRA_FLAGS} -Wno-uninitialized")
|
|
endif()
|
|
if("${NAME}" STREQUAL "native/quantized/cpu/kernels/QuantizedOpKernels.cpp")
|
|
# See https://github.com/pytorch/pytorch/issues/38854
|
|
set(EXTRA_FLAGS "${EXTRA_FLAGS} -Wno-deprecated-copy")
|
|
endif()
|
|
endif()
|
|
set_source_files_properties(${NEW_IMPL} PROPERTIES COMPILE_FLAGS "${FLAGS} ${EXTRA_FLAGS}")
|
|
endfunction()
|
|
foreach(IMPL ${cpu_kernel_cpp_in})
|
|
file(RELATIVE_PATH NAME "${PROJECT_SOURCE_DIR}/aten/src/ATen/" "${IMPL}")
|
|
process_vec("${NAME}")
|
|
endforeach()
|
|
foreach(IMPL ${cpu_vec_generated_sources})
|
|
file(RELATIVE_PATH NAME "${CMAKE_BINARY_DIR}/aten/src/ATen/" "${IMPL}")
|
|
process_vec("${NAME}")
|
|
endforeach()
|
|
endforeach()
|
|
list(APPEND ATen_CPU_SRCS ${cpu_kernel_cpp})
|
|
endif()
|
|
|
|
function(append_filelist name outputvar)
|
|
set(_rootdir "${Torch_SOURCE_DIR}/")
|
|
# configure_file adds its input to the list of CMAKE_RERUN dependencies
|
|
configure_file(
|
|
${PROJECT_SOURCE_DIR}/build_variables.bzl
|
|
${PROJECT_BINARY_DIR}/caffe2/build_variables.bzl)
|
|
execute_process(
|
|
COMMAND "${Python_EXECUTABLE}" -c
|
|
"exec(open('${PROJECT_SOURCE_DIR}/build_variables.bzl').read());print(';'.join(['${_rootdir}' + x for x in ${name}]))"
|
|
WORKING_DIRECTORY "${_rootdir}"
|
|
RESULT_VARIABLE _retval
|
|
OUTPUT_VARIABLE _tempvar)
|
|
if(NOT _retval EQUAL 0)
|
|
message(FATAL_ERROR "Failed to fetch filelist ${name} from build_variables.bzl")
|
|
endif()
|
|
string(REPLACE "\n" "" _tempvar "${_tempvar}")
|
|
list(APPEND ${outputvar} ${_tempvar})
|
|
set(${outputvar} "${${outputvar}}" PARENT_SCOPE)
|
|
endfunction()
|
|
|
|
set(NUM_CPU_CAPABILITY_NAMES ${NUM_CPU_CAPABILITY_NAMES} PARENT_SCOPE)
|
|
set(CPU_CAPABILITY_FLAGS ${CPU_CAPABILITY_FLAGS} PARENT_SCOPE)
|