mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/51957 This is a simplified version of #51554. Compared to #51554, this version only supports statically dispatching to a specific backend. The benefit is that it skipped the dispatch key computation logic thus has less framework overhead. The downside is that if input tensors do not match the specified backend it will throw error instead of falling back to regular dispatch. Sample code: ``` Tensor empty(IntArrayRef size, TensorOptions options, c10::optional<MemoryFormat> memory_format) { return at::cpu::empty(size, options, memory_format); } // aten::conj(Tensor(a) self) -> Tensor(a) Tensor conj(const Tensor & self) { return at::math::conj(self); } // aten::conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) Tensor & conj_out(Tensor & out, const Tensor & self) { return at::cpu::conj_out(out, self); } // aten::conj.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) Tensor & conj_outf(const Tensor & self, Tensor & out) { return at::cpu::conj_out(out, self); } // aten::_conj(Tensor self) -> Tensor Tensor _conj(const Tensor & self) { return at::defaultbackend::_conj(self); } ``` For ops without the specific backend dispatch, it will throw error: ``` // aten::_use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool bool _use_cudnn_ctc_loss(const Tensor & log_probs, const Tensor & targets, IntArrayRef input_lengths, IntArrayRef target_lengths, int64_t blank) { TORCH_CHECK(false, "Static dispatch does not support _use_cudnn_ctc_loss for CPU."); } ``` Differential Revision: D26337857 Test Plan: Imported from OSS Reviewed By: bhosmer Pulled By: ljk53 fbshipit-source-id: a8e95799115c349de3c09f04a26b01d21a679364
265 lines
11 KiB
CMake
265 lines
11 KiB
CMake
# This ill-named file does a number of things:
|
|
# - Installs Caffe2 header files (this has nothing to do with code generation)
|
|
# - Configures caffe2/core/macros.h
|
|
# - Creates an ATen target for its generated C++ files and adds it
|
|
# as a dependency
|
|
# - Reads build lists defined in build_variables.bzl
|
|
|
|
################################################################################
|
|
# Helper functions
|
|
################################################################################
|
|
|
|
function(filter_list output input)
|
|
unset(result)
|
|
foreach(filename ${${input}})
|
|
foreach(pattern ${ARGN})
|
|
if("${filename}" MATCHES "${pattern}")
|
|
list(APPEND result "${filename}")
|
|
endif()
|
|
endforeach()
|
|
endforeach()
|
|
set(${output} ${result} PARENT_SCOPE)
|
|
endfunction()
|
|
|
|
function(filter_list_exclude output input)
|
|
unset(result)
|
|
foreach(filename ${${input}})
|
|
foreach(pattern ${ARGN})
|
|
if(NOT "${filename}" MATCHES "${pattern}")
|
|
list(APPEND result "${filename}")
|
|
endif()
|
|
endforeach()
|
|
endforeach()
|
|
set(${output} ${result} PARENT_SCOPE)
|
|
endfunction()
|
|
|
|
################################################################################
|
|
|
|
# ---[ Write the macros file
|
|
configure_file(
|
|
${CMAKE_CURRENT_LIST_DIR}/../caffe2/core/macros.h.in
|
|
${CMAKE_BINARY_DIR}/caffe2/core/macros.h)
|
|
|
|
# ---[ Installing the header files
|
|
install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../caffe2
|
|
DESTINATION include
|
|
FILES_MATCHING PATTERN "*.h")
|
|
if(NOT INTERN_BUILD_ATEN_OPS)
|
|
install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core
|
|
DESTINATION include/ATen
|
|
FILES_MATCHING PATTERN "*.h")
|
|
endif()
|
|
install(FILES ${CMAKE_BINARY_DIR}/caffe2/core/macros.h
|
|
DESTINATION include/caffe2/core)
|
|
|
|
# ---[ ATen specific
|
|
if(INTERN_BUILD_ATEN_OPS)
|
|
if(MSVC)
|
|
set(OPT_FLAG "/fp:strict ")
|
|
else(MSVC)
|
|
set(OPT_FLAG "-O3 ")
|
|
if("${CMAKE_BUILD_TYPE}" MATCHES "Debug")
|
|
set(OPT_FLAG " ")
|
|
endif()
|
|
endif(MSVC)
|
|
|
|
if(C_AVX_FOUND)
|
|
if(MSVC)
|
|
set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/vector/AVX.cpp PROPERTIES COMPILE_FLAGS "${OPT_FLAG}/arch:AVX ${CXX_AVX_FLAGS}")
|
|
else(MSVC)
|
|
set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/vector/AVX.cpp PROPERTIES COMPILE_FLAGS "${OPT_FLAG} ${CXX_AVX_FLAGS}")
|
|
endif(MSVC)
|
|
endif(C_AVX_FOUND)
|
|
|
|
if(NOT MSVC AND NOT "${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
|
|
set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../aten/src/TH/THAllocator.cpp PROPERTIES COMPILE_FLAGS "-fno-openmp")
|
|
endif()
|
|
|
|
file(GLOB cpu_kernel_cpp_in "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cpu/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/quantized/cpu/kernels/*.cpp")
|
|
|
|
list(APPEND CPU_CAPABILITY_NAMES "DEFAULT")
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}")
|
|
|
|
if(CXX_AVX_FOUND)
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX_CPU_DEFINITION")
|
|
list(APPEND CPU_CAPABILITY_NAMES "AVX")
|
|
if(MSVC)
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX")
|
|
else(MSVC)
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx")
|
|
endif(MSVC)
|
|
endif(CXX_AVX_FOUND)
|
|
|
|
if(CXX_AVX2_FOUND)
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_AVX2_CPU_DEFINITION")
|
|
|
|
# Some versions of GCC pessimistically split unaligned load and store
|
|
# instructions when using the default tuning. This is a bad choice on
|
|
# new Intel and AMD processors so we disable it when compiling with AVX2.
|
|
# See https://stackoverflow.com/questions/52626726/why-doesnt-gcc-resolve-mm256-loadu-pd-as-single-vmovupd#tab-top
|
|
check_cxx_compiler_flag("-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store" COMPILER_SUPPORTS_NO_AVX256_SPLIT)
|
|
if(COMPILER_SUPPORTS_NO_AVX256_SPLIT)
|
|
set(CPU_NO_AVX256_SPLIT_FLAGS "-mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store")
|
|
endif(COMPILER_SUPPORTS_NO_AVX256_SPLIT)
|
|
|
|
list(APPEND CPU_CAPABILITY_NAMES "AVX2")
|
|
if(MSVC)
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG}/arch:AVX2")
|
|
else(MSVC)
|
|
list(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} -mavx2 -mfma ${CPU_NO_AVX256_SPLIT_FLAGS}")
|
|
endif(MSVC)
|
|
endif(CXX_AVX2_FOUND)
|
|
|
|
if(CXX_VSX_FOUND)
|
|
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAVE_VSX_CPU_DEFINITION")
|
|
LIST(APPEND CPU_CAPABILITY_NAMES "VSX")
|
|
LIST(APPEND CPU_CAPABILITY_FLAGS "${OPT_FLAG} ${CXX_VSX_FLAGS}")
|
|
endif(CXX_VSX_FOUND)
|
|
|
|
list(LENGTH CPU_CAPABILITY_NAMES NUM_CPU_CAPABILITY_NAMES)
|
|
math(EXPR NUM_CPU_CAPABILITY_NAMES "${NUM_CPU_CAPABILITY_NAMES}-1")
|
|
|
|
# The sources list might get reordered later based on the capabilites.
|
|
# See NOTE [ Linking AVX and non-AVX files ]
|
|
foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
|
|
foreach(IMPL ${cpu_kernel_cpp_in})
|
|
string(REPLACE "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/" "" NAME ${IMPL})
|
|
list(GET CPU_CAPABILITY_NAMES ${i} CPU_CAPABILITY)
|
|
set(NEW_IMPL ${CMAKE_BINARY_DIR}/aten/src/ATen/${NAME}.${CPU_CAPABILITY}.cpp)
|
|
configure_file(${IMPL} ${NEW_IMPL} COPYONLY)
|
|
set(cpu_kernel_cpp ${NEW_IMPL} ${cpu_kernel_cpp}) # Create list of copies
|
|
list(GET CPU_CAPABILITY_FLAGS ${i} FLAGS)
|
|
if(MSVC)
|
|
set(EXTRA_FLAGS "/DCPU_CAPABILITY=${CPU_CAPABILITY} /DCPU_CAPABILITY_${CPU_CAPABILITY}")
|
|
else(MSVC)
|
|
set(EXTRA_FLAGS "-DCPU_CAPABILITY=${CPU_CAPABILITY} -DCPU_CAPABILITY_${CPU_CAPABILITY}")
|
|
endif(MSVC)
|
|
# Disable certain warnings for GCC-9.X
|
|
if(CMAKE_COMPILER_IS_GNUCXX AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0.0))
|
|
if(("${NAME}" STREQUAL "native/cpu/GridSamplerKernel.cpp") AND ("${CPU_CAPABILITY}" STREQUAL "DEFAULT"))
|
|
# See https://github.com/pytorch/pytorch/issues/38855
|
|
set(EXTRA_FLAGS "${EXTRA_FLAGS} -Wno-uninitialized")
|
|
endif()
|
|
if("${NAME}" STREQUAL "native/quantized/cpu/kernels/QuantizedOpKernels.cpp")
|
|
# See https://github.com/pytorch/pytorch/issues/38854
|
|
set(EXTRA_FLAGS "${EXTRA_FLAGS} -Wno-deprecated-copy")
|
|
endif()
|
|
endif()
|
|
set_source_files_properties(${NEW_IMPL} PROPERTIES COMPILE_FLAGS "${FLAGS} ${EXTRA_FLAGS}")
|
|
endforeach()
|
|
endforeach()
|
|
list(APPEND ATen_CPU_SRCS ${cpu_kernel_cpp})
|
|
|
|
file(GLOB_RECURSE all_python "${CMAKE_CURRENT_LIST_DIR}/../tools/codegen/*.py")
|
|
|
|
set(GEN_ROCM_FLAG)
|
|
if(USE_ROCM)
|
|
set(GEN_ROCM_FLAG --rocm)
|
|
endif()
|
|
|
|
set(CUSTOM_BUILD_FLAGS)
|
|
if(INTERN_BUILD_MOBILE)
|
|
if(USE_VULKAN)
|
|
list(APPEND CUSTOM_BUILD_FLAGS --backend_whitelist CPU QuantizedCPU Vulkan)
|
|
else()
|
|
list(APPEND CUSTOM_BUILD_FLAGS --backend_whitelist CPU QuantizedCPU)
|
|
endif()
|
|
endif()
|
|
|
|
if(STATIC_DISPATCH_BACKEND)
|
|
message(STATUS "Custom build with static dispatch backend: ${STATIC_DISPATCH_BACKEND}")
|
|
list(APPEND CUSTOM_BUILD_FLAGS
|
|
--static_dispatch_backend ${STATIC_DISPATCH_BACKEND})
|
|
endif()
|
|
|
|
if(SELECTED_OP_LIST)
|
|
# With static dispatch we can omit the OP_DEPENDENCY flag. It will not calculate the transitive closure
|
|
# of used ops. It only needs to register used root ops.
|
|
if(NOT STATIC_DISPATCH_BACKEND AND NOT OP_DEPENDENCY)
|
|
message(INFO "Use default op dependency graph .yaml file for custom build with dynamic dispatch.")
|
|
set(OP_DEPENDENCY ${CMAKE_CURRENT_LIST_DIR}/../tools/code_analyzer/default_op_deps.yaml)
|
|
endif()
|
|
execute_process(
|
|
COMMAND
|
|
"${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_LIST_DIR}/../tools/code_analyzer/gen_op_registration_allowlist.py
|
|
--op-dependency "${OP_DEPENDENCY}"
|
|
--root-ops "${SELECTED_OP_LIST}"
|
|
OUTPUT_VARIABLE OP_REGISTRATION_WHITELIST
|
|
)
|
|
separate_arguments(OP_REGISTRATION_WHITELIST)
|
|
message(STATUS "Custom build with op registration whitelist: ${OP_REGISTRATION_WHITELIST}")
|
|
list(APPEND CUSTOM_BUILD_FLAGS
|
|
--force_schema_registration
|
|
--op_registration_whitelist ${OP_REGISTRATION_WHITELIST})
|
|
endif()
|
|
|
|
set(GEN_COMMAND
|
|
"${PYTHON_EXECUTABLE}" -m tools.codegen.gen
|
|
--source-path ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen
|
|
--install_dir ${CMAKE_BINARY_DIR}/aten/src/ATen
|
|
${GEN_ROCM_FLAG}
|
|
${CUSTOM_BUILD_FLAGS}
|
|
${GEN_VULKAN_FLAGS}
|
|
)
|
|
|
|
execute_process(
|
|
COMMAND ${GEN_COMMAND}
|
|
--output-dependencies ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt
|
|
RESULT_VARIABLE RETURN_VALUE
|
|
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
|
|
)
|
|
if(NOT RETURN_VALUE EQUAL 0)
|
|
message(STATUS ${generated_cpp})
|
|
message(FATAL_ERROR "Failed to get generated_cpp list")
|
|
endif()
|
|
# FIXME: the file/variable name lists cpp, but these list both cpp and .h files
|
|
file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt generated_cpp)
|
|
file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt-cuda cuda_generated_cpp)
|
|
file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt-core core_generated_cpp)
|
|
|
|
file(GLOB_RECURSE all_templates "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/templates/*")
|
|
|
|
file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen)
|
|
file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen/core)
|
|
|
|
add_custom_command(OUTPUT ${generated_cpp} ${cuda_generated_cpp} ${core_generated_cpp}
|
|
COMMAND ${GEN_COMMAND}
|
|
DEPENDS ${all_python} ${all_templates}
|
|
${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/native_functions.yaml
|
|
WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/..
|
|
)
|
|
|
|
# Generated headers used from a CUDA (.cu) file are
|
|
# not tracked correctly in CMake. We make the libATen.so depend explicitly
|
|
# on building the generated ATen files to workaround.
|
|
add_custom_target(ATEN_CPU_FILES_GEN_TARGET DEPENDS ${generated_cpp} ${core_generated_cpp})
|
|
add_custom_target(ATEN_CUDA_FILES_GEN_TARGET DEPENDS ${cuda_generated_cpp})
|
|
add_library(ATEN_CPU_FILES_GEN_LIB INTERFACE)
|
|
add_library(ATEN_CUDA_FILES_GEN_LIB INTERFACE)
|
|
add_dependencies(ATEN_CPU_FILES_GEN_LIB ATEN_CPU_FILES_GEN_TARGET)
|
|
add_dependencies(ATEN_CUDA_FILES_GEN_LIB ATEN_CUDA_FILES_GEN_TARGET)
|
|
endif()
|
|
|
|
function(append_filelist name outputvar)
|
|
set(_rootdir "${${CMAKE_PROJECT_NAME}_SOURCE_DIR}/")
|
|
# configure_file adds its input to the list of CMAKE_RERUN dependencies
|
|
configure_file(
|
|
${PROJECT_SOURCE_DIR}/tools/build_variables.bzl
|
|
${PROJECT_BINARY_DIR}/caffe2/build_variables.bzl)
|
|
execute_process(
|
|
COMMAND "${PYTHON_EXECUTABLE}" -c
|
|
"exec(open('${PROJECT_SOURCE_DIR}/tools/build_variables.bzl').read());print(';'.join(['${_rootdir}' + x for x in ${name}]))"
|
|
WORKING_DIRECTORY "${_rootdir}"
|
|
RESULT_VARIABLE _retval
|
|
OUTPUT_VARIABLE _tempvar)
|
|
if(NOT _retval EQUAL 0)
|
|
message(FATAL_ERROR "Failed to fetch filelist ${name} from build_variables.bzl")
|
|
endif()
|
|
string(REPLACE "\n" "" _tempvar "${_tempvar}")
|
|
list(APPEND ${outputvar} ${_tempvar})
|
|
set(${outputvar} "${${outputvar}}" PARENT_SCOPE)
|
|
endfunction()
|
|
|
|
set(NUM_CPU_CAPABILITY_NAMES ${NUM_CPU_CAPABILITY_NAMES} PARENT_SCOPE)
|
|
set(CPU_CAPABILITY_FLAGS ${CPU_CAPABILITY_FLAGS} PARENT_SCOPE)
|