diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh index a0eb0b72df2b..178db42a609a 100644 --- a/.ci/aarch64_linux/aarch64_ci_build.sh +++ b/.ci/aarch64_linux/aarch64_ci_build.sh @@ -31,7 +31,8 @@ pip install -r /pytorch/requirements.txt pip install auditwheel==6.2.0 wheel if [ "$DESIRED_CUDA" = "cpu" ]; then echo "BASE_CUDA_VERSION is not set. Building cpu wheel." - python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn + #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files + USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn else echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA" export USE_SYSTEM_NCCL=1 @@ -45,5 +46,6 @@ else export USE_NVIDIA_PYPI_LIBS=1 fi - python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda + #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files + USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda fi diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py index d4afea81ac0b..1b6429fa8c06 100755 --- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py +++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py @@ -317,7 +317,7 @@ if __name__ == "__main__": ).decode() print("Building PyTorch wheel") - build_vars = "" + build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " # MAX_JOB=5 is not required for CPU backend (see commit 465d98b) if enable_cuda: build_vars += "MAX_JOBS=5 " diff --git a/.gitignore b/.gitignore index 774ce14f17d6..d1fa4cd3caf2 100644 --- a/.gitignore +++ b/.gitignore @@ -259,9 +259,6 @@ gen .pytest_cache aten/build/* -# Linker scripts for prioritized text optimization -cmake/linker_script.ld - # Bram plsdontbreak diff --git a/CMakeLists.txt b/CMakeLists.txt index d367b078604e..5a43e0da8f2e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -379,13 +379,6 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler" OFF "USE_CUDA" OFF) cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON "CPU_AARCH64" OFF) -# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le. -set(USE_PRIORITIZED_TEXT_DEFAULT OFF) -if(LINUX AND CPU_AARCH64) - set(USE_PRIORITIZED_TEXT_DEFAULT ON) -endif() -cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld." - "${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF) option(USE_MIMALLOC "Use mimalloc" OFF) # Enable third party mimalloc library to improve memory allocation performance @@ -664,11 +657,6 @@ endif(MSVC) string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all") -# Set linker max-page-size to 64KiB on AArch64 Linux -if(LINUX AND CPU_AARCH64) - add_link_options_if_supported("-z,max-page-size=0x10000") -endif() - # Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not # applicable to mobile are disabled by this variable. Setting # `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it @@ -1433,57 +1421,3 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA) install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas" DESTINATION "${CMAKE_INSTALL_BINDIR}") endif() - -if(USE_PRIORITIZED_TEXT_FOR_LD) - add_compile_options( - $<$:-ffunction-sections> - $<$:-fdata-sections> - ) - set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld") - set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt") - - add_custom_command( - OUTPUT "${LINKER_SCRIPT_FILE_OUT}" - COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}" - DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}" - COMMENT "Generating prioritized text linker files" - VERBATIM - ) - - add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}") - - if(BUILD_PYTHON) - set(LINKER_OPT_TARGETS torch_python) - endif() - - if(NOT BUILD_LIBTORCHLESS) - list(APPEND LINKER_OPT_TARGETS torch_cpu c10) - if(USE_CUDA) - list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda) - endif() - if(USE_XPU) - list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu) - endif() - if(USE_ROCM) - list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip) - endif() - endif() - - foreach(tgt IN LISTS LINKER_OPT_TARGETS) - if(TARGET ${tgt}) - add_dependencies("${tgt}" generate_linker_script) - target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}") - set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}") - else() - message(WARNING "Requested target '${tgt}' for linker script optimization was not found.") - endif() - endforeach() - -else() - if(LINUX AND CPU_AARCH64) - message(WARNING [[ - It is strongly recommend to enable linker script optimization for all AArch64 Linux builds. - To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1 - ]]) - endif() -endif() \ No newline at end of file diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 2e2fd370a994..ffd4b5298a89 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -158,7 +158,6 @@ function(caffe2_print_configuration_summary) if(${USE_KLEIDIAI}) message(STATUS " USE_KLEIDIAI : ${USE_KLEIDIAI}") endif() - message(STATUS " USE_PRIORITIZED_TEXT_FOR_LD : ${USE_PRIORITIZED_TEXT_FOR_LD}") message(STATUS " USE_UCC : ${USE_UCC}") if(${USE_UCC}) message(STATUS " USE_SYSTEM_UCC : ${USE_SYSTEM_UCC}") diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake index c96ffebf858e..68e66bb3fc38 100644 --- a/cmake/public/utils.cmake +++ b/cmake/public/utils.cmake @@ -482,7 +482,6 @@ function(torch_update_find_cuda_flags) endfunction() include(CheckCXXCompilerFlag) -include(CheckLinkerFlag) ############################################################################## # CHeck if given flag is supported and append it to provided outputvar @@ -512,22 +511,3 @@ function(target_compile_options_if_supported target flag) target_compile_options(${target} PRIVATE ${flag}) endif() endfunction() - -# Check if a global link option is supported -function(add_link_options_if_supported flag) - check_linker_flag(C "LINKER:${flag}" _supported) - if("${_supported}") - add_link_options("LINKER:${flag}") - else() - message(WARNING "Attempted to use unsupported link option : ${flag}.") - endif() -endfunction() - -function(target_link_options_if_supported tgt flag) - check_linker_flag(C "LINKER:${flag}" _supported) - if("${_supported}") - target_link_options("${tgt}" PRIVATE "LINKER:${flag}") - else() - message(WARNING "Attempted to use unsupported link option : ${flag}.") - endif() -endfunction() \ No newline at end of file diff --git a/setup.py b/setup.py index 2bb63a93cec8..c0523a1b5c60 100644 --- a/setup.py +++ b/setup.py @@ -227,6 +227,9 @@ # Static link mimalloc into C10, and use mimalloc in alloc_cpu & alloc_free. # By default, It is only enabled on Windows. # +# USE_PRIORITIZED_TEXT_FOR_LD +# Uses prioritized text form cmake/prioritized_text.txt for LD +# # BUILD_LIBTORCH_WHL # Builds libtorch.so and its dependencies as a wheel # @@ -320,6 +323,7 @@ from tools.setup_helpers.env import ( IS_LINUX, IS_WINDOWS, ) +from tools.setup_helpers.generate_linker_script import gen_linker_script def str2bool(value: str | None) -> bool: @@ -1623,6 +1627,26 @@ def main() -> None: if BUILD_PYTHON_ONLY: install_requires += [f"{LIBTORCH_PKG_NAME}=={TORCH_VERSION}"] + if str2bool(os.getenv("USE_PRIORITIZED_TEXT_FOR_LD")): + gen_linker_script( + filein="cmake/prioritized_text.txt", fout="cmake/linker_script.ld" + ) + linker_script_path = os.path.abspath("cmake/linker_script.ld") + os.environ["LDFLAGS"] = os.getenv("LDFLAGS", "") + f" -T{linker_script_path}" + os.environ["CFLAGS"] = ( + os.getenv("CFLAGS", "") + " -ffunction-sections -fdata-sections" + ) + os.environ["CXXFLAGS"] = ( + os.getenv("CXXFLAGS", "") + " -ffunction-sections -fdata-sections" + ) + elif platform.system() == "Linux" and platform.processor() == "aarch64": + print_box( + """ + WARNING: we strongly recommend enabling linker script optimization for ARM + CUDA. + To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1 + """ + ) + # Parse the command line and check the arguments before we proceed with # building deps and setup. We need to set values so `--help` works. dist = Distribution() diff --git a/tools/setup_helpers/generate_linker_script.py b/tools/setup_helpers/generate_linker_script.py index b5a7a4ce7dec..e66fc197062a 100644 --- a/tools/setup_helpers/generate_linker_script.py +++ b/tools/setup_helpers/generate_linker_script.py @@ -1,7 +1,5 @@ -import argparse import os import subprocess -from pathlib import Path def gen_linker_script( @@ -30,10 +28,6 @@ def gen_linker_script( assert len(text_line_start) == 1, "The linker script has multiple text sections!" text_line_start = text_line_start[0] - # ensure that parent directory exists before writing - fout = Path(fout) - fout.parent.mkdir(parents=True, exist_ok=True) - with open(fout, "w") as f: for lineid, line in enumerate(linker_script_lines): if lineid == text_line_start + 2: @@ -42,20 +36,3 @@ def gen_linker_script( f.write(f" .text.{plines}\n") f.write(" )\n") f.write(f"{line}\n") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Generate linker file based on prioritized symbols. Used for link-time optimization.", - ) - parser.add_argument( - "--filein", - help="Path to prioritized_text.txt input file", - default=argparse.SUPPRESS, - ) - parser.add_argument( - "--fout", help="Output path for linker ld file", default=argparse.SUPPRESS - ) - # convert args to a dict to pass to gen_linker_script - kwargs = vars(parser.parse_args()) - gen_linker_script(**kwargs)