Move prioritized text linker optimization code from setup.py to cmake (#160078)

Note. This is a replica PR of #155901 which will be closed. I had to create a new PR in order to add it into my ghstack as there are some later commits which depend on it.

### Summary

🚀 This PR moves the prioritized text linker optimization from setup.py to cmake ( and enables by default on Linux aarch64 systems )

This change consolidates what was previously manual CI logic into a single location (cmake), ensuring consistent behavior across local builds, CI pipelines, and developer environments.

### Motivation
Prioritized text layout has measurable performance benefits on Arm systems by reducing code padding and improving cache utilization. This optimization was previously triggered manually via CI scripts (.ci/aarch64_linux/aarch64_ci_build.sh) or user-set environment variables. By detecting the target architecture within setup.py, this change enables the optimization automatically where applicable, improving maintainability and usability.

Note:

Due to ninja/cmake graph generation issues we cannot apply the linker file globally to all targets to the targets must be manually defined. See CMakeLists.txt the main libraries torch_python, torch, torch_cpu, torch_cuda, torch_xpu have been targetted which should be enough to maintain the performance benefits outlined above.

Co-authored-by: Usamah Zaheer <usamah.zaheer@arm.com>

Pull Request resolved: https://github.com/pytorch/pytorch/pull/160078
Approved by: https://github.com/seemethere
This commit is contained in:
Robert Hardwick
2025-09-18 08:42:03 +00:00
committed by PyTorch MergeBot
parent 56893ca1f6
commit 1aeac304b8
8 changed files with 116 additions and 29 deletions

View File

@ -31,8 +31,7 @@ pip install -r /pytorch/requirements.txt
pip install auditwheel==6.2.0 wheel
if [ "$DESIRED_CUDA" = "cpu" ]; then
echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
else
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
export USE_SYSTEM_NCCL=1
@ -46,6 +45,5 @@ else
export USE_NVIDIA_PYPI_LIBS=1
fi
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
fi

View File

@ -317,7 +317,7 @@ if __name__ == "__main__":
).decode()
print("Building PyTorch wheel")
build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
build_vars = ""
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
if enable_cuda:
build_vars += "MAX_JOBS=5 "

3
.gitignore vendored
View File

@ -259,6 +259,9 @@ gen
.pytest_cache
aten/build/*
# Linker scripts for prioritized text optimization
cmake/linker_script.ld
# Bram
plsdontbreak

View File

@ -380,6 +380,13 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
OFF "USE_CUDA" OFF)
cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
"CPU_AARCH64" OFF)
# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le.
set(USE_PRIORITIZED_TEXT_DEFAULT OFF)
if(LINUX AND CPU_AARCH64)
set(USE_PRIORITIZED_TEXT_DEFAULT ON)
endif()
cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld."
"${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF)
option(USE_MIMALLOC "Use mimalloc" OFF)
# Enable third party mimalloc library to improve memory allocation performance
@ -657,6 +664,11 @@ endif(MSVC)
string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
# Set linker max-page-size to 64KiB on AArch64 Linux
if(LINUX AND CPU_AARCH64)
add_link_options_if_supported("-z,max-page-size=0x10000")
endif()
# Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
# applicable to mobile are disabled by this variable. Setting
# `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
@ -1421,3 +1433,57 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
DESTINATION "${CMAKE_INSTALL_BINDIR}")
endif()
if(USE_PRIORITIZED_TEXT_FOR_LD)
add_compile_options(
$<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
$<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
)
set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
add_custom_command(
OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
COMMENT "Generating prioritized text linker files"
VERBATIM
)
add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
if(BUILD_PYTHON)
set(LINKER_OPT_TARGETS torch_python)
endif()
if(NOT BUILD_LIBTORCHLESS)
list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
if(USE_CUDA)
list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
endif()
if(USE_XPU)
list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
endif()
if(USE_ROCM)
list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
endif()
endif()
foreach(tgt IN LISTS LINKER_OPT_TARGETS)
if(TARGET ${tgt})
add_dependencies("${tgt}" generate_linker_script)
target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
else()
message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
endif()
endforeach()
else()
if(LINUX AND CPU_AARCH64)
message(WARNING [[
It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
]])
endif()
endif()

View File

@ -158,6 +158,7 @@ function(caffe2_print_configuration_summary)
if(${USE_KLEIDIAI})
message(STATUS " USE_KLEIDIAI : ${USE_KLEIDIAI}")
endif()
message(STATUS " USE_PRIORITIZED_TEXT_FOR_LD : ${USE_PRIORITIZED_TEXT_FOR_LD}")
message(STATUS " USE_UCC : ${USE_UCC}")
if(${USE_UCC})
message(STATUS " USE_SYSTEM_UCC : ${USE_SYSTEM_UCC}")

View File

@ -482,6 +482,7 @@ function(torch_update_find_cuda_flags)
endfunction()
include(CheckCXXCompilerFlag)
include(CheckLinkerFlag)
##############################################################################
# CHeck if given flag is supported and append it to provided outputvar
@ -511,3 +512,22 @@ function(target_compile_options_if_supported target flag)
target_compile_options(${target} PRIVATE ${flag})
endif()
endfunction()
# Check if a global link option is supported
function(add_link_options_if_supported flag)
check_linker_flag(C "LINKER:${flag}" _supported)
if("${_supported}")
add_link_options("LINKER:${flag}")
else()
message(WARNING "Attempted to use unsupported link option : ${flag}.")
endif()
endfunction()
function(target_link_options_if_supported tgt flag)
check_linker_flag(C "LINKER:${flag}" _supported)
if("${_supported}")
target_link_options("${tgt}" PRIVATE "LINKER:${flag}")
else()
message(WARNING "Attempted to use unsupported link option : ${flag}.")
endif()
endfunction()

View File

@ -227,9 +227,6 @@
# Static link mimalloc into C10, and use mimalloc in alloc_cpu & alloc_free.
# By default, It is only enabled on Windows.
#
# USE_PRIORITIZED_TEXT_FOR_LD
# Uses prioritized text form cmake/prioritized_text.txt for LD
#
# BUILD_LIBTORCH_WHL
# Builds libtorch.so and its dependencies as a wheel
#
@ -323,7 +320,6 @@ from tools.setup_helpers.env import (
IS_LINUX,
IS_WINDOWS,
)
from tools.setup_helpers.generate_linker_script import gen_linker_script
def str2bool(value: str | None) -> bool:
@ -1627,26 +1623,6 @@ def main() -> None:
if BUILD_PYTHON_ONLY:
install_requires += [f"{LIBTORCH_PKG_NAME}=={TORCH_VERSION}"]
if str2bool(os.getenv("USE_PRIORITIZED_TEXT_FOR_LD")):
gen_linker_script(
filein="cmake/prioritized_text.txt", fout="cmake/linker_script.ld"
)
linker_script_path = os.path.abspath("cmake/linker_script.ld")
os.environ["LDFLAGS"] = os.getenv("LDFLAGS", "") + f" -T{linker_script_path}"
os.environ["CFLAGS"] = (
os.getenv("CFLAGS", "") + " -ffunction-sections -fdata-sections"
)
os.environ["CXXFLAGS"] = (
os.getenv("CXXFLAGS", "") + " -ffunction-sections -fdata-sections"
)
elif platform.system() == "Linux" and platform.processor() == "aarch64":
print_box(
"""
WARNING: we strongly recommend enabling linker script optimization for ARM + CUDA.
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
"""
)
# Parse the command line and check the arguments before we proceed with
# building deps and setup. We need to set values so `--help` works.
dist = Distribution()

View File

@ -1,5 +1,7 @@
import argparse
import os
import subprocess
from pathlib import Path
def gen_linker_script(
@ -28,6 +30,10 @@ def gen_linker_script(
assert len(text_line_start) == 1, "The linker script has multiple text sections!"
text_line_start = text_line_start[0]
# ensure that parent directory exists before writing
fout = Path(fout)
fout.parent.mkdir(parents=True, exist_ok=True)
with open(fout, "w") as f:
for lineid, line in enumerate(linker_script_lines):
if lineid == text_line_start + 2:
@ -36,3 +42,20 @@ def gen_linker_script(
f.write(f" .text.{plines}\n")
f.write(" )\n")
f.write(f"{line}\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Generate linker file based on prioritized symbols. Used for link-time optimization.",
)
parser.add_argument(
"--filein",
help="Path to prioritized_text.txt input file",
default=argparse.SUPPRESS,
)
parser.add_argument(
"--fout", help="Output path for linker ld file", default=argparse.SUPPRESS
)
# convert args to a dict to pass to gen_linker_script
kwargs = vars(parser.parse_args())
gen_linker_script(**kwargs)