[Build] Add linker script optimization (#121975)

This PR adds a linker script optimization based on prioritized symbols that can be extracted from the profiles of popular workloads. The present linker script was generated to target ARM+CUDA and later can be extended if necessary. The reason we target ARM is shown below:

> PyTorch and other applications that access more than 24x 2MB code regions in quick succession can result in performance bottlenecks in the CPU front-end.  The link-time optimization improves executable code locality and improve performance. We recommend turning on the optimization always for PyTorch and other application that behaves similarly.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/121975
Approved by: https://github.com/ptrblck, https://github.com/atalman
This commit is contained in:
Aidyn-A
2024-04-09 20:22:20 +00:00
committed by PyTorch MergeBot
parent 178ce1433c
commit a6080f79e9
3 changed files with 1270 additions and 0 deletions

1204
cmake/prioritized_text.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -223,6 +223,9 @@
# USE_MIMALLOC
# Static link mimalloc into C10, and use mimalloc in alloc_cpu & alloc_free.
# By default, It is only enabled on Windows.
#
# USE_PRIORITIZED_TEXT_FOR_LD
# Uses prioritized text form cmake/prioritized_text.txt for LD
import sys
@ -263,6 +266,7 @@ from tools.build_pytorch_libs import build_caffe2
from tools.generate_torch_version import get_torch_version
from tools.setup_helpers.cmake import CMake
from tools.setup_helpers.env import build_type, IS_DARWIN, IS_LINUX, IS_WINDOWS
from tools.setup_helpers.generate_linker_script import gen_linker_script
################################################################################
# Parameters parsed from environment
@ -1114,6 +1118,31 @@ def main():
'mkl>=2021.1.1,<=2021.4.0; platform_system == "Windows"',
]
use_prioritized_text = str(os.getenv("USE_PRIORITIZED_TEXT_FOR_LD", ""))
if (
use_prioritized_text == ""
and platform.system() == "Linux"
and platform.processor() == "aarch64"
):
print_box(
"""
WARNING: we strongly recommend enabling linker script optimization for ARM + CUDA.
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
"""
)
if use_prioritized_text == "1" or use_prioritized_text == "True":
gen_linker_script(
filein="cmake/prioritized_text.txt", fout="cmake/linker_script.ld"
)
linker_script_path = os.path.abspath("cmake/linker_script.ld")
os.environ["LDFLAGS"] = os.getenv("LDFLAGS", "") + f" -T{linker_script_path}"
os.environ["CFLAGS"] = (
os.getenv("CFLAGS", "") + " -ffunction-sections -fdata-sections"
)
os.environ["CXXFLAGS"] = (
os.getenv("CXXFLAGS", "") + " -ffunction-sections -fdata-sections"
)
# Parse the command line and check the arguments before we proceed with
# building deps and setup. We need to set values so `--help` works.
dist = Distribution()

View File

@ -0,0 +1,37 @@
import subprocess
def gen_linker_script(
filein: str = "cmake/prioritized_text.txt", fout: str = "cmake/linker_script.ld"
) -> None:
with open(filein) as f:
prioritized_text = f.readlines()
prioritized_text = [
line.replace("\n", "") for line in prioritized_text if line != "\n"
]
linker_script_lines = subprocess.check_output(["ld", "-verbose"], text=True).split(
"\n"
)
indices = [
i
for i, x in enumerate(linker_script_lines)
if x == "=================================================="
]
linker_script_lines = linker_script_lines[indices[0] + 1 : indices[1]]
text_line_start = [
i for i, line in enumerate(linker_script_lines) if ".text :" in line
]
assert len(text_line_start) == 1, "The linker script has multiple text sections!"
text_line_start = text_line_start[0]
with open(fout, "w") as f:
for lineid, line in enumerate(linker_script_lines):
if lineid == text_line_start + 2:
f.write(" *(\n")
for plines in prioritized_text:
f.write(f" .text.{plines}\n")
f.write(" )\n")
f.write(f"{line}\n")