#!/usr/bin/env python3 # This script is for building AARCH64 wheels using AWS EC2 instances. # To generate binaries for the release follow these steps: # 1. Update mappings for each of the Domain Libraries by adding new row to a table like this: # "v1.11.0": ("0.11.0", "rc1"), # 2. Run script with following arguments for each of the supported python versions and required tag, for example: # build_aarch64_wheel.py --key-name --use-docker --python 3.8 --branch v1.11.0-rc3 import os import subprocess import sys import time from typing import Optional, Union import boto3 # AMI images for us-east-1, change the following based on your ~/.aws/config os_amis = { "ubuntu20_04": "ami-052eac90edaa9d08f", # login_name: ubuntu "ubuntu22_04": "ami-0c6c29c5125214c77", # login_name: ubuntu "redhat8": "ami-0698b90665a2ddcf1", # login_name: ec2-user } ubuntu20_04_ami = os_amis["ubuntu20_04"] def compute_keyfile_path(key_name: Optional[str] = None) -> tuple[str, str]: if key_name is None: key_name = os.getenv("AWS_KEY_NAME") if key_name is None: return os.getenv("SSH_KEY_PATH", ""), "" homedir_path = os.path.expanduser("~") default_path = os.path.join(homedir_path, ".ssh", f"{key_name}.pem") return os.getenv("SSH_KEY_PATH", default_path), key_name ec2 = boto3.resource("ec2") def ec2_get_instances(filter_name, filter_value): return ec2.instances.filter( Filters=[{"Name": filter_name, "Values": [filter_value]}] ) def ec2_instances_of_type(instance_type="t4g.2xlarge"): return ec2_get_instances("instance-type", instance_type) def ec2_instances_by_id(instance_id): rc = list(ec2_get_instances("instance-id", instance_id)) return rc[0] if len(rc) > 0 else None def start_instance( key_name, ami=ubuntu20_04_ami, instance_type="t4g.2xlarge", ebs_size: int = 50 ): inst = ec2.create_instances( ImageId=ami, InstanceType=instance_type, SecurityGroups=["ssh-allworld"], KeyName=key_name, MinCount=1, MaxCount=1, BlockDeviceMappings=[ { "DeviceName": "/dev/sda1", "Ebs": { "DeleteOnTermination": True, "VolumeSize": ebs_size, "VolumeType": "standard", }, } ], )[0] print(f"Create instance {inst.id}") inst.wait_until_running() running_inst = ec2_instances_by_id(inst.id) print(f"Instance started at {running_inst.public_dns_name}") return running_inst class RemoteHost: addr: str keyfile_path: str login_name: str container_id: Optional[str] = None ami: Optional[str] = None def __init__(self, addr: str, keyfile_path: str, login_name: str = "ubuntu"): self.addr = addr self.keyfile_path = keyfile_path self.login_name = login_name def _gen_ssh_prefix(self) -> list[str]: return [ "ssh", "-o", "StrictHostKeyChecking=no", "-i", self.keyfile_path, f"{self.login_name}@{self.addr}", "--", ] @staticmethod def _split_cmd(args: Union[str, list[str]]) -> list[str]: return args.split() if isinstance(args, str) else args def run_ssh_cmd(self, args: Union[str, list[str]]) -> None: subprocess.check_call(self._gen_ssh_prefix() + self._split_cmd(args)) def check_ssh_output(self, args: Union[str, list[str]]) -> str: return subprocess.check_output( self._gen_ssh_prefix() + self._split_cmd(args) ).decode("utf-8") def scp_upload_file(self, local_file: str, remote_file: str) -> None: subprocess.check_call( [ "scp", "-i", self.keyfile_path, local_file, f"{self.login_name}@{self.addr}:{remote_file}", ] ) def scp_download_file( self, remote_file: str, local_file: Optional[str] = None ) -> None: if local_file is None: local_file = "." subprocess.check_call( [ "scp", "-i", self.keyfile_path, f"{self.login_name}@{self.addr}:{remote_file}", local_file, ] ) def start_docker(self, image="quay.io/pypa/manylinux2014_aarch64:latest") -> None: self.run_ssh_cmd("sudo apt-get install -y docker.io") self.run_ssh_cmd(f"sudo usermod -a -G docker {self.login_name}") self.run_ssh_cmd("sudo service docker start") self.run_ssh_cmd(f"docker pull {image}") self.container_id = self.check_ssh_output( f"docker run -t -d -w /root {image}" ).strip() def using_docker(self) -> bool: return self.container_id is not None def run_cmd(self, args: Union[str, list[str]]) -> None: if not self.using_docker(): return self.run_ssh_cmd(args) assert self.container_id is not None docker_cmd = self._gen_ssh_prefix() + [ "docker", "exec", "-i", self.container_id, "bash", ] p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE) p.communicate( input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode( "utf-8" ) ) rc = p.wait() if rc != 0: raise subprocess.CalledProcessError(rc, docker_cmd) def check_output(self, args: Union[str, list[str]]) -> str: if not self.using_docker(): return self.check_ssh_output(args) assert self.container_id is not None docker_cmd = self._gen_ssh_prefix() + [ "docker", "exec", "-i", self.container_id, "bash", ] p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) (out, err) = p.communicate( input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode( "utf-8" ) ) rc = p.wait() if rc != 0: raise subprocess.CalledProcessError(rc, docker_cmd, output=out, stderr=err) return out.decode("utf-8") def upload_file(self, local_file: str, remote_file: str) -> None: if not self.using_docker(): return self.scp_upload_file(local_file, remote_file) tmp_file = os.path.join("/tmp", os.path.basename(local_file)) self.scp_upload_file(local_file, tmp_file) self.run_ssh_cmd( ["docker", "cp", tmp_file, f"{self.container_id}:/root/{remote_file}"] ) self.run_ssh_cmd(["rm", tmp_file]) def download_file(self, remote_file: str, local_file: Optional[str] = None) -> None: if not self.using_docker(): return self.scp_download_file(remote_file, local_file) tmp_file = os.path.join("/tmp", os.path.basename(remote_file)) self.run_ssh_cmd( ["docker", "cp", f"{self.container_id}:/root/{remote_file}", tmp_file] ) self.scp_download_file(tmp_file, local_file) self.run_ssh_cmd(["rm", tmp_file]) def download_wheel( self, remote_file: str, local_file: Optional[str] = None ) -> None: if self.using_docker() and local_file is None: basename = os.path.basename(remote_file) local_file = basename.replace( "-linux_aarch64.whl", "-manylinux2014_aarch64.whl" ) self.download_file(remote_file, local_file) def list_dir(self, path: str) -> list[str]: return self.check_output(["ls", "-1", path]).split("\n") def wait_for_connection(addr, port, timeout=15, attempt_cnt=5): import socket for i in range(attempt_cnt): try: with socket.create_connection((addr, port), timeout=timeout): return except (ConnectionRefusedError, TimeoutError): # noqa: PERF203 if i == attempt_cnt - 1: raise time.sleep(timeout) def update_apt_repo(host: RemoteHost) -> None: time.sleep(5) host.run_cmd("sudo systemctl stop apt-daily.service || true") host.run_cmd("sudo systemctl stop unattended-upgrades.service || true") host.run_cmd( "while systemctl is-active --quiet apt-daily.service; do sleep 1; done" ) host.run_cmd( "while systemctl is-active --quiet unattended-upgrades.service; do sleep 1; done" ) host.run_cmd("sudo apt-get update") time.sleep(3) host.run_cmd("sudo apt-get update") def install_condaforge( host: RemoteHost, suffix: str = "latest/download/Miniforge3-Linux-aarch64.sh" ) -> None: print("Install conda-forge") host.run_cmd(f"curl -OL https://github.com/conda-forge/miniforge/releases/{suffix}") host.run_cmd(f"sh -f {os.path.basename(suffix)} -b") host.run_cmd(f"rm -f {os.path.basename(suffix)}") if host.using_docker(): host.run_cmd("echo 'PATH=$HOME/miniforge3/bin:$PATH'>>.bashrc") else: host.run_cmd( [ "sed", "-i", "'/^# If not running interactively.*/i PATH=$HOME/miniforge3/bin:$PATH'", ".bashrc", ] ) def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None: if python_version == "3.6": # Python-3.6 EOLed and not compatible with conda-4.11 install_condaforge( host, suffix="download/4.10.3-10/Miniforge3-4.10.3-10-Linux-aarch64.sh" ) host.run_cmd(f"conda install -y python={python_version} numpy pyyaml") else: install_condaforge( host, suffix="download/4.11.0-4/Miniforge3-4.11.0-4-Linux-aarch64.sh" ) # Pytorch-1.10 or older are not compatible with setuptools=59.6 or newer host.run_cmd( f"conda install -y python={python_version} numpy pyyaml setuptools>=59.5.0" ) def build_OpenBLAS(host: RemoteHost, git_clone_flags: str = "") -> None: print("Building OpenBLAS") host.run_cmd( f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.28 {git_clone_flags}" ) make_flags = "NUM_THREADS=64 USE_OPENMP=1 NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=ARMV8" host.run_cmd( f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS" ) def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None: print("Building Arm Compute Library") acl_build_flags = " ".join( [ "debug=0", "neon=1", "opencl=0", "os=linux", "openmp=1", "cppthreads=0", "arch=armv8a", "multi_isa=1", "fixed_format_kernels=1", "build=native", ] ) host.run_cmd( f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v25.02 {git_clone_flags}" ) host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}") def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None: host.run_cmd("pip3 install auditwheel") host.run_cmd( "conda install -y patchelf" if use_conda else "sudo apt-get install -y patchelf" ) from tempfile import NamedTemporaryFile with NamedTemporaryFile() as tmp: tmp.write(embed_library_script.encode("utf-8")) tmp.flush() host.upload_file(tmp.name, "embed_library.py") print("Embedding libgomp into wheel") if host.using_docker(): host.run_cmd(f"python3 embed_library.py {wheel_name} --update-tag") else: host.run_cmd(f"python3 embed_library.py {wheel_name}") def checkout_repo( host: RemoteHost, *, branch: str = "main", url: str, git_clone_flags: str, mapping: dict[str, tuple[str, str]], ) -> Optional[str]: for prefix in mapping: if not branch.startswith(prefix): continue tag = f"v{mapping[prefix][0]}-{mapping[prefix][1]}" host.run_cmd(f"git clone {url} -b {tag} {git_clone_flags}") return mapping[prefix][0] host.run_cmd(f"git clone {url} -b {branch} {git_clone_flags}") return None def build_torchvision( host: RemoteHost, *, branch: str = "main", use_conda: bool = True, git_clone_flags: str, run_smoke_tests: bool = True, ) -> str: print("Checking out TorchVision repo") build_version = checkout_repo( host, branch=branch, url="https://github.com/pytorch/vision", git_clone_flags=git_clone_flags, mapping={ "v1.7.1": ("0.8.2", "rc2"), "v1.8.0": ("0.9.0", "rc3"), "v1.8.1": ("0.9.1", "rc1"), "v1.9.0": ("0.10.0", "rc1"), "v1.10.0": ("0.11.1", "rc1"), "v1.10.1": ("0.11.2", "rc1"), "v1.10.2": ("0.11.3", "rc1"), "v1.11.0": ("0.12.0", "rc1"), "v1.12.0": ("0.13.0", "rc4"), "v1.12.1": ("0.13.1", "rc6"), "v1.13.0": ("0.14.0", "rc4"), "v1.13.1": ("0.14.1", "rc2"), "v2.0.0": ("0.15.1", "rc2"), "v2.0.1": ("0.15.2", "rc2"), }, ) print("Building TorchVision wheel") # Please note libnpg and jpeg are required to build image.so extension if use_conda: host.run_cmd("conda install -y libpng jpeg") # Remove .so files to force static linking host.run_cmd( "rm miniforge3/lib/libpng.so miniforge3/lib/libpng16.so miniforge3/lib/libjpeg.so" ) # And patch setup.py to include libz dependency for libpng host.run_cmd( [ 'sed -i -e \'s/image_link_flags\\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py' ] ) build_vars = "" if branch == "nightly": version = host.check_output( ["if [ -f vision/version.txt ]; then cat vision/version.txt; fi"] ).strip() if len(version) == 0: # In older revisions, version was embedded in setup.py version = ( host.check_output(["grep", '"version = \'"', "vision/setup.py"]) .strip() .split("'")[1][:-2] ) build_date = ( host.check_output("cd vision && git log --pretty=format:%s -1") .strip() .split()[0] .replace("-", "") ) build_vars += f"BUILD_VERSION={version}.dev{build_date}" elif build_version is not None: build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}" if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation") vision_wheel_name = host.list_dir("vision/dist")[0] embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name)) print("Copying TorchVision wheel") host.download_wheel(os.path.join("vision", "dist", vision_wheel_name)) if run_smoke_tests: host.run_cmd( f"pip3 install {os.path.join('vision', 'dist', vision_wheel_name)}" ) host.run_cmd("python3 vision/test/smoke_test.py") print("Delete vision checkout") host.run_cmd("rm -rf vision") return vision_wheel_name def build_torchdata( host: RemoteHost, *, branch: str = "main", use_conda: bool = True, git_clone_flags: str = "", ) -> str: print("Checking out TorchData repo") git_clone_flags += " --recurse-submodules" build_version = checkout_repo( host, branch=branch, url="https://github.com/pytorch/data", git_clone_flags=git_clone_flags, mapping={ "v1.13.1": ("0.5.1", ""), "v2.0.0": ("0.6.0", "rc5"), "v2.0.1": ("0.6.1", "rc1"), }, ) print("Building TorchData wheel") build_vars = "" if branch == "nightly": version = host.check_output( ["if [ -f data/version.txt ]; then cat data/version.txt; fi"] ).strip() build_date = ( host.check_output("cd data && git log --pretty=format:%s -1") .strip() .split()[0] .replace("-", "") ) build_vars += f"BUILD_VERSION={version}.dev{build_date}" elif build_version is not None: build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}" if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation") wheel_name = host.list_dir("data/dist")[0] embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name)) print("Copying TorchData wheel") host.download_wheel(os.path.join("data", "dist", wheel_name)) return wheel_name def build_torchtext( host: RemoteHost, *, branch: str = "main", use_conda: bool = True, git_clone_flags: str = "", ) -> str: print("Checking out TorchText repo") git_clone_flags += " --recurse-submodules" build_version = checkout_repo( host, branch=branch, url="https://github.com/pytorch/text", git_clone_flags=git_clone_flags, mapping={ "v1.9.0": ("0.10.0", "rc1"), "v1.10.0": ("0.11.0", "rc2"), "v1.10.1": ("0.11.1", "rc1"), "v1.10.2": ("0.11.2", "rc1"), "v1.11.0": ("0.12.0", "rc1"), "v1.12.0": ("0.13.0", "rc2"), "v1.12.1": ("0.13.1", "rc5"), "v1.13.0": ("0.14.0", "rc3"), "v1.13.1": ("0.14.1", "rc1"), "v2.0.0": ("0.15.1", "rc2"), "v2.0.1": ("0.15.2", "rc2"), }, ) print("Building TorchText wheel") build_vars = "" if branch == "nightly": version = host.check_output( ["if [ -f text/version.txt ]; then cat text/version.txt; fi"] ).strip() build_date = ( host.check_output("cd text && git log --pretty=format:%s -1") .strip() .split()[0] .replace("-", "") ) build_vars += f"BUILD_VERSION={version}.dev{build_date}" elif build_version is not None: build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}" if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation") wheel_name = host.list_dir("text/dist")[0] embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name)) print("Copying TorchText wheel") host.download_wheel(os.path.join("text", "dist", wheel_name)) return wheel_name def build_torchaudio( host: RemoteHost, *, branch: str = "main", use_conda: bool = True, git_clone_flags: str = "", ) -> str: print("Checking out TorchAudio repo") git_clone_flags += " --recurse-submodules" build_version = checkout_repo( host, branch=branch, url="https://github.com/pytorch/audio", git_clone_flags=git_clone_flags, mapping={ "v1.9.0": ("0.9.0", "rc2"), "v1.10.0": ("0.10.0", "rc5"), "v1.10.1": ("0.10.1", "rc1"), "v1.10.2": ("0.10.2", "rc1"), "v1.11.0": ("0.11.0", "rc1"), "v1.12.0": ("0.12.0", "rc3"), "v1.12.1": ("0.12.1", "rc5"), "v1.13.0": ("0.13.0", "rc4"), "v1.13.1": ("0.13.1", "rc2"), "v2.0.0": ("2.0.1", "rc3"), "v2.0.1": ("2.0.2", "rc2"), }, ) print("Building TorchAudio wheel") build_vars = "" if branch == "nightly": version = ( host.check_output(["grep", '"version = \'"', "audio/setup.py"]) .strip() .split("'")[1][:-2] ) build_date = ( host.check_output("cd audio && git log --pretty=format:%s -1") .strip() .split()[0] .replace("-", "") ) build_vars += f"BUILD_VERSION={version}.dev{build_date}" elif build_version is not None: build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}" if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" host.run_cmd( f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \ && ./packaging/ffmpeg/build.sh \ && {build_vars} python3 -m build --wheel --no-isolation" ) wheel_name = host.list_dir("audio/dist")[0] embed_libgomp(host, use_conda, os.path.join("audio", "dist", wheel_name)) print("Copying TorchAudio wheel") host.download_wheel(os.path.join("audio", "dist", wheel_name)) return wheel_name def configure_system( host: RemoteHost, *, compiler: str = "gcc-8", use_conda: bool = True, python_version: str = "3.8", ) -> None: if use_conda: install_condaforge_python(host, python_version) print("Configuring the system") if not host.using_docker(): update_apt_repo(host) host.run_cmd("sudo apt-get install -y ninja-build g++ git cmake gfortran unzip") else: host.run_cmd("yum install -y sudo") host.run_cmd("conda install -y ninja scons") if not use_conda: host.run_cmd( "sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip" ) host.run_cmd("pip3 install dataclasses typing-extensions") if not use_conda: print("Installing Cython + numpy from PyPy") host.run_cmd("sudo pip3 install Cython") host.run_cmd("sudo pip3 install numpy") def build_domains( host: RemoteHost, *, branch: str = "main", use_conda: bool = True, git_clone_flags: str = "", ) -> tuple[str, str, str, str]: vision_wheel_name = build_torchvision( host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags ) audio_wheel_name = build_torchaudio( host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags ) data_wheel_name = build_torchdata( host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags ) text_wheel_name = build_torchtext( host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags ) return (vision_wheel_name, audio_wheel_name, data_wheel_name, text_wheel_name) def start_build( host: RemoteHost, *, branch: str = "main", compiler: str = "gcc-8", use_conda: bool = True, python_version: str = "3.8", pytorch_only: bool = False, pytorch_build_number: Optional[str] = None, shallow_clone: bool = True, enable_mkldnn: bool = False, ) -> tuple[str, str, str, str, str]: git_clone_flags = " --depth 1 --shallow-submodules" if shallow_clone else "" if host.using_docker() and not use_conda: print("Auto-selecting conda option for docker images") use_conda = True if not host.using_docker(): print("Disable mkldnn for host builds") enable_mkldnn = False configure_system( host, compiler=compiler, use_conda=use_conda, python_version=python_version ) build_OpenBLAS(host, git_clone_flags) if host.using_docker(): print("Move libgfortant.a into a standard location") # HACK: pypa gforntran.a is compiled without PIC, which leads to the following error # libgfortran.a(error.o)(.text._gfortrani_st_printf+0x34): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `__stack_chk_guard@@GLIBC_2.17' # noqa: E501, B950 # Workaround by copying gfortran library from the host host.run_ssh_cmd("sudo apt-get install -y gfortran-8") host.run_cmd("mkdir -p /usr/lib/gcc/aarch64-linux-gnu/8") host.run_ssh_cmd( [ "docker", "cp", "/usr/lib/gcc/aarch64-linux-gnu/8/libgfortran.a", f"{host.container_id}:/opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/", ] ) print("Checking out PyTorch repo") host.run_cmd( f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}" ) print("Building PyTorch wheel") build_opts = "" if pytorch_build_number is not None: build_opts += f" -C--build-option=--build-number={pytorch_build_number}" # Breakpad build fails on aarch64 build_vars = "USE_BREAKPAD=0 " if branch == "nightly": build_date = ( host.check_output("cd pytorch && git log --pretty=format:%s -1") .strip() .split()[0] .replace("-", "") ) version = host.check_output("cat pytorch/version.txt").strip()[:-2] build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1" if branch.startswith(("v1.", "v2.")): build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1" if host.using_docker(): build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" if enable_mkldnn: build_ArmComputeLibrary(host, git_clone_flags) print("build pytorch with mkldnn+acl backend") build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON" host.run_cmd( f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && " f"{build_vars} python3 -m build --wheel --no-isolation{build_opts}" ) print("Repair the wheel") pytorch_wheel_name = host.list_dir("pytorch/dist")[0] ld_library_path = "$HOME/acl/build:$HOME/pytorch/build/lib" host.run_cmd( f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}" ) print("replace the original wheel with the repaired one") pytorch_repaired_wheel_name = host.list_dir("wheelhouse")[0] host.run_cmd( f"cp $HOME/wheelhouse/{pytorch_repaired_wheel_name} $HOME/pytorch/dist/{pytorch_wheel_name}" ) else: print("build pytorch without mkldnn backend") host.run_cmd( f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}" ) print("Deleting build folder") host.run_cmd("cd pytorch && rm -rf build") pytorch_wheel_name = host.list_dir("pytorch/dist")[0] embed_libgomp(host, use_conda, os.path.join("pytorch", "dist", pytorch_wheel_name)) print("Copying the wheel") host.download_wheel(os.path.join("pytorch", "dist", pytorch_wheel_name)) print("Installing PyTorch wheel") host.run_cmd(f"pip3 install pytorch/dist/{pytorch_wheel_name}") if pytorch_only: return (pytorch_wheel_name, None, None, None, None) domain_wheels = build_domains( host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags ) return (pytorch_wheel_name, *domain_wheels) embed_library_script = """ #!/usr/bin/env python3 from auditwheel.patcher import Patchelf from auditwheel.wheeltools import InWheelCtx from auditwheel.elfutils import elf_file_filter from auditwheel.repair import copylib from auditwheel.lddtree import lddtree from subprocess import check_call import os import shutil import sys from tempfile import TemporaryDirectory def replace_tag(filename): with open(filename, 'r') as f: lines = f.read().split("\\n") for i,line in enumerate(lines): if not line.startswith("Tag: "): continue lines[i] = line.replace("-linux_", "-manylinux2014_") print(f'Updated tag from {line} to {lines[i]}') with open(filename, 'w') as f: f.write("\\n".join(lines)) class AlignedPatchelf(Patchelf): def set_soname(self, file_name: str, new_soname: str) -> None: check_call(['patchelf', '--page-size', '65536', '--set-soname', new_soname, file_name]) def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None: check_call(['patchelf', '--page-size', '65536', '--replace-needed', soname, new_soname, file_name]) def embed_library(whl_path, lib_soname, update_tag=False): patcher = AlignedPatchelf() out_dir = TemporaryDirectory() whl_name = os.path.basename(whl_path) tmp_whl_name = os.path.join(out_dir.name, whl_name) with InWheelCtx(whl_path) as ctx: torchlib_path = os.path.join(ctx._tmpdir.name, 'torch', 'lib') ctx.out_wheel=tmp_whl_name new_lib_path, new_lib_soname = None, None for filename, elf in elf_file_filter(ctx.iter_files()): if not filename.startswith('torch/lib'): continue libtree = lddtree(filename) if lib_soname not in libtree['needed']: continue lib_path = libtree['libs'][lib_soname]['path'] if lib_path is None: print(f"Can't embed {lib_soname} as it could not be found") break if lib_path.startswith(torchlib_path): continue if new_lib_path is None: new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher) patcher.replace_needed(filename, lib_soname, new_lib_soname) print(f'Replacing {lib_soname} with {new_lib_soname} for {filename}') if update_tag: # Add manylinux2014 tag for filename in ctx.iter_files(): if os.path.basename(filename) != 'WHEEL': continue replace_tag(filename) shutil.move(tmp_whl_name, whl_path) if __name__ == '__main__': embed_library(sys.argv[1], 'libgomp.so.1', len(sys.argv) > 2 and sys.argv[2] == '--update-tag') """ def run_tests(host: RemoteHost, whl: str, branch="main") -> None: print("Configuring the system") update_apt_repo(host) host.run_cmd("sudo apt-get install -y python3-pip git") host.run_cmd("sudo pip3 install Cython") host.run_cmd("sudo pip3 install numpy") host.upload_file(whl, ".") host.run_cmd(f"sudo pip3 install {whl}") host.run_cmd("python3 -c 'import torch;print(torch.rand((3,3))'") host.run_cmd(f"git clone -b {branch} https://github.com/pytorch/pytorch") host.run_cmd("cd pytorch/test; python3 test_torch.py -v") def get_instance_name(instance) -> Optional[str]: if instance.tags is None: return None for tag in instance.tags: if tag["Key"] == "Name": return tag["Value"] return None def list_instances(instance_type: str) -> None: print(f"All instances of type {instance_type}") for instance in ec2_instances_of_type(instance_type): ifaces = instance.network_interfaces az = ifaces[0].subnet.availability_zone if len(ifaces) > 0 else None print( f"{instance.id} {get_instance_name(instance)} {instance.public_dns_name} {instance.state['Name']} {az}" ) def terminate_instances(instance_type: str) -> None: print(f"Terminating all instances of type {instance_type}") instances = list(ec2_instances_of_type(instance_type)) for instance in instances: print(f"Terminating {instance.id}") instance.terminate() print("Waiting for termination to complete") for instance in instances: instance.wait_until_terminated() def parse_arguments(): from argparse import ArgumentParser parser = ArgumentParser("Builid and test AARCH64 wheels using EC2") parser.add_argument("--key-name", type=str) parser.add_argument("--debug", action="store_true") parser.add_argument("--build-only", action="store_true") parser.add_argument("--test-only", type=str) group = parser.add_mutually_exclusive_group() group.add_argument("--os", type=str, choices=list(os_amis.keys())) group.add_argument("--ami", type=str) parser.add_argument( "--python-version", type=str, choices=[f"3.{d}" for d in range(6, 12)], default=None, ) parser.add_argument("--alloc-instance", action="store_true") parser.add_argument("--list-instances", action="store_true") parser.add_argument("--pytorch-only", action="store_true") parser.add_argument("--keep-running", action="store_true") parser.add_argument("--terminate-instances", action="store_true") parser.add_argument("--instance-type", type=str, default="t4g.2xlarge") parser.add_argument("--ebs-size", type=int, default=50) parser.add_argument("--branch", type=str, default="main") parser.add_argument("--use-docker", action="store_true") parser.add_argument( "--compiler", type=str, choices=["gcc-7", "gcc-8", "gcc-9", "clang"], default="gcc-8", ) parser.add_argument("--use-torch-from-pypi", action="store_true") parser.add_argument("--pytorch-build-number", type=str, default=None) parser.add_argument("--disable-mkldnn", action="store_true") return parser.parse_args() if __name__ == "__main__": args = parse_arguments() ami = ( args.ami if args.ami is not None else os_amis[args.os] if args.os is not None else ubuntu20_04_ami ) keyfile_path, key_name = compute_keyfile_path(args.key_name) if args.list_instances: list_instances(args.instance_type) sys.exit(0) if args.terminate_instances: terminate_instances(args.instance_type) sys.exit(0) if len(key_name) == 0: raise RuntimeError(""" Cannot start build without key_name, please specify --key-name argument or AWS_KEY_NAME environment variable.""") if len(keyfile_path) == 0 or not os.path.exists(keyfile_path): raise RuntimeError(f""" Cannot find keyfile with name: [{key_name}] in path: [{keyfile_path}], please check `~/.ssh/` folder or manually set SSH_KEY_PATH environment variable.""") # Starting the instance inst = start_instance( key_name, ami=ami, instance_type=args.instance_type, ebs_size=args.ebs_size ) instance_name = f"{args.key_name}-{args.os}" if args.python_version is not None: instance_name += f"-py{args.python_version}" inst.create_tags( DryRun=False, Tags=[ { "Key": "Name", "Value": instance_name, } ], ) addr = inst.public_dns_name wait_for_connection(addr, 22) host = RemoteHost(addr, keyfile_path) host.ami = ami if args.use_docker: update_apt_repo(host) host.start_docker() if args.test_only: run_tests(host, args.test_only) sys.exit(0) if args.alloc_instance: if args.python_version is None: sys.exit(0) install_condaforge_python(host, args.python_version) sys.exit(0) python_version = args.python_version if args.python_version is not None else "3.10" if args.use_torch_from_pypi: configure_system(host, compiler=args.compiler, python_version=python_version) print("Installing PyTorch wheel") host.run_cmd("pip3 install torch") build_domains( host, branch=args.branch, git_clone_flags=" --depth 1 --shallow-submodules" ) else: start_build( host, branch=args.branch, compiler=args.compiler, python_version=python_version, pytorch_only=args.pytorch_only, pytorch_build_number=args.pytorch_build_number, enable_mkldnn=not args.disable_mkldnn, ) if not args.keep_running: print(f"Waiting for instance {inst.id} to terminate") inst.terminate() inst.wait_until_terminated()