Compare commits

..

2 Commits

Author SHA1 Message Date
af79feb29d Merge branch 'main' into bf/cudagraph-partition 2025-01-09 09:51:28 -08:00
71f2d29ebd init 2024-12-09 10:10:50 -08:00
10745 changed files with 465524 additions and 508572 deletions

View File

@ -2,7 +2,7 @@ build --cxxopt=--std=c++17
build --copt=-I. build --copt=-I.
# Bazel does not support including its cc_library targets as system # Bazel does not support including its cc_library targets as system
# headers. We work around this for generated code # headers. We work around this for generated code
# (e.g. torch/headeronly/macros/cmake_macros.h) by making the generated directory a # (e.g. c10/macros/cmake_macros.h) by making the generated directory a
# system include path. # system include path.
build --copt=-isystem --copt bazel-out/k8-fastbuild/bin build --copt=-isystem --copt bazel-out/k8-fastbuild/bin
build --copt=-isystem --copt bazel-out/darwin-fastbuild/bin build --copt=-isystem --copt bazel-out/darwin-fastbuild/bin

View File

@ -3,10 +3,6 @@ set -eux -o pipefail
GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-} GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
fi
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
source $SCRIPTPATH/aarch64_ci_setup.sh source $SCRIPTPATH/aarch64_ci_setup.sh
@ -18,14 +14,13 @@ cd /
# on the mounted pytorch repo # on the mounted pytorch repo
git config --global --add safe.directory /pytorch git config --global --add safe.directory /pytorch
pip install -r /pytorch/requirements.txt pip install -r /pytorch/requirements.txt
pip install auditwheel==6.2.0 pip install auditwheel
if [ "$DESIRED_CUDA" = "cpu" ]; then if [ "$DESIRED_CUDA" = "cpu" ]; then
echo "BASE_CUDA_VERSION is not set. Building cpu wheel." echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
else else
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA" echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
export USE_SYSTEM_NCCL=1
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
fi fi

View File

@ -5,14 +5,16 @@ set -eux -o pipefail
# By creating symlinks from desired /opt/python to /usr/local/bin/ # By creating symlinks from desired /opt/python to /usr/local/bin/
NUMPY_VERSION=2.0.2 NUMPY_VERSION=2.0.2
if [[ "$DESIRED_PYTHON" == "3.13" || "$DESIRED_PYTHON" == "3.13t" ]]; then PYGIT2_VERSION=1.15.1
if [[ "$DESIRED_PYTHON" == "3.13" ]]; then
NUMPY_VERSION=2.1.2 NUMPY_VERSION=2.1.2
PYGIT2_VERSION=1.16.0
fi fi
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
source $SCRIPTPATH/../manywheel/set_desired_python.sh source $SCRIPTPATH/../manywheel/set_desired_python.sh
pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2 pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2 pygit2==${PYGIT2_VERSION}
for tool in python python3 pip pip3 ninja scons patchelf; do for tool in python python3 pip pip3 ninja scons patchelf; do
ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin; ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin;

View File

@ -4,9 +4,12 @@
import os import os
import shutil import shutil
from subprocess import check_call, check_output from subprocess import check_call, check_output
from typing import List
from pygit2 import Repository
def list_dir(path: str) -> list[str]: def list_dir(path: str) -> List[str]:
"""' """'
Helper for getting paths for Python Helper for getting paths for Python
""" """
@ -31,47 +34,33 @@ def build_ArmComputeLibrary() -> None:
"build=native", "build=native",
] ]
acl_install_dir = "/acl" acl_install_dir = "/acl"
acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary") acl_checkout_dir = "ComputeLibrary"
if os.path.isdir(acl_install_dir): os.makedirs(acl_install_dir)
shutil.rmtree(acl_install_dir) check_call(
if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)): [
check_call( "git",
[ "clone",
"git", "https://github.com/ARM-software/ComputeLibrary.git",
"clone", "-b",
"https://github.com/ARM-software/ComputeLibrary.git", "v24.09",
"-b", "--depth",
"v25.02", "1",
"--depth", "--shallow-submodules",
"1", ]
"--shallow-submodules", )
]
)
check_call( check_call(
["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags, ["scons", "Werror=1", "-j8", f"build_dir=/{acl_install_dir}/build"]
+ acl_build_flags,
cwd=acl_checkout_dir, cwd=acl_checkout_dir,
) )
for d in ["arm_compute", "include", "utils", "support", "src", "build"]: for d in ["arm_compute", "include", "utils", "support", "src"]:
shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}") shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
def replace_tag(filename) -> None: def update_wheel(wheel_path) -> None:
with open(filename) as f:
lines = f.readlines()
for i, line in enumerate(lines):
if line.startswith("Tag:"):
lines[i] = line.replace("-linux_", "-manylinux_2_28_")
print(f"Updated tag from {line} to {lines[i]}")
break
with open(filename, "w") as f:
f.writelines(lines)
def package_cuda_wheel(wheel_path, desired_cuda) -> None:
""" """
Package the cuda wheel libraries Update the cuda wheel libraries
""" """
folder = os.path.dirname(wheel_path) folder = os.path.dirname(wheel_path)
wheelname = os.path.basename(wheel_path) wheelname = os.path.basename(wheel_path)
@ -79,7 +68,6 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
os.system(f"unzip {wheel_path} -d {folder}/tmp") os.system(f"unzip {wheel_path} -d {folder}/tmp")
libs_to_copy = [ libs_to_copy = [
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12", "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
"/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
"/usr/local/cuda/lib64/libcudnn.so.9", "/usr/local/cuda/lib64/libcudnn.so.9",
"/usr/local/cuda/lib64/libcublas.so.12", "/usr/local/cuda/lib64/libcublas.so.12",
"/usr/local/cuda/lib64/libcublasLt.so.12", "/usr/local/cuda/lib64/libcublasLt.so.12",
@ -89,9 +77,10 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
"/usr/local/cuda/lib64/libcusparseLt.so.0", "/usr/local/cuda/lib64/libcusparseLt.so.0",
"/usr/local/cuda/lib64/libcusolver.so.11", "/usr/local/cuda/lib64/libcusolver.so.11",
"/usr/local/cuda/lib64/libcurand.so.10", "/usr/local/cuda/lib64/libcurand.so.10",
"/usr/local/cuda/lib64/libnccl.so.2", "/usr/local/cuda/lib64/libnvToolsExt.so.1",
"/usr/local/cuda/lib64/libnvJitLink.so.12", "/usr/local/cuda/lib64/libnvJitLink.so.12",
"/usr/local/cuda/lib64/libnvrtc.so.12", "/usr/local/cuda/lib64/libnvrtc.so.12",
"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.6",
"/usr/local/cuda/lib64/libcudnn_adv.so.9", "/usr/local/cuda/lib64/libcudnn_adv.so.9",
"/usr/local/cuda/lib64/libcudnn_cnn.so.9", "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
"/usr/local/cuda/lib64/libcudnn_graph.so.9", "/usr/local/cuda/lib64/libcudnn_graph.so.9",
@ -103,19 +92,18 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
"/usr/lib64/libgfortran.so.5", "/usr/lib64/libgfortran.so.5",
"/acl/build/libarm_compute.so", "/acl/build/libarm_compute.so",
"/acl/build/libarm_compute_graph.so", "/acl/build/libarm_compute_graph.so",
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
"/usr/local/lib/libnvpl_lapack_core.so.0",
"/usr/local/lib/libnvpl_blas_core.so.0",
] ]
if enable_cuda:
if "129" in desired_cuda:
libs_to_copy += [ libs_to_copy += [
"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9", "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
"/usr/local/cuda/lib64/libcufile.so.0", "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
"/usr/local/cuda/lib64/libcufile_rdma.so.1", "/usr/local/lib/libnvpl_lapack_core.so.0",
"/usr/local/lib/libnvpl_blas_core.so.0",
]
else:
libs_to_copy += [
"/opt/OpenBLAS/lib/libopenblas.so.0",
] ]
# Copy libraries to unzipped_folder/a/lib # Copy libraries to unzipped_folder/a/lib
for lib_path in libs_to_copy: for lib_path in libs_to_copy:
lib_name = os.path.basename(lib_path) lib_name = os.path.basename(lib_path)
@ -124,13 +112,6 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
f"cd {folder}/tmp/torch/lib/; " f"cd {folder}/tmp/torch/lib/; "
f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}" f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}"
) )
# Make sure the wheel is tagged with manylinux_2_28
for f in os.scandir(f"{folder}/tmp/"):
if f.is_dir() and f.name.endswith(".dist-info"):
replace_tag(f"{f.path}/WHEEL")
break
os.mkdir(f"{folder}/cuda_wheel") os.mkdir(f"{folder}/cuda_wheel")
os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *") os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
shutil.move( shutil.move(
@ -147,9 +128,6 @@ def complete_wheel(folder: str) -> str:
""" """
wheel_name = list_dir(f"/{folder}/dist")[0] wheel_name = list_dir(f"/{folder}/dist")[0]
# Please note for cuda we don't run auditwheel since we use custom script to package
# the cuda dependencies to the wheel file using update_wheel() method.
# However we need to make sure filename reflects the correct Manylinux platform.
if "pytorch" in folder and not enable_cuda: if "pytorch" in folder and not enable_cuda:
print("Repairing Wheel with AuditWheel") print("Repairing Wheel with AuditWheel")
check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder) check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
@ -161,14 +139,7 @@ def complete_wheel(folder: str) -> str:
f"/{folder}/dist/{repaired_wheel_name}", f"/{folder}/dist/{repaired_wheel_name}",
) )
else: else:
repaired_wheel_name = wheel_name.replace( repaired_wheel_name = wheel_name
"linux_aarch64", "manylinux_2_28_aarch64"
)
print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}")
os.rename(
f"/{folder}/dist/{wheel_name}",
f"/{folder}/dist/{repaired_wheel_name}",
)
print(f"Copying {repaired_wheel_name} to artifacts") print(f"Copying {repaired_wheel_name} to artifacts")
shutil.copy2( shutil.copy2(
@ -200,24 +171,22 @@ if __name__ == "__main__":
args = parse_arguments() args = parse_arguments()
enable_mkldnn = args.enable_mkldnn enable_mkldnn = args.enable_mkldnn
enable_cuda = args.enable_cuda enable_cuda = args.enable_cuda
branch = check_output( repo = Repository("/pytorch")
["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd="/pytorch" branch = repo.head.name
).decode() if branch == "HEAD":
branch = "master"
print("Building PyTorch wheel") print("Building PyTorch wheel")
build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b) os.system("cd /pytorch; python setup.py clean")
if enable_cuda:
build_vars = "MAX_JOBS=5 " + build_vars
override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION") override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
desired_cuda = os.getenv("DESIRED_CUDA")
if override_package_version is not None: if override_package_version is not None:
version = override_package_version version = override_package_version
build_vars += ( build_vars += (
f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 " f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
) )
elif branch in ["nightly", "main"]: elif branch in ["nightly", "master"]:
build_date = ( build_date = (
check_output(["git", "log", "--pretty=format:%cs", "-1"], cwd="/pytorch") check_output(["git", "log", "--pretty=format:%cs", "-1"], cwd="/pytorch")
.decode() .decode()
@ -227,11 +196,12 @@ if __name__ == "__main__":
check_output(["cat", "version.txt"], cwd="/pytorch").decode().strip()[:-2] check_output(["cat", "version.txt"], cwd="/pytorch").decode().strip()[:-2]
) )
if enable_cuda: if enable_cuda:
desired_cuda = os.getenv("DESIRED_CUDA")
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date}+{desired_cuda} PYTORCH_BUILD_NUMBER=1 " build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date}+{desired_cuda} PYTORCH_BUILD_NUMBER=1 "
else: else:
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 " build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
elif branch.startswith(("v1.", "v2.")): elif branch.startswith(("v1.", "v2.")):
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 " build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
if enable_mkldnn: if enable_mkldnn:
build_ArmComputeLibrary() build_ArmComputeLibrary()
@ -255,6 +225,6 @@ if __name__ == "__main__":
print("Updating Cuda Dependency") print("Updating Cuda Dependency")
filename = os.listdir("/pytorch/dist/") filename = os.listdir("/pytorch/dist/")
wheel_path = f"/pytorch/dist/{filename[0]}" wheel_path = f"/pytorch/dist/{filename[0]}"
package_cuda_wheel(wheel_path, desired_cuda) update_wheel(wheel_path)
pytorch_wheel_name = complete_wheel("/pytorch/") pytorch_wheel_name = complete_wheel("/pytorch/")
print(f"Build Complete. Created {pytorch_wheel_name}..") print(f"Build Complete. Created {pytorch_wheel_name}..")

View File

@ -12,22 +12,22 @@ import os
import subprocess import subprocess
import sys import sys
import time import time
from typing import Optional, Union from typing import Dict, List, Optional, Tuple, Union
import boto3 import boto3
# AMI images for us-east-1, change the following based on your ~/.aws/config # AMI images for us-east-1, change the following based on your ~/.aws/config
os_amis = { os_amis = {
"ubuntu18_04": "ami-078eece1d8119409f", # login_name: ubuntu
"ubuntu20_04": "ami-052eac90edaa9d08f", # login_name: ubuntu "ubuntu20_04": "ami-052eac90edaa9d08f", # login_name: ubuntu
"ubuntu22_04": "ami-0c6c29c5125214c77", # login_name: ubuntu "ubuntu22_04": "ami-0c6c29c5125214c77", # login_name: ubuntu
"redhat8": "ami-0698b90665a2ddcf1", # login_name: ec2-user "redhat8": "ami-0698b90665a2ddcf1", # login_name: ec2-user
} }
ubuntu18_04_ami = os_amis["ubuntu18_04"]
ubuntu20_04_ami = os_amis["ubuntu20_04"]
def compute_keyfile_path(key_name: Optional[str] = None) -> tuple[str, str]: def compute_keyfile_path(key_name: Optional[str] = None) -> Tuple[str, str]:
if key_name is None: if key_name is None:
key_name = os.getenv("AWS_KEY_NAME") key_name = os.getenv("AWS_KEY_NAME")
if key_name is None: if key_name is None:
@ -57,7 +57,7 @@ def ec2_instances_by_id(instance_id):
def start_instance( def start_instance(
key_name, ami=ubuntu20_04_ami, instance_type="t4g.2xlarge", ebs_size: int = 50 key_name, ami=ubuntu18_04_ami, instance_type="t4g.2xlarge", ebs_size: int = 50
): ):
inst = ec2.create_instances( inst = ec2.create_instances(
ImageId=ami, ImageId=ami,
@ -96,7 +96,7 @@ class RemoteHost:
self.keyfile_path = keyfile_path self.keyfile_path = keyfile_path
self.login_name = login_name self.login_name = login_name
def _gen_ssh_prefix(self) -> list[str]: def _gen_ssh_prefix(self) -> List[str]:
return [ return [
"ssh", "ssh",
"-o", "-o",
@ -108,13 +108,13 @@ class RemoteHost:
] ]
@staticmethod @staticmethod
def _split_cmd(args: Union[str, list[str]]) -> list[str]: def _split_cmd(args: Union[str, List[str]]) -> List[str]:
return args.split() if isinstance(args, str) else args return args.split() if isinstance(args, str) else args
def run_ssh_cmd(self, args: Union[str, list[str]]) -> None: def run_ssh_cmd(self, args: Union[str, List[str]]) -> None:
subprocess.check_call(self._gen_ssh_prefix() + self._split_cmd(args)) subprocess.check_call(self._gen_ssh_prefix() + self._split_cmd(args))
def check_ssh_output(self, args: Union[str, list[str]]) -> str: def check_ssh_output(self, args: Union[str, List[str]]) -> str:
return subprocess.check_output( return subprocess.check_output(
self._gen_ssh_prefix() + self._split_cmd(args) self._gen_ssh_prefix() + self._split_cmd(args)
).decode("utf-8") ).decode("utf-8")
@ -157,7 +157,7 @@ class RemoteHost:
def using_docker(self) -> bool: def using_docker(self) -> bool:
return self.container_id is not None return self.container_id is not None
def run_cmd(self, args: Union[str, list[str]]) -> None: def run_cmd(self, args: Union[str, List[str]]) -> None:
if not self.using_docker(): if not self.using_docker():
return self.run_ssh_cmd(args) return self.run_ssh_cmd(args)
assert self.container_id is not None assert self.container_id is not None
@ -178,7 +178,7 @@ class RemoteHost:
if rc != 0: if rc != 0:
raise subprocess.CalledProcessError(rc, docker_cmd) raise subprocess.CalledProcessError(rc, docker_cmd)
def check_output(self, args: Union[str, list[str]]) -> str: def check_output(self, args: Union[str, List[str]]) -> str:
if not self.using_docker(): if not self.using_docker():
return self.check_ssh_output(args) return self.check_ssh_output(args)
assert self.container_id is not None assert self.container_id is not None
@ -230,7 +230,7 @@ class RemoteHost:
) )
self.download_file(remote_file, local_file) self.download_file(remote_file, local_file)
def list_dir(self, path: str) -> list[str]: def list_dir(self, path: str) -> List[str]:
return self.check_output(["ls", "-1", path]).split("\n") return self.check_output(["ls", "-1", path]).split("\n")
@ -327,7 +327,7 @@ def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None
] ]
) )
host.run_cmd( host.run_cmd(
f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v25.02 {git_clone_flags}" f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v24.09 {git_clone_flags}"
) )
host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}") host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}")
@ -358,7 +358,7 @@ def checkout_repo(
branch: str = "main", branch: str = "main",
url: str, url: str,
git_clone_flags: str, git_clone_flags: str,
mapping: dict[str, tuple[str, str]], mapping: Dict[str, Tuple[str, str]],
) -> Optional[str]: ) -> Optional[str]:
for prefix in mapping: for prefix in mapping:
if not branch.startswith(prefix): if not branch.startswith(prefix):
@ -657,6 +657,18 @@ def configure_system(
"sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip" "sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
) )
host.run_cmd("pip3 install dataclasses typing-extensions") host.run_cmd("pip3 install dataclasses typing-extensions")
# Install and switch to gcc-8 on Ubuntu-18.04
if not host.using_docker() and host.ami == ubuntu18_04_ami and compiler == "gcc-8":
host.run_cmd("sudo apt-get install -y g++-8 gfortran-8")
host.run_cmd(
"sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 100"
)
host.run_cmd(
"sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-8 100"
)
host.run_cmd(
"sudo update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-8 100"
)
if not use_conda: if not use_conda:
print("Installing Cython + numpy from PyPy") print("Installing Cython + numpy from PyPy")
host.run_cmd("sudo pip3 install Cython") host.run_cmd("sudo pip3 install Cython")
@ -669,7 +681,7 @@ def build_domains(
branch: str = "main", branch: str = "main",
use_conda: bool = True, use_conda: bool = True,
git_clone_flags: str = "", git_clone_flags: str = "",
) -> tuple[str, str, str, str]: ) -> Tuple[str, str, str, str]:
vision_wheel_name = build_torchvision( vision_wheel_name = build_torchvision(
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
) )
@ -696,7 +708,7 @@ def start_build(
pytorch_build_number: Optional[str] = None, pytorch_build_number: Optional[str] = None,
shallow_clone: bool = True, shallow_clone: bool = True,
enable_mkldnn: bool = False, enable_mkldnn: bool = False,
) -> tuple[str, str, str, str, str]: ) -> Tuple[str, str, str, str, str]:
git_clone_flags = " --depth 1 --shallow-submodules" if shallow_clone else "" git_clone_flags = " --depth 1 --shallow-submodules" if shallow_clone else ""
if host.using_docker() and not use_conda: if host.using_docker() and not use_conda:
print("Auto-selecting conda option for docker images") print("Auto-selecting conda option for docker images")
@ -747,7 +759,7 @@ def start_build(
version = host.check_output("cat pytorch/version.txt").strip()[:-2] version = host.check_output("cat pytorch/version.txt").strip()[:-2]
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1" build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
if branch.startswith(("v1.", "v2.")): if branch.startswith(("v1.", "v2.")):
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1" build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
if host.using_docker(): if host.using_docker():
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000" build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
if enable_mkldnn: if enable_mkldnn:
@ -920,9 +932,9 @@ def parse_arguments():
parser.add_argument("--debug", action="store_true") parser.add_argument("--debug", action="store_true")
parser.add_argument("--build-only", action="store_true") parser.add_argument("--build-only", action="store_true")
parser.add_argument("--test-only", type=str) parser.add_argument("--test-only", type=str)
group = parser.add_mutually_exclusive_group() parser.add_argument(
group.add_argument("--os", type=str, choices=list(os_amis.keys())) "--os", type=str, choices=list(os_amis.keys()), default="ubuntu20_04"
group.add_argument("--ami", type=str) )
parser.add_argument( parser.add_argument(
"--python-version", "--python-version",
type=str, type=str,
@ -952,13 +964,7 @@ def parse_arguments():
if __name__ == "__main__": if __name__ == "__main__":
args = parse_arguments() args = parse_arguments()
ami = ( ami = os_amis[args.os]
args.ami
if args.ami is not None
else os_amis[args.os]
if args.os is not None
else ubuntu20_04_ami
)
keyfile_path, key_name = compute_keyfile_path(args.key_name) keyfile_path, key_name = compute_keyfile_path(args.key_name)
if args.list_instances: if args.list_instances:
@ -1012,7 +1018,7 @@ if __name__ == "__main__":
install_condaforge_python(host, args.python_version) install_condaforge_python(host, args.python_version)
sys.exit(0) sys.exit(0)
python_version = args.python_version if args.python_version is not None else "3.9" python_version = args.python_version if args.python_version is not None else "3.8"
if args.use_torch_from_pypi: if args.use_torch_from_pypi:
configure_system(host, compiler=args.compiler, python_version=python_version) configure_system(host, compiler=args.compiler, python_version=python_version)

View File

@ -10,3 +10,5 @@ example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
built on Jenkins and are used in triggered builds already have this built on Jenkins and are used in triggered builds already have this
environment variable set in their manifest. Also see environment variable set in their manifest. Also see
`./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`. `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.

View File

@ -5,7 +5,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
if [[ ${BUILD_ENVIRONMENT} == *onnx* ]]; then if [[ ${BUILD_ENVIRONMENT} == *onnx* ]]; then
pip install click mock tabulate networkx==2.0 pip install click mock tabulate networkx==2.0
pip -q install "file:///var/lib/jenkins/workspace/third_party/onnx#egg=onnx" pip -q install --user "file:///var/lib/jenkins/workspace/third_party/onnx#egg=onnx"
fi fi
# Skip tests in environments where they are not built/applicable # Skip tests in environments where they are not built/applicable
@ -13,6 +13,10 @@ if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
echo 'Skipping tests' echo 'Skipping tests'
exit 0 exit 0
fi fi
if [[ "${BUILD_ENVIRONMENT}" == *-rocm* ]]; then
# temporary to locate some kernel issues on the CI nodes
export HSAKMT_DEBUG_LEVEL=4
fi
# These additional packages are needed for circleci ROCm builds. # These additional packages are needed for circleci ROCm builds.
if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
# Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by # Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by
@ -147,8 +151,8 @@ export DNNL_MAX_CPU_ISA=AVX2
if [[ "${SHARD_NUMBER:-1}" == "1" ]]; then if [[ "${SHARD_NUMBER:-1}" == "1" ]]; then
# TODO(sdym@meta.com) remove this when the linked issue resolved. # TODO(sdym@meta.com) remove this when the linked issue resolved.
# py is temporary until https://github.com/Teemu/pytest-sugar/issues/241 is fixed # py is temporary until https://github.com/Teemu/pytest-sugar/issues/241 is fixed
pip install py==1.11.0 pip install --user py==1.11.0
pip install pytest-sugar pip install --user pytest-sugar
# NB: Warnings are disabled because they make it harder to see what # NB: Warnings are disabled because they make it harder to see what
# the actual erroring test is # the actual erroring test is
"$PYTHON" \ "$PYTHON" \

View File

@ -34,5 +34,5 @@ See `build.sh` for valid build environments (it's the giant switch).
./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
# Set flags (see build.sh) and build image # Set flags (see build.sh) and build image
sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest sudo bash -c 'PROTOBUF=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
``` ```

View File

@ -1,7 +1,6 @@
ARG CUDA_VERSION=12.6 ARG CUDA_VERSION=12.4
ARG BASE_TARGET=cuda${CUDA_VERSION} ARG BASE_TARGET=cuda${CUDA_VERSION}
ARG ROCM_IMAGE=rocm/dev-almalinux-8:6.3-complete FROM amd64/almalinux:8 as base
FROM amd64/almalinux:8.10-20250519 as base
ENV LC_ALL en_US.UTF-8 ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8 ENV LANG en_US.UTF-8
@ -9,10 +8,12 @@ ENV LANGUAGE en_US.UTF-8
ARG DEVTOOLSET_VERSION=11 ARG DEVTOOLSET_VERSION=11
ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
RUN yum -y update RUN yum -y update
RUN yum -y install epel-release RUN yum -y install epel-release
# install glibc-langpack-en make sure en_US.UTF-8 locale is available
RUN yum -y install glibc-langpack-en
RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
# Just add everything as a safe.directory for git since these will be used in multiple places with git # Just add everything as a safe.directory for git since these will be used in multiple places with git
RUN git config --global --add safe.directory '*' RUN git config --global --add safe.directory '*'
@ -40,36 +41,31 @@ RUN bash ./install_conda.sh && rm install_conda.sh
# Install CUDA # Install CUDA
FROM base as cuda FROM base as cuda
ARG CUDA_VERSION=12.6 ARG CUDA_VERSION=12.4
RUN rm -rf /usr/local/cuda-* RUN rm -rf /usr/local/cuda-*
ADD ./common/install_cuda.sh install_cuda.sh ADD ./common/install_cuda.sh install_cuda.sh
COPY ./common/install_nccl.sh install_nccl.sh
COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
COPY ./common/install_cusparselt.sh install_cusparselt.sh
ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION} ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
# Preserve CUDA_VERSION for the builds # Preserve CUDA_VERSION for the builds
ENV CUDA_VERSION=${CUDA_VERSION} ENV CUDA_VERSION=${CUDA_VERSION}
# Make things in our path by default # Make things in our path by default
ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH
FROM cuda as cuda11.8
RUN bash ./install_cuda.sh 11.8
ENV DESIRED_CUDA=11.8
FROM cuda as cuda12.1
RUN bash ./install_cuda.sh 12.1
ENV DESIRED_CUDA=12.1
FROM cuda as cuda12.4
RUN bash ./install_cuda.sh 12.4
ENV DESIRED_CUDA=12.4
FROM cuda as cuda12.6 FROM cuda as cuda12.6
RUN bash ./install_cuda.sh 12.6 RUN bash ./install_cuda.sh 12.6
ENV DESIRED_CUDA=12.6 ENV DESIRED_CUDA=12.6
FROM cuda as cuda12.8
RUN bash ./install_cuda.sh 12.8
ENV DESIRED_CUDA=12.8
FROM cuda as cuda12.9
RUN bash ./install_cuda.sh 12.9
ENV DESIRED_CUDA=12.9
FROM ${ROCM_IMAGE} as rocm
ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
ADD ./common/install_mkl.sh install_mkl.sh
RUN bash ./install_mkl.sh && rm install_mkl.sh
ENV MKLROOT /opt/intel
# Install MNIST test data # Install MNIST test data
FROM base as mnist FROM base as mnist
ADD ./common/install_mnist.sh install_mnist.sh ADD ./common/install_mnist.sh install_mnist.sh
@ -77,9 +73,9 @@ RUN bash ./install_mnist.sh
FROM base as all_cuda FROM base as all_cuda
COPY --from=cuda11.8 /usr/local/cuda-11.8 /usr/local/cuda-11.8 COPY --from=cuda11.8 /usr/local/cuda-11.8 /usr/local/cuda-11.8
COPY --from=cuda12.1 /usr/local/cuda-12.1 /usr/local/cuda-12.1
COPY --from=cuda12.4 /usr/local/cuda-12.4 /usr/local/cuda-12.4
COPY --from=cuda12.6 /usr/local/cuda-12.6 /usr/local/cuda-12.6 COPY --from=cuda12.6 /usr/local/cuda-12.6 /usr/local/cuda-12.6
COPY --from=cuda12.8 /usr/local/cuda-12.8 /usr/local/cuda-12.8
COPY --from=cuda12.9 /usr/local/cuda-12.9 /usr/local/cuda-12.9
# Final step # Final step
FROM ${BASE_TARGET} as final FROM ${BASE_TARGET} as final

View File

@ -1,70 +1,82 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# Script used only in CD pipeline # Script used only in CD pipeline
set -exou pipefail set -eou pipefail
image="$1" image="$1"
shift shift
if [ -z "${image}" ]; then if [ -z "${image}" ]; then
echo "Usage: $0 IMAGENAME:ARCHTAG" echo "Usage: $0 IMAGE"
exit 1 exit 1
fi fi
# Go from imagename:tag to tag DOCKER_IMAGE_NAME="pytorch/${image}"
DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
CUDA_VERSION=""
ROCM_VERSION=""
EXTRA_BUILD_ARGS=""
if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
# extract cuda version from image name and tag. e.g. manylinux2_28-builder:cuda12.8 returns 12.8
CUDA_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
EXTRA_BUILD_ARGS="--build-arg CUDA_VERSION=${CUDA_VERSION}"
elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
# extract rocm version from image name and tag. e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
ROCM_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
EXTRA_BUILD_ARGS="--build-arg ROCM_IMAGE=rocm/dev-almalinux-8:${ROCM_VERSION}-complete"
fi
case ${DOCKER_TAG_PREFIX} in
cpu)
BASE_TARGET=base
;;
cuda*)
BASE_TARGET=cuda${CUDA_VERSION}
;;
rocm*)
BASE_TARGET=rocm
;;
*)
echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"
exit 1
;;
esac
# TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
# is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
sudo systemctl daemon-reload
sudo systemctl restart docker
export DOCKER_BUILDKIT=1 export DOCKER_BUILDKIT=1
TOPDIR=$(git rev-parse --show-toplevel) TOPDIR=$(git rev-parse --show-toplevel)
tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
docker build \ CUDA_VERSION=${CUDA_VERSION:-12.1}
--target final \
--progress plain \
--build-arg "BASE_TARGET=${BASE_TARGET}" \
--build-arg "DEVTOOLSET_VERSION=11" \
${EXTRA_BUILD_ARGS} \
-t ${tmp_tag} \
$@ \
-f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
${TOPDIR}/.ci/docker/
if [ -n "${CUDA_VERSION}" ]; then case ${CUDA_VERSION} in
cpu)
BASE_TARGET=base
DOCKER_TAG=cpu
;;
all)
BASE_TARGET=all_cuda
DOCKER_TAG=latest
;;
*)
BASE_TARGET=cuda${CUDA_VERSION}
DOCKER_TAG=cuda${CUDA_VERSION}
;;
esac
(
set -x
# TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
# is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
sudo systemctl daemon-reload
sudo systemctl restart docker
docker build \
--target final \
--progress plain \
--build-arg "BASE_TARGET=${BASE_TARGET}" \
--build-arg "CUDA_VERSION=${CUDA_VERSION}" \
--build-arg "DEVTOOLSET_VERSION=11" \
-t ${DOCKER_IMAGE_NAME} \
$@ \
-f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
${TOPDIR}/.ci/docker/
)
if [[ "${DOCKER_TAG}" =~ ^cuda* ]]; then
# Test that we're using the right CUDA compiler # Test that we're using the right CUDA compiler
docker run --rm "${tmp_tag}" nvcc --version | grep "cuda_${CUDA_VERSION}" (
set -x
docker run --rm "${DOCKER_IMAGE_NAME}" nvcc --version | grep "cuda_${CUDA_VERSION}"
)
fi
GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
GIT_BRANCH_NAME=${GITHUB_REF##*/}
GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE_NAME}-${GIT_BRANCH_NAME}
DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE_NAME}-${GIT_COMMIT_SHA}
if [[ "${WITH_PUSH:-}" == true ]]; then
(
set -x
docker push "${DOCKER_IMAGE_NAME}"
if [[ -n ${GITHUB_REF} ]]; then
docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_BRANCH_TAG}
docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_SHA_TAG}
docker push "${DOCKER_IMAGE_BRANCH_TAG}"
docker push "${DOCKER_IMAGE_SHA_TAG}"
fi
)
fi fi

View File

@ -0,0 +1,5 @@
0.8b
manylinux_2_28
rocm6.2
6f8cbcac8a92775291bb1ba8f514d4beb350baf4
e938def5d32869fe2e00aec0300f354c9f157867bebdf2e104d732b94cb238d8

View File

@ -1,8 +1,4 @@
#!/bin/bash #!/bin/bash
# The purpose of this script is to:
# 1. Extract the set of parameters to be used for a docker build based on the provided image name.
# 2. Run docker build with the parameters found in step 1.
# 3. Run the built image and print out the expected and actual versions of packages installed.
set -ex set -ex
@ -50,23 +46,30 @@ if [[ "$image" == *xla* ]]; then
exit 0 exit 0
fi fi
if [[ "$image" == *-jammy* ]]; then if [[ "$image" == *-focal* ]]; then
UBUNTU_VERSION=20.04
elif [[ "$image" == *-jammy* ]]; then
UBUNTU_VERSION=22.04 UBUNTU_VERSION=22.04
elif [[ "$image" == *-noble* ]]; then
UBUNTU_VERSION=24.04
elif [[ "$image" == *ubuntu* ]]; then elif [[ "$image" == *ubuntu* ]]; then
extract_version_from_image_name ubuntu UBUNTU_VERSION extract_version_from_image_name ubuntu UBUNTU_VERSION
elif [[ "$image" == *centos* ]]; then
extract_version_from_image_name centos CENTOS_VERSION
fi fi
if [ -n "${UBUNTU_VERSION}" ]; then if [ -n "${UBUNTU_VERSION}" ]; then
OS="ubuntu" OS="ubuntu"
elif [ -n "${CENTOS_VERSION}" ]; then
OS="centos"
else else
echo "Unable to derive operating system base..." echo "Unable to derive operating system base..."
exit 1 exit 1
fi fi
DOCKERFILE="${OS}/Dockerfile" DOCKERFILE="${OS}/Dockerfile"
if [[ "$image" == *rocm* ]]; then # When using ubuntu - 22.04, start from Ubuntu docker image, instead of nvidia/cuda docker image.
if [[ "$image" == *cuda* && "$UBUNTU_VERSION" != "22.04" ]]; then
DOCKERFILE="${OS}-cuda/Dockerfile"
elif [[ "$image" == *rocm* ]]; then
DOCKERFILE="${OS}-rocm/Dockerfile" DOCKERFILE="${OS}-rocm/Dockerfile"
elif [[ "$image" == *xpu* ]]; then elif [[ "$image" == *xpu* ]]; then
DOCKERFILE="${OS}-xpu/Dockerfile" DOCKERFILE="${OS}-xpu/Dockerfile"
@ -78,82 +81,427 @@ elif [[ "$image" == *linter* ]]; then
DOCKERFILE="linter/Dockerfile" DOCKERFILE="linter/Dockerfile"
fi fi
PY_HARDCODED_CONFIG_SCRIPT=$(python3 get_config.py --image "$image") # CMake 3.18 is needed to support CUDA17 language variant
CMAKE_VERSION=3.18.5
if [[ $? -eq 0 ]]; then _UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
eval "$PY_HARDCODED_CONFIG_SCRIPT" _UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
else
echo "[Fallback] Python script failed or no match — fallback to hardcoded shell case" # It's annoying to rename jobs every time you want to rewrite a
# Catch-all for builds that are not hardcoded. # configuration, so we hardcode everything here rather than do it
VISION=yes # from scratch
echo "image '$image' did not match an existing build configuration" case "$image" in
if [[ "$image" == *py* ]]; then pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
extract_version_from_image_name py ANACONDA_PYTHON_VERSION CUDA_VERSION=12.4.1
fi CUDNN_VERSION=9
if [[ "$image" == *cuda* ]]; then ANACONDA_PYTHON_VERSION=3.10
extract_version_from_image_name cuda CUDA_VERSION GCC_VERSION=9
extract_version_from_image_name cudnn CUDNN_VERSION PROTOBUF=yes
fi DB=yes
if [[ "$image" == *rocm* ]]; then VISION=yes
extract_version_from_image_name rocm ROCM_VERSION KATEX=yes
NINJA_VERSION=1.9.0 UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes TRITON=yes
# To ensure that any ROCm config will build using conda cmake ;;
# and thus have LAPACK/MKL enabled pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
CUDA_VERSION=12.1.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.4.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.1.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda12.1-cudnn9-py3.12-gcc9-inductor-benchmarks)
CUDA_VERSION=12.1.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks)
CUDA_VERSION=12.4.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda12.4-cudnn9-py3.13-gcc9-inductor-benchmarks)
CUDA_VERSION=12.4.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.13
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9)
CUDA_VERSION=11.8.0
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
CUDA_VERSION=12.1.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-py3-clang10-onnx)
ANACONDA_PYTHON_VERSION=3.9
CLANG_VERSION=10
PROTOBUF=yes
DB=yes
VISION=yes
CONDA_CMAKE=yes
ONNX=yes
;;
pytorch-linux-focal-py3.9-clang10)
ANACONDA_PYTHON_VERSION=3.9
CLANG_VERSION=10
PROTOBUF=yes
DB=yes
VISION=yes
VULKAN_SDK_VERSION=1.2.162.1
SWIFTSHADER=yes
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-py3.11-clang10)
ANACONDA_PYTHON_VERSION=3.11
CLANG_VERSION=10
PROTOBUF=yes
DB=yes
VISION=yes
VULKAN_SDK_VERSION=1.2.162.1
SWIFTSHADER=yes
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-py3.9-gcc9)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-rocm-n-1-py3)
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
ROCM_VERSION=6.1
NINJA_VERSION=1.9.0
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-rocm-n-py3)
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
ROCM_VERSION=6.2.4
NINJA_VERSION=1.9.0
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-jammy-xpu-2024.0-py3)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
PROTOBUF=yes
DB=yes
VISION=yes
XPU_VERSION=0.5
NINJA_VERSION=1.9.0
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-jammy-xpu-2025.0-py3)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
PROTOBUF=yes
DB=yes
VISION=yes
XPU_VERSION=2025.0
NINJA_VERSION=1.9.0
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
CONDA_CMAKE=yes
TRITON=yes
DOCS=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-clang12)
ANACONDA_PYTHON_VERSION=3.9
CUDA_VERSION=11.8
CUDNN_VERSION=9
CLANG_VERSION=12
PROTOBUF=yes
DB=yes
VISION=yes
TRITON=yes
;;
pytorch-linux-jammy-py3-clang12-asan)
ANACONDA_PYTHON_VERSION=3.9
CLANG_VERSION=12
PROTOBUF=yes
DB=yes
VISION=yes
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-jammy-py3-clang15-asan)
ANACONDA_PYTHON_VERSION=3.10
CLANG_VERSION=15
CONDA_CMAKE=yes
VISION=yes
;;
pytorch-linux-jammy-py3-clang18-asan)
ANACONDA_PYTHON_VERSION=3.10
CLANG_VERSION=18
CONDA_CMAKE=yes
VISION=yes
;;
pytorch-linux-jammy-py3.9-gcc11)
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
CONDA_CMAKE=yes
TRITON=yes
DOCS=yes
UNINSTALL_DILL=yes
;;
pytorch-linux-jammy-py3-clang12-executorch)
ANACONDA_PYTHON_VERSION=3.10
CLANG_VERSION=12
CONDA_CMAKE=yes
EXECUTORCH=yes
;;
pytorch-linux-jammy-py3.12-halide)
CUDA_VERSION=12.4
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=11
CONDA_CMAKE=yes
HALIDE=yes
TRITON=yes
;;
pytorch-linux-jammy-py3.12-triton-cpu)
CUDA_VERSION=12.4
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=11
CONDA_CMAKE=yes
TRITON_CPU=yes
;;
pytorch-linux-focal-linter)
# TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
# We will need to update mypy version eventually, but that's for another day. The task
# would be to upgrade mypy to 1.0.0 with Python 3.11
ANACONDA_PYTHON_VERSION=3.9
CONDA_CMAKE=yes
;;
pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter)
ANACONDA_PYTHON_VERSION=3.9
CUDA_VERSION=11.8
CONDA_CMAKE=yes
;;
pytorch-linux-jammy-aarch64-py3.10-gcc11)
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
ACL=yes
PROTOBUF=yes
DB=yes
VISION=yes
CONDA_CMAKE=yes
# snadampal: skipping llvm src build install because the current version
# from pytorch/llvm:9.0.1 is x86 specific
SKIP_LLVM_SRC_BUILD_INSTALL=yes
;;
pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
ACL=yes
PROTOBUF=yes
DB=yes
VISION=yes
CONDA_CMAKE=yes
# snadampal: skipping llvm src build install because the current version
# from pytorch/llvm:9.0.1 is x86 specific
SKIP_LLVM_SRC_BUILD_INSTALL=yes
INDUCTOR_BENCHMARKS=yes
;;
*)
# Catch-all for builds that are not hardcoded.
PROTOBUF=yes
DB=yes
VISION=yes
echo "image '$image' did not match an existing build configuration"
if [[ "$image" == *py* ]]; then
extract_version_from_image_name py ANACONDA_PYTHON_VERSION
fi fi
if [[ "$image" == *centos7* ]]; then if [[ "$image" == *cuda* ]]; then
NINJA_VERSION=1.10.2 extract_version_from_image_name cuda CUDA_VERSION
fi extract_version_from_image_name cudnn CUDNN_VERSION
if [[ "$image" == *gcc* ]]; then fi
extract_version_from_image_name gcc GCC_VERSION if [[ "$image" == *rocm* ]]; then
fi extract_version_from_image_name rocm ROCM_VERSION
if [[ "$image" == *clang* ]]; then NINJA_VERSION=1.9.0
extract_version_from_image_name clang CLANG_VERSION TRITON=yes
fi # To ensure that any ROCm config will build using conda cmake
if [[ "$image" == *devtoolset* ]]; then # and thus have LAPACK/MKL enabled
extract_version_from_image_name devtoolset DEVTOOLSET_VERSION CONDA_CMAKE=yes
fi fi
if [[ "$image" == *glibc* ]]; then if [[ "$image" == *centos7* ]]; then
extract_version_from_image_name glibc GLIBC_VERSION NINJA_VERSION=1.10.2
fi fi
;; if [[ "$image" == *gcc* ]]; then
extract_version_from_image_name gcc GCC_VERSION
fi
if [[ "$image" == *clang* ]]; then
extract_version_from_image_name clang CLANG_VERSION
fi
if [[ "$image" == *devtoolset* ]]; then
extract_version_from_image_name devtoolset DEVTOOLSET_VERSION
fi
if [[ "$image" == *glibc* ]]; then
extract_version_from_image_name glibc GLIBC_VERSION
fi
if [[ "$image" == *cmake* ]]; then
extract_version_from_image_name cmake CMAKE_VERSION
fi
;;
esac esac
tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]') tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
no_cache_flag="" #when using cudnn version 8 install it separately from cuda
progress_flag="" if [[ "$image" == *cuda* && ${OS} == "ubuntu" ]]; then
# Do not use cache and progress=plain when in CI IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
if [[ -n "${CI:-}" ]]; then if [[ ${CUDNN_VERSION} == 9 ]]; then
no_cache_flag="--no-cache" IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
progress_flag="--progress=plain" fi
fi fi
# Build image # Build image
docker build \ docker build \
${no_cache_flag} \ --no-cache \
${progress_flag} \ --progress=plain \
--build-arg "BUILD_ENVIRONMENT=${image}" \ --build-arg "BUILD_ENVIRONMENT=${image}" \
--build-arg "PROTOBUF=${PROTOBUF:-}" \
--build-arg "LLVMDEV=${LLVMDEV:-}" \ --build-arg "LLVMDEV=${LLVMDEV:-}" \
--build-arg "DB=${DB:-}" \
--build-arg "VISION=${VISION:-}" \ --build-arg "VISION=${VISION:-}" \
--build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \ --build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \
--build-arg "CENTOS_VERSION=${CENTOS_VERSION}" \
--build-arg "DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}" \ --build-arg "DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}" \
--build-arg "GLIBC_VERSION=${GLIBC_VERSION}" \ --build-arg "GLIBC_VERSION=${GLIBC_VERSION}" \
--build-arg "CLANG_VERSION=${CLANG_VERSION}" \ --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
--build-arg "ANACONDA_PYTHON_VERSION=${ANACONDA_PYTHON_VERSION}" \ --build-arg "ANACONDA_PYTHON_VERSION=${ANACONDA_PYTHON_VERSION}" \
--build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
--build-arg "GCC_VERSION=${GCC_VERSION}" \ --build-arg "GCC_VERSION=${GCC_VERSION}" \
--build-arg "CUDA_VERSION=${CUDA_VERSION}" \ --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
--build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \ --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
--build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \ --build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \
--build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \ --build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \
--build-arg "VULKAN_SDK_VERSION=${VULKAN_SDK_VERSION}" \
--build-arg "SWIFTSHADER=${SWIFTSHADER}" \
--build-arg "CMAKE_VERSION=${CMAKE_VERSION:-}" \
--build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \ --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
--build-arg "KATEX=${KATEX:-}" \ --build-arg "KATEX=${KATEX:-}" \
--build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \ --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
--build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \ --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a}" \
--build-arg "IMAGE_NAME=${IMAGE_NAME}" \ --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
--build-arg "UCX_COMMIT=${UCX_COMMIT}" \ --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
--build-arg "UCC_COMMIT=${UCC_COMMIT}" \ --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
--build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
--build-arg "TRITON=${TRITON}" \ --build-arg "TRITON=${TRITON}" \
--build-arg "TRITON_CPU=${TRITON_CPU}" \ --build-arg "TRITON_CPU=${TRITON_CPU}" \
--build-arg "ONNX=${ONNX}" \ --build-arg "ONNX=${ONNX}" \
@ -162,9 +510,7 @@ docker build \
--build-arg "EXECUTORCH=${EXECUTORCH}" \ --build-arg "EXECUTORCH=${EXECUTORCH}" \
--build-arg "HALIDE=${HALIDE}" \ --build-arg "HALIDE=${HALIDE}" \
--build-arg "XPU_VERSION=${XPU_VERSION}" \ --build-arg "XPU_VERSION=${XPU_VERSION}" \
--build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
--build-arg "ACL=${ACL:-}" \ --build-arg "ACL=${ACL:-}" \
--build-arg "OPENBLAS=${OPENBLAS:-}" \
--build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \ --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
--build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \ --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
-f $(dirname ${DOCKERFILE})/Dockerfile \ -f $(dirname ${DOCKERFILE})/Dockerfile \
@ -181,7 +527,7 @@ docker build \
UBUNTU_VERSION=$(echo ${UBUNTU_VERSION} | sed 's/-rc$//') UBUNTU_VERSION=$(echo ${UBUNTU_VERSION} | sed 's/-rc$//')
function drun() { function drun() {
docker run --rm "$tmp_tag" "$@" docker run --rm "$tmp_tag" $*
} }
if [[ "$OS" == "ubuntu" ]]; then if [[ "$OS" == "ubuntu" ]]; then
@ -229,23 +575,3 @@ if [ -n "$KATEX" ]; then
exit 1 exit 1
fi fi
fi fi
HAS_TRITON=$(drun python -c "import triton" > /dev/null 2>&1 && echo "yes" || echo "no")
if [[ -n "$TRITON" || -n "$TRITON_CPU" ]]; then
if [ "$HAS_TRITON" = "no" ]; then
echo "expecting triton to be installed, but it is not"
exit 1
fi
elif [ "$HAS_TRITON" = "yes" ]; then
echo "expecting triton to not be installed, but it is"
exit 1
fi
# Sanity check cmake version. Executorch reinstalls cmake and I'm not sure if
# they support 4.0.0 yet, so exclude them from this check.
CMAKE_VERSION=$(drun cmake --version)
if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then
echo "CMake version is not 4.0.0:"
drun cmake --version
exit 1
fi

View File

@ -17,8 +17,9 @@ RUN bash ./install_base.sh && rm install_base.sh
# Update CentOS git version # Update CentOS git version
RUN yum -y remove git RUN yum -y remove git
RUN yum -y remove git-* RUN yum -y remove git-*
RUN yum -y install https://packages.endpointdev.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm && \ RUN yum -y install https://packages.endpoint.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm || \
sed -i 's/packages.endpoint/packages.endpointdev/' /etc/yum.repos.d/endpoint.repo (yum -y install https://packages.endpointdev.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm && \
sed -i "s/packages.endpoint/packages.endpointdev/" /etc/yum.repos.d/endpoint.repo)
RUN yum install -y git RUN yum install -y git
# Install devtoolset # Install devtoolset
@ -39,7 +40,7 @@ RUN bash ./install_user.sh && rm install_user.sh
# Install conda and other packages (e.g., numpy, pytest) # Install conda and other packages (e.g., numpy, pytest)
ARG ANACONDA_PYTHON_VERSION ARG ANACONDA_PYTHON_VERSION
ARG BUILD_ENVIRONMENT ARG CONDA_CMAKE
ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
COPY requirements-ci.txt /opt/conda/requirements-ci.txt COPY requirements-ci.txt /opt/conda/requirements-ci.txt
@ -47,6 +48,20 @@ COPY ./common/install_conda.sh install_conda.sh
COPY ./common/common_utils.sh common_utils.sh COPY ./common/common_utils.sh common_utils.sh
RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
# (optional) Install protobuf for ONNX
ARG PROTOBUF
COPY ./common/install_protobuf.sh install_protobuf.sh
RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
RUN rm install_protobuf.sh
ENV INSTALLED_PROTOBUF ${PROTOBUF}
# (optional) Install database packages like LMDB and LevelDB
ARG DB
COPY ./common/install_db.sh install_db.sh
RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}
# (optional) Install vision packages like OpenCV # (optional) Install vision packages like OpenCV
ARG VISION ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./ COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -60,7 +75,7 @@ COPY ./common/install_rocm.sh install_rocm.sh
RUN bash ./install_rocm.sh RUN bash ./install_rocm.sh
RUN rm install_rocm.sh RUN rm install_rocm.sh
COPY ./common/install_rocm_magma.sh install_rocm_magma.sh COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} RUN bash ./install_rocm_magma.sh
RUN rm install_rocm_magma.sh RUN rm install_rocm_magma.sh
COPY ./common/install_amdsmi.sh install_amdsmi.sh COPY ./common/install_amdsmi.sh install_amdsmi.sh
RUN bash ./install_amdsmi.sh RUN bash ./install_amdsmi.sh
@ -74,6 +89,12 @@ ENV MAGMA_HOME /opt/rocm/magma
ENV LANG en_US.utf8 ENV LANG en_US.utf8
ENV LC_ALL en_US.utf8 ENV LC_ALL en_US.utf8
# (optional) Install non-default CMake version
ARG CMAKE_VERSION
COPY ./common/install_cmake.sh install_cmake.sh
RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
RUN rm install_cmake.sh
# (optional) Install non-default Ninja version # (optional) Install non-default Ninja version
ARG NINJA_VERSION ARG NINJA_VERSION
COPY ./common/install_ninja.sh install_ninja.sh COPY ./common/install_ninja.sh install_ninja.sh
@ -92,6 +113,13 @@ COPY triton_version.txt triton_version.txt
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
# Install AOTriton (Early fail)
COPY ./aotriton_version.txt aotriton_version.txt
COPY ./common/common_utils.sh common_utils.sh
COPY ./common/install_aotriton.sh install_aotriton.sh
RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
# Install ccache/sccache (do this last, so we get priority in PATH) # Install ccache/sccache (do this last, so we get priority in PATH)
COPY ./common/install_cache.sh install_cache.sh COPY ./common/install_cache.sh install_cache.sh
ENV PATH /opt/cache/bin:$PATH ENV PATH /opt/cache/bin:$PATH

View File

@ -1 +1 @@
56392aa978594cc155fa8af48cd949f5b5f1823a 6f638937d64e3396793956d75ee3e14802022745

View File

@ -1 +0,0 @@
v2.21.5-1

View File

@ -1 +0,0 @@
v2.27.5-1

View File

@ -1 +1 @@
5d535d7a2d4b435b1b5c1177fd8f04a12b942b9a ac3470188b914c5d7a5058a7e28b9eb685a62427

View File

@ -1 +0,0 @@
e03a63be43e33596f7f0a43b0f530353785e4a59

View File

@ -1 +1 @@
ae324eeac8e102a2b40370e341460f3791353398 e98b6fcb8df5b44eb0d0addb6767c573d37ba024

View File

@ -1 +1 @@
11ec6354315768a85da41032535e3b7b99c5f706 0d4682f073ded4d1a8260dd4208a43d735ae3a2b

View File

@ -23,10 +23,6 @@ conda_install() {
as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $* as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
} }
conda_install_through_forge() {
as_jenkins conda install -c conda-forge -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
}
conda_run() { conda_run() {
as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $* as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $*
} }

View File

@ -1,7 +1,7 @@
set -euo pipefail set -euo pipefail
readonly version=v25.02 readonly version=v24.04
readonly src_host=https://github.com/ARM-software readonly src_host=https://review.mlplatform.org/ml
readonly src_repo=ComputeLibrary readonly src_repo=ComputeLibrary
# Clone ACL # Clone ACL

View File

@ -0,0 +1,23 @@
#!/bin/bash
set -ex
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
TARBALL='aotriton.tar.gz'
# This read command alwasy returns with exit code 1
read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
ARCH=$(uname -m)
AOTRITON_INSTALL_PREFIX="$1"
AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.gz"
cd "${AOTRITON_INSTALL_PREFIX}"
# Must use -L to follow redirects
curl -L --retry 3 -o "${TARBALL}" "${AOTRITON_URL}"
ACTUAL_SHA256=$(sha256sum "${TARBALL}" | cut -d " " -f 1)
if [ "${SHA256}" != "${ACTUAL_SHA256}" ]; then
echo -n "Error: The SHA256 of downloaded tarball is ${ACTUAL_SHA256},"
echo " which does not match the expected value ${SHA256}."
exit
fi
tar xf "${TARBALL}" && rm -rf "${TARBALL}"

View File

@ -15,9 +15,6 @@ install_ubuntu() {
elif [[ "$UBUNTU_VERSION" == "22.04"* ]]; then elif [[ "$UBUNTU_VERSION" == "22.04"* ]]; then
cmake3="cmake=3.22*" cmake3="cmake=3.22*"
maybe_libiomp_dev="" maybe_libiomp_dev=""
elif [[ "$UBUNTU_VERSION" == "24.04"* ]]; then
cmake3="cmake=3.28*"
maybe_libiomp_dev=""
else else
cmake3="cmake=3.5*" cmake3="cmake=3.5*"
maybe_libiomp_dev="libiomp-dev" maybe_libiomp_dev="libiomp-dev"
@ -33,6 +30,14 @@ install_ubuntu() {
maybe_libomp_dev="" maybe_libomp_dev=""
fi fi
# HACK: UCC testing relies on libnccl library from NVIDIA repo, and version 2.16 crashes
# See https://github.com/pytorch/pytorch/pull/105260#issuecomment-1673399729
if [[ "$UBUNTU_VERSION" == "20.04"* && "$CUDA_VERSION" == "11.8"* ]]; then
maybe_libnccl_dev="libnccl2=2.15.5-1+cuda11.8 libnccl-dev=2.15.5-1+cuda11.8 --allow-downgrades --allow-change-held-packages"
else
maybe_libnccl_dev=""
fi
# Install common dependencies # Install common dependencies
apt-get update apt-get update
# TODO: Some of these may not be necessary # TODO: Some of these may not be necessary
@ -61,6 +66,7 @@ install_ubuntu() {
libasound2-dev \ libasound2-dev \
libsndfile-dev \ libsndfile-dev \
${maybe_libomp_dev} \ ${maybe_libomp_dev} \
${maybe_libnccl_dev} \
software-properties-common \ software-properties-common \
wget \ wget \
sudo \ sudo \
@ -89,6 +95,9 @@ install_centos() {
ccache_deps="asciidoc docbook-dtds docbook-style-xsl libxslt" ccache_deps="asciidoc docbook-dtds docbook-style-xsl libxslt"
numpy_deps="gcc-gfortran" numpy_deps="gcc-gfortran"
# Note: protobuf-c-{compiler,devel} on CentOS are too old to be used
# for Caffe2. That said, we still install them to make sure the build
# system opts to build/use protoc and libprotobuf from third-party.
yum install -y \ yum install -y \
$ccache_deps \ $ccache_deps \
$numpy_deps \ $numpy_deps \

View File

@ -9,7 +9,7 @@ install_ubuntu() {
# Instead use lib and headers from OpenSSL1.1 installed in `install_openssl.sh`` # Instead use lib and headers from OpenSSL1.1 installed in `install_openssl.sh``
apt-get install -y cargo apt-get install -y cargo
echo "Checking out sccache repo" echo "Checking out sccache repo"
git clone https://github.com/mozilla/sccache -b v0.10.0 git clone https://github.com/mozilla/sccache -b v0.9.0
cd sccache cd sccache
echo "Building sccache" echo "Building sccache"
cargo build --release cargo build --release

View File

@ -4,10 +4,16 @@ set -ex
if [ -n "$CLANG_VERSION" ]; then if [ -n "$CLANG_VERSION" ]; then
if [[ $UBUNTU_VERSION == 22.04 ]]; then if [[ $CLANG_VERSION == 9 && $UBUNTU_VERSION == 18.04 ]]; then
sudo apt-get update
# gpg-agent is not available by default on 18.04
sudo apt-get install -y --no-install-recommends gpg-agent
wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
apt-add-repository "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-${CLANG_VERSION} main"
elif [[ $UBUNTU_VERSION == 22.04 ]]; then
# work around ubuntu apt-get conflicts # work around ubuntu apt-get conflicts
sudo apt-get -y -f install sudo apt-get -y -f install
wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
if [[ $CLANG_VERSION == 18 ]]; then if [[ $CLANG_VERSION == 18 ]]; then
apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main" apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main"
fi fi
@ -35,7 +41,7 @@ if [ -n "$CLANG_VERSION" ]; then
# clang's packaging is a little messed up (the runtime libs aren't # clang's packaging is a little messed up (the runtime libs aren't
# added into the linker path), so give it a little help # added into the linker path), so give it a little help
clang_lib=("/usr/lib/llvm-$CLANG_VERSION/lib/clang/"*"/lib/linux") clang_lib=("/usr/lib/llvm-$CLANG_VERSION/lib/clang/"*"/lib/linux")
echo "$clang_lib" >/etc/ld.so.conf.d/clang.conf echo "$clang_lib" > /etc/ld.so.conf.d/clang.conf
ldconfig ldconfig
# Cleanup package manager # Cleanup package manager

View File

@ -0,0 +1,31 @@
#!/bin/bash
set -ex
[ -n "$CMAKE_VERSION" ]
# Remove system cmake install so it won't get used instead
ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
case "$ID" in
ubuntu)
apt-get remove cmake -y
;;
centos)
yum remove cmake -y
;;
*)
echo "Unable to determine OS..."
exit 1
;;
esac
# Turn 3.6.3 into v3.6
path=$(echo "${CMAKE_VERSION}" | sed -e 's/\([0-9].[0-9]\+\).*/v\1/')
file="cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz"
# Download and install specific CMake version in /usr/local
pushd /tmp
curl -Os --retry 3 "https://cmake.org/files/${path}/${file}"
tar -C /usr/local --strip-components 1 --no-same-owner -zxf cmake-*.tar.gz
rm -f cmake-*.tar.gz
popd

View File

@ -4,8 +4,12 @@ set -ex
# Optionally install conda # Optionally install conda
if [ -n "$ANACONDA_PYTHON_VERSION" ]; then if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download" # @lint-ignore BASE_URL="https://repo.anaconda.com/miniconda"
CONDA_FILE="Miniforge3-Linux-$(uname -m).sh" CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh"
if [[ $(uname -m) == "aarch64" ]] || [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"
CONDA_FILE="Miniforge3-Linux-$(uname -m).sh"
fi
MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1) MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1)
MINOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 2) MINOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 2)
@ -17,6 +21,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
exit 1 exit 1
;; ;;
esac esac
mkdir -p /opt/conda mkdir -p /opt/conda
chown jenkins:jenkins /opt/conda chown jenkins:jenkins /opt/conda
@ -57,33 +62,32 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
# libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30 # libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30
# which is provided in libstdcxx 12 and up. # which is provided in libstdcxx 12 and up.
conda_install libstdcxx-ng=12.3.0 --update-deps -c conda-forge conda_install libstdcxx-ng=12.3.0 -c conda-forge
# Miniforge installer doesn't install sqlite by default
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
conda_install sqlite
fi
# Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
if [[ $(uname -m) != "aarch64" ]]; then if [[ $(uname -m) == "aarch64" ]]; then
pip_install mkl==2024.2.0 conda_install "openblas==0.3.28=*openmp*"
pip_install mkl-static==2024.2.0 else
pip_install mkl-include==2024.2.0 conda_install "mkl=2021.4.0 mkl-include=2021.4.0"
fi fi
# Install llvm-8 as it is required to compile llvmlite-0.30.0 from source # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
# and libpython-static for torch deploy # and libpython-static for torch deploy
conda_install llvmdev=8.0.0 "libpython-static=${ANACONDA_PYTHON_VERSION}" conda_install llvmdev=8.0.0 "libpython-static=${ANACONDA_PYTHON_VERSION}"
# Use conda cmake in some cases. Conda cmake will be newer than our supported
# min version (3.5 for xenial and 3.10 for bionic), so we only do it in those
# following builds that we know should use conda. Specifically, Ubuntu bionic
# and focal cannot find conda mkl with stock cmake, so we need a cmake from conda
if [ -n "${CONDA_CMAKE}" ]; then
conda_install cmake
fi
# Magma package names are concatenation of CUDA major and minor ignoring revision # Magma package names are concatenation of CUDA major and minor ignoring revision
# I.e. magma-cuda102 package corresponds to CUDA_VERSION=10.2 and CUDA_VERSION=10.2.89 # I.e. magma-cuda102 package corresponds to CUDA_VERSION=10.2 and CUDA_VERSION=10.2.89
# Magma is installed from a tarball in the ossci-linux bucket into the conda env # Magma is installed from a tarball in the ossci-linux bucket into the conda env
if [ -n "$CUDA_VERSION" ]; then if [ -n "$CUDA_VERSION" ]; then
conda_run ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION}) ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION}) ${ANACONDA_PYTHON_VERSION}
fi
if [[ "$UBUNTU_VERSION" == "24.04"* ]] ; then
conda_install_through_forge libstdcxx-ng=14
fi fi
# Install some other packages, including those needed for Python test reporting # Install some other packages, including those needed for Python test reporting

View File

@ -3,10 +3,11 @@
set -uex -o pipefail set -uex -o pipefail
PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python
PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads
GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
# Python versions to be installed in /opt/$VERSION_NO # Python versions to be installed in /opt/$VERSION_NO
CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t 3.14.0 3.14.0t"} CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}
function check_var { function check_var {
if [ -z "$1" ]; then if [ -z "$1" ]; then
@ -23,8 +24,9 @@ function do_cpython_build {
tar -xzf Python-$py_ver.tgz tar -xzf Python-$py_ver.tgz
local additional_flags="" local additional_flags=""
if [[ "$py_ver" == *"t" ]]; then if [ "$py_ver" == "3.13.0t" ]; then
additional_flags=" --disable-gil" additional_flags=" --disable-gil"
mv cpython-3.13/ cpython-3.13t/
fi fi
pushd $py_folder pushd $py_folder
@ -66,7 +68,7 @@ function do_cpython_build {
ln -s pip3 ${prefix}/bin/pip ln -s pip3 ${prefix}/bin/pip
fi fi
# install setuptools since python 3.12 is required to use distutils # install setuptools since python 3.12 is required to use distutils
${prefix}/bin/pip install wheel==0.45.1 setuptools==80.9.0 ${prefix}/bin/pip install wheel==0.34.2 setuptools==68.2.2
local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))") local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")
ln -sf ${prefix} /opt/python/${abi_tag} ln -sf ${prefix} /opt/python/${abi_tag}
} }
@ -74,20 +76,24 @@ function do_cpython_build {
function build_cpython { function build_cpython {
local py_ver=$1 local py_ver=$1
check_var $py_ver check_var $py_ver
local py_suffix=$py_ver check_var $PYTHON_DOWNLOAD_URL
local py_folder=$py_ver local py_ver_folder=$py_ver
# Special handling for nogil if [ "$py_ver" = "3.13.0t" ]; then
if [[ "${py_ver}" == *"t" ]]; then PY_VER_SHORT="3.13"
py_suffix=${py_ver::-1} PYT_VER_SHORT="3.13t"
py_folder=$py_suffix check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
do_cpython_build $py_ver cpython-$PYT_VER_SHORT
elif [ "$py_ver" = "3.13.0" ]; then
PY_VER_SHORT="3.13"
check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
do_cpython_build $py_ver cpython-$PY_VER_SHORT
else
wget -q $PYTHON_DOWNLOAD_URL/$py_ver_folder/Python-$py_ver.tgz
do_cpython_build $py_ver Python-$py_ver
fi fi
# Only b3 is available now
if [ "$py_suffix" == "3.14.0" ]; then
py_suffix="3.14.0b3"
fi
wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz
do_cpython_build $py_ver Python-$py_suffix
rm -f Python-$py_ver.tgz rm -f Python-$py_ver.tgz
} }

View File

@ -2,128 +2,247 @@
set -ex set -ex
arch_path='' NCCL_VERSION=v2.21.5-1
targetarch=${TARGETARCH:-$(uname -m)} CUDNN_VERSION=9.5.1.17
if [ ${targetarch} = 'amd64' ] || [ "${targetarch}" = 'x86_64' ]; then
arch_path='x86_64'
else
arch_path='sbsa'
fi
NVSHMEM_VERSION=3.3.9 function install_cusparselt_040 {
# cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
function install_cuda { mkdir tmp_cusparselt && pushd tmp_cusparselt
version=$1 wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
runfile=$2 tar xf libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
major_minor=${version%.*} cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/include/* /usr/local/cuda/include/
rm -rf /usr/local/cuda-${major_minor} /usr/local/cuda cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/lib/* /usr/local/cuda/lib64/
if [[ ${arch_path} == 'sbsa' ]]; then popd
runfile="${runfile}_sbsa" rm -rf tmp_cusparselt
fi
runfile="${runfile}.run"
wget -q https://developer.download.nvidia.com/compute/cuda/${version}/local_installers/${runfile} -O ${runfile}
chmod +x ${runfile}
./${runfile} --toolkit --silent
rm -f ${runfile}
rm -f /usr/local/cuda && ln -s /usr/local/cuda-${major_minor} /usr/local/cuda
} }
function install_cudnn { function install_cusparselt_052 {
cuda_major_version=$1 # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
cudnn_version=$2 mkdir tmp_cusparselt && pushd tmp_cusparselt
mkdir tmp_cudnn && cd tmp_cudnn wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.5.2.1-archive.tar.xz
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement tar xf libcusparse_lt-linux-x86_64-0.5.2.1-archive.tar.xz
filepath="cudnn-linux-${arch_path}-${cudnn_version}_cuda${cuda_major_version}-archive" cp -a libcusparse_lt-linux-x86_64-0.5.2.1-archive/include/* /usr/local/cuda/include/
wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-${arch_path}/${filepath}.tar.xz cp -a libcusparse_lt-linux-x86_64-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
tar xf ${filepath}.tar.xz popd
cp -a ${filepath}/include/* /usr/local/cuda/include/ rm -rf tmp_cusparselt
cp -a ${filepath}/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf tmp_cudnn
} }
function install_nvshmem { function install_cusparselt_062 {
cuda_major_version=$1 # e.g. "12" # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
nvshmem_version=$2 # e.g. "3.3.9" mkdir tmp_cusparselt && pushd tmp_cusparselt
wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz
tar xf libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz
cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/include/* /usr/local/cuda/include/
cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/lib/* /usr/local/cuda/lib64/
popd
rm -rf tmp_cusparselt
}
case "${arch_path}" in function install_cusparselt_063 {
sbsa) # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
dl_arch="aarch64" mkdir tmp_cusparselt && pushd tmp_cusparselt
;; wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
x86_64) tar xf libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
dl_arch="x64" cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/include/* /usr/local/cuda/include/
;; cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/lib/* /usr/local/cuda/lib64/
*) popd
dl_arch="${arch}" rm -rf tmp_cusparselt
;; }
esac
tmpdir="tmp_nvshmem" function install_118 {
mkdir -p "${tmpdir}" && cd "${tmpdir}" CUDNN_VERSION=9.1.0.70
echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
rm -rf /usr/local/cuda-11.8 /usr/local/cuda
# install CUDA 11.8.0 in the same container
wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
chmod +x cuda_11.8.0_520.61.05_linux.run
./cuda_11.8.0_520.61.05_linux.run --toolkit --silent
rm -f cuda_11.8.0_520.61.05_linux.run
rm -f /usr/local/cuda && ln -s /usr/local/cuda-11.8 /usr/local/cuda
# nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
filename="libnvshmem_cuda${cuda_major_version}-linux-${arch_path}-${nvshmem_version}" mkdir tmp_cudnn && cd tmp_cudnn
url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}.tar.gz" wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/include/* /usr/local/cuda/include/
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf tmp_cudnn
# download, unpack, install # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
wget -q "${url}" # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
tar xf "${filename}.tar.gz" git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
cp -a "libnvshmem/include/"* /usr/local/include/ cd nccl && make -j src.build
cp -a "libnvshmem/lib/"* /usr/local/lib/ cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf nccl
# cleanup install_cusparselt_040
cd ..
rm -rf "${tmpdir}"
echo "nvSHMEM ${nvshmem_version} for CUDA ${cuda_major_version} (${arch_path}) installed." ldconfig
}
function install_121 {
echo "Installing CUDA 12.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
rm -rf /usr/local/cuda-12.1 /usr/local/cuda
# install CUDA 12.1.0 in the same container
wget -q https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run
chmod +x cuda_12.1.1_530.30.02_linux.run
./cuda_12.1.1_530.30.02_linux.run --toolkit --silent
rm -f cuda_12.1.1_530.30.02_linux.run
rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.1 /usr/local/cuda
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
mkdir tmp_cudnn && cd tmp_cudnn
wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf tmp_cudnn
# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
# Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl && make -j src.build
cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf nccl
install_cusparselt_052
ldconfig
} }
function install_124 { function install_124 {
CUDNN_VERSION=9.1.0.70 CUDNN_VERSION=9.1.0.70
echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.2" echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
install_cuda 12.4.1 cuda_12.4.1_550.54.15_linux rm -rf /usr/local/cuda-12.4 /usr/local/cuda
# install CUDA 12.4.1 in the same container
wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
chmod +x cuda_12.4.1_550.54.15_linux.run
./cuda_12.4.1_550.54.15_linux.run --toolkit --silent
rm -f cuda_12.4.1_550.54.15_linux.run
rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
install_cudnn 12 $CUDNN_VERSION # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
mkdir tmp_cudnn && cd tmp_cudnn
wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf tmp_cudnn
CUDA_VERSION=12.4 bash install_nccl.sh # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
# Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl && make -j src.build
cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf nccl
CUDA_VERSION=12.4 bash install_cusparselt.sh install_cusparselt_062
ldconfig ldconfig
} }
function install_126 { function install_126 {
CUDNN_VERSION=9.10.2.21 echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1" rm -rf /usr/local/cuda-12.6 /usr/local/cuda
install_cuda 12.6.3 cuda_12.6.3_560.35.05_linux # install CUDA 12.6.3 in the same container
wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run
chmod +x cuda_12.6.3_560.35.05_linux.run
./cuda_12.6.3_560.35.05_linux.run --toolkit --silent
rm -f cuda_12.6.3_560.35.05_linux.run
rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda
install_cudnn 12 $CUDNN_VERSION # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
mkdir tmp_cudnn && cd tmp_cudnn
wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf tmp_cudnn
install_nvshmem 12 $NVSHMEM_VERSION # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
# Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl && make -j src.build
cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf nccl
CUDA_VERSION=12.6 bash install_nccl.sh install_cusparselt_063
CUDA_VERSION=12.6 bash install_cusparselt.sh
ldconfig ldconfig
} }
function install_129 { function prune_118 {
CUDNN_VERSION=9.10.2.21 echo "Pruning CUDA 11.8 and cuDNN"
echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1" #####################################################################################
# install CUDA 12.9.1 in the same container # CUDA 11.8 prune static libs
install_cuda 12.9.1 cuda_12.9.1_575.57.08_linux #####################################################################################
export NVPRUNE="/usr/local/cuda-11.8/bin/nvprune"
export CUDA_LIB_DIR="/usr/local/cuda-11.8/lib64"
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement export GENCODE="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
install_cudnn 12 $CUDNN_VERSION export GENCODE_CUDNN="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
install_nvshmem 12 $NVSHMEM_VERSION if [[ -n "$OVERRIDE_GENCODE" ]]; then
export GENCODE=$OVERRIDE_GENCODE
fi
CUDA_VERSION=12.9 bash install_nccl.sh # all CUDA libs except CuDNN and CuBLAS (cudnn and cublas need arch 3.7 included)
ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \
| xargs -I {} bash -c \
"echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
CUDA_VERSION=12.9 bash install_cusparselt.sh # prune CuDNN and CuBLAS
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
ldconfig #####################################################################################
# CUDA 11.8 prune visual tools
#####################################################################################
export CUDA_BASE="/usr/local/cuda-11.8/"
rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2022.3.0 $CUDA_BASE/nsight-systems-2022.4.2/
}
function prune_121 {
echo "Pruning CUDA 12.1"
#####################################################################################
# CUDA 12.1 prune static libs
#####################################################################################
export NVPRUNE="/usr/local/cuda-12.1/bin/nvprune"
export CUDA_LIB_DIR="/usr/local/cuda-12.1/lib64"
export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
if [[ -n "$OVERRIDE_GENCODE" ]]; then
export GENCODE=$OVERRIDE_GENCODE
fi
# all CUDA libs except CuDNN and CuBLAS
ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \
| xargs -I {} bash -c \
"echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
# prune CuDNN and CuBLAS
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
#####################################################################################
# CUDA 12.1 prune visual tools
#####################################################################################
export CUDA_BASE="/usr/local/cuda-12.1/"
rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2023.1.0 $CUDA_BASE/nsight-systems-2023.1.2/
} }
function prune_124 { function prune_124 {
@ -194,35 +313,17 @@ function prune_126 {
rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/ rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
} }
function install_128 {
CUDNN_VERSION=9.8.0.87
echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
# install CUDA 12.8.1 in the same container
install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
install_cudnn 12 $CUDNN_VERSION
install_nvshmem 12 $NVSHMEM_VERSION
CUDA_VERSION=12.8 bash install_nccl.sh
CUDA_VERSION=12.8 bash install_cusparselt.sh
ldconfig
}
# idiomatic parameter and option handling in sh # idiomatic parameter and option handling in sh
while test $# -gt 0 while test $# -gt 0
do do
case "$1" in case "$1" in
11.8) install_118; prune_118
;;
12.1) install_121; prune_121
;;
12.4) install_124; prune_124 12.4) install_124; prune_124
;; ;;
12.6|12.6.*) install_126; prune_126 12.6) install_126; prune_126
;;
12.8|12.8.*) install_128;
;;
12.9|12.9.*) install_129;
;; ;;
*) echo "bad argument $1"; exit 1 *) echo "bad argument $1"; exit 1
;; ;;

View File

@ -0,0 +1,175 @@
#!/bin/bash
# Script used only in CD pipeline
set -ex
NCCL_VERSION=v2.21.5-1
CUDNN_VERSION=9.5.1.17
function install_cusparselt_062 {
# cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
mkdir tmp_cusparselt && pushd tmp_cusparselt
wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.6.2.3-archive.tar.xz
tar xf libcusparse_lt-linux-sbsa-0.6.2.3-archive.tar.xz
cp -a libcusparse_lt-linux-sbsa-0.6.2.3-archive/include/* /usr/local/cuda/include/
cp -a libcusparse_lt-linux-sbsa-0.6.2.3-archive/lib/* /usr/local/cuda/lib64/
popd
rm -rf tmp_cusparselt
}
function install_cusparselt_063 {
# cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
mkdir tmp_cusparselt && pushd tmp_cusparselt
wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.6.3.2-archive.tar.xz
tar xf libcusparse_lt-linux-sbsa-0.6.3.2-archive.tar.xz
cp -a libcusparse_lt-linux-sbsa-0.6.3.2-archive/include/* /usr/local/cuda/include/
cp -a libcusparse_lt-linux-sbsa-0.6.3.2-archive/lib/* /usr/local/cuda/lib64/
popd
rm -rf tmp_cusparselt
}
function install_124 {
CUDNN_VERSION=9.1.0.70
echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
rm -rf /usr/local/cuda-12.4 /usr/local/cuda
# install CUDA 12.4.1 in the same container
wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux_sbsa.run
chmod +x cuda_12.4.1_550.54.15_linux_sbsa.run
./cuda_12.4.1_550.54.15_linux_sbsa.run --toolkit --silent
rm -f cuda_12.4.1_550.54.15_linux_sbsa.run
rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
mkdir tmp_cudnn && cd tmp_cudnn
wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
tar xf cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf tmp_cudnn
# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
# Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl && make -j src.build
cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf nccl
install_cusparselt_062
ldconfig
}
function prune_124 {
echo "Pruning CUDA 12.4"
#####################################################################################
# CUDA 12.4 prune static libs
#####################################################################################
export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
if [[ -n "$OVERRIDE_GENCODE" ]]; then
export GENCODE=$OVERRIDE_GENCODE
fi
# all CUDA libs except CuDNN and CuBLAS
ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \
| xargs -I {} bash -c \
"echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
# prune CuDNN and CuBLAS
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
#####################################################################################
# CUDA 12.4 prune visual tools
#####################################################################################
export CUDA_BASE="/usr/local/cuda-12.4/"
rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
}
function install_126 {
echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
rm -rf /usr/local/cuda-12.6 /usr/local/cuda
# install CUDA 12.6.3 in the same container
wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux_sbsa.run
chmod +x cuda_12.6.3_560.35.05_linux_sbsa.run
./cuda_12.6.3_560.35.05_linux_sbsa.run --toolkit --silent
rm -f cuda_12.6.3_560.35.05_linux_sbsa.run
rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
mkdir tmp_cudnn && cd tmp_cudnn
wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
tar xf cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf tmp_cudnn
# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
# Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
cd nccl && make -j src.build
cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/
cd ..
rm -rf nccl
install_cusparselt_063
ldconfig
}
function prune_126 {
echo "Pruning CUDA 12.6"
#####################################################################################
# CUDA 12.6 prune static libs
#####################################################################################
export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
if [[ -n "$OVERRIDE_GENCODE" ]]; then
export GENCODE=$OVERRIDE_GENCODE
fi
if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
fi
# all CUDA libs except CuDNN and CuBLAS
ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \
| xargs -I {} bash -c \
"echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
# prune CuDNN and CuBLAS
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
#####################################################################################
# CUDA 12.6 prune visual tools
#####################################################################################
export CUDA_BASE="/usr/local/cuda-12.6/"
rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
}
# idiomatic parameter and option handling in sh
while test $# -gt 0
do
case "$1" in
12.4) install_124; prune_124
;;
12.6) install_126; prune_126
;;
*) echo "bad argument $1"; exit 1
;;
esac
shift
done

View File

@ -4,12 +4,10 @@ if [[ -n "${CUDNN_VERSION}" ]]; then
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
mkdir tmp_cudnn mkdir tmp_cudnn
pushd tmp_cudnn pushd tmp_cudnn
if [[ ${CUDA_VERSION:0:4} == "12.9" || ${CUDA_VERSION:0:4} == "12.8" ]]; then if [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive" CUDNN_NAME="cudnn-linux-x86_64-9.5.1.17_cuda12-archive"
elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive" CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda12-archive"
elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive" CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
else else

View File

@ -5,15 +5,7 @@ set -ex
# cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
mkdir tmp_cusparselt && cd tmp_cusparselt mkdir tmp_cusparselt && cd tmp_cusparselt
if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then if [[ ${CUDA_VERSION:0:4} =~ ^12\.[2-6]$ ]]; then
arch_path='sbsa'
export TARGETARCH=${TARGETARCH:-$(uname -m)}
if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
arch_path='x86_64'
fi
CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.7.1.0-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
arch_path='sbsa' arch_path='sbsa'
export TARGETARCH=${TARGETARCH:-$(uname -m)} export TARGETARCH=${TARGETARCH:-$(uname -m)}
if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
@ -21,8 +13,17 @@ elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
fi fi
CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.6.2.3-archive" CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.6.2.3-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
else elif [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
echo "Not sure which libcusparselt version to install for this ${CUDA_VERSION}" arch_path='sbsa'
export TARGETARCH=${TARGETARCH:-$(uname -m)}
if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
arch_path='x86_64'
fi
CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.5.2.1-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.4.0.7-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
fi fi
tar xf ${CUSPARSELT_NAME}.tar.xz tar xf ${CUSPARSELT_NAME}.tar.xz

38
.ci/docker/common/install_db.sh Executable file
View File

@ -0,0 +1,38 @@
#!/bin/bash
set -ex
install_ubuntu() {
apt-get update
# Cleanup
apt-get autoclean && apt-get clean
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
}
install_centos() {
# Need EPEL for many packages we depend on.
# See http://fedoraproject.org/wiki/EPEL
yum --enablerepo=extras install -y epel-release
# Cleanup
yum clean all
rm -rf /var/cache/yum
rm -rf /var/lib/yum/yumdb
rm -rf /var/lib/yum/history
}
# Install base packages depending on the base OS
ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
case "$ID" in
ubuntu)
install_ubuntu
;;
centos)
install_centos
;;
*)
echo "Unable to determine OS..."
exit 1
;;
esac

View File

@ -13,7 +13,7 @@ clone_executorch() {
# and fetch the target commit # and fetch the target commit
pushd executorch pushd executorch
git checkout "${EXECUTORCH_PINNED_COMMIT}" git checkout "${EXECUTORCH_PINNED_COMMIT}"
git submodule update --init --recursive git submodule update --init
popd popd
chown -R jenkins executorch chown -R jenkins executorch
@ -37,12 +37,7 @@ install_conda_dependencies() {
install_pip_dependencies() { install_pip_dependencies() {
pushd executorch pushd executorch
as_jenkins bash install_executorch.sh as_jenkins bash install_requirements.sh --pybind xnnpack
# A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current
# numba and scipy version used in PyTorch CI
conda_run pip uninstall -y numba scipy
popd popd
} }
@ -50,9 +45,10 @@ setup_executorch() {
pushd executorch pushd executorch
export PYTHON_EXECUTABLE=python export PYTHON_EXECUTABLE=python
export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" export EXECUTORCH_BUILD_PYBIND=ON
export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true as_jenkins .ci/scripts/setup-linux.sh cmake || true
popd popd
} }

View File

@ -17,7 +17,7 @@ if [ -n "${UBUNTU_VERSION}" ];then
libopenblas-dev libeigen3-dev libatlas-base-dev libzstd-dev libopenblas-dev libeigen3-dev libatlas-base-dev libzstd-dev
fi fi
pip_install numpy scipy imageio cmake ninja conda_install numpy scipy imageio cmake ninja
git clone --depth 1 --branch release/16.x --recursive https://github.com/llvm/llvm-project.git git clone --depth 1 --branch release/16.x --recursive https://github.com/llvm/llvm-project.git
cmake -DCMAKE_BUILD_TYPE=Release \ cmake -DCMAKE_BUILD_TYPE=Release \
@ -35,9 +35,7 @@ git clone https://github.com/halide/Halide.git
pushd Halide pushd Halide
git checkout ${COMMIT} && git submodule update --init --recursive git checkout ${COMMIT} && git submodule update --init --recursive
pip_install -r requirements.txt pip_install -r requirements.txt
# NOTE: pybind has a requirement for cmake > 3.5 so set the minimum cmake version here with a flag cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -S . -B build
# Context: https://github.com/pytorch/pytorch/issues/150420
cmake -G Ninja -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE=Release -S . -B build
cmake --build build cmake --build build
test -e ${CONDA_PREFIX}/lib/python3 || ln -s python${ANACONDA_PYTHON_VERSION} ${CONDA_PREFIX}/lib/python3 test -e ${CONDA_PREFIX}/lib/python3 || ln -s python${ANACONDA_PYTHON_VERSION} ${CONDA_PREFIX}/lib/python3
cmake --install build --prefix ${CONDA_PREFIX} cmake --install build --prefix ${CONDA_PREFIX}

View File

@ -14,36 +14,19 @@ function install_timm() {
local commit local commit
commit=$(get_pinned_commit timm) commit=$(get_pinned_commit timm)
# TODO (huydhn): There is no torchvision release on 3.13 when I write this, so
# I'm using nightly here instead. We just need to package to be able to install
# TIMM. Removing this once vision has a release on 3.13
if [[ "${ANACONDA_PYTHON_VERSION}" == "3.13" ]]; then
pip_install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124
fi
pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}" pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
} # Clean up
conda_run pip uninstall -y cmake torch torchvision triton
function install_torchbench() {
local commit
commit=$(get_pinned_commit torchbench)
git clone https://github.com/pytorch/benchmark torchbench
pushd torchbench
git checkout "$commit"
python install.py --continue_on_fail
# TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
# is regressing speedup metric. This needs to be investigated further
pip install transformers==4.38.1
echo "Print all dependencies after TorchBench is installed"
python -mpip freeze
popd
} }
# Pango is needed for weasyprint which is needed for doctr # Pango is needed for weasyprint which is needed for doctr
conda_install pango conda_install pango
# Stable packages are ok here, just to satisfy TorchBench check
pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
install_torchbench
install_huggingface install_huggingface
install_timm install_timm
# Clean up
conda_run pip uninstall -y torch torchvision torchaudio triton

View File

@ -2,6 +2,8 @@
set -ex set -ex
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
if [ -n "${UBUNTU_VERSION}" ]; then if [ -n "${UBUNTU_VERSION}" ]; then
apt update apt update
apt-get install -y clang doxygen git graphviz nodejs npm libtinfo5 apt-get install -y clang doxygen git graphviz nodejs npm libtinfo5
@ -13,8 +15,8 @@ chown -R jenkins pytorch
pushd pytorch pushd pytorch
# Install all linter dependencies # Install all linter dependencies
pip install -r requirements.txt pip_install -r requirements.txt
lintrunner init conda_run lintrunner init
# Cache .lintbin directory as part of the Docker image # Cache .lintbin directory as part of the Docker image
cp -r .lintbin /tmp cp -r .lintbin /tmp

View File

@ -1,23 +1,26 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# Script that installs magma from tarball inside conda environment. # Script that replaces the magma install from a conda package
# It replaces anaconda magma-cuda package which is no longer published.
# Execute it inside active conda environment.
# See issue: https://github.com/pytorch/pytorch/issues/138506
set -eou pipefail set -eou pipefail
cuda_version_nodot=${1/./} function do_install() {
anaconda_dir=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} cuda_version_nodot=${1/./}
anaconda_python_version=$2
MAGMA_VERSION="2.6.1" MAGMA_VERSION="2.6.1"
magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2" magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
(
set -x anaconda_dir="/opt/conda/envs/py_${anaconda_python_version}"
tmp_dir=$(mktemp -d) (
pushd ${tmp_dir} set -x
curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive} tmp_dir=$(mktemp -d)
tar -xvf "${magma_archive}" pushd ${tmp_dir}
mv include/* "${anaconda_dir}/include/" curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
mv lib/* "${anaconda_dir}/lib" tar -xvf "${magma_archive}"
popd mv include/* "${anaconda_dir}/include/"
) mv lib/* "${anaconda_dir}/lib"
popd
)
}
do_install $1 $2

View File

@ -1,26 +0,0 @@
#!/bin/bash
set -ex
NCCL_VERSION=""
if [[ ${CUDA_VERSION:0:2} == "11" ]]; then
NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt)
elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt)
else
echo "Unexpected CUDA_VERSION ${CUDA_VERSION}"
exit 1
fi
if [[ -n "${NCCL_VERSION}" ]]; then
# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
# Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
pushd nccl
make -j src.build
cp -a build/include/* /usr/local/cuda/include/
cp -a build/lib/* /usr/local/cuda/lib64/
popd
rm -rf nccl
ldconfig
fi

View File

@ -4,15 +4,10 @@ set -ex
[ -n "$NINJA_VERSION" ] [ -n "$NINJA_VERSION" ]
arch=$(uname -m) url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux.zip"
if [ "$arch" == "aarch64" ]; then
url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux-aarch64.zip"
else
url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux.zip"
fi
pushd /tmp pushd /tmp
wget --no-verbose --output-document=ninja-linux.zip "$url" wget --no-verbose --output-document=ninja-linux.zip "$url"
unzip ninja-linux.zip -d /usr/local/bin unzip ninja-linux.zip -d /usr/local/bin
rm -f ninja-linux.zip rm -f ninja-linux.zip
popd popd

View File

@ -8,6 +8,16 @@ retry () {
"$@" || (sleep 10 && "$@") || (sleep 20 && "$@") || (sleep 40 && "$@") "$@" || (sleep 10 && "$@") || (sleep 20 && "$@") || (sleep 40 && "$@")
} }
# A bunch of custom pip dependencies for ONNX
pip_install \
beartype==0.15.0 \
filelock==3.9.0 \
flatbuffers==2.0 \
mock==5.0.1 \
ninja==1.10.2 \
networkx==2.5 \
numpy==1.24.2
# ONNXRuntime should be installed before installing # ONNXRuntime should be installed before installing
# onnx-weekly. Otherwise, onnx-weekly could be # onnx-weekly. Otherwise, onnx-weekly could be
# overwritten by onnx. # overwritten by onnx.
@ -19,13 +29,17 @@ pip_install \
transformers==4.36.2 transformers==4.36.2
pip_install coloredlogs packaging pip_install coloredlogs packaging
pip_install onnxruntime==1.18.1 pip_install onnxruntime==1.18.1
pip_install onnxscript==0.3.1 pip_install onnx==1.16.2
pip_install onnxscript==0.1.0.dev20241124 --no-deps
# required by onnxscript
pip_install ml_dtypes
# Cache the transformers model to be used later by ONNX tests. We need to run the transformers # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
# package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/ # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
IMPORT_SCRIPT_FILENAME="/tmp/onnx_import_script.py" IMPORT_SCRIPT_FILENAME="/tmp/onnx_import_script.py"
as_jenkins echo 'import transformers; transformers.GPTJForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gptj");' > "${IMPORT_SCRIPT_FILENAME}" as_jenkins echo 'import transformers; transformers.AutoModel.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3");' > "${IMPORT_SCRIPT_FILENAME}"
# Need a PyTorch version for transformers to work # Need a PyTorch version for transformers to work
pip_install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu pip_install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu

View File

@ -4,9 +4,9 @@
set -ex set -ex
cd / cd /
git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.28 --depth 1 --shallow-submodules
OPENBLAS_CHECKOUT_DIR="OpenBLAS"
OPENBLAS_BUILD_FLAGS=" OPENBLAS_BUILD_FLAGS="
NUM_THREADS=128 NUM_THREADS=128
USE_OPENMP=1 USE_OPENMP=1
@ -14,8 +14,9 @@ NO_SHARED=0
DYNAMIC_ARCH=1 DYNAMIC_ARCH=1
TARGET=ARMV8 TARGET=ARMV8
CFLAGS=-O3 CFLAGS=-O3
BUILD_BFLOAT16=1
" "
OPENBLAS_CHECKOUT_DIR="OpenBLAS"
make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR} make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR} make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}

View File

@ -0,0 +1,19 @@
#!/bin/bash
set -ex
pb_dir="/usr/temp_pb_install_dir"
mkdir -p $pb_dir
# On the nvidia/cuda:9-cudnn7-devel-centos7 image we need this symlink or
# else it will fail with
# g++: error: ./../lib64/crti.o: No such file or directory
ln -s /usr/lib64 "$pb_dir/lib64"
curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.17.3/protobuf-all-3.17.3.tar.gz" --retry 3
tar -xvz --no-same-owner -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz
NPROC=$[$(nproc) - 2]
pushd "$pb_dir" && ./configure && make -j${NPROC} && make -j${NPROC} check && sudo make -j${NRPOC} install && sudo ldconfig
popd
rm -rf $pb_dir

View File

@ -1,15 +0,0 @@
#!/bin/bash
set -ex
apt-get update
# Use deadsnakes in case we need an older python version
sudo add-apt-repository ppa:deadsnakes/ppa
apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python3-pip python${PYTHON_VERSION}-venv
# Use a venv because uv and some other package managers don't support --user install
ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
python -m venv /var/lib/jenkins/ci_env
source /var/lib/jenkins/ci_env/bin/activate
python -mpip install --upgrade pip
python -mpip install -r /opt/requirements-ci.txt

View File

@ -8,11 +8,13 @@ ver() {
install_ubuntu() { install_ubuntu() {
apt-get update apt-get update
# gpg-agent is not available by default if [[ $UBUNTU_VERSION == 18.04 ]]; then
apt-get install -y --no-install-recommends gpg-agent # gpg-agent is not available by default on 18.04
if [[ $(ver $UBUNTU_VERSION) -ge $(ver 22.04) ]]; then apt-get install -y --no-install-recommends gpg-agent
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \ fi
| sudo tee /etc/apt/preferences.d/rocm-pin-600 if [[ $UBUNTU_VERSION == 20.04 ]]; then
# gpg-agent is not available by default on 20.04
apt-get install -y --no-install-recommends gpg-agent
fi fi
apt-get install -y kmod apt-get install -y kmod
apt-get install -y wget apt-get install -y wget
@ -21,34 +23,13 @@ install_ubuntu() {
apt-get install -y libc++1 apt-get install -y libc++1
apt-get install -y libc++abi1 apt-get install -y libc++abi1
# Make sure rocm packages from repo.radeon.com have highest priority
cat << EOF > /etc/apt/preferences.d/rocm-pin-600
Package: *
Pin: release o=repo.radeon.com
Pin-Priority: 600
EOF
# we want the patch version of 6.4 instead
if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
ROCM_VERSION="${ROCM_VERSION}.1"
fi
# Default url values
rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
# Special case for ROCM_VERSION == 7.0
if [[ $(ver "$ROCM_VERSION") -eq $(ver 7.0) ]]; then
rocm_baseurl="https://repo.radeon.com/rocm/apt/7.0_alpha2"
amdgpu_baseurl="https://repo.radeon.com/amdgpu/30.10_alpha2/ubuntu"
fi
# Add amdgpu repository # Add amdgpu repository
UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'` UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
# Add rocm repository # Add rocm repository
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
local rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
echo "deb [arch=amd64] ${rocm_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/rocm.list echo "deb [arch=amd64] ${rocm_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/rocm.list
apt-get update --allow-insecure-repositories apt-get update --allow-insecure-repositories
@ -81,35 +62,6 @@ EOF
sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;" sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
done done
# ROCm 6.3 had a regression where initializing static code objects had significant overhead
# CI no longer builds for ROCm 6.3, but
# ROCm 6.4 did not yet fix the regression, also HIP branch names are different
if [[ $(ver $ROCM_VERSION) -ge $(ver 6.4) ]] && [[ $(ver $ROCM_VERSION) -lt $(ver 7.0) ]]; then
if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then
HIP_BRANCH=release/rocm-rel-6.4
CLR_HASH=606bc820b4b1f315d135da02a1f0b176ca50a92c # branch release/rocm-rel-6.4.1-statco-hotfix
elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
HIP_BRANCH=release/rocm-rel-6.4
CLR_HASH=600f5b0d2baed94d5121e2174a9de0851b040b0c # branch release/rocm-rel-6.4-statco-hotfix
fi
# clr build needs CppHeaderParser but can only find it using conda's python
python -m pip install CppHeaderParser
git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
HIP_COMMON_DIR=$(readlink -f HIP)
git clone https://github.com/jeffdaily/clr
pushd clr
git checkout $CLR_HASH
popd
mkdir -p clr/build
pushd clr/build
# Need to point CMake to the correct python installation to find CppHeaderParser
cmake .. -DPython3_EXECUTABLE=/opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}/bin/python3 -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
make -j
cp hipamd/lib/libamdhip64.so.6.4.* /opt/rocm/lib/libamdhip64.so.6.4.*
popd
rm -rf HIP clr
fi
# Cleanup # Cleanup
apt-get autoclean && apt-get clean apt-get autoclean && apt-get clean
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

View File

@ -115,7 +115,7 @@ index a5007ffc..13fa07fc 100644
if (!fp) { if (!fp) {
- fprintf(stderr, "%s: %s\n", AMDGPU_ASIC_ID_TABLE, - fprintf(stderr, "%s: %s\n", AMDGPU_ASIC_ID_TABLE,
- strerror(errno)); - strerror(errno));
+ //fprintf(stderr, "amdgpu.ids: No such file or directory\n"); + fprintf(stderr, "amdgpu.ids: No such file or directory\n");
return; return;
} }

View File

@ -1,37 +1,50 @@
#!/usr/bin/env bash #!/bin/bash
# Script used only in CD pipeline # Script used in CI and CD pipeline
set -eou pipefail set -ex
function do_install() { # Magma build scripts need `python`
rocm_version=$1 ln -sf /usr/bin/python3 /usr/bin/python
if [[ ${rocm_version} =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
# chop off any patch version
rocm_version="${rocm_version%.*}"
fi
rocm_version_nodot=${rocm_version//./} ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
case "$ID" in
almalinux)
yum install -y gcc-gfortran
;;
*)
echo "No preinstalls to build magma..."
;;
esac
# Version 2.7.2 + ROCm related updates MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}
MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
rocm_dir="/opt/rocm" # "install" hipMAGMA into /opt/rocm/magma by copying after build
( git clone https://bitbucket.org/icl/magma.git
set -x pushd magma
tmp_dir=$(mktemp -d)
pushd ${tmp_dir}
curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
if tar -xvf "${magma_archive}"
then
mkdir -p "${rocm_dir}/magma"
mv include "${rocm_dir}/magma/include"
mv lib "${rocm_dir}/magma/lib"
else
echo "${magma_archive} not found, skipping magma install"
fi
popd
)
}
do_install $1 # Version 2.7.2 + ROCm related updates
git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6
cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
fi
echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
export PATH="${PATH}:/opt/rocm/bin"
if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'`
else
amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs`
fi
for arch in $amdgpu_targets; do
echo "DEVCCFLAGS += --offload-arch=$arch" >> make.inc
done
# hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
make -f make.gen.hipMAGMA -j $(nproc)
LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
popd
mv magma /opt/rocm

View File

@ -0,0 +1,24 @@
#!/bin/bash
set -ex
[ -n "${SWIFTSHADER}" ]
retry () {
$* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
}
_https_amazon_aws=https://ossci-android.s3.amazonaws.com
# SwiftShader
_swiftshader_dir=/var/lib/jenkins/swiftshader
_swiftshader_file_targz=swiftshader-abe07b943-prebuilt.tar.gz
mkdir -p $_swiftshader_dir
_tmp_swiftshader_targz="/tmp/${_swiftshader_file_targz}"
curl --silent --show-error --location --fail --retry 3 \
--output "${_tmp_swiftshader_targz}" "$_https_amazon_aws/${_swiftshader_file_targz}"
tar -C "${_swiftshader_dir}" -xzf "${_tmp_swiftshader_targz}"
export VK_ICD_FILENAMES="${_swiftshader_dir}/build/Linux/vk_swiftshader_icd.json"

View File

@ -2,16 +2,14 @@
set -ex set -ex
mkdir -p /opt/triton
if [ -z "${TRITON}" ] && [ -z "${TRITON_CPU}" ]; then
echo "TRITON and TRITON_CPU are not set. Exiting..."
exit 0
fi
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
get_pip_version() { get_conda_version() {
conda_run pip list | grep -w $* | head -n 1 | awk '{print $2}' as_jenkins conda list -n py_$ANACONDA_PYTHON_VERSION | grep -w $* | head -n 1 | awk '{print $2}'
}
conda_reinstall() {
as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $*
} }
if [ -n "${XPU_VERSION}" ]; then if [ -n "${XPU_VERSION}" ]; then
@ -33,9 +31,11 @@ if [ -n "${UBUNTU_VERSION}" ];then
apt-get install -y gpg-agent apt-get install -y gpg-agent
fi fi
# Keep the current cmake and numpy version here, so we can reinstall them later if [ -n "${CONDA_CMAKE}" ]; then
CMAKE_VERSION=$(get_pip_version cmake) # Keep the current cmake and numpy version here, so we can reinstall them later
NUMPY_VERSION=$(get_pip_version numpy) CMAKE_VERSION=$(get_conda_version cmake)
NUMPY_VERSION=$(get_conda_version numpy)
fi
if [ -z "${MAX_JOBS}" ]; then if [ -z "${MAX_JOBS}" ]; then
export MAX_JOBS=$(nproc) export MAX_JOBS=$(nproc)
@ -51,13 +51,7 @@ as_jenkins git clone --recursive ${TRITON_REPO} triton
cd triton cd triton
as_jenkins git checkout ${TRITON_PINNED_COMMIT} as_jenkins git checkout ${TRITON_PINNED_COMMIT}
as_jenkins git submodule update --init --recursive as_jenkins git submodule update --init --recursive
cd python
# Old versions of python have setup.py in ./python; newer versions have it in ./
if [ ! -f setup.py ]; then
cd python
fi
pip_install pybind11==2.13.6
# TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py
@ -66,42 +60,28 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}"
# Triton needs at least gcc-9 to build # Triton needs at least gcc-9 to build
apt-get install -y g++-9 apt-get install -y g++-9
CXX=g++-9 conda_run python setup.py bdist_wheel CXX=g++-9 pip_install -e .
elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
# Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
add-apt-repository -y ppa:ubuntu-toolchain-r/test add-apt-repository -y ppa:ubuntu-toolchain-r/test
apt-get install -y g++-9 apt-get install -y g++-9
CXX=g++-9 conda_run python setup.py bdist_wheel CXX=g++-9 pip_install -e .
else else
conda_run python setup.py bdist_wheel pip_install -e .
fi fi
# Copy the wheel to /opt for multi stage docker builds if [ -n "${CONDA_CMAKE}" ]; then
cp dist/*.whl /opt/triton # TODO: This is to make sure that the same cmake and numpy version from install conda
# Install the wheel for docker builds that don't use multi stage # script is used. Without this step, the newer cmake version (3.25.2) downloaded by
pip_install dist/*.whl # triton build step via pip will fail to detect conda MKL. Once that issue is fixed,
# this can be removed.
# TODO: This is to make sure that the same cmake and numpy version from install conda #
# script is used. Without this step, the newer cmake version (3.25.2) downloaded by # The correct numpy version also needs to be set here because conda claims that it
# triton build step via pip will fail to detect conda MKL. Once that issue is fixed, # causes inconsistent environment. Without this, conda will attempt to install the
# this can be removed. # latest numpy version, which fails ASAN tests with the following import error: Numba
# # needs NumPy 1.20 or less.
# The correct numpy version also needs to be set here because conda claims that it conda_reinstall cmake="${CMAKE_VERSION}"
# causes inconsistent environment. Without this, conda will attempt to install the # Note that we install numpy with pip as conda might not have the version we want
# latest numpy version, which fails ASAN tests with the following import error: Numba pip_install --force-reinstall numpy=="${NUMPY_VERSION}"
# needs NumPy 1.20 or less.
# Note that we install numpy with pip as conda might not have the version we want
if [ -n "${CMAKE_VERSION}" ]; then
pip_install "cmake==${CMAKE_VERSION}"
fi
if [ -n "${NUMPY_VERSION}" ]; then
pip_install "numpy==${NUMPY_VERSION}"
fi
# IMPORTANT: helion needs to be installed without dependencies.
# It depends on torch and triton. We don't want to install
# triton and torch from production on Docker CI images
if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then
pip_install helion --no-deps
fi fi

View File

@ -8,12 +8,6 @@ else
with_cuda=no with_cuda=no
fi fi
if [[ -d "/opt/rocm" ]]; then
with_rocm=/opt/rocm
else
with_rocm=no
fi
function install_ucx() { function install_ucx() {
set -ex set -ex
git clone --recursive https://github.com/openucx/ucx.git git clone --recursive https://github.com/openucx/ucx.git
@ -25,7 +19,6 @@ function install_ucx() {
./configure --prefix=$UCX_HOME \ ./configure --prefix=$UCX_HOME \
--enable-mt \ --enable-mt \
--with-cuda=$with_cuda \ --with-cuda=$with_cuda \
--with-rocm=$with_rocm \
--enable-profiling \ --enable-profiling \
--enable-stats --enable-stats
time make -j time make -j
@ -43,29 +36,12 @@ function install_ucc() {
git submodule update --init --recursive git submodule update --init --recursive
./autogen.sh ./autogen.sh
# We only run distributed tests on Tesla M60 and A10G # We only run distributed tests on Tesla M60 and A10G
NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86" NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
if [[ -n "$ROCM_VERSION" ]]; then
if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'`
else
amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs`
fi
for arch in $amdgpu_targets; do
HIP_OFFLOAD="$HIP_OFFLOAD --offload-arch=$arch"
done
else
HIP_OFFLOAD="all-arch-no-native"
fi
./configure --prefix=$UCC_HOME \ ./configure --prefix=$UCC_HOME \
--with-ucx=$UCX_HOME \ --with-ucx=$UCX_HOME \
--with-cuda=$with_cuda \ --with-cuda=$with_cuda \
--with-nvcc-gencode="${NVCC_GENCODE}" \ --with-nvcc-gencode="${NVCC_GENCODE}"
--with-rocm=$with_rocm \
--with-rocm-arch="${HIP_OFFLOAD}"
time make -j time make -j
sudo make install sudo make install

View File

@ -0,0 +1,24 @@
#!/bin/bash
set -ex
[ -n "${VULKAN_SDK_VERSION}" ]
retry () {
$* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
}
_vulkansdk_dir=/var/lib/jenkins/vulkansdk
_tmp_vulkansdk_targz=/tmp/vulkansdk.tar.gz
curl \
--silent \
--show-error \
--location \
--fail \
--retry 3 \
--output "${_tmp_vulkansdk_targz}" "https://ossci-android.s3.amazonaws.com/vulkansdk-linux-x86_64-${VULKAN_SDK_VERSION}.tar.gz"
mkdir -p "${_vulkansdk_dir}"
tar -C "${_vulkansdk_dir}" -xzf "${_tmp_vulkansdk_targz}" --strip-components 1
rm -rf "${_tmp_vulkansdk_targz}"

View File

@ -26,7 +26,7 @@ function install_ubuntu() {
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
| gpg --dearmor > /usr/share/keyrings/oneapi-archive-keyring.gpg.gpg | gpg --dearmor > /usr/share/keyrings/oneapi-archive-keyring.gpg.gpg
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg.gpg] \ echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg.gpg] \
https://apt.repos.intel.com/oneapi all main" \ https://apt.repos.intel.com/${XPU_REPO_NAME} all main" \
| tee /etc/apt/sources.list.d/oneAPI.list | tee /etc/apt/sources.list.d/oneAPI.list
# Update the packages list and repository index # Update the packages list and repository index
@ -56,10 +56,14 @@ function install_ubuntu() {
function install_rhel() { function install_rhel() {
. /etc/os-release . /etc/os-release
if [[ "${ID}" == "rhel" ]]; then
if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
echo "RHEL version ${VERSION_ID} not supported" echo "RHEL version ${VERSION_ID} not supported"
exit exit
fi
elif [[ "${ID}" == "almalinux" ]]; then
# Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64
VERSION_ID="8.8"
fi fi
dnf install -y 'dnf-command(config-manager)' dnf install -y 'dnf-command(config-manager)'
@ -70,7 +74,7 @@ function install_rhel() {
tee > /etc/yum.repos.d/oneAPI.repo << EOF tee > /etc/yum.repos.d/oneAPI.repo << EOF
[oneAPI] [oneAPI]
name=Intel for Pytorch GPU dev repository name=Intel for Pytorch GPU dev repository
baseurl=https://yum.repos.intel.com/oneapi baseurl=https://yum.repos.intel.com/${XPU_REPO_NAME}
enabled=1 enabled=1
gpgcheck=1 gpgcheck=1
repo_gpgcheck=1 repo_gpgcheck=1
@ -114,7 +118,7 @@ function install_sles() {
https://repositories.intel.com/gpu/sles/${VERSION_SP}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_SP}.repo https://repositories.intel.com/gpu/sles/${VERSION_SP}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_SP}.repo
rpm --import https://repositories.intel.com/gpu/intel-graphics.key rpm --import https://repositories.intel.com/gpu/intel-graphics.key
# To add the online network network package repository for the Intel Support Packages # To add the online network network package repository for the Intel Support Packages
zypper addrepo https://yum.repos.intel.com/oneapi oneAPI zypper addrepo https://yum.repos.intel.com/${XPU_REPO_NAME} oneAPI
rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
# The xpu-smi packages # The xpu-smi packages
@ -137,10 +141,10 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
XPU_DRIVER_VERSION="" XPU_DRIVER_VERSION=""
fi fi
# Default use Intel® oneAPI Deep Learning Essentials 2025.0 XPU_REPO_NAME="intel-for-pytorch-gpu-dev"
if [[ "$XPU_VERSION" == "2025.1" ]]; then XPU_PACKAGES="intel-for-pytorch-gpu-dev-0.5 intel-pti-dev-0.9"
XPU_PACKAGES="intel-deep-learning-essentials-2025.1" if [[ "$XPU_VERSION" == "2025.0" ]]; then
else XPU_REPO_NAME="oneapi"
XPU_PACKAGES="intel-deep-learning-essentials-2025.0" XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
fi fi

View File

@ -1,350 +0,0 @@
import argparse
import sys
from enum import Enum
import shlex
class HardwareType(Enum):
DEFAULT = "default"
ROCM = "rocm"
@staticmethod
def from_image_name(image_name: str) -> "HardwareType":
if "rocm" in image_name:
return HardwareType.ROCM
return HardwareType.DEFAULT
class HardcodedBaseConfig:
_UCX_UCC_CONFIGS: dict[HardwareType, dict[str, str]] = {
HardwareType.DEFAULT: {
"UCX_COMMIT": "7bb2722ff2187a0cad557ae4a6afa090569f83fb",
"UCC_COMMIT": "20eae37090a4ce1b32bcce6144ccad0b49943e0b",
},
HardwareType.ROCM: {
"UCX_COMMIT": "cc312eaa4655c0cc5c2bcd796db938f90563bcf6",
"UCC_COMMIT": "0c0fc21559835044ab107199e334f7157d6a0d3d",
},
}
def __init__(self, hardwareType: HardwareType) -> None:
commits = self.get_ucx_ucc_commits(hardwareType)
self.ucx_commit = commits["UCX_COMMIT"]
self.ucc_commit = commits["UCC_COMMIT"]
def _get_tag(self, image: str):
if ":" not in image:
print(f"echo 'Invalid image format (missing :): {image}'", file=sys.stderr)
return
tag = image.split(":")[1]
return tag
def get_all_configs(self):
_TAG_CONFIGS = {
"pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11": {
"CUDA_VERSION": "12.4",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.10",
"GCC_VERSION": "11",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
},
"pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11": {
"CUDA_VERSION": "12.8.1",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.10",
"GCC_VERSION": "11",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
},
"pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks": {
"CUDA_VERSION": "12.8.1",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.10",
"GCC_VERSION": "9",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
"INDUCTOR_BENCHMARKS": "yes",
},
"pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks": {
"CUDA_VERSION": "12.8.1",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.12",
"GCC_VERSION": "9",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
"INDUCTOR_BENCHMARKS": "yes",
},
"pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks": {
"CUDA_VERSION": "12.8.1",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.13",
"GCC_VERSION": "9",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
"INDUCTOR_BENCHMARKS": "yes",
},
"pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9": {
"CUDA_VERSION": "12.6.3",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.10",
"GCC_VERSION": "9",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
},
"pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks": {
"CUDA_VERSION": "12.6",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.10",
"GCC_VERSION": "9",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
"INDUCTOR_BENCHMARKS": "yes",
},
"pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks": {
"CUDA_VERSION": "12.6",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.12",
"GCC_VERSION": "9",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
"INDUCTOR_BENCHMARKS": "yes",
},
"pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks": {
"CUDA_VERSION": "12.6",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.13",
"GCC_VERSION": "9",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
"INDUCTOR_BENCHMARKS": "yes",
},
"pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9": {
"CUDA_VERSION": "12.8.1",
"CUDNN_VERSION": "9",
"ANACONDA_PYTHON_VERSION": "3.10",
"GCC_VERSION": "9",
"VISION": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"TRITON": "yes",
},
"pytorch-linux-jammy-py3-clang12-onnx": {
"ANACONDA_PYTHON_VERSION": "3.9",
"CLANG_VERSION": "12",
"VISION": "yes",
"ONNX": "yes",
},
"pytorch-linux-jammy-py3.9-clang12": {
"ANACONDA_PYTHON_VERSION": "3.9",
"CLANG_VERSION": "12",
"VISION": "yes",
"TRITON": "yes",
},
"pytorch-linux-jammy-py3.11-clang12": {
"ANACONDA_PYTHON_VERSION": "3.11",
"CLANG_VERSION": "12",
"VISION": "yes",
"TRITON": "yes",
},
"pytorch-linux-jammy-py3.9-gcc9": {
"ANACONDA_PYTHON_VERSION": "3.9",
"GCC_VERSION": "9",
"VISION": "yes",
"TRITON": "yes",
},
"pytorch-linux-jammy-rocm-n-py3": {
"ANACONDA_PYTHON_VERSION": "3.10",
"GCC_VERSION": "11",
"VISION": "yes",
"ROCM_VERSION": "6.4",
"NINJA_VERSION": "1.9.0",
"TRITON": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"INDUCTOR_BENCHMARKS": "yes",
},
"pytorch-linux-noble-rocm-n-py3": {
"ANACONDA_PYTHON_VERSION": "3.12",
"GCC_VERSION": "11",
"VISION": "yes",
"ROCM_VERSION": "6.4",
"NINJA_VERSION": "1.9.0",
"TRITON": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"INDUCTOR_BENCHMARKS": "yes",
},
"pytorch-linux-noble-rocm-alpha-py3": {
"ANACONDA_PYTHON_VERSION": "3.12",
"GCC_VERSION": "11",
"VISION": "yes",
"ROCM_VERSION": "7.0",
"NINJA_VERSION": "1.9.0",
"TRITON": "yes",
"KATEX": "yes",
"UCX_COMMIT": self.ucx_commit,
"UCC_COMMIT": self.ucc_commit,
"INDUCTOR_BENCHMARKS": "yes",
"PYTORCH_ROCM_ARCH": "gfx90a;gfx942;gfx950",
},
"pytorch-linux-jammy-xpu-2025.0-py3": {
"ANACONDA_PYTHON_VERSION": "3.9",
"GCC_VERSION": "11",
"VISION": "yes",
"XPU_VERSION": "2025.0",
"NINJA_VERSION": "1.9.0",
"TRITON": "yes",
},
"pytorch-linux-jammy-xpu-2025.1-py3": {
"ANACONDA_PYTHON_VERSION": "3.9",
"GCC_VERSION": "11",
"VISION": "yes",
"XPU_VERSION": "2025.1",
"NINJA_VERSION": "1.9.0",
"TRITON": "yes",
},
"pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks": {
"ANACONDA_PYTHON_VERSION": "3.9",
"GCC_VERSION": "11",
"VISION": "yes",
"KATEX": "yes",
"TRITON": "yes",
"DOCS": "yes",
"INDUCTOR_BENCHMARKS": "yes",
},
"pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12": {
"ANACONDA_PYTHON_VERSION": "3.9",
"CUDA_VERSION": "12.8.1",
"CUDNN_VERSION": "9",
"CLANG_VERSION": "12",
"VISION": "yes",
"TRITON": "yes",
},
"pytorch-linux-jammy-py3-clang18-asan": {
"ANACONDA_PYTHON_VERSION": "3.10",
"CLANG_VERSION": "18",
"VISION": "yes",
},
"pytorch-linux-jammy-py3.9-gcc11": {
"ANACONDA_PYTHON_VERSION": "3.9",
"GCC_VERSION": "11",
"VISION": "yes",
"KATEX": "yes",
"TRITON": "yes",
"DOCS": "yes",
"UNINSTALL_DILL": "yes",
},
"pytorch-linux-jammy-py3-clang12-executorch": {
"ANACONDA_PYTHON_VERSION": "3.10",
"CLANG_VERSION": "12",
"EXECUTORCH": "yes",
},
"pytorch-linux-jammy-py3.12-halide": {
"CUDA_VERSION": "12.6",
"ANACONDA_PYTHON_VERSION": "3.12",
"GCC_VERSION": "11",
"HALIDE": "yes",
"TRITON": "yes",
},
"pytorch-linux-jammy-py3.12-triton-cpu": {
"CUDA_VERSION": "12.6",
"ANACONDA_PYTHON_VERSION": "3.12",
"GCC_VERSION": "11",
"TRITON_CPU": "yes",
},
"pytorch-linux-jammy-linter": {
"PYTHON_VERSION": "3.9",
},
"pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter": {
"PYTHON_VERSION": "3.9",
"CUDA_VERSION": "12.8.1",
},
"pytorch-linux-jammy-aarch64-py3.10-gcc11": {
"ANACONDA_PYTHON_VERSION": "3.10",
"GCC_VERSION": "11",
"ACL": "yes",
"VISION": "yes",
"CONDA_CMAKE": "yes",
"OPENBLAS": "yes",
"SKIP_LLVM_SRC_BUILD_INSTALL": "yes",
},
"pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks": {
"ANACONDA_PYTHON_VERSION": "3.10",
"GCC_VERSION": "11",
"ACL": "yes",
"VISION": "yes",
"CONDA_CMAKE": "yes",
"OPENBLAS": "yes",
"SKIP_LLVM_SRC_BUILD_INSTALL": "yes",
"INDUCTOR_BENCHMARKS": "yes",
},
}
return _TAG_CONFIGS
def get_config(self, image_name:str) -> dict:
tag = self._get_tag(image_name)
config_dict = self.get_all_configs()
if tag not in config_dict:
raise ValueError(f"Unknown tag: {tag}")
return config_dict[tag]
def get_ucx_ucc_commits(self, hw_type: HardwareType) -> dict[str, str]:
if hw_type not in self._UCX_UCC_CONFIGS:
raise ValueError(f"Unsupported hardware type: {hw_type}")
return self._UCX_UCC_CONFIGS[hw_type]
def main():
parser = argparse.ArgumentParser(
description="Return for a given image tag."
)
parser.add_argument(
"--image", required=True, help="Full image string (e.g., repo/name:tag)"
)
args = parser.parse_args()
try:
image_name = args.image
hw_type = HardwareType.from_image_name(image_name)
config_runner = HardcodedBaseConfig(hw_type)
config = config_runner.get_config(args.image)
for key, val in config.items():
print(f'export {key}={shlex.quote(val)}')
except Exception as e:
# Any error will signal fallback
print(f"# Fallback due to error: {e}", file=sys.stderr)
sys.exit(42)
if __name__ == "__main__":
main()

View File

@ -49,28 +49,29 @@ RUN bash ./install_mkl.sh && rm install_mkl.sh
FROM cpu as cuda FROM cpu as cuda
ADD ./common/install_cuda.sh install_cuda.sh ADD ./common/install_cuda.sh install_cuda.sh
ADD ./common/install_magma.sh install_magma.sh ADD ./common/install_magma.sh install_magma.sh
COPY ./common/install_nccl.sh install_nccl.sh
COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
COPY ./common/install_cusparselt.sh install_cusparselt.sh
ENV CUDA_HOME /usr/local/cuda ENV CUDA_HOME /usr/local/cuda
FROM cuda as cuda11.8
RUN bash ./install_cuda.sh 11.8
RUN bash ./install_magma.sh 11.8
RUN ln -sf /usr/local/cuda-11.8 /usr/local/cuda
FROM cuda as cuda12.1
RUN bash ./install_cuda.sh 12.1
RUN bash ./install_magma.sh 12.1
RUN ln -sf /usr/local/cuda-12.1 /usr/local/cuda
FROM cuda as cuda12.4
RUN bash ./install_cuda.sh 12.4
RUN bash ./install_magma.sh 12.4
RUN ln -sf /usr/local/cuda-12.4 /usr/local/cuda
FROM cuda as cuda12.6 FROM cuda as cuda12.6
RUN bash ./install_cuda.sh 12.6 RUN bash ./install_cuda.sh 12.6
RUN bash ./install_magma.sh 12.6 RUN bash ./install_magma.sh 12.6
RUN ln -sf /usr/local/cuda-12.6 /usr/local/cuda RUN ln -sf /usr/local/cuda-12.6 /usr/local/cuda
FROM cuda as cuda12.8
RUN bash ./install_cuda.sh 12.8
RUN bash ./install_magma.sh 12.8
RUN ln -sf /usr/local/cuda-12.8 /usr/local/cuda
FROM cuda as cuda12.9
RUN bash ./install_cuda.sh 12.9
RUN bash ./install_magma.sh 12.9
RUN ln -sf /usr/local/cuda-12.9 /usr/local/cuda
FROM cpu as rocm FROM cpu as rocm
ARG ROCM_VERSION
ARG PYTORCH_ROCM_ARCH ARG PYTORCH_ROCM_ARCH
ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH} ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
ENV MKLROOT /opt/intel ENV MKLROOT /opt/intel
@ -85,11 +86,18 @@ ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
# gfortran and python needed for building magma from source for ROCm # gfortran and python needed for building magma from source for ROCm
RUN apt-get update -y && \ RUN apt-get update -y && \
apt-get install gfortran -y && \ apt-get install gfortran -y && \
apt-get install python3 python-is-python3 -y && \ apt-get install python -y && \
apt-get clean apt-get clean
RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
# Install AOTriton
COPY ./common/common_utils.sh common_utils.sh
COPY ./aotriton_version.txt aotriton_version.txt
COPY ./common/install_aotriton.sh install_aotriton.sh
RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
FROM ${BASE_TARGET} as final FROM ${BASE_TARGET} as final
COPY --from=openssl /opt/openssl /opt/openssl COPY --from=openssl /opt/openssl /opt/openssl

View File

@ -1,67 +1,83 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# Script used only in CD pipeline # Script used only in CD pipeline
set -eoux pipefail set -eou pipefail
image="$1" image="$1"
shift shift
if [ -z "${image}" ]; then if [ -z "${image}" ]; then
echo "Usage: $0 IMAGENAME:ARCHTAG" echo "Usage: $0 IMAGE"
exit 1 exit 1
fi fi
DOCKER_IMAGE="pytorch/${image}"
TOPDIR=$(git rev-parse --show-toplevel) TOPDIR=$(git rev-parse --show-toplevel)
GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
WITH_PUSH=${WITH_PUSH:-}
DOCKER=${DOCKER:-docker} DOCKER=${DOCKER:-docker}
# Go from imagename:tag to tag case ${GPU_ARCH_TYPE} in
DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
GPU_ARCH_VERSION=""
if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
# extract cuda version from image name. e.g. manylinux2_28-builder:cuda12.8 returns 12.8
GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
# extract rocm version from image name. e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
fi
case ${DOCKER_TAG_PREFIX} in
cpu) cpu)
BASE_TARGET=cpu BASE_TARGET=cpu
DOCKER_TAG=cpu
GPU_IMAGE=ubuntu:20.04 GPU_IMAGE=ubuntu:20.04
DOCKER_GPU_BUILD_ARG="" DOCKER_GPU_BUILD_ARG=""
;; ;;
cuda*) cuda)
BASE_TARGET=cuda${GPU_ARCH_VERSION} BASE_TARGET=cuda${GPU_ARCH_VERSION}
DOCKER_TAG=cuda${GPU_ARCH_VERSION}
GPU_IMAGE=ubuntu:20.04 GPU_IMAGE=ubuntu:20.04
DOCKER_GPU_BUILD_ARG="" DOCKER_GPU_BUILD_ARG=""
;; ;;
rocm*) rocm)
# we want the patch version of 6.4 instead
if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.1"
fi
BASE_TARGET=rocm BASE_TARGET=rocm
GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete DOCKER_TAG=rocm${GPU_ARCH_VERSION}
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}" PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx942"
DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
;; ;;
*) *)
echo "ERROR: Unrecognized DOCKER_TAG_PREFIX: ${DOCKER_TAG_PREFIX}" echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
exit 1 exit 1
;; ;;
esac esac
tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
DOCKER_BUILDKIT=1 ${DOCKER} build \ (
--target final \ set -x
${DOCKER_GPU_BUILD_ARG} \ DOCKER_BUILDKIT=1 ${DOCKER} build \
--build-arg "GPU_IMAGE=${GPU_IMAGE}" \ --target final \
--build-arg "BASE_TARGET=${BASE_TARGET}" \ ${DOCKER_GPU_BUILD_ARG} \
-t "${tmp_tag}" \ --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
$@ \ --build-arg "BASE_TARGET=${BASE_TARGET}" \
-f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \ -t "${DOCKER_IMAGE}" \
"${TOPDIR}/.ci/docker/" $@ \
-f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \
"${TOPDIR}/.ci/docker/"
)
GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
GIT_BRANCH_NAME=${GITHUB_REF##*/}
GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
if [[ "${WITH_PUSH}" == true ]]; then
(
set -x
${DOCKER} push "${DOCKER_IMAGE}"
if [[ -n ${GITHUB_REF} ]]; then
${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
${DOCKER} push "${DOCKER_IMAGE_BRANCH_TAG}"
${DOCKER} push "${DOCKER_IMAGE_SHA_TAG}"
fi
)
fi

View File

@ -18,31 +18,28 @@ COPY ./common/install_user.sh install_user.sh
RUN bash ./install_user.sh && rm install_user.sh RUN bash ./install_user.sh && rm install_user.sh
# Install conda and other packages (e.g., numpy, pytest) # Install conda and other packages (e.g., numpy, pytest)
ARG PYTHON_VERSION ARG ANACONDA_PYTHON_VERSION
ARG PIP_CMAKE ARG CONDA_CMAKE
# Put venv into the env vars so users don't need to activate it ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
ENV PATH /var/lib/jenkins/ci_env/bin:$PATH ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
ENV VIRTUAL_ENV /var/lib/jenkins/ci_env COPY requirements-ci.txt /opt/conda/requirements-ci.txt
COPY requirements-ci.txt /opt/requirements-ci.txt COPY ./common/install_conda.sh install_conda.sh
COPY ./common/install_python.sh install_python.sh COPY ./common/common_utils.sh common_utils.sh
RUN bash ./install_python.sh && rm install_python.sh /opt/requirements-ci.txt COPY ./common/install_magma_conda.sh install_magma_conda.sh
RUN bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
# Install cuda and cudnn # Install cuda and cudnn
ARG CUDA_VERSION ARG CUDA_VERSION
COPY ./common/install_cuda.sh install_cuda.sh COPY ./common/install_cuda.sh install_cuda.sh
COPY ./common/install_nccl.sh install_nccl.sh RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
COPY ./common/install_cusparselt.sh install_cusparselt.sh
RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu* install_cusparselt.sh
ENV DESIRED_CUDA ${CUDA_VERSION} ENV DESIRED_CUDA ${CUDA_VERSION}
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
# Note that Docker build forbids copying file outside the build context # Note that Docker build forbids copying file outside the build context
COPY ./common/install_linter.sh install_linter.sh COPY ./common/install_linter.sh install_linter.sh
COPY ./common/common_utils.sh common_utils.sh
RUN bash ./install_linter.sh RUN bash ./install_linter.sh
RUN rm install_linter.sh RUN rm install_linter.sh common_utils.sh
RUN chown -R jenkins:jenkins /var/lib/jenkins/ci_env
USER jenkins USER jenkins
CMD ["bash"] CMD ["bash"]

View File

@ -15,19 +15,20 @@ COPY ./common/install_user.sh install_user.sh
RUN bash ./install_user.sh && rm install_user.sh RUN bash ./install_user.sh && rm install_user.sh
# Install conda and other packages (e.g., numpy, pytest) # Install conda and other packages (e.g., numpy, pytest)
ARG PYTHON_VERSION ARG ANACONDA_PYTHON_VERSION
ENV PATH /var/lib/jenkins/ci_env/bin:$PATH ARG CONDA_CMAKE
ENV VIRTUAL_ENV /var/lib/jenkins/ci_env ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
COPY requirements-ci.txt /opt/requirements-ci.txt ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
COPY ./common/install_python.sh install_python.sh COPY requirements-ci.txt /opt/conda/requirements-ci.txt
RUN bash ./install_python.sh && rm install_python.sh /opt/requirements-ci.txt COPY ./common/install_conda.sh install_conda.sh
COPY ./common/common_utils.sh common_utils.sh
RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
# Note that Docker build forbids copying file outside the build context # Note that Docker build forbids copying file outside the build context
COPY ./common/install_linter.sh install_linter.sh COPY ./common/install_linter.sh install_linter.sh
COPY ./common/common_utils.sh common_utils.sh
RUN bash ./install_linter.sh RUN bash ./install_linter.sh
RUN rm install_linter.sh RUN rm install_linter.sh common_utils.sh
RUN chown -R jenkins:jenkins /var/lib/jenkins/ci_env
USER jenkins USER jenkins
CMD ["bash"] CMD ["bash"]

View File

@ -0,0 +1,207 @@
# syntax = docker/dockerfile:experimental
ARG ROCM_VERSION=3.7
ARG BASE_CUDA_VERSION=11.8
ARG GPU_IMAGE=centos:7
FROM centos:7 as base
ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
ARG DEVTOOLSET_VERSION=9
# Note: This is required patch since CentOS have reached EOL
# otherwise any yum install setp will fail
RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel
# Just add everything as a safe.directory for git since these will be used in multiple places with git
RUN git config --global --add safe.directory '*'
RUN yum install -y yum-utils centos-release-scl
RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
# Note: After running yum-config-manager --enable rhel-server-rhscl-7-rpms
# patch is required once again. Somehow this steps adds mirror.centos.org
RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
RUN yum --enablerepo=extras install -y epel-release
# cmake-3.18.4 from pip
RUN yum install -y python3-pip && \
python3 -mpip install cmake==3.18.4 && \
ln -s /usr/local/bin/cmake /usr/bin/cmake
RUN yum install -y autoconf aclocal automake make sudo
FROM base as openssl
# Install openssl (this must precede `build python` step)
# (In order to have a proper SSL module, Python is compiled
# against a recent openssl [see env vars above], which is linked
# statically. We delete openssl afterwards.)
ADD ./common/install_openssl.sh install_openssl.sh
RUN bash ./install_openssl.sh && rm install_openssl.sh
# EPEL for cmake
FROM base as patchelf
# Install patchelf
ADD ./common/install_patchelf.sh install_patchelf.sh
RUN bash ./install_patchelf.sh && rm install_patchelf.sh
RUN cp $(which patchelf) /patchelf
FROM patchelf as python
# build python
COPY manywheel/build_scripts /build_scripts
ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh
RUN bash build_scripts/build.sh && rm -r build_scripts
FROM base as cuda
ARG BASE_CUDA_VERSION=10.2
# Install CUDA
ADD ./common/install_cuda.sh install_cuda.sh
RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
FROM base as intel
# MKL
ADD ./common/install_mkl.sh install_mkl.sh
RUN bash ./install_mkl.sh && rm install_mkl.sh
FROM base as magma
ARG BASE_CUDA_VERSION=10.2
# Install magma
ADD ./common/install_magma.sh install_magma.sh
RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
FROM base as jni
# Install java jni header
ADD ./common/install_jni.sh install_jni.sh
ADD ./java/jni.h jni.h
RUN bash ./install_jni.sh && rm install_jni.sh
FROM base as libpng
# Install libpng
ADD ./common/install_libpng.sh install_libpng.sh
RUN bash ./install_libpng.sh && rm install_libpng.sh
FROM ${GPU_IMAGE} as common
RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
RUN yum install -y \
aclocal \
autoconf \
automake \
bison \
bzip2 \
curl \
diffutils \
file \
git \
make \
patch \
perl \
unzip \
util-linux \
wget \
which \
xz \
yasm
RUN yum install -y \
https://repo.ius.io/ius-release-el7.rpm \
https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
RUN yum swap -y git git236-core
# git236+ would refuse to run git commands in repos owned by other users
# Which causes version check to fail, as pytorch repo is bind-mounted into the image
# Override this behaviour by treating every folder as safe
# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
RUN git config --global --add safe.directory "*"
ENV SSL_CERT_FILE=/opt/_internal/certs.pem
# Install LLVM version
COPY --from=openssl /opt/openssl /opt/openssl
COPY --from=python /opt/python /opt/python
COPY --from=python /opt/_internal /opt/_internal
COPY --from=python /opt/python/cp39-cp39/bin/auditwheel /usr/local/bin/auditwheel
COPY --from=intel /opt/intel /opt/intel
COPY --from=patchelf /usr/local/bin/patchelf /usr/local/bin/patchelf
COPY --from=jni /usr/local/include/jni.h /usr/local/include/jni.h
COPY --from=libpng /usr/local/bin/png* /usr/local/bin/
COPY --from=libpng /usr/local/bin/libpng* /usr/local/bin/
COPY --from=libpng /usr/local/include/png* /usr/local/include/
COPY --from=libpng /usr/local/include/libpng* /usr/local/include/
COPY --from=libpng /usr/local/lib/libpng* /usr/local/lib/
COPY --from=libpng /usr/local/lib/pkgconfig /usr/local/lib/pkgconfig
FROM common as cpu_final
ARG BASE_CUDA_VERSION=10.1
ARG DEVTOOLSET_VERSION=9
# Install Anaconda
ADD ./common/install_conda_docker.sh install_conda.sh
RUN bash ./install_conda.sh && rm install_conda.sh
ENV PATH /opt/conda/bin:$PATH
RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
RUN yum install -y yum-utils centos-release-scl
RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
# cmake is already installed inside the rocm base image, so remove if present
RUN rpm -e cmake || true
# cmake-3.18.4 from pip
RUN yum install -y python3-pip && \
python3 -mpip install cmake==3.18.4 && \
ln -s /usr/local/bin/cmake /usr/bin/cmake
# ninja
RUN yum install -y ninja-build
FROM cpu_final as cuda_final
RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
COPY --from=cuda /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BASE_CUDA_VERSION}
COPY --from=magma /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BASE_CUDA_VERSION}
RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
ENV PATH=/usr/local/cuda/bin:$PATH
FROM cpu_final as rocm_final
ARG ROCM_VERSION=3.7
ARG PYTORCH_ROCM_ARCH
ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
# Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0)
# find HIP works for ROCm5.7. Not needed for ROCm6.0 and above.
# Remove below when ROCm5.7 is not in support matrix anymore.
ENV ROCM_PATH /opt/rocm
ENV MKLROOT /opt/intel
# No need to install ROCm as base docker image should have full ROCm install
#ADD ./common/install_rocm.sh install_rocm.sh
#RUN ROCM_VERSION=${ROCM_VERSION} bash ./install_rocm.sh && rm install_rocm.sh
ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
# cmake3 is needed for the MIOpen build
RUN ln -sf /usr/local/bin/cmake /usr/bin/cmake3
ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
ADD ./common/install_miopen.sh install_miopen.sh
RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
# Install AOTriton
COPY ./common/common_utils.sh common_utils.sh
COPY ./aotriton_version.txt aotriton_version.txt
COPY ./common/install_aotriton.sh install_aotriton.sh
RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton

View File

@ -0,0 +1,153 @@
# syntax = docker/dockerfile:experimental
ARG ROCM_VERSION=3.7
ARG BASE_CUDA_VERSION=10.2
ARG GPU_IMAGE=nvidia/cuda:${BASE_CUDA_VERSION}-devel-centos7
FROM quay.io/pypa/manylinux2014_x86_64 as base
ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel
RUN yum install -y yum-utils centos-release-scl sudo
RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils
ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
# cmake
RUN yum install -y cmake3 && \
ln -s /usr/bin/cmake3 /usr/bin/cmake
FROM base as openssl
# Install openssl (this must precede `build python` step)
# (In order to have a proper SSL module, Python is compiled
# against a recent openssl [see env vars above], which is linked
# statically. We delete openssl afterwards.)
ADD ./common/install_openssl.sh install_openssl.sh
RUN bash ./install_openssl.sh && rm install_openssl.sh
# remove unncessary python versions
RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
FROM base as cuda
ARG BASE_CUDA_VERSION=10.2
# Install CUDA
ADD ./common/install_cuda.sh install_cuda.sh
RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
FROM base as intel
# MKL
ADD ./common/install_mkl.sh install_mkl.sh
RUN bash ./install_mkl.sh && rm install_mkl.sh
FROM base as magma
ARG BASE_CUDA_VERSION=10.2
# Install magma
ADD ./common/install_magma.sh install_magma.sh
RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
FROM base as jni
# Install java jni header
ADD ./common/install_jni.sh install_jni.sh
ADD ./java/jni.h jni.h
RUN bash ./install_jni.sh && rm install_jni.sh
FROM base as libpng
# Install libpng
ADD ./common/install_libpng.sh install_libpng.sh
RUN bash ./install_libpng.sh && rm install_libpng.sh
FROM ${GPU_IMAGE} as common
RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
RUN yum install -y \
aclocal \
autoconf \
automake \
bison \
bzip2 \
curl \
diffutils \
file \
git \
make \
patch \
perl \
unzip \
util-linux \
wget \
which \
xz \
yasm
RUN yum install -y \
https://repo.ius.io/ius-release-el7.rpm \
https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
RUN yum swap -y git git236-core
# git236+ would refuse to run git commands in repos owned by other users
# Which causes version check to fail, as pytorch repo is bind-mounted into the image
# Override this behaviour by treating every folder as safe
# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
RUN git config --global --add safe.directory "*"
ENV SSL_CERT_FILE=/opt/_internal/certs.pem
# Install LLVM version
COPY --from=openssl /opt/openssl /opt/openssl
COPY --from=base /opt/python /opt/python
COPY --from=base /opt/_internal /opt/_internal
COPY --from=base /usr/local/bin/auditwheel /usr/local/bin/auditwheel
COPY --from=intel /opt/intel /opt/intel
COPY --from=base /usr/local/bin/patchelf /usr/local/bin/patchelf
COPY --from=libpng /usr/local/bin/png* /usr/local/bin/
COPY --from=libpng /usr/local/bin/libpng* /usr/local/bin/
COPY --from=libpng /usr/local/include/png* /usr/local/include/
COPY --from=libpng /usr/local/include/libpng* /usr/local/include/
COPY --from=libpng /usr/local/lib/libpng* /usr/local/lib/
COPY --from=libpng /usr/local/lib/pkgconfig /usr/local/lib/pkgconfig
COPY --from=jni /usr/local/include/jni.h /usr/local/include/jni.h
FROM common as cpu_final
ARG BASE_CUDA_VERSION=10.2
RUN yum install -y yum-utils centos-release-scl
RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils
ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
# cmake
RUN yum install -y cmake3 && \
ln -s /usr/bin/cmake3 /usr/bin/cmake
# ninja
RUN yum install -y http://repo.okay.com.mx/centos/7/x86_64/release/okay-release-1-1.noarch.rpm
RUN yum install -y ninja-build
FROM cpu_final as cuda_final
RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
COPY --from=cuda /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BASE_CUDA_VERSION}
COPY --from=magma /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda-${BASE_CUDA_VERSION}
FROM common as rocm_final
ARG ROCM_VERSION=3.7
# Install ROCm
ADD ./common/install_rocm.sh install_rocm.sh
RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh
# cmake is already installed inside the rocm base image, but both 2 and 3 exist
# cmake3 is needed for the later MIOpen custom build, so that step is last.
RUN yum install -y cmake3 && \
rm -f /usr/bin/cmake && \
ln -s /usr/bin/cmake3 /usr/bin/cmake
ADD ./common/install_miopen.sh install_miopen.sh
RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh

View File

@ -7,8 +7,8 @@ ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8 ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8 ENV LANGUAGE en_US.UTF-8
ARG DEVTOOLSET_VERSION=13 ARG DEVTOOLSET_VERSION=11
RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
@ -26,20 +26,17 @@ ADD ./common/install_openssl.sh install_openssl.sh
RUN bash ./install_openssl.sh && rm install_openssl.sh RUN bash ./install_openssl.sh && rm install_openssl.sh
# remove unnecessary python versions # remove unncessary python versions
RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2 RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
FROM base as cuda FROM base as cuda
ARG BASE_CUDA_VERSION=12.6 ARG BASE_CUDA_VERSION=11.8
# Install CUDA # Install CUDA
ADD ./common/install_cuda.sh install_cuda.sh ADD ./common/install_cuda.sh install_cuda.sh
COPY ./common/install_nccl.sh install_nccl.sh RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
COPY ./common/install_cusparselt.sh install_cusparselt.sh
RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu* install_cusparselt.sh
FROM base as intel FROM base as intel
# MKL # MKL
@ -47,7 +44,7 @@ ADD ./common/install_mkl.sh install_mkl.sh
RUN bash ./install_mkl.sh && rm install_mkl.sh RUN bash ./install_mkl.sh && rm install_mkl.sh
FROM base as magma FROM base as magma
ARG BASE_CUDA_VERSION=12.6 ARG BASE_CUDA_VERSION=10.2
# Install magma # Install magma
ADD ./common/install_magma.sh install_magma.sh ADD ./common/install_magma.sh install_magma.sh
RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
@ -64,7 +61,7 @@ ADD ./common/install_libpng.sh install_libpng.sh
RUN bash ./install_libpng.sh && rm install_libpng.sh RUN bash ./install_libpng.sh && rm install_libpng.sh
FROM ${GPU_IMAGE} as common FROM ${GPU_IMAGE} as common
ARG DEVTOOLSET_VERSION=13 ARG DEVTOOLSET_VERSION=11
ENV LC_ALL en_US.UTF-8 ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8 ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8 ENV LANGUAGE en_US.UTF-8
@ -87,12 +84,13 @@ RUN yum install -y \
wget \ wget \
which \ which \
xz \ xz \
glibc-langpack-en \ gcc-toolset-${DEVTOOLSET_VERSION}-toolchain \
gcc-toolset-${DEVTOOLSET_VERSION}-gcc \ glibc-langpack-en
gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \ RUN yum install -y \
gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \ https://repo.ius.io/ius-release-el7.rpm \
gcc-toolset-${DEVTOOLSET_VERSION}-gdb https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
RUN yum swap -y git git236-core
# git236+ would refuse to run git commands in repos owned by other users # git236+ would refuse to run git commands in repos owned by other users
# Which causes version check to fail, as pytorch repo is bind-mounted into the image # Which causes version check to fail, as pytorch repo is bind-mounted into the image
# Override this behaviour by treating every folder as safe # Override this behaviour by treating every folder as safe
@ -103,7 +101,6 @@ ENV SSL_CERT_FILE=/opt/_internal/certs.pem
# Install LLVM version # Install LLVM version
COPY --from=openssl /opt/openssl /opt/openssl COPY --from=openssl /opt/openssl /opt/openssl
COPY --from=base /opt/python /opt/python COPY --from=base /opt/python /opt/python
COPY --from=base /usr/local/lib/ /usr/local/lib/
COPY --from=base /opt/_internal /opt/_internal COPY --from=base /opt/_internal /opt/_internal
COPY --from=base /usr/local/bin/auditwheel /usr/local/bin/auditwheel COPY --from=base /usr/local/bin/auditwheel /usr/local/bin/auditwheel
COPY --from=intel /opt/intel /opt/intel COPY --from=intel /opt/intel /opt/intel
@ -117,8 +114,8 @@ COPY --from=libpng /usr/local/lib/pkgconfig /usr/local/
COPY --from=jni /usr/local/include/jni.h /usr/local/include/jni.h COPY --from=jni /usr/local/include/jni.h /usr/local/include/jni.h
FROM common as cpu_final FROM common as cpu_final
ARG BASE_CUDA_VERSION=12.6 ARG BASE_CUDA_VERSION=11.8
ARG DEVTOOLSET_VERSION=13 ARG DEVTOOLSET_VERSION=11
# Install Anaconda # Install Anaconda
ADD ./common/install_conda_docker.sh install_conda.sh ADD ./common/install_conda_docker.sh install_conda.sh
RUN bash ./install_conda.sh && rm install_conda.sh RUN bash ./install_conda.sh && rm install_conda.sh
@ -157,14 +154,11 @@ ENV ROCM_PATH /opt/rocm
# and avoid 3.21.0 cmake+ninja issues with ninja inserting "-Wl,--no-as-needed" in LINK_FLAGS for static linker # and avoid 3.21.0 cmake+ninja issues with ninja inserting "-Wl,--no-as-needed" in LINK_FLAGS for static linker
RUN python3 -m pip install --upgrade pip && \ RUN python3 -m pip install --upgrade pip && \
python3 -mpip install cmake==3.28.4 python3 -mpip install cmake==3.28.4
# replace the libdrm in /opt/amdgpu with custom amdgpu.ids lookup path
ADD ./common/install_rocm_drm.sh install_rocm_drm.sh ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
# ROCm 6.4 rocm-smi depends on system drm.h header
RUN yum install -y libdrm-devel
ENV MKLROOT /opt/intel ENV MKLROOT /opt/intel
ADD ./common/install_rocm_magma.sh install_rocm_magma.sh ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
ADD ./common/install_miopen.sh install_miopen.sh ADD ./common/install_miopen.sh install_miopen.sh
RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
@ -175,6 +169,6 @@ ENV XPU_DRIVER_TYPE ROLLING
RUN python3 -m pip install --upgrade pip && \ RUN python3 -m pip install --upgrade pip && \
python3 -mpip install cmake==3.28.4 python3 -mpip install cmake==3.28.4
ADD ./common/install_xpu.sh install_xpu.sh ADD ./common/install_xpu.sh install_xpu.sh
ENV XPU_VERSION 2025.1 ENV XPU_VERSION 2025.0
RUN bash ./install_xpu.sh && rm install_xpu.sh RUN bash ./install_xpu.sh && rm install_xpu.sh
RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd

View File

@ -1,8 +1,9 @@
FROM quay.io/pypa/manylinux_2_28_aarch64 as base FROM quay.io/pypa/manylinux_2_28_aarch64 as base
ARG GCCTOOLSET_VERSION=13 # Graviton needs GCC 10 or above for the build. GCC12 is the default version in almalinux-8.
ARG GCCTOOLSET_VERSION=11
# Language variables # Language variabes
ENV LC_ALL=en_US.UTF-8 ENV LC_ALL=en_US.UTF-8
ENV LANG=en_US.UTF-8 ENV LANG=en_US.UTF-8
ENV LANGUAGE=en_US.UTF-8 ENV LANGUAGE=en_US.UTF-8
@ -35,16 +36,7 @@ RUN yum install -y \
yasm \ yasm \
zstd \ zstd \
sudo \ sudo \
gcc-toolset-${GCCTOOLSET_VERSION}-gcc \ gcc-toolset-${GCCTOOLSET_VERSION}-toolchain
gcc-toolset-${GCCTOOLSET_VERSION}-gcc-c++ \
gcc-toolset-${GCCTOOLSET_VERSION}-gcc-gfortran \
gcc-toolset-${GCCTOOLSET_VERSION}-gdb
# (optional) Install non-default Ninja version
ARG NINJA_VERSION
COPY ./common/install_ninja.sh install_ninja.sh
RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
RUN rm install_ninja.sh
# Ensure the expected devtoolset is used # Ensure the expected devtoolset is used
ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH
@ -58,13 +50,12 @@ RUN git config --global --add safe.directory "*"
FROM base as openblas FROM base as openblas
# Install openblas # Install openblas
ARG OPENBLAS_VERSION
ADD ./common/install_openblas.sh install_openblas.sh ADD ./common/install_openblas.sh install_openblas.sh
RUN bash ./install_openblas.sh && rm install_openblas.sh RUN bash ./install_openblas.sh && rm install_openblas.sh
FROM base as final FROM base as final
# remove unnecessary python versions # remove unncessary python versions
RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2 RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6

View File

@ -0,0 +1,94 @@
FROM quay.io/pypa/manylinux2014_aarch64 as base
# Graviton needs GCC 10 for the build
ARG DEVTOOLSET_VERSION=10
# Language variabes
ENV LC_ALL=en_US.UTF-8
ENV LANG=en_US.UTF-8
ENV LANGUAGE=en_US.UTF-8
# Installed needed OS packages. This is to support all
# the binary builds (torch, vision, audio, text, data)
RUN yum -y install epel-release
RUN yum -y update
RUN yum install -y \
autoconf \
automake \
bison \
bzip2 \
curl \
diffutils \
file \
git \
make \
patch \
perl \
unzip \
util-linux \
wget \
which \
xz \
yasm \
less \
zstd \
libgomp \
sudo \
devtoolset-${DEVTOOLSET_VERSION}-gcc \
devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ \
devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
devtoolset-${DEVTOOLSET_VERSION}-binutils
# Ensure the expected devtoolset is used
ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
# git236+ would refuse to run git commands in repos owned by other users
# Which causes version check to fail, as pytorch repo is bind-mounted into the image
# Override this behaviour by treating every folder as safe
# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
RUN git config --global --add safe.directory "*"
###############################################################################
# libglfortran.a hack
#
# libgfortran.a from quay.io/pypa/manylinux2014_aarch64 is not compiled with -fPIC.
# This causes __stack_chk_guard@@GLIBC_2.17 on pytorch build. To solve, get
# ubuntu's libgfortran.a which is compiled with -fPIC
# NOTE: Need a better way to get this library as Ubuntu's package can be removed by the vender, or changed
###############################################################################
RUN cd ~/ \
&& curl -L -o ~/libgfortran-10-dev.deb http://ports.ubuntu.com/ubuntu-ports/pool/universe/g/gcc-10/libgfortran-10-dev_10.5.0-4ubuntu2_arm64.deb \
&& ar x ~/libgfortran-10-dev.deb \
&& tar --use-compress-program=unzstd -xvf data.tar.zst -C ~/ \
&& cp -f ~/usr/lib/gcc/aarch64-linux-gnu/10/libgfortran.a /opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/
# install cmake
RUN yum install -y cmake3 && \
ln -s /usr/bin/cmake3 /usr/bin/cmake
FROM base as openssl
# Install openssl (this must precede `build python` step)
# (In order to have a proper SSL module, Python is compiled
# against a recent openssl [see env vars above], which is linked
# statically. We delete openssl afterwards.)
ADD ./common/install_openssl.sh install_openssl.sh
RUN bash ./install_openssl.sh && rm install_openssl.sh
ENV SSL_CERT_FILE=/opt/_internal/certs.pem
FROM base as openblas
# Install openblas
ADD ./common/install_openblas.sh install_openblas.sh
RUN bash ./install_openblas.sh && rm install_openblas.sh
FROM openssl as final
# remove unncessary python versions
RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
COPY --from=openblas /opt/OpenBLAS/ /opt/OpenBLAS/
ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH

View File

@ -1,7 +1,7 @@
FROM quay.io/pypa/manylinux_2_28_aarch64 as base FROM quay.io/pypa/manylinux_2_28_aarch64 as base
# Cuda ARM build needs gcc 11 # Cuda ARM build needs gcc 11
ARG DEVTOOLSET_VERSION=13 ARG DEVTOOLSET_VERSION=11
# Language variables # Language variables
ENV LC_ALL=en_US.UTF-8 ENV LC_ALL=en_US.UTF-8
@ -34,10 +34,7 @@ RUN yum install -y \
zstd \ zstd \
libgomp \ libgomp \
sudo \ sudo \
gcc-toolset-${DEVTOOLSET_VERSION}-gcc \ gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \
gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
gcc-toolset-${DEVTOOLSET_VERSION}-gdb
# Ensure the expected devtoolset is used # Ensure the expected devtoolset is used
ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
@ -60,7 +57,7 @@ RUN bash ./install_openssl.sh && rm install_openssl.sh
ENV SSL_CERT_FILE=/opt/_internal/certs.pem ENV SSL_CERT_FILE=/opt/_internal/certs.pem
FROM openssl as final FROM openssl as final
# remove unnecessary python versions # remove unncessary python versions
RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2 RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
@ -69,11 +66,8 @@ RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
FROM base as cuda FROM base as cuda
ARG BASE_CUDA_VERSION ARG BASE_CUDA_VERSION
# Install CUDA # Install CUDA
ADD ./common/install_cuda.sh install_cuda.sh ADD ./common/install_cuda_aarch64.sh install_cuda_aarch64.sh
COPY ./common/install_nccl.sh install_nccl.sh RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh
COPY ./common/install_cusparselt.sh install_cusparselt.sh
COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu* install_cusparselt.sh
FROM base as magma FROM base as magma
ARG BASE_CUDA_VERSION ARG BASE_CUDA_VERSION

View File

@ -5,9 +5,7 @@ ENV LC_ALL=C.UTF-8
ENV LANG=C.UTF-8 ENV LANG=C.UTF-8
ENV LANGUAGE=C.UTF-8 ENV LANGUAGE=C.UTF-8
# there is a bugfix in gcc >= 14 for precompiled headers and s390x vectorization interaction. ARG DEVTOOLSET_VERSION=13
# with earlier gcc versions test/inductor/test_cpu_cpp_wrapper.py will fail.
ARG DEVTOOLSET_VERSION=14
# Installed needed OS packages. This is to support all # Installed needed OS packages. This is to support all
# the binary builds (torch, vision, audio, text, data) # the binary builds (torch, vision, audio, text, data)
RUN yum -y install epel-release RUN yum -y install epel-release
@ -44,7 +42,6 @@ RUN yum install -y \
llvm-devel \ llvm-devel \
libzstd-devel \ libzstd-devel \
python3.12-devel \ python3.12-devel \
python3.12-test \
python3.12-setuptools \ python3.12-setuptools \
python3.12-pip \ python3.12-pip \
python3-virtualenv \ python3-virtualenv \
@ -60,8 +57,7 @@ RUN yum install -y \
libxslt-devel \ libxslt-devel \
libxml2-devel \ libxml2-devel \
openssl-devel \ openssl-devel \
valgrind \ valgrind
ninja-build
ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
@ -105,37 +101,24 @@ CMD ["/bin/bash"]
# install test dependencies: # install test dependencies:
# - grpcio requires system openssl, bundled crypto fails to build # - grpcio requires system openssl, bundled crypto fails to build
# - ml_dtypes 0.4.0 requires some fixes provided in later commits to build
RUN dnf install -y \ RUN dnf install -y \
hdf5-devel \ protobuf-devel \
python3-h5py \ protobuf-c-devel \
git protobuf-lite-devel \
wget \
patch
RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio==1.65.4
RUN cd ~ && \
# cmake-3.28.0 from pip for onnxruntime git clone https://github.com/jax-ml/ml_dtypes && \
RUN python3 -mpip install cmake==3.28.0 cd ml_dtypes && \
git checkout v0.4.0 && \
# build onnxruntime 1.21.0 from sources.
# it is not possible to build it from sources using pip,
# so just build it from upstream repository.
# h5py is dependency of onnxruntime_training.
# h5py==3.11.0 builds with hdf5-devel 1.10.5 from repository.
# h5py 3.11.0 doesn't build with numpy >= 2.3.0.
# install newest flatbuffers version first:
# for some reason old version is getting pulled in otherwise.
# packaging package is required for onnxruntime wheel build.
RUN pip3 install flatbuffers && \
pip3 install cython 'pkgconfig>=1.5.5' 'setuptools>=77' 'numpy<2.3.0' && \
pip3 install --no-build-isolation h5py==3.11.0 && \
pip3 install packaging && \
git clone https://github.com/microsoft/onnxruntime && \
cd onnxruntime && git checkout v1.21.0 && \
git submodule update --init --recursive && \ git submodule update --init --recursive && \
wget https://github.com/microsoft/onnxruntime/commit/f57db79743c4d1a3553aa05cf95bcd10966030e6.patch && \ wget https://github.com/jax-ml/ml_dtypes/commit/b969f76914d6b30676721bc92bf0f6021a0d1321.patch && \
patch -p1 < f57db79743c4d1a3553aa05cf95bcd10966030e6.patch && \ wget https://github.com/jax-ml/ml_dtypes/commit/d4e6d035ecda073eab8bcf60f4eef572ee7087e6.patch && \
./build.sh --config Release --parallel 0 --enable_pybind \ patch -p1 < b969f76914d6b30676721bc92bf0f6021a0d1321.patch && \
--build_wheel --enable_training --enable_training_apis \ patch -p1 < d4e6d035ecda073eab8bcf60f4eef572ee7087e6.patch && \
--enable_training_ops --skip_tests --allow_running_as_root \ python3 setup.py bdist_wheel && \
--compile_no_warning_as_error && \ pip3 install dist/*.whl && \
pip3 install ./build/Linux/Release/dist/onnxruntime_training-*.whl && \ rm -rf ml_dtypes
cd .. && /bin/rm -rf ./onnxruntime

View File

@ -1,7 +1,7 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# Script used only in CD pipeline # Script used only in CD pipeline
set -exou pipefail set -eou pipefail
TOPDIR=$(git rev-parse --show-toplevel) TOPDIR=$(git rev-parse --show-toplevel)
@ -9,115 +9,151 @@ image="$1"
shift shift
if [ -z "${image}" ]; then if [ -z "${image}" ]; then
echo "Usage: $0 IMAGE:ARCHTAG" echo "Usage: $0 IMAGE"
exit 1 exit 1
fi fi
# Go from imagename:tag to tag DOCKER_IMAGE="pytorch/${image}"
DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
GPU_ARCH_VERSION="" DOCKER_REGISTRY="${DOCKER_REGISTRY:-docker.io}"
if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
# extract cuda version from image name. e.g. manylinux2_28-builder:cuda12.8 returns 12.8
GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
# extract rocm version from image name. e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
fi
GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-} MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-} DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
OPENBLAS_VERSION=${OPENBLAS_VERSION:-} WITH_PUSH=${WITH_PUSH:-}
case ${image} in case ${GPU_ARCH_TYPE} in
manylinux2_28-builder:cpu) cpu)
TARGET=cpu_final TARGET=cpu_final
DOCKER_TAG=cpu
GPU_IMAGE=centos:7
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
;;
cpu-manylinux_2_28)
TARGET=cpu_final
DOCKER_TAG=cpu
GPU_IMAGE=amd64/almalinux:8 GPU_IMAGE=amd64/almalinux:8
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13" DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
MANY_LINUX_VERSION="2_28" MANY_LINUX_VERSION="2_28"
;; ;;
manylinux2_28_aarch64-builder:cpu-aarch64) cpu-aarch64)
TARGET=final TARGET=final
GPU_IMAGE=arm64v8/almalinux:8 DOCKER_TAG=cpu-aarch64
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1" GPU_IMAGE=arm64v8/centos:7
MANY_LINUX_VERSION="2_28_aarch64" DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=10"
OPENBLAS_VERSION="v0.3.30" MANY_LINUX_VERSION="aarch64"
;; ;;
manylinuxcxx11-abi-builder:cpu-cxx11-abi) cpu-aarch64-2_28)
TARGET=final TARGET=final
DOCKER_TAG=cpu-aarch64
GPU_IMAGE=arm64v8/almalinux:8
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
MANY_LINUX_VERSION="2_28_aarch64"
;;
cpu-cxx11-abi)
TARGET=final
DOCKER_TAG=cpu-cxx11-abi
GPU_IMAGE="" GPU_IMAGE=""
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9" DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
MANY_LINUX_VERSION="cxx11-abi" MANY_LINUX_VERSION="cxx11-abi"
;; ;;
manylinuxs390x-builder:cpu-s390x) cpu-s390x)
TARGET=final TARGET=final
DOCKER_TAG=cpu-s390x
GPU_IMAGE=s390x/almalinux:8 GPU_IMAGE=s390x/almalinux:8
DOCKER_GPU_BUILD_ARG="" DOCKER_GPU_BUILD_ARG=""
MANY_LINUX_VERSION="s390x" MANY_LINUX_VERSION="s390x"
;; ;;
manylinux2_28-builder:cuda11*) cuda)
TARGET=cuda_final TARGET=cuda_final
DOCKER_TAG=cuda${GPU_ARCH_VERSION}
# Keep this up to date with the minimum version of CUDA we currently support
GPU_IMAGE=centos:7
DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=9"
;;
cuda-manylinux_2_28)
TARGET=cuda_final
DOCKER_TAG=cuda${GPU_ARCH_VERSION}
GPU_IMAGE=amd64/almalinux:8 GPU_IMAGE=amd64/almalinux:8
DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11" DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
MANY_LINUX_VERSION="2_28" MANY_LINUX_VERSION="2_28"
;; ;;
manylinux2_28-builder:cuda12*) cuda-aarch64)
TARGET=cuda_final TARGET=cuda_final
GPU_IMAGE=amd64/almalinux:8 DOCKER_TAG=cuda${GPU_ARCH_VERSION}
DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13" GPU_IMAGE=arm64v8/centos:7
MANY_LINUX_VERSION="2_28" DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
;;
manylinuxaarch64-builder:cuda*)
TARGET=cuda_final
GPU_IMAGE=amd64/almalinux:8
DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
MANY_LINUX_VERSION="aarch64" MANY_LINUX_VERSION="aarch64"
DOCKERFILE_SUFFIX="_cuda_aarch64" DOCKERFILE_SUFFIX="_cuda_aarch64"
;; ;;
manylinux2_28-builder:rocm*) rocm|rocm-manylinux_2_28)
# we want the patch version of 6.4 instead
if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.1"
fi
TARGET=rocm_final TARGET=rocm_final
MANY_LINUX_VERSION="2_28" DOCKER_TAG=rocm${GPU_ARCH_VERSION}
DEVTOOLSET_VERSION="11" GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete
GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete DEVTOOLSET_VERSION="9"
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" if [ ${GPU_ARCH_TYPE} == "rocm-manylinux_2_28" ]; then
MANY_LINUX_VERSION="2_28"
DEVTOOLSET_VERSION="11"
GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
fi
PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101"
DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}" DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
;; ;;
manylinux2_28-builder:xpu) xpu)
TARGET=xpu_final TARGET=xpu_final
DOCKER_TAG=xpu
GPU_IMAGE=amd64/almalinux:8 GPU_IMAGE=amd64/almalinux:8
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11" DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
MANY_LINUX_VERSION="2_28" MANY_LINUX_VERSION="2_28"
;; ;;
*) *)
echo "ERROR: Unrecognized image name: ${image}" echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
exit 1 exit 1
;; ;;
esac esac
IMAGES=''
if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
DOCKERFILE_SUFFIX=_${MANY_LINUX_VERSION} DOCKERFILE_SUFFIX=_${MANY_LINUX_VERSION}
fi fi
# Only activate this if in CI (
if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then set -x
# TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
# is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023. if [ "$(uname -m)" != "s390x" ]; then
sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
sudo systemctl daemon-reload # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
sudo systemctl restart docker sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
sudo systemctl daemon-reload
sudo systemctl restart docker
fi
DOCKER_BUILDKIT=1 docker build \
${DOCKER_GPU_BUILD_ARG} \
--build-arg "GPU_IMAGE=${GPU_IMAGE}" \
--target "${TARGET}" \
-t "${DOCKER_IMAGE}" \
$@ \
-f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
"${TOPDIR}/.ci/docker/"
)
GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
GIT_BRANCH_NAME=${GITHUB_REF##*/}
GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
if [[ "${WITH_PUSH}" == true ]]; then
(
set -x
docker push "${DOCKER_IMAGE}"
if [[ -n ${GITHUB_REF} ]]; then
docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
docker push "${DOCKER_IMAGE_BRANCH_TAG}"
docker push "${DOCKER_IMAGE_SHA_TAG}"
fi
)
fi fi
tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
DOCKER_BUILDKIT=1 docker build \
${DOCKER_GPU_BUILD_ARG} \
--build-arg "GPU_IMAGE=${GPU_IMAGE}" \
--build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION}" \
--target "${TARGET}" \
-t "${tmp_tag}" \
$@ \
-f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
"${TOPDIR}/.ci/docker/"

View File

@ -97,7 +97,7 @@ find /opt/_internal -type f -print0 \
| xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true | xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true
# We do not need the Python test suites, or indeed the precompiled .pyc and # We do not need the Python test suites, or indeed the precompiled .pyc and
# .pyo files. Partially cribbed from: # .pyo files. Partially cribbed from:
# https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile # @lint-ignore # https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile
find /opt/_internal \ find /opt/_internal \
\( -type d -a -name test -o -name tests \) \ \( -type d -a -name test -o -name tests \) \
-o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \ -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \

View File

@ -2,8 +2,8 @@
# Helper utilities for build # Helper utilities for build
# Script used only in CD pipeline # Script used only in CD pipeline
OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/ # @lint-ignore OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/
CURL_DOWNLOAD_URL=https://curl.se/download CURL_DOWNLOAD_URL=https://curl.askapache.com/download
AUTOCONF_DOWNLOAD_URL=https://ftp.gnu.org/gnu/autoconf AUTOCONF_DOWNLOAD_URL=https://ftp.gnu.org/gnu/autoconf

View File

@ -16,7 +16,6 @@ click
#test that import: #test that import:
coremltools==5.0b5 ; python_version < "3.12" coremltools==5.0b5 ; python_version < "3.12"
coremltools==8.3 ; python_version == "3.12"
#Description: Apple framework for ML integration #Description: Apple framework for ML integration
#Pinned versions: 5.0b5 #Pinned versions: 5.0b5
#test that import: #test that import:
@ -42,9 +41,9 @@ fbscribelogger==0.1.7
#Pinned versions: 0.1.6 #Pinned versions: 0.1.6
#test that import: #test that import:
flatbuffers==24.12.23 flatbuffers==2.0
#Description: cross platform serialization library #Description: cross platform serialization library
#Pinned versions: 24.12.23 #Pinned versions: 2.0
#test that import: #test that import:
hypothesis==5.35.1 hypothesis==5.35.1
@ -64,7 +63,6 @@ lark==0.12.0
#test that import: #test that import:
librosa>=0.6.2 ; python_version < "3.11" librosa>=0.6.2 ; python_version < "3.11"
librosa==0.10.2 ; python_version == "3.12"
#Description: A python package for music and audio analysis #Description: A python package for music and audio analysis
#Pinned versions: >=0.6.2 #Pinned versions: >=0.6.2
#test that import: test_spectral_ops.py #test that import: test_spectral_ops.py
@ -92,10 +90,10 @@ librosa==0.10.2 ; python_version == "3.12"
#Pinned versions: #Pinned versions:
#test that import: #test that import:
mypy==1.16.0 mypy==1.13.0
# Pin MyPy version because new errors are likely to appear with each release # Pin MyPy version because new errors are likely to appear with each release
#Description: linter #Description: linter
#Pinned versions: 1.16.0 #Pinned versions: 1.10.0
#test that import: test_typing.py, test_type_hints.py #test that import: test_typing.py, test_type_hints.py
networkx==2.8.8 networkx==2.8.8
@ -104,16 +102,15 @@ networkx==2.8.8
#Pinned versions: 2.8.8 #Pinned versions: 2.8.8
#test that import: functorch #test that import: functorch
ninja==1.11.1.3 #ninja
#Description: build system. Used in some tests. Used in build to generate build #Description: build system. Note that it install from
#time tracing information #here breaks things so it is commented out
#Pinned versions: 1.11.1.3 #Pinned versions: 1.10.0.post1
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
numba==0.49.0 ; python_version < "3.9" numba==0.49.0 ; python_version < "3.9"
numba==0.55.2 ; python_version == "3.9" numba==0.55.2 ; python_version == "3.9"
numba==0.55.2 ; python_version == "3.10" numba==0.55.2 ; python_version == "3.10"
numba==0.60.0 ; python_version == "3.12"
#Description: Just-In-Time Compiler for Numerical Functions #Description: Just-In-Time Compiler for Numerical Functions
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
#test that import: test_numba_integration.py #test that import: test_numba_integration.py
@ -166,10 +163,10 @@ pillow==11.0.0
#Pinned versions: 10.3.0 #Pinned versions: 10.3.0
#test that import: #test that import:
protobuf==5.29.4 protobuf==3.20.2
#Description: Google's data interchange format #Description: Googles data interchange format
#Pinned versions: 5.29.4 #Pinned versions: 3.20.1
#test that import: test_tensorboard.py, test/onnx/* #test that import: test_tensorboard.py
psutil psutil
#Description: information on running processes and system utilization #Description: information on running processes and system utilization
@ -297,7 +294,7 @@ ghstack==0.8.0
#Pinned versions: 0.8.0 #Pinned versions: 0.8.0
#test that import: #test that import:
jinja2==3.1.6 jinja2==3.1.5
#Description: jinja2 template engine #Description: jinja2 template engine
#Pinned versions: 3.1.4 #Pinned versions: 3.1.4
#test that import: #test that import:
@ -307,7 +304,7 @@ pytest-cpp==2.3.0
#Pinned versions: 2.3.0 #Pinned versions: 2.3.0
#test that import: #test that import:
z3-solver==4.12.6.0 z3-solver==4.12.2.0
#Description: The Z3 Theorem Prover Project #Description: The Z3 Theorem Prover Project
#Pinned versions: #Pinned versions:
#test that import: #test that import:
@ -332,17 +329,17 @@ lxml==5.3.0
PyGithub==2.3.0 PyGithub==2.3.0
sympy==1.13.3 sympy==1.13.1 ; python_version >= "3.9"
#Description: Required by coremltools, also pinned in .github/requirements/pip-requirements-macOS.txt #Description: Required by coremltools, also pinned in .github/requirements/pip-requirements-macOS.txt
#Pinned versions: #Pinned versions:
#test that import: #test that import:
onnx==1.18.0 onnx==1.17.0
#Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
#Pinned versions: #Pinned versions:
#test that import: #test that import:
onnxscript==0.3.1 onnxscript==0.1.0.dev20240817
#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
#Pinned versions: #Pinned versions:
#test that import: #test that import:
@ -356,41 +353,21 @@ parameterized==0.8.1
#Pinned versions: 1.24.0 #Pinned versions: 1.24.0
#test that import: test_sac_estimator.py #test that import: test_sac_estimator.py
pwlf==2.2.1 pwlf==2.2.1 ; python_version >= "3.8"
#Description: required for testing torch/distributed/_tools/sac_estimator.py #Description: required for testing torch/distributed/_tools/sac_estimator.py
#Pinned versions: 2.2.1 #Pinned versions: 2.2.1
#test that import: test_sac_estimator.py #test that import: test_sac_estimator.py
# To build PyTorch itself
pyyaml
pyzstd
setuptools>=70.1.0
six
# To build PyTorch itself
astunparse
PyYAML
setuptools
ninja==1.11.1 ; platform_machine == "aarch64"
scons==4.5.2 ; platform_machine == "aarch64" scons==4.5.2 ; platform_machine == "aarch64"
pulp==2.9.0 pulp==2.9.0 ; python_version >= "3.8"
#Description: required for testing ilp formulaiton under torch/distributed/_tools #Description: required for testing ilp formulaiton under torch/distributed/_tools
#Pinned versions: 2.9.0 #Pinned versions: 2.9.0
#test that import: test_sac_ilp.py #test that import: test_sac_ilp.py
dataclasses_json==0.6.7
#Description: required for data pipeline and scripts under tools/stats
#Pinned versions: 0.6.7
#test that import:
cmake==4.0.0
#Description: required for building
tlparse==0.3.30
#Description: required for log parsing
cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"
#Description: required for testing CUDAGraph::raw_cuda_graph(). See https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html for how this version was chosen. Note "Any fix in the latest bindings would be backported to the prior major version" means that only the newest version of cuda-bindings will get fixes. Depending on the latest version of 12.x is okay because all 12.y versions will be supported via "CUDA minor version compatibility". Pytorch builds against 13.z versions of cuda toolkit work with 12.x versions of cuda-bindings as well because newer drivers work with old toolkits.
#test that import: test_cuda.py
setuptools-git-versioning==2.1.0
scikit-build==0.18.1
pyre-extensions==0.0.32
tabulate==0.9.0
#Description: These package are needed to build FBGEMM and torchrec on PyTorch CI

View File

@ -1,28 +1,18 @@
sphinx==5.3.0 sphinx==5.3.0
#Description: This is used to generate PyTorch docs #Description: This is used to generate PyTorch docs
#Pinned versions: 5.3.0 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably # but it doesn't seem to work and hangs around idly. The initial thought is probably
# something related to Docker setup. We can investigate this later. # something related to Docker setup. We can investigate this later
sphinxcontrib.katex==0.8.6 sphinxcontrib.katex==0.8.6
#Description: This is used to generate PyTorch docs #Description: This is used to generate PyTorch docs
#Pinned versions: 0.8.6 #Pinned versions: 0.8.6
sphinxext-opengraph==0.9.1 matplotlib==3.5.3
#Description: This is used to generate PyTorch docs #Description: This is used to generate PyTorch docs
#Pinned versions: 0.9.1 #Pinned versions: 3.5.3
sphinx_sitemap==2.6.0
#Description: This is used to generate sitemap for PyTorch docs
#Pinned versions: 2.6.0
matplotlib==3.5.3 ; python_version < "3.13"
matplotlib==3.6.3 ; python_version >= "3.13"
#Description: This is used to generate PyTorch docs
#Pinned versions: 3.6.3 if python > 3.12. Otherwise 3.5.3.
tensorboard==2.13.0 ; python_version < "3.13" tensorboard==2.13.0 ; python_version < "3.13"
tensorboard==2.18.0 ; python_version >= "3.13" tensorboard==2.18.0 ; python_version >= "3.13"
@ -56,7 +46,5 @@ myst-nb==0.17.2
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
python-etcd==0.4.5 python-etcd==0.4.5
sphinx-copybutton==0.5.0 sphinx-copybutton==0.5.0
sphinx-design==0.4.0 sphinx-panels==0.4.1
sphinxcontrib-mermaid==1.0.0
myst-parser==0.18.1 myst-parser==0.18.1
myst-nb

View File

@ -1 +1 @@
3.4.0 3.2.0

View File

@ -1 +0,0 @@
3.4.0

View File

@ -0,0 +1,175 @@
ARG UBUNTU_VERSION
ARG CUDA_VERSION
ARG IMAGE_NAME
FROM ${IMAGE_NAME}
ARG UBUNTU_VERSION
ARG CUDA_VERSION
ENV DEBIAN_FRONTEND noninteractive
# Install common dependencies (so that this step can be cached separately)
COPY ./common/install_base.sh install_base.sh
RUN bash ./install_base.sh && rm install_base.sh
# Install user
COPY ./common/install_user.sh install_user.sh
RUN bash ./install_user.sh && rm install_user.sh
# Install katex
ARG KATEX
COPY ./common/install_docs_reqs.sh install_docs_reqs.sh
RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
# Install conda and other packages (e.g., numpy, pytest)
ARG ANACONDA_PYTHON_VERSION
ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
ARG CONDA_CMAKE
COPY requirements-ci.txt /opt/conda/requirements-ci.txt
COPY ./common/install_conda.sh install_conda.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ./common/install_magma_conda.sh install_magma_conda.sh
RUN bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
# Install gcc
ARG GCC_VERSION
COPY ./common/install_gcc.sh install_gcc.sh
RUN bash ./install_gcc.sh && rm install_gcc.sh
# Install clang
ARG CLANG_VERSION
COPY ./common/install_clang.sh install_clang.sh
RUN bash ./install_clang.sh && rm install_clang.sh
# (optional) Install protobuf for ONNX
ARG PROTOBUF
COPY ./common/install_protobuf.sh install_protobuf.sh
RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
RUN rm install_protobuf.sh
ENV INSTALLED_PROTOBUF ${PROTOBUF}
# (optional) Install database packages like LMDB and LevelDB
ARG DB
COPY ./common/install_db.sh install_db.sh
RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}
# (optional) Install vision packages like OpenCV
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
ENV INSTALLED_VISION ${VISION}
# (optional) Install UCC
ARG UCX_COMMIT
ARG UCC_COMMIT
ENV UCX_COMMIT $UCX_COMMIT
ENV UCC_COMMIT $UCC_COMMIT
ENV UCX_HOME /usr
ENV UCC_HOME /usr
ADD ./common/install_ucc.sh install_ucc.sh
RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi
RUN rm install_ucc.sh
COPY ./common/install_openssl.sh install_openssl.sh
ENV OPENSSL_ROOT_DIR /opt/openssl
RUN bash ./install_openssl.sh
ENV OPENSSL_DIR /opt/openssl
ARG INDUCTOR_BENCHMARKS
ARG ANACONDA_PYTHON_VERSION
ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/huggingface.txt huggingface.txt
COPY ci_commit_pins/timm.txt timm.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
# (optional) Install non-default CMake version
ARG CMAKE_VERSION
COPY ./common/install_cmake.sh install_cmake.sh
RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
RUN rm install_cmake.sh
ARG TRITON
# Install triton, this needs to be done before sccache because the latter will
# try to reach out to S3, which docker build runners don't have access
COPY ./common/install_triton.sh install_triton.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/triton.txt triton.txt
COPY triton_version.txt triton_version.txt
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
ARG HALIDE
# Build and install halide
COPY ./common/install_halide.sh install_halide.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/halide.txt halide.txt
RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
RUN rm install_halide.sh common_utils.sh halide.txt
# Install ccache/sccache (do this last, so we get priority in PATH)
COPY ./common/install_cache.sh install_cache.sh
ENV PATH /opt/cache/bin:$PATH
# See https://github.com/pytorch/pytorch/issues/82174
# TODO(sdym@fb.com):
# check if this is needed after full off Xenial migration
ENV CARGO_NET_GIT_FETCH_WITH_CLI true
RUN bash ./install_cache.sh && rm install_cache.sh
ENV CMAKE_CUDA_COMPILER_LAUNCHER=/opt/cache/bin/sccache
# Add jni.h for java host build
COPY ./common/install_jni.sh install_jni.sh
COPY ./java/jni.h jni.h
RUN bash ./install_jni.sh && rm install_jni.sh
# Install Open MPI for CUDA
COPY ./common/install_openmpi.sh install_openmpi.sh
RUN if [ -n "${CUDA_VERSION}" ]; then bash install_openmpi.sh; fi
RUN rm install_openmpi.sh
# Include BUILD_ENVIRONMENT environment variable in image
ARG BUILD_ENVIRONMENT
ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
# AWS specific CUDA build guidance
ENV TORCH_CUDA_ARCH_LIST Maxwell
ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all"
ENV CUDA_PATH /usr/local/cuda
# Install LLVM dev version (Defined in the pytorch/builder github repository)
COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
# Install CUDNN
ARG CUDNN_VERSION
ARG CUDA_VERSION
COPY ./common/install_cudnn.sh install_cudnn.sh
RUN if [ -n "${CUDNN_VERSION}" ]; then bash install_cudnn.sh; fi
RUN rm install_cudnn.sh
# Install CUSPARSELT
ARG CUDA_VERSION
COPY ./common/install_cusparselt.sh install_cusparselt.sh
RUN bash install_cusparselt.sh
RUN rm install_cusparselt.sh
# Install CUDSS
ARG CUDA_VERSION
COPY ./common/install_cudss.sh install_cudss.sh
RUN bash install_cudss.sh
RUN rm install_cudss.sh
# Delete /usr/local/cuda-11.X/cuda-11.X symlinks
RUN if [ -h /usr/local/cuda-11.6/cuda-11.6 ]; then rm /usr/local/cuda-11.6/cuda-11.6; fi
RUN if [ -h /usr/local/cuda-11.7/cuda-11.7 ]; then rm /usr/local/cuda-11.7/cuda-11.7; fi
RUN if [ -h /usr/local/cuda-12.1/cuda-12.1 ]; then rm /usr/local/cuda-12.1/cuda-12.1; fi
RUN if [ -h /usr/local/cuda-12.4/cuda-12.4 ]; then rm /usr/local/cuda-12.4/cuda-12.4; fi
USER jenkins
CMD ["bash"]

View File

@ -14,18 +14,19 @@ ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
COPY ./common/install_base.sh install_base.sh COPY ./common/install_base.sh install_base.sh
RUN bash ./install_base.sh && rm install_base.sh RUN bash ./install_base.sh && rm install_base.sh
# Install clang
ARG LLVMDEV
ARG CLANG_VERSION
COPY ./common/install_clang.sh install_clang.sh
RUN bash ./install_clang.sh && rm install_clang.sh
# Install user # Install user
COPY ./common/install_user.sh install_user.sh COPY ./common/install_user.sh install_user.sh
RUN bash ./install_user.sh && rm install_user.sh RUN bash ./install_user.sh && rm install_user.sh
# Install katex
ARG KATEX
COPY ./common/install_docs_reqs.sh install_docs_reqs.sh
RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
# Install conda and other packages (e.g., numpy, pytest) # Install conda and other packages (e.g., numpy, pytest)
ARG ANACONDA_PYTHON_VERSION ARG ANACONDA_PYTHON_VERSION
ARG BUILD_ENVIRONMENT ARG CONDA_CMAKE
ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
COPY requirements-ci.txt /opt/conda/requirements-ci.txt COPY requirements-ci.txt /opt/conda/requirements-ci.txt
@ -38,10 +39,19 @@ ARG GCC_VERSION
COPY ./common/install_gcc.sh install_gcc.sh COPY ./common/install_gcc.sh install_gcc.sh
RUN bash ./install_gcc.sh && rm install_gcc.sh RUN bash ./install_gcc.sh && rm install_gcc.sh
# Install clang # (optional) Install protobuf for ONNX
ARG CLANG_VERSION ARG PROTOBUF
COPY ./common/install_clang.sh install_clang.sh COPY ./common/install_protobuf.sh install_protobuf.sh
RUN bash ./install_clang.sh && rm install_clang.sh RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
RUN rm install_protobuf.sh
ENV INSTALLED_PROTOBUF ${PROTOBUF}
# (optional) Install database packages like LMDB and LevelDB
ARG DB
COPY ./common/install_db.sh install_db.sh
RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}
# (optional) Install vision packages like OpenCV # (optional) Install vision packages like OpenCV
ARG VISION ARG VISION
@ -56,7 +66,7 @@ COPY ./common/install_rocm.sh install_rocm.sh
RUN bash ./install_rocm.sh RUN bash ./install_rocm.sh
RUN rm install_rocm.sh RUN rm install_rocm.sh
COPY ./common/install_rocm_magma.sh install_rocm_magma.sh COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} RUN bash ./install_rocm_magma.sh
RUN rm install_rocm_magma.sh RUN rm install_rocm_magma.sh
ADD ./common/install_miopen.sh install_miopen.sh ADD ./common/install_miopen.sh install_miopen.sh
RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
@ -75,32 +85,11 @@ COPY ./common/install_amdsmi.sh install_amdsmi.sh
RUN bash ./install_amdsmi.sh RUN bash ./install_amdsmi.sh
RUN rm install_amdsmi.sh RUN rm install_amdsmi.sh
# (optional) Install UCC # (optional) Install non-default CMake version
ARG UCX_COMMIT ARG CMAKE_VERSION
ARG UCC_COMMIT COPY ./common/install_cmake.sh install_cmake.sh
ENV UCX_COMMIT $UCX_COMMIT RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
ENV UCC_COMMIT $UCC_COMMIT RUN rm install_cmake.sh
ENV UCX_HOME /usr
ENV UCC_HOME /usr
ADD ./common/install_ucc.sh install_ucc.sh
RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi
RUN rm install_ucc.sh
COPY ./common/install_openssl.sh install_openssl.sh
ENV OPENSSL_ROOT_DIR /opt/openssl
RUN bash ./install_openssl.sh
ENV OPENSSL_DIR /opt/openssl
ARG INDUCTOR_BENCHMARKS
ARG ANACONDA_PYTHON_VERSION
ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/huggingface.txt huggingface.txt
COPY ci_commit_pins/timm.txt timm.txt
COPY ci_commit_pins/torchbench.txt torchbench.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
# (optional) Install non-default Ninja version # (optional) Install non-default Ninja version
ARG NINJA_VERSION ARG NINJA_VERSION
@ -118,17 +107,24 @@ COPY triton_version.txt triton_version.txt
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
# Install AOTriton
COPY ./aotriton_version.txt aotriton_version.txt
COPY ./common/common_utils.sh common_utils.sh
COPY ./common/install_aotriton.sh install_aotriton.sh
RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
# This is needed by sccache
COPY ./common/install_openssl.sh install_openssl.sh
ENV OPENSSL_ROOT_DIR /opt/openssl
RUN bash ./install_openssl.sh
ENV OPENSSL_DIR /opt/openssl
# Install ccache/sccache (do this last, so we get priority in PATH) # Install ccache/sccache (do this last, so we get priority in PATH)
COPY ./common/install_cache.sh install_cache.sh COPY ./common/install_cache.sh install_cache.sh
ENV PATH /opt/cache/bin:$PATH ENV PATH /opt/cache/bin:$PATH
RUN bash ./install_cache.sh && rm install_cache.sh RUN bash ./install_cache.sh && rm install_cache.sh
# Install Open MPI for ROCm
COPY ./common/install_openmpi.sh install_openmpi.sh
RUN if [ -n "${CUDA_VERSION}" ]; then bash install_openmpi.sh; fi
RUN rm install_openmpi.sh
# Include BUILD_ENVIRONMENT environment variable in image # Include BUILD_ENVIRONMENT environment variable in image
ARG BUILD_ENVIRONMENT ARG BUILD_ENVIRONMENT
ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT} ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}

View File

@ -28,6 +28,7 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
# Install conda and other packages (e.g., numpy, pytest) # Install conda and other packages (e.g., numpy, pytest)
ARG ANACONDA_PYTHON_VERSION ARG ANACONDA_PYTHON_VERSION
ARG CONDA_CMAKE
ARG DOCS ARG DOCS
ARG BUILD_ENVIRONMENT ARG BUILD_ENVIRONMENT
ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
@ -72,10 +73,17 @@ ARG TRITON
COPY ./common/install_triton.sh install_triton.sh COPY ./common/install_triton.sh install_triton.sh
COPY ./common/common_utils.sh common_utils.sh COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/triton-xpu.txt triton-xpu.txt COPY ci_commit_pins/triton-xpu.txt triton-xpu.txt
COPY triton_xpu_version.txt triton_version.txt COPY triton_version.txt triton_version.txt
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt
# (optional) Install database packages like LMDB and LevelDB
ARG DB
COPY ./common/install_db.sh install_db.sh
RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}
# (optional) Install vision packages like OpenCV # (optional) Install vision packages like OpenCV
ARG VISION ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./ COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -83,6 +91,12 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
RUN rm install_vision.sh cache_vision_models.sh common_utils.sh RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
ENV INSTALLED_VISION ${VISION} ENV INSTALLED_VISION ${VISION}
# (optional) Install non-default CMake version
ARG CMAKE_VERSION
COPY ./common/install_cmake.sh install_cmake.sh
RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
RUN rm install_cmake.sh
# (optional) Install non-default Ninja version # (optional) Install non-default Ninja version
ARG NINJA_VERSION ARG NINJA_VERSION
COPY ./common/install_ninja.sh install_ninja.sh COPY ./common/install_ninja.sh install_ninja.sh

View File

@ -1,6 +1,6 @@
ARG UBUNTU_VERSION ARG UBUNTU_VERSION
FROM ubuntu:${UBUNTU_VERSION} as base FROM ubuntu:${UBUNTU_VERSION}
ARG UBUNTU_VERSION ARG UBUNTU_VERSION
@ -28,6 +28,7 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
# Install conda and other packages (e.g., numpy, pytest) # Install conda and other packages (e.g., numpy, pytest)
ARG ANACONDA_PYTHON_VERSION ARG ANACONDA_PYTHON_VERSION
ARG CONDA_CMAKE
ARG DOCS ARG DOCS
ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
@ -51,17 +52,9 @@ RUN bash ./install_lcov.sh && rm install_lcov.sh
# Install cuda and cudnn # Install cuda and cudnn
ARG CUDA_VERSION ARG CUDA_VERSION
COPY ./common/install_cuda.sh install_cuda.sh COPY ./common/install_cuda.sh install_cuda.sh
COPY ./common/install_nccl.sh install_nccl.sh RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
COPY ./common/install_cusparselt.sh install_cusparselt.sh
RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu* install_cusparselt.sh
ENV DESIRED_CUDA ${CUDA_VERSION} ENV DESIRED_CUDA ${CUDA_VERSION}
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
# No effect if cuda not installed
ENV USE_SYSTEM_NCCL=1
ENV NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
# (optional) Install UCC # (optional) Install UCC
ARG UCX_COMMIT ARG UCX_COMMIT
@ -74,6 +67,20 @@ ADD ./common/install_ucc.sh install_ucc.sh
RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi
RUN rm install_ucc.sh RUN rm install_ucc.sh
# (optional) Install protobuf for ONNX
ARG PROTOBUF
COPY ./common/install_protobuf.sh install_protobuf.sh
RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
RUN rm install_protobuf.sh
ENV INSTALLED_PROTOBUF ${PROTOBUF}
# (optional) Install database packages like LMDB and LevelDB
ARG DB
COPY ./common/install_db.sh install_db.sh
RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}
# (optional) Install vision packages like OpenCV # (optional) Install vision packages like OpenCV
ARG VISION ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./ COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -81,6 +88,24 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
RUN rm install_vision.sh cache_vision_models.sh common_utils.sh RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
ENV INSTALLED_VISION ${VISION} ENV INSTALLED_VISION ${VISION}
# (optional) Install Vulkan SDK
ARG VULKAN_SDK_VERSION
COPY ./common/install_vulkan_sdk.sh install_vulkan_sdk.sh
RUN if [ -n "${VULKAN_SDK_VERSION}" ]; then bash ./install_vulkan_sdk.sh; fi
RUN rm install_vulkan_sdk.sh
# (optional) Install swiftshader
ARG SWIFTSHADER
COPY ./common/install_swiftshader.sh install_swiftshader.sh
RUN if [ -n "${SWIFTSHADER}" ]; then bash ./install_swiftshader.sh; fi
RUN rm install_swiftshader.sh
# (optional) Install non-default CMake version
ARG CMAKE_VERSION
COPY ./common/install_cmake.sh install_cmake.sh
RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
RUN rm install_cmake.sh
# (optional) Install non-default Ninja version # (optional) Install non-default Ninja version
ARG NINJA_VERSION ARG NINJA_VERSION
COPY ./common/install_ninja.sh install_ninja.sh COPY ./common/install_ninja.sh install_ninja.sh
@ -98,26 +123,24 @@ COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps
COPY ./common/common_utils.sh common_utils.sh COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/huggingface.txt huggingface.txt COPY ci_commit_pins/huggingface.txt huggingface.txt
COPY ci_commit_pins/timm.txt timm.txt COPY ci_commit_pins/timm.txt timm.txt
COPY ci_commit_pins/torchbench.txt torchbench.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
ARG TRITON ARG TRITON
ARG TRITON_CPU # Install triton, this needs to be done before sccache because the latter will
# try to reach out to S3, which docker build runners don't have access
# Create a separate stage for building Triton and Triton-CPU. install_triton
# will check for the presence of env vars
FROM base as triton-builder
COPY ./common/install_triton.sh install_triton.sh COPY ./common/install_triton.sh install_triton.sh
COPY ./common/common_utils.sh common_utils.sh COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/triton.txt triton.txt COPY ci_commit_pins/triton.txt triton.txt
COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
RUN bash ./install_triton.sh RUN rm install_triton.sh common_utils.sh triton.txt
FROM base as final ARG TRITON_CPU
COPY --from=triton-builder /opt/triton /opt/triton COPY ./common/install_triton.sh install_triton.sh
RUN if [ -n "${TRITON}" ] || [ -n "${TRITON_CPU}" ]; then pip install /opt/triton/*.whl; chown -R jenkins:jenkins /opt/conda; fi COPY ./common/common_utils.sh common_utils.sh
RUN rm -rf /opt/triton COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt
RUN if [ -n "${TRITON_CPU}" ]; then bash ./install_triton.sh; fi
RUN rm install_triton.sh common_utils.sh triton-cpu.txt
ARG EXECUTORCH ARG EXECUTORCH
# Build and install executorch # Build and install executorch
@ -148,12 +171,6 @@ RUN if [ -n "${ACL}" ]; then bash ./install_acl.sh; fi
RUN rm install_acl.sh RUN rm install_acl.sh
ENV INSTALLED_ACL ${ACL} ENV INSTALLED_ACL ${ACL}
ARG OPENBLAS
COPY ./common/install_openblas.sh install_openblas.sh
RUN if [ -n "${OPENBLAS}" ]; then bash ./install_openblas.sh; fi
RUN rm install_openblas.sh
ENV INSTALLED_OPENBLAS ${OPENBLAS}
# Install ccache/sccache (do this last, so we get priority in PATH) # Install ccache/sccache (do this last, so we get priority in PATH)
ARG SKIP_SCCACHE_INSTALL ARG SKIP_SCCACHE_INSTALL
COPY ./common/install_cache.sh install_cache.sh COPY ./common/install_cache.sh install_cache.sh

View File

@ -1,2 +0,0 @@
output/
magma-rocm*/

View File

@ -1,35 +0,0 @@
SHELL=/usr/bin/env bash
DOCKER_CMD ?= docker
DESIRED_ROCM ?= 6.4
DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
PACKAGE_NAME = magma-rocm
# inherit this from underlying docker image, do not pass this env var to docker
#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
-w /builder \
-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_ROCM_SHORT} \
-e DESIRED_ROCM=${DESIRED_ROCM} \
"pytorch/almalinux-builder:rocm${DESIRED_ROCM}" \
magma-rocm/build_magma.sh
.PHONY: all
all: magma-rocm64
all: magma-rocm63
.PHONY:
clean:
$(RM) -r magma-*
$(RM) -r output
.PHONY: magma-rocm64
magma-rocm64: DESIRED_ROCM := 6.4
magma-rocm64:
$(DOCKER_RUN)
.PHONY: magma-rocm63
magma-rocm63: DESIRED_ROCM := 6.3
magma-rocm63:
$(DOCKER_RUN)

View File

@ -1,48 +0,0 @@
# Magma ROCm
This folder contains the scripts and configurations to build libmagma.so, linked for various versions of ROCm.
## Building
Look in the `Makefile` for available targets to build. To build any target, for example `magma-rocm63`, run
```
# Using `docker`
make magma-rocm63
# Using `podman`
DOCKER_CMD=podman make magma-rocm63
```
This spawns a `pytorch/manylinux-rocm<version>` docker image, which has the required `devtoolset` and ROCm versions installed.
Within the docker image, it runs `build_magma.sh` with the correct environment variables set, which package the necessary files
into a tarball, with the following structure:
```
.
├── include # header files
├── lib # libmagma.so
├── info
│ ├── licenses # license file
│ └── recipe # build script
```
More specifically, `build_magma.sh` copies over the relevant files from the `package_files` directory depending on the ROCm version.
Outputted binaries should be in the `output` folder.
## Pushing
Packages can be uploaded to an S3 bucket using:
```
aws s3 cp output/*/magma-cuda*.bz2 <bucket-with-path>
```
If you do not have upload permissions, please ping @seemethere or @soumith to gain access
## New versions
New ROCm versions can be added by creating a new make target with the next desired version. For ROCm version N.n, the target should be named `magma-rocmNn`.
Make sure to edit the appropriate environment variables (e.g., DESIRED_ROCM) in the `Makefile` accordingly. Remember also to check `build_magma.sh` to ensure the logic for copying over the files remains correct.

View File

@ -1,42 +0,0 @@
#!/usr/bin/env bash
set -eou pipefail
# Environment variables
# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
# Version 2.7.2 + ROCm related updates
MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
# Folders for the build
PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
PACKAGE_DIR=${ROOT_DIR}/magma-rocm/${PACKAGE_NAME} # build workspace
PACKAGE_OUTPUT=${ROOT_DIR}/magma-rocm/output # where tarballs are stored
PACKAGE_BUILD=${PACKAGE_DIR} # where the content of the tarball is prepared
PACKAGE_RECIPE=${PACKAGE_BUILD}/info/recipe
PACKAGE_LICENSE=${PACKAGE_BUILD}/info/licenses
mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RECIPE} ${PACKAGE_LICENSE}
# Fetch magma sources and verify checksum
pushd ${PACKAGE_DIR}
git clone https://bitbucket.org/icl/magma.git
pushd magma
git checkout ${MAGMA_VERSION}
popd
popd
# build
pushd ${PACKAGE_DIR}/magma
# The build.sh script expects to be executed from the sources root folder
INSTALL_DIR=${PACKAGE_BUILD} ${PACKAGE_FILES}/build.sh
popd
# Package recipe, license and tarball
# Folder and package name are backward compatible for the build workflow
cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh
cp ${PACKAGE_DIR}/magma/COPYRIGHT ${PACKAGE_LICENSE}/COPYRIGHT
pushd ${PACKAGE_BUILD}
tar cjf ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2 include lib info
echo Built in ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2
popd

View File

@ -1,38 +0,0 @@
# Magma build scripts need `python`
ln -sf /usr/bin/python3 /usr/bin/python
ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
case "$ID" in
almalinux)
yum install -y gcc-gfortran
;;
*)
echo "No preinstalls to build magma..."
;;
esac
MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}
cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
fi
echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
export PATH="${PATH}:/opt/rocm/bin"
if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'`
else
amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs`
fi
for arch in $amdgpu_targets; do
echo "DEVCCFLAGS += --offload-arch=$arch" >> make.inc
done
# hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
make -f make.gen.hipMAGMA -j $(nproc)
LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
cp -R lib ${INSTALL_DIR}
cp -R include ${INSTALL_DIR}

View File

@ -1,7 +1,7 @@
SHELL=/usr/bin/env bash SHELL=/usr/bin/env bash
DOCKER_CMD ?= docker DOCKER_CMD ?= docker
DESIRED_CUDA ?= 12.8 DESIRED_CUDA ?= 11.8
DESIRED_CUDA_SHORT = $(subst .,,$(DESIRED_CUDA)) DESIRED_CUDA_SHORT = $(subst .,,$(DESIRED_CUDA))
PACKAGE_NAME = magma-cuda PACKAGE_NAME = magma-cuda
CUDA_ARCH_LIST ?= -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 CUDA_ARCH_LIST ?= -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90
@ -12,32 +12,37 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_CUDA_SHORT} \ -e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_CUDA_SHORT} \
-e DESIRED_CUDA=${DESIRED_CUDA} \ -e DESIRED_CUDA=${DESIRED_CUDA} \
-e CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" \ -e CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" \
"pytorch/almalinux-builder:cuda${DESIRED_CUDA}-main" \ "pytorch/manylinux-builder:cuda${DESIRED_CUDA}-main" \
magma/build_magma.sh magma/build_magma.sh
.PHONY: all .PHONY: all
all: magma-cuda129
all: magma-cuda128
all: magma-cuda126 all: magma-cuda126
all: magma-cuda124
all: magma-cuda121
all: magma-cuda118
.PHONY: .PHONY:
clean: clean:
$(RM) -r magma-* $(RM) -r magma-*
$(RM) -r output $(RM) -r output
.PHONY: magma-cuda129
magma-cuda129: DESIRED_CUDA := 12.9
magma-cuda129: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
magma-cuda129:
$(DOCKER_RUN)
.PHONY: magma-cuda128
magma-cuda128: DESIRED_CUDA := 12.8
magma-cuda128: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
magma-cuda128:
$(DOCKER_RUN)
.PHONY: magma-cuda126 .PHONY: magma-cuda126
magma-cuda126: DESIRED_CUDA := 12.6 magma-cuda126: DESIRED_CUDA := 12.6
magma-cuda126: magma-cuda126:
$(DOCKER_RUN) $(DOCKER_RUN)
.PHONY: magma-cuda124
magma-cuda124: DESIRED_CUDA := 12.4
magma-cuda124:
$(DOCKER_RUN)
.PHONY: magma-cuda121
magma-cuda121: DESIRED_CUDA := 12.1
magma-cuda121:
$(DOCKER_RUN)
.PHONY: magma-cuda118
magma-cuda118: DESIRED_CUDA := 11.8
magma-cuda118: CUDA_ARCH_LIST += -gencode arch=compute_37,code=sm_37
magma-cuda118:
$(DOCKER_RUN)

View File

@ -18,10 +18,12 @@ retry () {
$* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
} }
PLATFORM="" PLATFORM="manylinux2014_x86_64"
# TODO move this into the Docker images # TODO move this into the Docker images
OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release) OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
retry yum install -q -y zip openssl
elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
retry yum install -q -y zip openssl retry yum install -q -y zip openssl
PLATFORM="manylinux_2_28_x86_64" PLATFORM="manylinux_2_28_x86_64"
elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
@ -31,11 +33,9 @@ elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
# Comment out nvidia repositories to prevent them from getting apt-get updated, see https://github.com/pytorch/pytorch/issues/74968 # Comment out nvidia repositories to prevent them from getting apt-get updated, see https://github.com/pytorch/pytorch/issues/74968
# shellcheck disable=SC2046 # shellcheck disable=SC2046
sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list") sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list")
retry apt-get update retry apt-get update
retry apt-get -y install zip openssl retry apt-get -y install zip openssl
else
echo "Unknown OS: '$OS_NAME'"
exit 1
fi fi
# We use the package name to test the package by passing this to 'pip install' # We use the package name to test the package by passing this to 'pip install'
@ -79,6 +79,8 @@ if [[ -e /opt/openssl ]]; then
export CMAKE_INCLUDE_PATH="/opt/openssl/include":$CMAKE_INCLUDE_PATH export CMAKE_INCLUDE_PATH="/opt/openssl/include":$CMAKE_INCLUDE_PATH
fi fi
mkdir -p /tmp/$WHEELHOUSE_DIR mkdir -p /tmp/$WHEELHOUSE_DIR
export PATCHELF_BIN=/usr/local/bin/patchelf export PATCHELF_BIN=/usr/local/bin/patchelf
@ -97,7 +99,6 @@ if [[ -z "$PYTORCH_ROOT" ]]; then
exit 1 exit 1
fi fi
pushd "$PYTORCH_ROOT" pushd "$PYTORCH_ROOT"
retry pip install -qUr requirements-build.txt
python setup.py clean python setup.py clean
retry pip install -qr requirements.txt retry pip install -qr requirements.txt
case ${DESIRED_PYTHON} in case ${DESIRED_PYTHON} in
@ -110,6 +111,12 @@ case ${DESIRED_PYTHON} in
;; ;;
esac esac
if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
export _GLIBCXX_USE_CXX11_ABI=1
else
export _GLIBCXX_USE_CXX11_ABI=0
fi
if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
echo "Calling build_amd.py at $(date)" echo "Calling build_amd.py at $(date)"
python tools/amd_build/build_amd.py python tools/amd_build/build_amd.py
@ -151,7 +158,7 @@ if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 \ BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 \
BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \ BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \ USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
CMAKE_FRESH=1 python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR --cmake
echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)" echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
else else
time CMAKE_ARGS=${CMAKE_ARGS[@]} \ time CMAKE_ARGS=${CMAKE_ARGS[@]} \
@ -202,6 +209,12 @@ if [[ -n "$BUILD_PYTHONLESS" ]]; then
mkdir -p /tmp/$LIBTORCH_HOUSE_DIR mkdir -p /tmp/$LIBTORCH_HOUSE_DIR
if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
LIBTORCH_ABI="cxx11-abi-"
else
LIBTORCH_ABI=
fi
zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch
cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \ cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \
/tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-latest.zip /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-latest.zip
@ -320,8 +333,8 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
# ROCm workaround for roctracer dlopens # ROCm workaround for roctracer dlopens
if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
patchedpath=$(fname_without_so_number $destpath) patchedpath=$(fname_without_so_number $destpath)
# Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load # Keep the so number for XPU dependencies
elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then elif [[ "$DESIRED_CUDA" == *"xpu"* ]]; then
patchedpath=$destpath patchedpath=$destpath
else else
patchedpath=$(fname_with_sha256 $destpath) patchedpath=$(fname_with_sha256 $destpath)

View File

@ -14,10 +14,6 @@ export USE_CUDA_STATIC_LINK=1
export INSTALL_TEST=0 # dont install test binaries into site-packages export INSTALL_TEST=0 # dont install test binaries into site-packages
export USE_CUPTI_SO=0 export USE_CUPTI_SO=0
export USE_CUSPARSELT=${USE_CUSPARSELT:-1} # Enable if not disabled by libtorch build export USE_CUSPARSELT=${USE_CUSPARSELT:-1} # Enable if not disabled by libtorch build
export USE_CUFILE=${USE_CUFILE:-1}
export USE_SYSTEM_NCCL=1
export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
# Keep an array of cmake variables to add to # Keep an array of cmake variables to add to
if [[ -z "$CMAKE_ARGS" ]]; then if [[ -z "$CMAKE_ARGS" ]]; then
@ -39,8 +35,10 @@ if [[ -n "$DESIRED_CUDA" ]]; then
if [[ ${DESIRED_CUDA} =~ ^[0-9]+\.[0-9]+$ ]]; then if [[ ${DESIRED_CUDA} =~ ^[0-9]+\.[0-9]+$ ]]; then
CUDA_VERSION=${DESIRED_CUDA} CUDA_VERSION=${DESIRED_CUDA}
else else
# cu126, cu128 etc... # cu90, cu92, cu100, cu101
if [[ ${#DESIRED_CUDA} -eq 5 ]]; then if [[ ${#DESIRED_CUDA} -eq 4 ]]; then
CUDA_VERSION="${DESIRED_CUDA:2:1}.${DESIRED_CUDA:3:1}"
elif [[ ${#DESIRED_CUDA} -eq 5 ]]; then
CUDA_VERSION="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4:1}" CUDA_VERSION="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4:1}"
fi fi
fi fi
@ -51,23 +49,32 @@ else
fi fi
cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.') cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
case ${CUDA_VERSION} in case ${CUDA_VERSION} in
#removing sm_50-sm_60 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
#however we would like to keep sm_70 architecture see: https://github.com/pytorch/pytorch/issues/157517
12.8)
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0"
;;
12.9)
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0+PTX"
# WAR to resolve the ld error in libtorch build with CUDA 12.9
if [[ "$PACKAGE_TYPE" == "libtorch" ]]; then
TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
fi
;;
12.6) 12.6)
TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0" if [[ "$GPU_ARCH_TYPE" = "cuda-aarch64" ]]; then
TORCH_CUDA_ARCH_LIST="9.0"
else
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0+PTX"
fi
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
;;
12.4)
if [[ "$GPU_ARCH_TYPE" = "cuda-aarch64" ]]; then
TORCH_CUDA_ARCH_LIST="9.0"
else
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
fi
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
;;
12.1)
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
;;
11.8)
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};3.7;9.0"
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
;; ;;
*) *)
echo "unknown cuda version $CUDA_VERSION" echo "unknown cuda version $CUDA_VERSION"
@ -91,15 +98,14 @@ fi
mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release) OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
LIBGOMP_PATH="/usr/lib64/libgomp.so.1" LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
LIBGOMP_PATH="/usr/lib64/libgomp.so.1" LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1" LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
else
echo "Unknown OS: '$OS_NAME'"
exit 1
fi fi
DEPS_LIST=( DEPS_LIST=(
@ -109,12 +115,22 @@ DEPS_SONAME=(
"libgomp.so.1" "libgomp.so.1"
) )
# CUDA 11.8 have to ship the libcusparseLt.so.0 with the binary
# since nvidia-cusparselt-cu11 is not available in PYPI
if [[ $USE_CUSPARSELT == "1" && $CUDA_VERSION == "11.8" ]]; then
DEPS_SONAME+=(
"libcusparseLt.so.0"
)
DEPS_LIST+=(
"/usr/local/cuda/lib64/libcusparseLt.so.0"
)
fi
# CUDA_VERSION 12.6, 12.8, 12.9 if [[ $CUDA_VERSION == "12.4" || $CUDA_VERSION == "12.6" ]]; then
if [[ $CUDA_VERSION == 12* ]]; then
export USE_STATIC_CUDNN=0 export USE_STATIC_CUDNN=0
# Try parallelizing nvcc as well # Try parallelizing nvcc as well
export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2" export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
echo "Bundling with cudnn and cublas." echo "Bundling with cudnn and cublas."
DEPS_LIST+=( DEPS_LIST+=(
@ -130,12 +146,9 @@ if [[ $CUDA_VERSION == 12* ]]; then
"/usr/local/cuda/lib64/libcublasLt.so.12" "/usr/local/cuda/lib64/libcublasLt.so.12"
"/usr/local/cuda/lib64/libcusparseLt.so.0" "/usr/local/cuda/lib64/libcusparseLt.so.0"
"/usr/local/cuda/lib64/libcudart.so.12" "/usr/local/cuda/lib64/libcudart.so.12"
"/usr/local/cuda/lib64/libnvToolsExt.so.1"
"/usr/local/cuda/lib64/libnvrtc.so.12" "/usr/local/cuda/lib64/libnvrtc.so.12"
"/usr/local/cuda/lib64/libnvrtc-builtins.so" "/usr/local/cuda/lib64/libnvrtc-builtins.so"
"/usr/local/cuda/lib64/libcufile.so.0"
"/usr/local/cuda/lib64/libcufile_rdma.so.1"
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12"
"/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so"
) )
DEPS_SONAME+=( DEPS_SONAME+=(
"libcudnn_adv.so.9" "libcudnn_adv.so.9"
@ -150,18 +163,10 @@ if [[ $CUDA_VERSION == 12* ]]; then
"libcublasLt.so.12" "libcublasLt.so.12"
"libcusparseLt.so.0" "libcusparseLt.so.0"
"libcudart.so.12" "libcudart.so.12"
"libnvToolsExt.so.1"
"libnvrtc.so.12" "libnvrtc.so.12"
"libnvrtc-builtins.so" "libnvrtc-builtins.so"
"libcufile.so.0"
"libcufile_rdma.so.1"
"libcupti.so.12"
"libnvperf_host.so"
) )
# Add libnvToolsExt only if CUDA version is not 12.9
if [[ $CUDA_VERSION != 12.9* ]]; then
DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")
DEPS_SONAME+=("libnvToolsExt.so.1")
fi
else else
echo "Using nvidia libs from pypi." echo "Using nvidia libs from pypi."
CUDA_RPATHS=( CUDA_RPATHS=(
@ -174,21 +179,89 @@ if [[ $CUDA_VERSION == 12* ]]; then
'$ORIGIN/../../nvidia/curand/lib' '$ORIGIN/../../nvidia/curand/lib'
'$ORIGIN/../../nvidia/cusolver/lib' '$ORIGIN/../../nvidia/cusolver/lib'
'$ORIGIN/../../nvidia/cusparse/lib' '$ORIGIN/../../nvidia/cusparse/lib'
'$ORIGIN/../../nvidia/cusparselt/lib'
'$ORIGIN/../../cusparselt/lib' '$ORIGIN/../../cusparselt/lib'
'$ORIGIN/../../nvidia/nccl/lib' '$ORIGIN/../../nvidia/nccl/lib'
'$ORIGIN/../../nvidia/nvshmem/lib'
'$ORIGIN/../../nvidia/nvtx/lib' '$ORIGIN/../../nvidia/nvtx/lib'
'$ORIGIN/../../nvidia/cufile/lib'
) )
CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}") CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib' export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN' export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
export FORCE_RPATH="--force-rpath" export FORCE_RPATH="--force-rpath"
export USE_STATIC_NCCL=0 export USE_STATIC_NCCL=0
export USE_SYSTEM_NCCL=1
export ATEN_STATIC_CUDA=0 export ATEN_STATIC_CUDA=0
export USE_CUDA_STATIC_LINK=0 export USE_CUDA_STATIC_LINK=0
export USE_CUPTI_SO=1 export USE_CUPTI_SO=1
export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
fi
elif [[ $CUDA_VERSION == "11.8" ]]; then
export USE_STATIC_CUDNN=0
# Try parallelizing nvcc as well
export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
# Bundle ptxas into the wheel, see https://github.com/pytorch/pytorch/pull/119750
export BUILD_BUNDLE_PTXAS=1
if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
echo "Bundling with cudnn and cublas."
DEPS_LIST+=(
"/usr/local/cuda/lib64/libcudnn_adv.so.9"
"/usr/local/cuda/lib64/libcudnn_cnn.so.9"
"/usr/local/cuda/lib64/libcudnn_graph.so.9"
"/usr/local/cuda/lib64/libcudnn_ops.so.9"
"/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9"
"/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9"
"/usr/local/cuda/lib64/libcudnn_heuristic.so.9"
"/usr/local/cuda/lib64/libcudnn.so.9"
"/usr/local/cuda/lib64/libcublas.so.11"
"/usr/local/cuda/lib64/libcublasLt.so.11"
"/usr/local/cuda/lib64/libcudart.so.11.0"
"/usr/local/cuda/lib64/libnvToolsExt.so.1"
"/usr/local/cuda/lib64/libnvrtc.so.11.2" # this is not a mistake, it links to more specific cuda version
"/usr/local/cuda/lib64/libnvrtc-builtins.so.11.8"
)
DEPS_SONAME+=(
"libcudnn_adv.so.9"
"libcudnn_cnn.so.9"
"libcudnn_graph.so.9"
"libcudnn_ops.so.9"
"libcudnn_engines_runtime_compiled.so.9"
"libcudnn_engines_precompiled.so.9"
"libcudnn_heuristic.so.9"
"libcudnn.so.9"
"libcublas.so.11"
"libcublasLt.so.11"
"libcudart.so.11.0"
"libnvToolsExt.so.1"
"libnvrtc.so.11.2"
"libnvrtc-builtins.so.11.8"
)
else
echo "Using nvidia libs from pypi."
CUDA_RPATHS=(
'$ORIGIN/../../nvidia/cublas/lib'
'$ORIGIN/../../nvidia/cuda_cupti/lib'
'$ORIGIN/../../nvidia/cuda_nvrtc/lib'
'$ORIGIN/../../nvidia/cuda_runtime/lib'
'$ORIGIN/../../nvidia/cudnn/lib'
'$ORIGIN/../../nvidia/cufft/lib'
'$ORIGIN/../../nvidia/curand/lib'
'$ORIGIN/../../nvidia/cusolver/lib'
'$ORIGIN/../../nvidia/cusparse/lib'
'$ORIGIN/../../nvidia/nccl/lib'
'$ORIGIN/../../nvidia/nvtx/lib'
)
CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
export FORCE_RPATH="--force-rpath"
export USE_STATIC_NCCL=0
export USE_SYSTEM_NCCL=1
export ATEN_STATIC_CUDA=0
export USE_CUDA_STATIC_LINK=0
export USE_CUPTI_SO=1
export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
fi fi
else else
echo "Unknown cuda version $CUDA_VERSION" echo "Unknown cuda version $CUDA_VERSION"

View File

@ -22,7 +22,9 @@ retry () {
# TODO move this into the Docker images # TODO move this into the Docker images
OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release` OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
retry yum install -q -y zip openssl
elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
retry yum install -q -y zip openssl retry yum install -q -y zip openssl
elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
retry dnf install -q -y zip openssl retry dnf install -q -y zip openssl
@ -33,9 +35,6 @@ elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list") sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list")
retry apt-get update retry apt-get update
retry apt-get -y install zip openssl retry apt-get -y install zip openssl
else
echo "Unknown OS: '$OS_NAME'"
exit 1
fi fi
# Version: setup.py uses $PYTORCH_BUILD_VERSION.post$PYTORCH_BUILD_NUMBER if # Version: setup.py uses $PYTORCH_BUILD_VERSION.post$PYTORCH_BUILD_NUMBER if
@ -92,11 +91,16 @@ if [[ -z "$PYTORCH_ROOT" ]]; then
exit 1 exit 1
fi fi
pushd "$PYTORCH_ROOT" pushd "$PYTORCH_ROOT"
retry pip install -qUr requirements-build.txt
python setup.py clean python setup.py clean
retry pip install -qr requirements.txt retry pip install -qr requirements.txt
retry pip install -q numpy==2.0.1 retry pip install -q numpy==2.0.1
if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
export _GLIBCXX_USE_CXX11_ABI=1
else
export _GLIBCXX_USE_CXX11_ABI=0
fi
if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
echo "Calling build_amd.py at $(date)" echo "Calling build_amd.py at $(date)"
python tools/amd_build/build_amd.py python tools/amd_build/build_amd.py
@ -104,7 +108,7 @@ if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr
fi fi
echo "Calling 'python -m pip install .' at $(date)" echo "Calling setup.py install at $(date)"
if [[ $LIBTORCH_VARIANT = *"static"* ]]; then if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
STATIC_CMAKE_FLAG="-DTORCH_STATIC=1" STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
@ -120,7 +124,7 @@ fi
# TODO: Remove this flag once https://github.com/pytorch/pytorch/issues/55952 is closed # TODO: Remove this flag once https://github.com/pytorch/pytorch/issues/55952 is closed
CFLAGS='-Wno-deprecated-declarations' \ CFLAGS='-Wno-deprecated-declarations' \
BUILD_LIBTORCH_CPU_WITH_DEBUG=1 \ BUILD_LIBTORCH_CPU_WITH_DEBUG=1 \
python -m pip install --no-build-isolation -v . python setup.py install
mkdir -p libtorch/{lib,bin,include,share} mkdir -p libtorch/{lib,bin,include,share}
@ -165,6 +169,12 @@ fi
) )
if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
LIBTORCH_ABI="cxx11-abi-"
else
LIBTORCH_ABI=
fi
( (
set -x set -x

View File

@ -95,7 +95,6 @@ ROCM_SO_FILES=(
"libroctracer64.so" "libroctracer64.so"
"libroctx64.so" "libroctx64.so"
"libhipblaslt.so" "libhipblaslt.so"
"libhipsparselt.so"
"libhiprtc.so" "libhiprtc.so"
) )
@ -187,28 +186,29 @@ do
OS_SO_FILES[${#OS_SO_FILES[@]}]=$file_name # Append lib to array OS_SO_FILES[${#OS_SO_FILES[@]}]=$file_name # Append lib to array
done done
ARCH=$(echo $PYTORCH_ROCM_ARCH | sed 's/;/|/g') # Replace ; separated arch list to bar for grep # FIXME: Temporary until https://github.com/pytorch/pytorch/pull/137443 lands
# Install AOTriton
if [ -e ${PYTORCH_ROOT}/.ci/docker/aotriton_version.txt ]; then
cp -a ${PYTORCH_ROOT}/.ci/docker/aotriton_version.txt aotriton_version.txt
bash ${PYTORCH_ROOT}/.ci/docker/common/install_aotriton.sh ${ROCM_HOME} && rm aotriton_version.txt
export AOTRITON_INSTALLED_PREFIX=${ROCM_HOME}/aotriton
ROCM_SO_FILES+=("libaotriton_v2.so")
fi
# rocBLAS library files # rocBLAS library files
ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library
ROCBLAS_LIB_DST=lib/rocblas/library ROCBLAS_LIB_DST=lib/rocblas/library
ROCBLAS_ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH) ARCH=$(echo $PYTORCH_ROCM_ARCH | sed 's/;/|/g') # Replace ; seperated arch list to bar for grep
ROCBLAS_OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx) ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $OTHER_FILES) OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
ROCBLAS_LIB_FILES=($ARCH_SPECIFIC_FILES $OTHER_FILES)
# hipblaslt library files # hipblaslt library files
HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library
HIPBLASLT_LIB_DST=lib/hipblaslt/library HIPBLASLT_LIB_DST=lib/hipblaslt/library
HIPBLASLT_ARCH_SPECIFIC_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -E $ARCH) ARCH_SPECIFIC_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -E $ARCH)
HIPBLASLT_OTHER_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -v gfx) OTHER_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -v gfx)
HIPBLASLT_LIB_FILES=($HIPBLASLT_ARCH_SPECIFIC_FILES $HIPBLASLT_OTHER_FILES) HIPBLASLT_LIB_FILES=($ARCH_SPECIFIC_FILES $OTHER_FILES)
# hipsparselt library files
HIPSPARSELT_LIB_SRC=$ROCM_HOME/lib/hipsparselt/library
HIPSPARSELT_LIB_DST=lib/hipsparselt/library
HIPSPARSELT_ARCH_SPECIFIC_FILES=$(ls $HIPSPARSELT_LIB_SRC | grep -E $ARCH)
#HIPSPARSELT_OTHER_FILES=$(ls $HIPSPARSELT_LIB_SRC | grep -v gfx)
HIPSPARSELT_LIB_FILES=($HIPSPARSELT_ARCH_SPECIFIC_FILES $HIPSPARSELT_OTHER_FILES)
# ROCm library files # ROCm library files
ROCM_SO_PATHS=() ROCM_SO_PATHS=()
@ -243,14 +243,12 @@ DEPS_SONAME=(
DEPS_AUX_SRCLIST=( DEPS_AUX_SRCLIST=(
"${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_SRC/}" "${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_SRC/}"
"${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_SRC/}" "${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_SRC/}"
"${HIPSPARSELT_LIB_FILES[@]/#/$HIPSPARSELT_LIB_SRC/}"
"/opt/amdgpu/share/libdrm/amdgpu.ids" "/opt/amdgpu/share/libdrm/amdgpu.ids"
) )
DEPS_AUX_DSTLIST=( DEPS_AUX_DSTLIST=(
"${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_DST/}" "${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_DST/}"
"${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_DST/}" "${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_DST/}"
"${HIPSPARSELT_LIB_FILES[@]/#/$HIPSPARSELT_LIB_DST/}"
"share/libdrm/amdgpu.ids" "share/libdrm/amdgpu.ids"
) )
@ -268,6 +266,20 @@ RCCL_SHARE_FILES=($(ls $RCCL_SHARE_SRC))
DEPS_AUX_SRCLIST+=(${RCCL_SHARE_FILES[@]/#/$RCCL_SHARE_SRC/}) DEPS_AUX_SRCLIST+=(${RCCL_SHARE_FILES[@]/#/$RCCL_SHARE_SRC/})
DEPS_AUX_DSTLIST+=(${RCCL_SHARE_FILES[@]/#/$RCCL_SHARE_DST/}) DEPS_AUX_DSTLIST+=(${RCCL_SHARE_FILES[@]/#/$RCCL_SHARE_DST/})
# PyTorch 2.6+ (AOTriton 0.8b+)
# AKS = "AOTriton Kernel Storage", a file format to store GPU kernels compactly
if (( $(echo "${PYTORCH_VERSION} 2.6" | awk '{print ($1 >= $2)}') )); then
LIBAOTRITON_DIR=$(find "$ROCM_HOME/lib/" -name "libaotriton_v2.so" -printf '%h\n')
if [[ -z ${LIBAOTRITON_DIR} ]]; then
LIBAOTRITON_DIR=$(find "$ROCM_HOME/" -name "libaotriton_v2.so" -printf '%h\n')
fi
AKS_FILES=($(find "${LIBAOTRITON_DIR}/aotriton.images" -type f -name '*.aks?' -printf '%P\n'))
AKS_SRC="${LIBAOTRITON_DIR}/aotriton.images"
AKS_DST="lib/aotriton.images"
DEPS_AUX_SRCLIST+=(${AKS_FILES[@]/#/${AKS_SRC}/})
DEPS_AUX_DSTLIST+=(${AKS_FILES[@]/#/${AKS_DST}/})
fi
echo "PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH}" echo "PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH}"
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )" SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"

View File

@ -20,11 +20,7 @@ fi
source /opt/intel/oneapi/compiler/latest/env/vars.sh source /opt/intel/oneapi/compiler/latest/env/vars.sh
source /opt/intel/oneapi/pti/latest/env/vars.sh source /opt/intel/oneapi/pti/latest/env/vars.sh
source /opt/intel/oneapi/umf/latest/env/vars.sh source /opt/intel/oneapi/umf/latest/env/vars.sh
source /opt/intel/oneapi/ccl/latest/env/vars.sh
source /opt/intel/oneapi/mpi/latest/env/vars.sh
export USE_STATIC_MKL=1 export USE_STATIC_MKL=1
export USE_ONEMKL=1
export USE_XCCL=1
WHEELHOUSE_DIR="wheelhousexpu" WHEELHOUSE_DIR="wheelhousexpu"
LIBTORCH_HOUSE_DIR="libtorch_housexpu" LIBTORCH_HOUSE_DIR="libtorch_housexpu"

View File

@ -10,3 +10,5 @@ example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
built on Jenkins and are used in triggered builds already have this built on Jenkins and are used in triggered builds already have this
environment variable set in their manifest. Also see environment variable set in their manifest. Also see
`./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`. `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.

View File

@ -19,7 +19,7 @@ git config --global --add safe.directory /var/lib/jenkins/workspace
if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
# TODO: This can be removed later once vision is also part of the Docker image # TODO: This can be removed later once vision is also part of the Docker image
pip install -q --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)" pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
# JIT C++ extensions require ninja, so put it into PATH. # JIT C++ extensions require ninja, so put it into PATH.
export PATH="/var/lib/jenkins/.local/bin:$PATH" export PATH="/var/lib/jenkins/.local/bin:$PATH"
# NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we # NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we

View File

@ -27,12 +27,6 @@ cmake --version
echo "Environment variables:" echo "Environment variables:"
env env
# The sccache wrapped version of nvcc gets put in /opt/cache/lib in docker since
# there are some issues if it is always wrapped, so we need to add it to PATH
# during CI builds.
# https://github.com/pytorch/pytorch/blob/0b6c0898e6c352c8ea93daec854e704b41485375/.ci/docker/common/install_cache.sh#L97
export PATH="/opt/cache/lib:$PATH"
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
# Use jemalloc during compilation to mitigate https://github.com/pytorch/pytorch/issues/116289 # Use jemalloc during compilation to mitigate https://github.com/pytorch/pytorch/issues/116289
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2 export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
@ -41,7 +35,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
fi fi
if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
if [[ "$BUILD_ENVIRONMENT" != *clang* ]]; then if [[ "$BUILD_ENVIRONMENT" != *cuda11.3* && "$BUILD_ENVIRONMENT" != *clang* ]]; then
# TODO: there is a linking issue when building with UCC using clang, # TODO: there is a linking issue when building with UCC using clang,
# disable it for now and to be fix later. # disable it for now and to be fix later.
# TODO: disable UCC temporarily to enable CUDA 12.1 in CI # TODO: disable UCC temporarily to enable CUDA 12.1 in CI
@ -58,6 +52,12 @@ fi
export USE_LLVM=/opt/llvm export USE_LLVM=/opt/llvm
export LLVM_DIR=/opt/llvm/lib/cmake/llvm export LLVM_DIR=/opt/llvm/lib/cmake/llvm
if [[ "$BUILD_ENVIRONMENT" == *executorch* ]]; then
# To build test_edge_op_registration
export BUILD_EXECUTORCH=ON
export USE_CUDA=0
fi
if ! which conda; then if ! which conda; then
# In ROCm CIs, we are doing cross compilation on build machines with # In ROCm CIs, we are doing cross compilation on build machines with
# intel cpu and later run tests on machines with amd cpu. # intel cpu and later run tests on machines with amd cpu.
@ -171,15 +171,8 @@ fi
if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
# shellcheck disable=SC1091 # shellcheck disable=SC1091
source /opt/intel/oneapi/compiler/latest/env/vars.sh source /opt/intel/oneapi/compiler/latest/env/vars.sh
# shellcheck disable=SC1091
source /opt/intel/oneapi/ccl/latest/env/vars.sh
# shellcheck disable=SC1091
source /opt/intel/oneapi/mpi/latest/env/vars.sh
# Enable XCCL build
export USE_XCCL=1
# XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
export USE_KINETO=0 export USE_KINETO=0
export TORCH_XPU_ARCH_LIST=pvc
fi fi
# sccache will fail for CUDA builds if all cores are used for compiling # sccache will fail for CUDA builds if all cores are used for compiling
@ -199,7 +192,9 @@ fi
# We only build FlashAttention files for CUDA 8.0+, and they require large amounts of # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
# memory to build and will OOM # memory to build and will OOM
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j 2" echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
fi fi
if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
@ -233,7 +228,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
export CMAKE_BUILD_TYPE=RelWithAssert export CMAKE_BUILD_TYPE=RelWithAssert
fi fi
# Do not change workspace permissions for ROCm and s390x CI jobs # Do not change workspace permissions for ROCm CI jobs
# as it can leave workspace with bad permissions for cancelled jobs # as it can leave workspace with bad permissions for cancelled jobs
if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then
# Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96) # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
@ -255,7 +250,6 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
set -e -o pipefail set -e -o pipefail
get_bazel get_bazel
python3 tools/optional_submodules.py checkout_eigen
# Leave 1 CPU free and use only up to 80% of memory to reduce the change of crashing # Leave 1 CPU free and use only up to 80% of memory to reduce the change of crashing
# the runner # the runner
@ -282,8 +276,10 @@ else
# or building non-XLA tests. # or building non-XLA tests.
if [[ "$BUILD_ENVIRONMENT" != *rocm* && if [[ "$BUILD_ENVIRONMENT" != *rocm* &&
"$BUILD_ENVIRONMENT" != *xla* ]]; then "$BUILD_ENVIRONMENT" != *xla* ]]; then
# Install numpy-2.0.2 for builds which are backward compatible with 1.X if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
python -mpip install numpy==2.0.2 # Install numpy-2.0.2 for builds which are backward compatible with 1.X
python -mpip install numpy==2.0.2
fi
WERROR=1 python setup.py clean WERROR=1 python setup.py clean
@ -306,34 +302,6 @@ else
fi fi
pip_install_whl "$(echo dist/*.whl)" pip_install_whl "$(echo dist/*.whl)"
if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *vision* ]]; then
install_torchvision
fi
if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *audio* ]]; then
install_torchaudio
fi
if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *torchrec* || "${BUILD_ADDITIONAL_PACKAGES:-}" == *fbgemm* ]]; then
install_torchrec_and_fbgemm
fi
if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *torchao* ]]; then
install_torchao
fi
if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
echo "Checking that xpu is compiled"
pushd dist/
if python -c 'import torch; exit(0 if torch.xpu._is_compiled() else 1)'; then
echo "XPU support is compiled in."
else
echo "XPU support is NOT compiled in."
exit 1
fi
popd
fi
# TODO: I'm not sure why, but somehow we lose verbose commands # TODO: I'm not sure why, but somehow we lose verbose commands
set -x set -x

View File

@ -59,16 +59,78 @@ else
export install_root="$(dirname $(which python))/../lib/python${py_dot}/site-packages/torch/" export install_root="$(dirname $(which python))/../lib/python${py_dot}/site-packages/torch/"
fi fi
###############################################################################
# Setup XPU ENV
###############################################################################
if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
set +u
# Refer https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
source /opt/intel/oneapi/compiler/latest/env/vars.sh
source /opt/intel/oneapi/pti/latest/env/vars.sh
fi
############################################################################### ###############################################################################
# Check GCC ABI # Check GCC ABI
############################################################################### ###############################################################################
# NOTE: As of https://github.com/pytorch/pytorch/issues/126551 we only produce # NOTE [ Building libtorch with old vs. new gcc ABI ]
# wheels with cxx11-abi #
# Packages built with one version of ABI could not be linked against by client
# C++ libraries that were compiled using the other version of ABI. Since both
# gcc ABIs are still common in the wild, we need to support both ABIs. Currently:
#
# - All the nightlies built on CentOS 7 + devtoolset7 use the old gcc ABI.
# - All the nightlies built on Ubuntu 16.04 + gcc 5.4 use the new gcc ABI.
echo "Checking that the gcc ABI is what we expect" echo "Checking that the gcc ABI is what we expect"
if [[ "$(uname)" != 'Darwin' ]]; then if [[ "$(uname)" != 'Darwin' ]]; then
# We also check that there are cxx11 symbols in libtorch function is_expected() {
if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* || "$DESIRED_CUDA" == *"rocm"* ]]; then
if [[ "$1" -gt 0 || "$1" == "ON " ]]; then
echo 1
fi
else
if [[ -z "$1" || "$1" == 0 || "$1" == "OFF" ]]; then
echo 1
fi
fi
}
# First we check that the env var in TorchConfig.cmake is correct
# We search for D_GLIBCXX_USE_CXX11_ABI=1 in torch/TorchConfig.cmake
torch_config="${install_root}/share/cmake/Torch/TorchConfig.cmake"
if [[ ! -f "$torch_config" ]]; then
echo "No TorchConfig.cmake found!"
ls -lah "$install_root/share/cmake/Torch"
exit 1
fi
echo "Checking the TorchConfig.cmake"
cat "$torch_config"
# The sed call below is
# don't print lines by default (only print the line we want)
# -n
# execute the following expression
# e
# replace lines that match with the first capture group and print
# s/.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/\1/p
# any characters, D_GLIBCXX_USE_CXX11_ABI=, exactly one any character, a
# quote, any characters
# Note the exactly one single character after the '='. In the case that the
# variable is not set the '=' will be followed by a '"' immediately and the
# line will fail the match and nothing will be printed; this is what we
# want. Otherwise it will capture the 0 or 1 after the '='.
# /.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/
# replace the matched line with the capture group and print
# /\1/p
actual_gcc_abi="$(sed -ne 's/.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/\1/p' < "$torch_config")"
if [[ "$(is_expected "$actual_gcc_abi")" != 1 ]]; then
echo "gcc ABI $actual_gcc_abi not as expected."
exit 1
fi
# We also check that there are [not] cxx11 symbols in libtorch
# #
echo "Checking that symbols in libtorch.so have the right gcc abi" echo "Checking that symbols in libtorch.so have the right gcc abi"
python3 "$(dirname ${BASH_SOURCE[0]})/smoke_test/check_binary_symbols.py" python3 "$(dirname ${BASH_SOURCE[0]})/smoke_test/check_binary_symbols.py"
@ -146,11 +208,35 @@ setup_link_flags () {
TEST_CODE_DIR="$(dirname $(realpath ${BASH_SOURCE[0]}))/test_example_code" TEST_CODE_DIR="$(dirname $(realpath ${BASH_SOURCE[0]}))/test_example_code"
build_and_run_example_cpp () { build_and_run_example_cpp () {
if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
GLIBCXX_USE_CXX11_ABI=1
else
GLIBCXX_USE_CXX11_ABI=0
fi
setup_link_flags setup_link_flags
g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1 g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -D_GLIBCXX_USE_CXX11_ABI=$GLIBCXX_USE_CXX11_ABI -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
./$1 ./$1
} }
build_example_cpp_with_incorrect_abi () {
if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
GLIBCXX_USE_CXX11_ABI=0
else
GLIBCXX_USE_CXX11_ABI=1
fi
set +e
setup_link_flags
g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -D_GLIBCXX_USE_CXX11_ABI=$GLIBCXX_USE_CXX11_ABI -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
ERRCODE=$?
set -e
if [ "$ERRCODE" -eq "0" ]; then
echo "Building example with incorrect ABI didn't throw error. Aborting."
exit 1
else
echo "Building example with incorrect ABI throws expected error. Proceeding."
fi
}
############################################################################### ###############################################################################
# Check simple Python/C++ calls # Check simple Python/C++ calls
############################################################################### ###############################################################################
@ -160,6 +246,11 @@ if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
export LD_LIBRARY_PATH=/usr/local/cuda/lib64 export LD_LIBRARY_PATH=/usr/local/cuda/lib64
fi fi
build_and_run_example_cpp simple-torch-test build_and_run_example_cpp simple-torch-test
# `_GLIBCXX_USE_CXX11_ABI` is always ignored by gcc in devtoolset7, so we test
# the expected failure case for Ubuntu 16.04 + gcc 5.4 only.
if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
build_example_cpp_with_incorrect_abi simple-torch-test
fi
else else
pushd /tmp pushd /tmp
python -c 'import torch' python -c 'import torch'
@ -216,14 +307,6 @@ else
fi fi
fi fi
###############################################################################
# Check XPU configured correctly
###############################################################################
if [[ "$DESIRED_CUDA" == 'xpu' && "$PACKAGE_TYPE" != 'libtorch' ]]; then
echo "Checking that xpu is compiled"
python -c 'import torch; exit(0 if torch.xpu._is_compiled() else 1)'
fi
############################################################################### ###############################################################################
# Check CUDA configured correctly # Check CUDA configured correctly
############################################################################### ###############################################################################
@ -302,22 +385,10 @@ except RuntimeError as e:
fi fi
############################################################################### ###############################################################################
# Check for C++ ABI compatibility to GCC-11 - GCC 13 # Check for C++ ABI compatibility between gcc7 and gcc9 compiled binaries
############################################################################### ###############################################################################
if [[ "$(uname)" == 'Linux' && "$PACKAGE_TYPE" == 'manywheel' ]]; then if [[ "$(uname)" == 'Linux' && ("$PACKAGE_TYPE" == 'conda' || "$PACKAGE_TYPE" == 'manywheel')]]; then
pushd /tmp pushd /tmp
# Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html python -c "import torch; exit(0 if torch.compiled_with_cxx11_abi() else (0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi1011' else 1))"
# gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19
# gcc 11 - CUDA 11.8, xpu, rocm
# gcc 13 - CUDA 12.6, 12.8 and cpu
# Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426
if [[ "$(uname -m)" == "s390x" ]]; then
cxx_abi="19"
elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then
cxx_abi="18"
else
cxx_abi="16"
fi
python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)"
popd popd
fi fi

View File

@ -13,13 +13,6 @@ if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then
fi fi
if which sccache > /dev/null; then if which sccache > /dev/null; then
# Clear SCCACHE_BUCKET and SCCACHE_REGION if they are empty, otherwise
# sccache will complain about invalid bucket configuration
if [[ -z "${SCCACHE_BUCKET:-}" ]]; then
unset SCCACHE_BUCKET
unset SCCACHE_REGION
fi
# Save sccache logs to file # Save sccache logs to file
sccache --stop-server > /dev/null 2>&1 || true sccache --stop-server > /dev/null 2>&1 || true
rm -f ~/sccache_error.log || true rm -f ~/sccache_error.log || true

View File

@ -13,8 +13,12 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
# HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
unset HIP_PLATFORM unset HIP_PLATFORM
export PYTORCH_TEST_WITH_ROCM=1 export PYTORCH_TEST_WITH_ROCM=1
# temporary to locate some kernel issues on the CI nodes
export HSAKMT_DEBUG_LEVEL=4
# improve rccl performance for distributed tests
export HSA_FORCE_FINE_GRAIN_PCIE=1
fi fi
# TODO: Reenable libtorch testing for MacOS, see https://github.com/pytorch/pytorch/issues/62598 # TODO: Renable libtorch testing for MacOS, see https://github.com/pytorch/pytorch/issues/62598
# shellcheck disable=SC2034 # shellcheck disable=SC2034
BUILD_TEST_LIBTORCH=0 BUILD_TEST_LIBTORCH=0

View File

@ -78,34 +78,6 @@ function pip_install_whl() {
fi fi
} }
function pip_build_and_install() {
local build_target=$1
local wheel_dir=$2
local found_whl=0
for file in "${wheel_dir}"/*.whl
do
if [[ -f "${file}" ]]; then
found_whl=1
break
fi
done
# Build the wheel if it doesn't exist
if [ "${found_whl}" == "0" ]; then
python3 -m pip wheel \
--no-build-isolation \
--no-deps \
--no-use-pep517 \
-w "${wheel_dir}" \
"${build_target}"
fi
for file in "${wheel_dir}"/*.whl
do
pip_install_whl "${file}"
done
}
function pip_install() { function pip_install() {
# retry 3 times # retry 3 times
@ -152,7 +124,14 @@ function get_pinned_commit() {
function install_torchaudio() { function install_torchaudio() {
local commit local commit
commit=$(get_pinned_commit audio) commit=$(get_pinned_commit audio)
pip_build_and_install "git+https://github.com/pytorch/audio.git@${commit}" dist/audio if [[ "$1" == "cuda" ]]; then
# TODO: This is better to be passed as a parameter from _linux-test workflow
# so that it can be consistent with what is set in build
TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${commit}"
else
pip_install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${commit}"
fi
} }
function install_torchtext() { function install_torchtext() {
@ -160,8 +139,8 @@ function install_torchtext() {
local text_commit local text_commit
data_commit=$(get_pinned_commit data) data_commit=$(get_pinned_commit data)
text_commit=$(get_pinned_commit text) text_commit=$(get_pinned_commit text)
pip_build_and_install "git+https://github.com/pytorch/data.git@${data_commit}" dist/data pip_install --no-use-pep517 --user "git+https://github.com/pytorch/data.git@${data_commit}"
pip_build_and_install "git+https://github.com/pytorch/text.git@${text_commit}" dist/text pip_install --no-use-pep517 --user "git+https://github.com/pytorch/text.git@${text_commit}"
} }
function install_torchvision() { function install_torchvision() {
@ -174,75 +153,40 @@ function install_torchvision() {
echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c - echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c -
LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so
fi fi
pip_install --no-use-pep517 --user "git+https://github.com/pytorch/vision.git@${commit}"
if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
# Not sure if both are needed, but why not
export FORCE_CUDA=1
export WITH_CUDA=1
fi
pip_build_and_install "git+https://github.com/pytorch/vision.git@${commit}" dist/vision
if [ -n "${LD_PRELOAD}" ]; then if [ -n "${LD_PRELOAD}" ]; then
LD_PRELOAD=${orig_preload} LD_PRELOAD=${orig_preload}
fi fi
} }
function install_tlparse() {
pip_install --user "tlparse==0.3.30"
PATH="$(python -m site --user-base)/bin:$PATH"
}
function install_torchrec_and_fbgemm() { function install_torchrec_and_fbgemm() {
local torchrec_commit local torchrec_commit
torchrec_commit=$(get_pinned_commit torchrec) torchrec_commit=$(get_pinned_commit torchrec)
local fbgemm_commit local fbgemm_commit
fbgemm_commit=$(get_pinned_commit fbgemm) fbgemm_commit=$(get_pinned_commit fbgemm)
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]] ; then
fbgemm_commit=$(get_pinned_commit fbgemm_rocm)
fi
pip_uninstall torchrec-nightly pip_uninstall torchrec-nightly
pip_uninstall fbgemm-gpu-nightly pip_uninstall fbgemm-gpu-nightly
pip_install setuptools-git-versioning scikit-build pyre-extensions pip_install setuptools-git-versioning scikit-build pyre-extensions
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]] ; then # TODO (huydhn): I still have no clue on why sccache doesn't work with only fbgemm_gpu here, but it
# install torchrec first because it installs fbgemm nightly on top of rocm fbgemm # seems to be an sccache-related issue
pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec if [[ "$IS_A100_RUNNER" == "1" ]]; then
pip_uninstall fbgemm-gpu-nightly unset CMAKE_CUDA_COMPILER_LAUNCHER
sudo mv /opt/cache/bin /opt/cache/bin-backup
fi
pip_install tabulate # needed for newer fbgemm # See https://github.com/pytorch/pytorch/issues/106971
pip_install patchelf # needed for rocm fbgemm CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 --user "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu"
pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
local wheel_dir=dist/fbgemm_gpu if [[ "$IS_A100_RUNNER" == "1" ]]; then
local found_whl=0 export CMAKE_CUDA_COMPILER_LAUNCHER=/opt/cache/bin/sccache
for file in "${wheel_dir}"/*.whl sudo mv /opt/cache/bin-backup /opt/cache/bin
do
if [[ -f "${file}" ]]; then
found_whl=1
break
fi
done
# Build the wheel if it doesn't exist
if [ "${found_whl}" == "0" ]; then
git clone --recursive https://github.com/pytorch/fbgemm
pushd fbgemm/fbgemm_gpu
git checkout "${fbgemm_commit}"
python setup.py bdist_wheel \
--package_variant=rocm \
-DHIP_ROOT_DIR="${ROCM_PATH}" \
-DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
-DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
popd
# Save the wheel before cleaning up
mkdir -p dist/fbgemm_gpu
cp fbgemm/fbgemm_gpu/dist/*.whl dist/fbgemm_gpu
fi
for file in "${wheel_dir}"/*.whl
do
pip_install_whl "${file}"
done
rm -rf fbgemm
else
pip_build_and_install "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}" dist/torchrec
pip_build_and_install "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#subdirectory=fbgemm_gpu" dist/fbgemm_gpu
fi fi
} }
@ -258,10 +202,29 @@ function clone_pytorch_xla() {
fi fi
} }
function checkout_install_torchbench() {
local commit
commit=$(get_pinned_commit torchbench)
git clone https://github.com/pytorch/benchmark torchbench
pushd torchbench
git checkout "$commit"
if [ "$1" ]; then
python install.py --continue_on_fail models "$@"
else
# Occasionally the installation may fail on one model but it is ok to continue
# to install and test other models
python install.py --continue_on_fail
fi
echo "Print all dependencies after TorchBench is installed"
python -mpip freeze
popd
}
function install_torchao() { function install_torchao() {
local commit local commit
commit=$(get_pinned_commit torchao) commit=$(get_pinned_commit torchao)
pip_build_and_install "git+https://github.com/pytorch/ao.git@${commit}" dist/ao pip_install --no-use-pep517 --user "git+https://github.com/pytorch/ao.git@${commit}"
} }
function print_sccache_stats() { function print_sccache_stats() {

View File

@ -1,50 +1,31 @@
#!/bin/bash #!/bin/bash
# Script for installing sccache on the xla build job, which uses xla's docker # Script for installing sccache on the xla build job, which uses xla's docker
# image, which has sccache installed but doesn't write the stubs. This is # image and doesn't have sccache installed on it. This is mostly copied from
# mostly copied from .ci/docker/install_cache.sh. Changes are: removing checks # .ci/docker/install_cache.sh. Changes are: removing checks that will always
# that will always return the same thing, ex checks for for rocm, CUDA, changing # return the same thing, ex checks for for rocm, CUDA, and changing the path
# the path where sccache is installed, not changing /etc/environment, and not # where sccache is installed, and not changing /etc/environment.
# installing/downloading sccache as it is already in the docker image.
set -ex -o pipefail set -ex -o pipefail
install_binary() {
echo "Downloading sccache binary from S3 repo"
curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /tmp/cache/bin/sccache
}
mkdir -p /tmp/cache/bin mkdir -p /tmp/cache/bin
mkdir -p /tmp/cache/lib
export PATH="/tmp/cache/bin:$PATH" export PATH="/tmp/cache/bin:$PATH"
install_binary
chmod a+x /tmp/cache/bin/sccache
function write_sccache_stub() { function write_sccache_stub() {
# Unset LD_PRELOAD for ps because of asan + ps issues # Unset LD_PRELOAD for ps because of asan + ps issues
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589
if [ "$1" == "gcc" ]; then # shellcheck disable=SC2086
# Do not call sccache recursively when dumping preprocessor argument # shellcheck disable=SC2059
# For some reason it's very important for the first cached nvcc invocation printf "#!/bin/sh\nif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then\n exec sccache $(which $1) \"\$@\"\nelse\n exec $(which $1) \"\$@\"\nfi" > "/tmp/cache/bin/$1"
cat >"/tmp/cache/bin/$1" <<EOF
#!/bin/sh
# sccache does not support -E flag, so we need to call the original compiler directly in order to avoid calling this wrapper recursively
for arg in "\$@"; do
if [ "\$arg" = "-E" ]; then
exec $(which "$1") "\$@"
fi
done
if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
exec sccache $(which "$1") "\$@"
else
exec $(which "$1") "\$@"
fi
EOF
else
cat >"/tmp/cache/bin/$1" <<EOF
#!/bin/sh
if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
exec sccache $(which "$1") "\$@"
else
exec $(which "$1") "\$@"
fi
EOF
fi
chmod a+x "/tmp/cache/bin/$1" chmod a+x "/tmp/cache/bin/$1"
} }

View File

@ -33,15 +33,56 @@ if which sccache > /dev/null; then
export PATH="${tmp_dir}:$PATH" export PATH="${tmp_dir}:$PATH"
fi fi
print_cmake_info cross_compile_arm64() {
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then # Cross compilation for arm64
# Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
else
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448 # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64 USE_DISTRIBUTED=0 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
}
compile_arm64() {
# Compilation for arm64
# TODO: Compile with OpenMP support (but this causes CI regressions as cross-compilation were done with OpenMP disabled)
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
}
compile_x86_64() {
USE_DISTRIBUTED=0 WERROR=1 python setup.py bdist_wheel --plat-name=macosx_10_9_x86_64
}
build_lite_interpreter() {
echo "Testing libtorch (lite interpreter)."
CPP_BUILD="$(pwd)/../cpp_build"
# Ensure the removal of the tmp directory
trap 'rm -rfv ${CPP_BUILD}' EXIT
rm -rf "${CPP_BUILD}"
mkdir -p "${CPP_BUILD}/caffe2"
# It looks libtorch need to be built in "${CPP_BUILD}/caffe2 folder.
BUILD_LIBTORCH_PY=$PWD/tools/build_libtorch.py
pushd "${CPP_BUILD}/caffe2" || exit
VERBOSE=1 DEBUG=1 python "${BUILD_LIBTORCH_PY}"
popd || exit
"${CPP_BUILD}/caffe2/build/bin/test_lite_interpreter_runtime"
}
print_cmake_info
if [[ ${BUILD_ENVIRONMENT} = *arm64* ]]; then
if [[ $(uname -m) == "arm64" ]]; then
compile_arm64
else
cross_compile_arm64
fi
elif [[ ${BUILD_ENVIRONMENT} = *lite-interpreter* ]]; then
export BUILD_LITE_INTERPRETER=1
build_lite_interpreter
else
compile_x86_64
fi fi
if which sccache > /dev/null; then if which sccache > /dev/null; then
print_sccache_stats print_sccache_stats
fi fi

View File

@ -20,4 +20,14 @@ print_cmake_info() {
CONDA_INSTALLATION_DIR=$(dirname "$CMAKE_EXEC") CONDA_INSTALLATION_DIR=$(dirname "$CMAKE_EXEC")
# Print all libraries under cmake rpath for debugging # Print all libraries under cmake rpath for debugging
ls -la "$CONDA_INSTALLATION_DIR/../lib" ls -la "$CONDA_INSTALLATION_DIR/../lib"
export CMAKE_EXEC
# Explicitly add conda env lib folder to cmake rpath to address the flaky issue
# where cmake dependencies couldn't be found. This seems to point to how conda
# links $CMAKE_EXEC to its package cache when cloning a new environment
install_name_tool -add_rpath @executable_path/../lib "${CMAKE_EXEC}" || true
# Adding the rpath will invalidate cmake signature, so signing it again here
# to trust the executable. EXC_BAD_ACCESS (SIGKILL (Code Signature Invalid))
# with an exit code 137 otherwise
codesign -f -s - "${CMAKE_EXEC}" || true
} }

View File

@ -5,6 +5,11 @@ set -x
# shellcheck source=./macos-common.sh # shellcheck source=./macos-common.sh
source "$(dirname "${BASH_SOURCE[0]}")/macos-common.sh" source "$(dirname "${BASH_SOURCE[0]}")/macos-common.sh"
if [[ -n "$CONDA_ENV" ]]; then
# Use binaries under conda environment
export PATH="$CONDA_ENV/bin":$PATH
fi
# Test that OpenMP is enabled # Test that OpenMP is enabled
pushd test pushd test
if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available()))") == "1" ]]; then if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available()))") == "1" ]]; then
@ -13,9 +18,6 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
fi fi
popd popd
# enable debug asserts in serialization
export TORCH_SERIALIZATION_DEBUG=1
setup_test_python() { setup_test_python() {
# The CircleCI worker hostname doesn't resolve to an address. # The CircleCI worker hostname doesn't resolve to an address.
# This environment variable makes ProcessGroupGloo default to # This environment variable makes ProcessGroupGloo default to
@ -37,16 +39,6 @@ test_python_all() {
assert_git_not_dirty assert_git_not_dirty
} }
test_python_mps() {
setup_test_python
time python test/run_test.py --verbose --mps
MTL_CAPTURE_ENABLED=1 ${CONDA_RUN} python3 test/test_mps.py --verbose -k test_metal_capture
assert_git_not_dirty
}
test_python_shard() { test_python_shard() {
if [[ -z "$NUM_TEST_SHARDS" ]]; then if [[ -z "$NUM_TEST_SHARDS" ]]; then
echo "NUM_TEST_SHARDS must be defined to run a Python test shard" echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
@ -160,7 +152,6 @@ test_jit_hooks() {
torchbench_setup_macos() { torchbench_setup_macos() {
git clone --recursive https://github.com/pytorch/vision torchvision git clone --recursive https://github.com/pytorch/vision torchvision
git clone --recursive https://github.com/pytorch/audio torchaudio git clone --recursive https://github.com/pytorch/audio torchaudio
brew install jpeg-turbo libpng
pushd torchvision pushd torchvision
git fetch git fetch
@ -175,8 +166,7 @@ torchbench_setup_macos() {
git checkout "$(cat ../.github/ci_commit_pins/audio.txt)" git checkout "$(cat ../.github/ci_commit_pins/audio.txt)"
git submodule update --init --recursive git submodule update --init --recursive
python setup.py clean python setup.py clean
#TODO: Remove me, when figure out how to make TorchAudio find brew installed openmp python setup.py develop
USE_OPENMP=0 python setup.py develop
popd popd
# Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120 # Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120
@ -184,8 +174,9 @@ torchbench_setup_macos() {
checkout_install_torchbench checkout_install_torchbench
} }
pip_benchmark_deps() { conda_benchmark_deps() {
python -mpip install --no-input requests cython scikit-learn six conda install -y astunparse numpy scipy ninja pyyaml setuptools cmake typing-extensions requests protobuf numba cython scikit-learn
conda install -y -c conda-forge librosa
} }
@ -193,7 +184,7 @@ test_torchbench_perf() {
print_cmake_info print_cmake_info
echo "Launching torchbench setup" echo "Launching torchbench setup"
pip_benchmark_deps conda_benchmark_deps
torchbench_setup_macos torchbench_setup_macos
TEST_REPORTS_DIR=$(pwd)/test/test-reports TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -220,61 +211,32 @@ test_torchbench_smoketest() {
print_cmake_info print_cmake_info
echo "Launching torchbench setup" echo "Launching torchbench setup"
pip_benchmark_deps conda_benchmark_deps
# shellcheck disable=SC2119,SC2120 # shellcheck disable=SC2119,SC2120
torchbench_setup_macos torchbench_setup_macos
TEST_REPORTS_DIR=$(pwd)/test/test-reports TEST_REPORTS_DIR=$(pwd)/test/test-reports
mkdir -p "$TEST_REPORTS_DIR" mkdir -p "$TEST_REPORTS_DIR"
local backend=eager
local dtype=notset
local device=mps local device=mps
local dtypes=(undefined float16 bfloat16 notset)
local dtype=${dtypes[$1]}
local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
for backend in eager inductor; do touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
echo "Launching torchbench inference performance run for backend ${backend} and dtype ${dtype}" echo "Setup complete, launching torchbench training performance run"
local dtype_arg="--${dtype}" for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
if [ "$dtype" == notset ]; then PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
dtype_arg="--float32" --performance --only "$model" --backend "$backend" --training --devices "$device" \
fi --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv" done
for model in "${models[@]}"; do
PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
--performance --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
--output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv" || true
if [ "$backend" == "inductor" ]; then
PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
--accuracy --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
--output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_accuracy.csv" || true
fi
done
if [ "$backend" == "inductor" ]; then
PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
--performance --backend "$backend" --inference --devices "$device" "$dtype_arg" \
--output "$TEST_REPORTS_DIR/inductor_${backend}_huggingface_${dtype}_inference_${device}_performance.csv" || true
PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
--accuracy --backend "$backend" --inference --devices "$device" "$dtype_arg" \
--output "$TEST_REPORTS_DIR/inductor_${backend}_huggingface_${dtype}_inference_${device}_accuracy.csv" || true
fi
if [ "$dtype" == notset ]; then
for dtype_ in notset amp; do
echo "Launching torchbench training performance run for backend ${backend} and dtype ${dtype_}"
touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype_}_training_${device}_performance.csv"
local dtype_arg="--${dtype_}"
if [ "$dtype_" == notset ]; then
dtype_arg="--float32"
fi
for model in "${models[@]}"; do
PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
--performance --only "$model" --backend "$backend" --training --devices "$device" "$dtype_arg" \
--output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype_}_training_${device}_performance.csv" || true
done
done
fi
echo "Launching torchbench inference performance run"
for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
--performance --only "$model" --backend "$backend" --inference --devices "$device" \
--output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
done done
echo "Pytorch benchmark on mps device completed" echo "Pytorch benchmark on mps device completed"
@ -284,7 +246,7 @@ test_hf_perf() {
print_cmake_info print_cmake_info
TEST_REPORTS_DIR=$(pwd)/test/test-reports TEST_REPORTS_DIR=$(pwd)/test/test-reports
mkdir -p "$TEST_REPORTS_DIR" mkdir -p "$TEST_REPORTS_DIR"
pip_benchmark_deps conda_benchmark_deps
torchbench_setup_macos torchbench_setup_macos
echo "Launching HuggingFace training perf run" echo "Launching HuggingFace training perf run"
@ -300,7 +262,7 @@ test_timm_perf() {
print_cmake_info print_cmake_info
TEST_REPORTS_DIR=$(pwd)/test/test-reports TEST_REPORTS_DIR=$(pwd)/test/test-reports
mkdir -p "$TEST_REPORTS_DIR" mkdir -p "$TEST_REPORTS_DIR"
pip_benchmark_deps conda_benchmark_deps
torchbench_setup_macos torchbench_setup_macos
echo "Launching timm training perf run" echo "Launching timm training perf run"
@ -312,6 +274,8 @@ test_timm_perf() {
echo "timm benchmark on mps device completed" echo "timm benchmark on mps device completed"
} }
install_tlparse
if [[ $TEST_CONFIG == *"perf_all"* ]]; then if [[ $TEST_CONFIG == *"perf_all"* ]]; then
test_torchbench_perf test_torchbench_perf
test_hf_perf test_hf_perf
@ -323,9 +287,7 @@ elif [[ $TEST_CONFIG == *"perf_hf"* ]]; then
elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then
test_timm_perf test_timm_perf
elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then
test_torchbench_smoketest "${SHARD_NUMBER}" test_torchbench_smoketest
elif [[ $TEST_CONFIG == *"mps"* ]]; then
test_python_mps
elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then
test_python_shard "${SHARD_NUMBER}" test_python_shard "${SHARD_NUMBER}"
if [[ "${SHARD_NUMBER}" == 1 ]]; then if [[ "${SHARD_NUMBER}" == 1 ]]; then

View File

@ -0,0 +1,22 @@
#!/bin/bash
set -e
run_test () {
rm -rf test_tmp/ && mkdir test_tmp/ && cd test_tmp/
"$@"
cd .. && rm -rf test_tmp/
}
get_runtime_of_command () {
TIMEFORMAT=%R
# runtime=$( { time ($@ &> /dev/null); } 2>&1 1>/dev/null)
runtime=$( { time "$@"; } 2>&1 1>/dev/null)
if [[ $runtime == *"Error"* ]]; then
exit 1
fi
runtime=${runtime#+++ $@}
runtime=$(python -c "print($runtime)")
echo "$runtime"
}

View File

@ -0,0 +1,91 @@
import argparse
import json
import math
import sys
parser = argparse.ArgumentParser()
parser.add_argument(
"--test-name", dest="test_name", action="store", required=True, help="test name"
)
parser.add_argument(
"--sample-stats",
dest="sample_stats",
action="store",
required=True,
help="stats from sample",
)
parser.add_argument(
"--update",
action="store_true",
help="whether to update baseline using stats from sample",
)
args = parser.parse_args()
test_name = args.test_name
if "cpu" in test_name:
backend = "cpu"
elif "gpu" in test_name:
backend = "gpu"
data_file_path = f"../{backend}_runtime.json"
with open(data_file_path) as data_file:
data = json.load(data_file)
if test_name in data:
mean = float(data[test_name]["mean"])
sigma = float(data[test_name]["sigma"])
else:
# Let the test pass if baseline number doesn't exist
mean = sys.maxsize
sigma = 0.001
print("population mean: ", mean)
print("population sigma: ", sigma)
# Let the test pass if baseline number is NaN (which happened in
# the past when we didn't have logic for catching NaN numbers)
if math.isnan(mean) or math.isnan(sigma):
mean = sys.maxsize
sigma = 0.001
sample_stats_data = json.loads(args.sample_stats)
sample_mean = float(sample_stats_data["mean"])
sample_sigma = float(sample_stats_data["sigma"])
print("sample mean: ", sample_mean)
print("sample sigma: ", sample_sigma)
if math.isnan(sample_mean):
raise Exception("""Error: sample mean is NaN""") # noqa: TRY002
elif math.isnan(sample_sigma):
raise Exception("""Error: sample sigma is NaN""") # noqa: TRY002
z_value = (sample_mean - mean) / sigma
print("z-value: ", z_value)
if z_value >= 3:
raise Exception( # noqa: TRY002
f"""\n
z-value >= 3, there is high chance of perf regression.\n
To reproduce this regression, run
`cd .ci/pytorch/perf_test/ && bash {test_name}.sh` on your local machine
and compare the runtime before/after your code change.
"""
)
else:
print("z-value < 3, no perf regression detected.")
if args.update:
print("We will use these numbers as new baseline.")
new_data_file_path = f"../new_{backend}_runtime.json"
with open(new_data_file_path) as new_data_file:
new_data = json.load(new_data_file)
new_data[test_name] = {}
new_data[test_name]["mean"] = sample_mean
new_data[test_name]["sigma"] = max(sample_sigma, sample_mean * 0.1)
with open(new_data_file_path, "w") as new_data_file:
json.dump(new_data, new_data_file, indent=4)

Some files were not shown because too many files have changed in this diff Show More