Compare commits

..

7 Commits

Author SHA1 Message Date
37257774c6 Triton wheel build using 2.3.x branch (#122403)
* Triton build 2.3.x

* Revert "[Release Only] Build triton using pinned version rather branch (#121765)"

This reverts commit d69c4219127e2cf5d9637b0daacc0a24e65f8133.

* Triton wheel change

* release
2024-03-21 12:52:21 -04:00
c4e5434423 necessary change to make torch2.3 work with triton2.2 (#122139) 2024-03-21 08:24:53 -04:00
b4f90aae1b CI: Specify libc and libstdcxx versions in conda environments (#121929)
Without this we get mismatches between the GLIBC and GLIBCXX ABI used
by conda packages vs pytorch.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/121556
Approved by: https://github.com/isuruf, https://github.com/malfet

(cherry picked from commit 7a53dedb07ed72b85d1e083ce38c43c7810fc5f1)

Co-authored-by: Peter Bell <peterbell10@live.co.uk>
2024-03-14 17:56:46 -04:00
94d6463255 [RELEASE ONLY CHANGES] Increase timeout for linux binary jobs, fix workflow lint (#121851)
* [release only] Increase timeout job for linux binary builds by 30min

* fix lint
2024-03-13 19:50:57 -04:00
6a89a753b1 [RELEASE ONLY CHANGES] Apply release only changes Release 2.3 (#121813)
* [Release only changes] Release only changes #2

* common+lint
2024-03-13 11:03:48 -04:00
d69c421912 [Release Only] Build triton using pinned version rather branch (#121765) 2024-03-12 19:05:23 -04:00
6725db07ae [RELEASE ONLY CHANGES] Apply release only changes Release 2.3 (#121726)
* Apply release only changes

* temp changes

* tweak

* fix

* Revert "tweak"

This reverts commit 38edcac21448829ac114c73423c84614628e2598.
2024-03-12 18:14:35 -04:00
8419 changed files with 653862 additions and 377805 deletions

View File

@ -1,4 +1,3 @@
# We do not use this library in our Bazel build. It contains an
# infinitely recursing symlink that makes Bazel very unhappy.
third_party/ittapi/
third_party/opentelemetry-cpp

View File

@ -1,5 +0,0 @@
0.6b
manylinux_2_17
rocm6.1
7f07e8a1cb1f99627eb6d77f5c0e9295c775f3c7
77c29fa3f3b614e187d7213d745e989a92708cee2bc6020419ab49019af399d1

View File

@ -84,30 +84,16 @@ fi
# CMake 3.18 is needed to support CUDA17 language variant
CMAKE_VERSION=3.18.5
_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
_UCX_COMMIT=00bcc6bb18fc282eb160623b4c0d300147f579af
_UCC_COMMIT=7cb07a76ccedad7e56ceb136b865eb9319c258ea
# It's annoying to rename jobs every time you want to rewrite a
# configuration, so we hardcode everything here rather than do it
# from scratch
case "$image" in
pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
CUDA_VERSION=12.4.0
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
CUDA_VERSION=12.1.1
CUDNN_VERSION=9
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
@ -119,24 +105,9 @@ case "$image" in
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.4.0
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks)
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.1.1
CUDNN_VERSION=9
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
@ -149,39 +120,9 @@ case "$image" in
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda12.1-cudnn9-py3.12-gcc9-inductor-benchmarks)
CUDA_VERSION=12.1.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks)
CUDA_VERSION=12.4.0
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9)
pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9)
CUDA_VERSION=11.8.0
CUDNN_VERSION=9
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
@ -193,37 +134,9 @@ case "$image" in
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
CUDA_VERSION=12.4.0
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
CUDA_VERSION=12.1.1
CUDNN_VERSION=9
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
CUDA_VERSION=12.4.0
CUDNN_VERSION=9
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
@ -291,7 +204,7 @@ case "$image" in
PROTOBUF=yes
DB=yes
VISION=yes
ROCM_VERSION=6.0
ROCM_VERSION=5.7
NINJA_VERSION=1.9.0
CONDA_CMAKE=yes
TRITON=yes
@ -302,7 +215,7 @@ case "$image" in
PROTOBUF=yes
DB=yes
VISION=yes
ROCM_VERSION=6.1
ROCM_VERSION=6.0
NINJA_VERSION=1.9.0
CONDA_CMAKE=yes
TRITON=yes
@ -313,10 +226,9 @@ case "$image" in
PROTOBUF=yes
DB=yes
VISION=yes
XPU_VERSION=0.5
BASEKIT_VERSION=2024.0.0-49522
NINJA_VERSION=1.9.0
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.8
@ -330,10 +242,10 @@ case "$image" in
DOCS=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda11.8-cudnn9-py3.8-clang12)
pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12)
ANACONDA_PYTHON_VERSION=3.8
CUDA_VERSION=11.8
CUDNN_VERSION=9
CUDNN_VERSION=8
CLANG_VERSION=12
PROTOBUF=yes
DB=yes
@ -373,13 +285,6 @@ case "$image" in
CONDA_CMAKE=yes
EXECUTORCH=yes
;;
pytorch-linux-jammy-py3.12-halide)
CUDA_VERSION=12.4
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=11
CONDA_CMAKE=yes
HALIDE=yes
;;
pytorch-linux-focal-linter)
# TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
# We will need to update mypy version eventually, but that's for another day. The task
@ -387,7 +292,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.9
CONDA_CMAKE=yes
;;
pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter)
pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter)
ANACONDA_PYTHON_VERSION=3.9
CUDA_VERSION=11.8
CONDA_CMAKE=yes
@ -400,12 +305,6 @@ case "$image" in
DB=yes
VISION=yes
CONDA_CMAKE=yes
# snadampal: skipping sccache due to the following issue
# https://github.com/pytorch/pytorch/issues/121559
SKIP_SCCACHE_INSTALL=yes
# snadampal: skipping llvm src build install because the current version
# from pytorch/llvm:9.0.1 is x86 specific
SKIP_LLVM_SRC_BUILD_INSTALL=yes
;;
*)
# Catch-all for builds that are not hardcoded.
@ -454,13 +353,13 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
#when using cudnn version 8 install it separately from cuda
if [[ "$image" == *cuda* && ${OS} == "ubuntu" ]]; then
IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
if [[ ${CUDNN_VERSION} == 9 ]]; then
if [[ ${CUDNN_VERSION} == 8 ]]; then
IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
fi
fi
# Build image
docker build \
DOCKER_BUILDKIT=1 docker build \
--no-cache \
--progress=plain \
--build-arg "BUILD_ENVIRONMENT=${image}" \
@ -497,17 +396,14 @@ docker build \
--build-arg "DOCS=${DOCS}" \
--build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
--build-arg "EXECUTORCH=${EXECUTORCH}" \
--build-arg "HALIDE=${HALIDE}" \
--build-arg "XPU_VERSION=${XPU_VERSION}" \
--build-arg "BASEKIT_VERSION=${BASEKIT_VERSION}" \
--build-arg "ACL=${ACL:-}" \
--build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
--build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
-f $(dirname ${DOCKERFILE})/Dockerfile \
-t "$tmp_tag" \
"$@" \
.
# NVIDIA dockers for RC releases use tag names like `11.0-cudnn9-devel-ubuntu18.04-rc`,
# NVIDIA dockers for RC releases use tag names like `11.0-cudnn8-devel-ubuntu18.04-rc`,
# for this case we will set UBUNTU_VERSION to `18.04-rc` so that the Dockerfile could
# find the correct image. As a result, here we have to replace the
# "$UBUNTU_VERSION" == "18.04-rc"

View File

@ -62,7 +62,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}
# (optional) Install vision packages like OpenCV
# (optional) Install vision packages like OpenCV and ffmpeg
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -77,9 +77,6 @@ RUN rm install_rocm.sh
COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
RUN bash ./install_rocm_magma.sh
RUN rm install_rocm_magma.sh
COPY ./common/install_amdsmi.sh install_amdsmi.sh
RUN bash ./install_amdsmi.sh
RUN rm install_amdsmi.sh
ENV PATH /opt/rocm/bin:$PATH
ENV PATH /opt/rocm/hcc/bin:$PATH
ENV PATH /opt/rocm/hip/bin:$PATH
@ -113,13 +110,6 @@ COPY triton_version.txt triton_version.txt
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt
# Install AOTriton (Early fail)
COPY ./aotriton_version.txt aotriton_version.txt
COPY ./common/common_utils.sh common_utils.sh
COPY ./common/install_aotriton.sh install_aotriton.sh
RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
# Install ccache/sccache (do this last, so we get priority in PATH)
COPY ./common/install_cache.sh install_cache.sh
ENV PATH /opt/cache/bin:$PATH

View File

@ -1 +1 @@
9d859653ae916d0a72f6b2b5c5925bed38832140
e2a8f9548aecb62a68e264607174a7d207ed2929

View File

@ -1 +0,0 @@
340136fec6d3ebc73e7a19eba1663e9b0ba8ab2d

View File

@ -1 +1 @@
21eae954efa5bf584da70324b640288c3ee7aede
0a22a91d04c2b4a029a69a198eac390089c3e891

View File

@ -1 +0,0 @@
1b2f15840e0d70eec50d84c7a0575cb835524def

View File

@ -1 +1 @@
dedb7bdf339a3546896d4820366ca562c586bfa0
79c6c9b209a5692b9a895398f4f3a033f8f80415

View File

@ -1,6 +1,6 @@
set -euo pipefail
readonly version=v24.04
readonly version=v23.08
readonly src_host=https://review.mlplatform.org/ml
readonly src_repo=ComputeLibrary

View File

@ -1,5 +0,0 @@
#!/bin/bash
set -ex
cd /opt/rocm/share/amd_smi && pip install .

View File

@ -1,23 +0,0 @@
#!/bin/bash
set -ex
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
TARBALL='aotriton.tar.bz2'
# This read command alwasy returns with exit code 1
read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
ARCH=$(uname -m)
AOTRITON_INSTALL_PREFIX="$1"
AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.bz2"
cd "${AOTRITON_INSTALL_PREFIX}"
# Must use -L to follow redirects
curl -L --retry 3 -o "${TARBALL}" "${AOTRITON_URL}"
ACTUAL_SHA256=$(sha256sum "${TARBALL}" | cut -d " " -f 1)
if [ "${SHA256}" != "${ACTUAL_SHA256}" ]; then
echo -n "Error: The SHA256 of downloaded tarball is ${ACTUAL_SHA256},"
echo " which does not match the expected value ${SHA256}."
exit
fi
tar xf "${TARBALL}" && rm -rf "${TARBALL}"

View File

@ -3,7 +3,7 @@
set -ex
install_ubuntu() {
# NVIDIA dockers for RC releases use tag names like `11.0-cudnn9-devel-ubuntu18.04-rc`,
# NVIDIA dockers for RC releases use tag names like `11.0-cudnn8-devel-ubuntu18.04-rc`,
# for this case we will set UBUNTU_VERSION to `18.04-rc` so that the Dockerfile could
# find the correct image. As a result, here we have to check for
# "$UBUNTU_VERSION" == "18.04"*
@ -113,6 +113,7 @@ install_centos() {
glibc-devel \
glibc-headers \
glog-devel \
hiredis-devel \
libstdc++-devel \
libsndfile-devel \
make \

View File

@ -85,7 +85,7 @@ fi
else
CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"
if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.13" ]; then
if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ]; then
conda_install numpy=1.26.0 ${CONDA_COMMON_DEPS}
else
conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}

View File

@ -1,18 +1,20 @@
#!/bin/bash
if [[ -n "${CUDNN_VERSION}" ]]; then
if [[ ${CUDNN_VERSION} == 8 ]]; then
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
mkdir tmp_cudnn
pushd tmp_cudnn
if [[ ${CUDA_VERSION:0:2} == "12" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda12-archive"
elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-8.9.2.26_cuda12-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-8.7.0.84_cuda11-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/${CUDNN_NAME}.tar.xz
else
print "Unsupported CUDA version ${CUDA_VERSION}"
exit 1
fi
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
tar xf ${CUDNN_NAME}.tar.xz
cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/

View File

@ -5,14 +5,9 @@ set -ex
# cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
mkdir tmp_cusparselt && cd tmp_cusparselt
if [[ ${CUDA_VERSION:0:4} =~ ^12\.[1-4]$ ]]; then
arch_path='sbsa'
export TARGETARCH=${TARGETARCH:-$(uname -m)}
if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
arch_path='x86_64'
fi
CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.5.2.1-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.5.2.1-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.4.0.7-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz

View File

@ -4,6 +4,11 @@ set -ex
install_ubuntu() {
apt-get update
apt-get install -y --no-install-recommends \
libhiredis-dev \
libleveldb-dev \
liblmdb-dev \
libsnappy-dev
# Cleanup
apt-get autoclean && apt-get clean
@ -15,6 +20,12 @@ install_centos() {
# See http://fedoraproject.org/wiki/EPEL
yum --enablerepo=extras install -y epel-release
yum install -y \
hiredis-devel \
leveldb-devel \
lmdb-devel \
snappy-devel
# Cleanup
yum clean all
rm -rf /var/cache/yum

View File

@ -37,9 +37,6 @@ install_conda_dependencies() {
install_pip_dependencies() {
pushd executorch/.ci/docker
# Install PyTorch CPU build beforehand to avoid installing the much bigger CUDA
# binaries later, ExecuTorch only needs CPU
pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
# Install all Python dependencies
pip_install -r requirements-ci.txt
popd
@ -47,14 +44,13 @@ install_pip_dependencies() {
setup_executorch() {
pushd executorch
# Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
as_jenkins bash .ci/scripts/setup-vulkan-linux-deps.sh
source .ci/scripts/utils.sh
export PYTHON_EXECUTABLE=python
export EXECUTORCH_BUILD_PYBIND=ON
export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
install_flatc_from_source
pip_install .
as_jenkins .ci/scripts/setup-linux.sh cmake
# Make sure that all the newly generate files are owned by Jenkins
chown -R jenkins .
popd
}

View File

@ -1,46 +0,0 @@
#!/bin/bash
set -ex
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
COMMIT=$(get_pinned_commit halide)
test -n "$COMMIT"
# activate conda to populate CONDA_PREFIX
test -n "$ANACONDA_PYTHON_VERSION"
eval "$(conda shell.bash hook)"
conda activate py_$ANACONDA_PYTHON_VERSION
if [ -n "${UBUNTU_VERSION}" ];then
apt update
apt-get install -y lld liblld-15-dev libpng-dev libjpeg-dev libgl-dev \
libopenblas-dev libeigen3-dev libatlas-base-dev libzstd-dev
fi
conda_install numpy scipy imageio cmake ninja
git clone --depth 1 --branch release/16.x --recursive https://github.com/llvm/llvm-project.git
cmake -DCMAKE_BUILD_TYPE=Release \
-DLLVM_ENABLE_PROJECTS="clang" \
-DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \
-DLLVM_ENABLE_TERMINFO=OFF -DLLVM_ENABLE_ASSERTIONS=ON \
-DLLVM_ENABLE_EH=ON -DLLVM_ENABLE_RTTI=ON -DLLVM_BUILD_32_BITS=OFF \
-S llvm-project/llvm -B llvm-build -G Ninja
cmake --build llvm-build
cmake --install llvm-build --prefix llvm-install
export LLVM_ROOT=`pwd`/llvm-install
export LLVM_CONFIG=$LLVM_ROOT/bin/llvm-config
git clone https://github.com/halide/Halide.git
pushd Halide
git checkout ${COMMIT} && git submodule update --init --recursive
pip_install -r requirements.txt
cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -S . -B build
cmake --build build
test -e ${CONDA_PREFIX}/lib/python3 || ln -s python${ANACONDA_PYTHON_VERSION} ${CONDA_PREFIX}/lib/python3
cmake --install build --prefix ${CONDA_PREFIX}
chown -R jenkins ${CONDA_PREFIX}
popd
rm -rf Halide llvm-build llvm-project llvm-install
python -c "import halide" # check for errors

View File

@ -30,17 +30,15 @@ pip_install \
pip_install coloredlogs packaging
pip_install onnxruntime==1.18
pip_install onnx==1.16.0
pip_install onnxruntime==1.17.0
pip_install onnx==1.15.0
# pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@3e869ef8ccf19b5ebd21c10d3e9c267c9a9fa729" --no-deps
pip_install onnxscript==0.1.0.dev20240613 --no-deps
# required by onnxscript
pip_install ml_dtypes
pip_install onnxscript==0.1.0.dev20240301 --no-deps
# Cache the transformers model to be used later by ONNX tests. We need to run the transformers
# package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
IMPORT_SCRIPT_FILENAME="/tmp/onnx_import_script.py"
as_jenkins echo 'import transformers; transformers.AutoModel.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3");' > "${IMPORT_SCRIPT_FILENAME}"
as_jenkins echo 'import transformers; transformers.AutoModel.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2");' > "${IMPORT_SCRIPT_FILENAME}"
# Need a PyTorch version for transformers to work
pip_install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu

View File

@ -11,8 +11,7 @@ mkdir -p $pb_dir
ln -s /usr/lib64 "$pb_dir/lib64"
curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.17.3/protobuf-all-3.17.3.tar.gz" --retry 3
tar -xvz --no-same-owner -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz
tar -xvz -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz
NPROC=$[$(nproc) - 2]
pushd "$pb_dir" && ./configure && make -j${NPROC} && make -j${NPROC} check && sudo make -j${NRPOC} install && sudo ldconfig
popd

View File

@ -6,6 +6,9 @@ ver() {
printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
}
# Map ROCm version to AMDGPU version
declare -A AMDGPU_VERSIONS=( ["5.0"]="21.50" ["5.1.1"]="22.10.1" ["5.2"]="22.20" )
install_ubuntu() {
apt-get update
if [[ $UBUNTU_VERSION == 18.04 ]]; then
@ -23,14 +26,31 @@ install_ubuntu() {
apt-get install -y libc++1
apt-get install -y libc++abi1
# Add amdgpu repository
UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then
# Add amdgpu repository
UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
local amdgpu_baseurl
if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
else
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/ubuntu"
fi
echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
fi
ROCM_REPO="ubuntu"
if [[ $(ver $ROCM_VERSION) -lt $(ver 4.2) ]]; then
ROCM_REPO="xenial"
fi
if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
ROCM_REPO="${UBUNTU_VERSION_NAME}"
fi
# Add rocm repository
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
local rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
echo "deb [arch=amd64] ${rocm_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/rocm.list
echo "deb [arch=amd64] ${rocm_baseurl} ${ROCM_REPO} main" > /etc/apt/sources.list.d/rocm.list
apt-get update --allow-insecure-repositories
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
@ -39,28 +59,34 @@ install_ubuntu() {
rocm-libs \
rccl \
rocprofiler-dev \
roctracer-dev \
amd-smi-lib
if [[ $(ver $ROCM_VERSION) -ge $(ver 6.1) ]]; then
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
fi
roctracer-dev
# precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
# search for all unversioned packages
# if search fails it will abort this script; use true to avoid case where search fails
MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
if [[ "x${MIOPENHIPGFX}" = x ]]; then
echo "miopen-hip-gfx package not available" && exit 1
if [[ $(ver $ROCM_VERSION) -ge $(ver 5.5) ]]; then
MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
if [[ "x${MIOPENHIPGFX}" = x ]]; then
echo "miopen-hip-gfx package not available" && exit 1
else
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
fi
else
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
MIOPENKERNELS=$(apt-cache search --names-only miopenkernels | awk '{print $1}' | grep -F -v . || true)
if [[ "x${MIOPENKERNELS}" = x ]]; then
echo "miopenkernels package not available" && exit 1
else
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENKERNELS}
fi
fi
# ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
for kdb in /opt/rocm/share/miopen/db/*.kdb
do
sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
done
if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then
for kdb in /opt/rocm/share/miopen/db/*.kdb
do
sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
done
fi
# Cleanup
apt-get autoclean && apt-get clean
@ -77,19 +103,25 @@ install_centos() {
yum install -y epel-release
yum install -y dkms kernel-headers-`uname -r` kernel-devel-`uname -r`
# Add amdgpu repository
local amdgpu_baseurl
if [[ $OS_VERSION == 9 ]]; then
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/9.0/main/x86_64"
else
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64"
if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then
# Add amdgpu repository
local amdgpu_baseurl
if [[ $OS_VERSION == 9 ]]; then
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/9.0/main/x86_64"
else
if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64"
else
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/7.9/main/x86_64"
fi
fi
echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo
echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo
echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo
echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo
echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo
echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo
fi
echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo
echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo
echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo
echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo
echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo
echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo
local rocm_baseurl="http://repo.radeon.com/rocm/yum/${ROCM_VERSION}"
echo "[ROCm]" > /etc/yum.repos.d/rocm.repo
@ -107,23 +139,33 @@ install_centos() {
rocm-libs \
rccl \
rocprofiler-dev \
roctracer-dev \
amd-smi-lib
roctracer-dev
# precompiled miopen kernels; search for all unversioned packages
# if search fails it will abort this script; use true to avoid case where search fails
MIOPENHIPGFX=$(yum -q search miopen-hip-gfx | grep miopen-hip-gfx | awk '{print $1}'| grep -F kdb. || true)
if [[ "x${MIOPENHIPGFX}" = x ]]; then
echo "miopen-hip-gfx package not available" && exit 1
if [[ $(ver $ROCM_VERSION) -ge $(ver 5.5) ]]; then
MIOPENHIPGFX=$(yum -q search miopen-hip-gfx | grep miopen-hip-gfx | awk '{print $1}'| grep -F kdb. || true)
if [[ "x${MIOPENHIPGFX}" = x ]]; then
echo "miopen-hip-gfx package not available" && exit 1
else
yum install -y ${MIOPENHIPGFX}
fi
else
yum install -y ${MIOPENHIPGFX}
MIOPENKERNELS=$(yum -q search miopenkernels | grep miopenkernels- | awk '{print $1}'| grep -F kdb. || true)
if [[ "x${MIOPENKERNELS}" = x ]]; then
echo "miopenkernels package not available" && exit 1
else
yum install -y ${MIOPENKERNELS}
fi
fi
# ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
for kdb in /opt/rocm/share/miopen/db/*.kdb
do
sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
done
if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then
for kdb in /opt/rocm/share/miopen/db/*.kdb
do
sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
done
fi
# Cleanup
yum clean all

View File

@ -13,11 +13,8 @@ conda_reinstall() {
}
if [ -n "${ROCM_VERSION}" ]; then
TRITON_REPO="https://github.com/openai/triton"
TRITON_REPO="https://github.com/ROCmSoftwarePlatform/triton"
TRITON_TEXT_FILE="triton-rocm"
elif [ -n "${XPU_VERSION}" ]; then
TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
TRITON_TEXT_FILE="triton-xpu"
else
TRITON_REPO="https://github.com/openai/triton"
TRITON_TEXT_FILE="triton"

View File

@ -5,7 +5,8 @@ set -ex
install_ubuntu() {
apt-get update
apt-get install -y --no-install-recommends \
libopencv-dev
libopencv-dev \
libavcodec-dev
# Cleanup
apt-get autoclean && apt-get clean
@ -18,7 +19,8 @@ install_centos() {
yum --enablerepo=extras install -y epel-release
yum install -y \
opencv-devel
opencv-devel \
ffmpeg-devel
# Cleanup
yum clean all

View File

@ -3,7 +3,10 @@ set -xe
# Intel® software for general purpose GPU capabilities.
# Refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
# Refer to https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html
# Intel® oneAPI Base Toolkit (version 2024.0.0) has been updated to include functional and security updates.
# Refer to https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html
# Users should update to the latest version as it becomes available
@ -14,16 +17,14 @@ function install_ubuntu() {
# Set up the repository. To do this, download the key to the system keyring
wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
| gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
| gpg --dearmor --output /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
# Add the signed entry to APT sources and configure the APT client to use the Intel repository
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] \
https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/production/2328 unified" \
| tee /etc/apt/sources.list.d/intel-gpu-jammy.list
echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] \
https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" \
| tee /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
| tee /etc/apt/sources.list.d/oneAPI.list
# Update the packages list and repository index
apt-get update
@ -39,11 +40,11 @@ function install_ubuntu() {
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
# Development Packages
apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
# Install Intel Support Packages
if [ -n "$XPU_VERSION" ]; then
apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION}
# Install Intel® oneAPI Base Toolkit
if [ -n "$BASEKIT_VERSION" ]; then
apt-get install intel-basekit=$BASEKIT_VERSION -y
else
apt-get install -y intel-for-pytorch-gpu-dev
apt-get install intel-basekit -y
fi
# Cleanup

View File

@ -85,10 +85,10 @@ librosa>=0.6.2 ; python_version < "3.11"
#Pinned versions:
#test that import:
mypy==1.10.0
mypy==1.8.0
# Pin MyPy version because new errors are likely to appear with each release
#Description: linter
#Pinned versions: 1.10.0
#Pinned versions: 1.8.0
#test that import: test_typing.py, test_type_hints.py
networkx==2.8.8
@ -134,9 +134,9 @@ opt-einsum==3.3
#Pinned versions: 3.3
#test that import: test_linalg.py
optree==0.12.1
optree==0.9.1
#Description: A library for tree manipulation
#Pinned versions: 0.12.1
#Pinned versions: 0.9.1
#test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
#test_pytree.py, test_ops.py, test_control_flow.py, test_modules.py,
#common_utils.py, test_eager_transforms.py, test_python_dispatch.py,
@ -147,9 +147,9 @@ optree==0.12.1
#test_pointwise_ops.py, test_dtensor_ops.py, test_torchinductor.py, test_fx.py,
#test_fake_tensor.py, test_mps.py
pillow==10.3.0
pillow==10.2.0
#Description: Python Imaging Library fork
#Pinned versions: 10.3.0
#Pinned versions: 10.2.0
#test that import:
protobuf==3.20.2
@ -228,11 +228,12 @@ scikit-image==0.20.0 ; python_version >= "3.10"
#Pinned versions: 0.20.3
#test that import:
scipy==1.10.1 ; python_version <= "3.11"
scipy==1.12.0 ; python_version == "3.12"
scipy==1.6.3 ; python_version < "3.10"
scipy==1.8.1 ; python_version == "3.10"
scipy==1.10.1 ; python_version == "3.11"
# Pin SciPy because of failing distribution tests (see #60347)
#Description: scientific python
#Pinned versions: 1.10.1
#Pinned versions: 1.6.3
#test that import: test_unary_ufuncs.py, test_torch.py,test_tensor_creation_ops.py
#test_spectral_ops.py, test_sparse_csr.py, test_reductions.py,test_nn.py
#test_linalg.py, test_binary_ufuncs.py
@ -263,10 +264,10 @@ unittest-xml-reporting<=3.2.0,>=2.0.0
#Pinned versions:
#test that import:
#lintrunner is supported on aarch64-linux only from 0.12.4 version
lintrunner==0.12.5
#wheel not found on aarch64, and source build requires rust
lintrunner==0.10.7 ; platform_machine == "x86_64"
#Description: all about linters!
#Pinned versions: 0.12.5
#Pinned versions: 0.10.7
#test that import:
rockset==1.0.3
@ -279,9 +280,9 @@ ghstack==0.8.0
#Pinned versions: 0.8.0
#test that import:
jinja2==3.1.4
jinja2==3.1.3
#Description: jinja2 template engine
#Pinned versions: 3.1.4
#Pinned versions: 3.1.3
#test that import:
pytest-cpp==2.3.0
@ -306,9 +307,7 @@ pywavelets==1.5.0 ; python_version >= "3.12"
#Pinned versions: 1.4.1
#test that import:
lxml==5.0.0
lxml==5.0.0.
#Description: This is a requirement of unittest-xml-reporting
# Python-3.9 binaries
PyGithub==2.3.0

View File

@ -1 +1 @@
3.0.0
2.3.0

View File

@ -56,7 +56,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}
# (optional) Install vision packages like OpenCV
# (optional) Install vision packages like OpenCV and ffmpeg
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -103,14 +103,6 @@ COPY triton_version.txt triton_version.txt
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
ARG HALIDE
# Build and install halide
COPY ./common/install_halide.sh install_halide.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/halide.txt halide.txt
RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
RUN rm install_halide.sh common_utils.sh halide.txt
# Install ccache/sccache (do this last, so we get priority in PATH)
COPY ./common/install_cache.sh install_cache.sh
ENV PATH /opt/cache/bin:$PATH
@ -147,7 +139,7 @@ COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
ARG CUDNN_VERSION
ARG CUDA_VERSION
COPY ./common/install_cudnn.sh install_cudnn.sh
RUN if [ -n "${CUDNN_VERSION}" ]; then bash install_cudnn.sh; fi
RUN if [ "${CUDNN_VERSION}" -eq 8 ]; then bash install_cudnn.sh; fi
RUN rm install_cudnn.sh
# Install CUSPARSELT
@ -160,7 +152,6 @@ RUN rm install_cusparselt.sh
RUN if [ -h /usr/local/cuda-11.6/cuda-11.6 ]; then rm /usr/local/cuda-11.6/cuda-11.6; fi
RUN if [ -h /usr/local/cuda-11.7/cuda-11.7 ]; then rm /usr/local/cuda-11.7/cuda-11.7; fi
RUN if [ -h /usr/local/cuda-12.1/cuda-12.1 ]; then rm /usr/local/cuda-12.1/cuda-12.1; fi
RUN if [ -h /usr/local/cuda-12.4/cuda-12.4 ]; then rm /usr/local/cuda-12.4/cuda-12.4; fi
USER jenkins
CMD ["bash"]

View File

@ -53,7 +53,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}
# (optional) Install vision packages like OpenCV
# (optional) Install vision packages like OpenCV and ffmpeg
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -78,11 +78,6 @@ ENV MAGMA_HOME /opt/rocm/magma
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8
# Install amdsmi
COPY ./common/install_amdsmi.sh install_amdsmi.sh
RUN bash ./install_amdsmi.sh
RUN rm install_amdsmi.sh
# (optional) Install non-default CMake version
ARG CMAKE_VERSION
COPY ./common/install_cmake.sh install_cmake.sh
@ -105,13 +100,6 @@ COPY triton_version.txt triton_version.txt
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt
# Install AOTriton
COPY ./aotriton_version.txt aotriton_version.txt
COPY ./common/common_utils.sh common_utils.sh
COPY ./common/install_aotriton.sh install_aotriton.sh
RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
# Install ccache/sccache (do this last, so we get priority in PATH)
COPY ./common/install_cache.sh install_cache.sh
ENV PATH /opt/cache/bin:$PATH

View File

@ -61,20 +61,15 @@ COPY ci_commit_pins/timm.txt timm.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
# Install XPU Dependencies
ARG XPU_VERSION
COPY ./common/install_xpu.sh install_xpu.sh
RUN bash ./install_xpu.sh && rm install_xpu.sh
ARG TRITON
# Install triton, this needs to be done before sccache because the latter will
# try to reach out to S3, which docker build runners don't have access
COPY ./common/install_triton.sh install_triton.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/triton-xpu.txt triton-xpu.txt
COPY triton_version.txt triton_version.txt
# TODO: will add triton xpu commit
COPY ci_commit_pins/triton.txt triton.txt
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt
RUN rm install_triton.sh common_utils.sh triton.txt
# (optional) Install database packages like LMDB and LevelDB
ARG DB
@ -83,13 +78,18 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}
# (optional) Install vision packages like OpenCV
# (optional) Install vision packages like OpenCV and ffmpeg
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
ENV INSTALLED_VISION ${VISION}
# Install XPU Dependencies
ARG BASEKIT_VERSION
COPY ./common/install_xpu.sh install_xpu.sh
RUN bash ./install_xpu.sh && rm install_xpu.sh
# (optional) Install non-default CMake version
ARG CMAKE_VERSION
COPY ./common/install_cmake.sh install_cmake.sh

View File

@ -80,7 +80,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}
# (optional) Install vision packages like OpenCV
# (optional) Install vision packages like OpenCV and ffmpeg
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -155,14 +155,6 @@ COPY ci_commit_pins/executorch.txt executorch.txt
RUN if [ -n "${EXECUTORCH}" ]; then bash ./install_executorch.sh; fi
RUN rm install_executorch.sh common_utils.sh executorch.txt
ARG HALIDE
# Build and install halide
COPY ./common/install_halide.sh install_halide.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/halide.txt halide.txt
RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
RUN rm install_halide.sh common_utils.sh halide.txt
ARG ONNX
# Install ONNX dependencies
COPY ./common/install_onnx.sh ./common/common_utils.sh ./
@ -177,11 +169,9 @@ RUN rm install_acl.sh
ENV INSTALLED_ACL ${ACL}
# Install ccache/sccache (do this last, so we get priority in PATH)
ARG SKIP_SCCACHE_INSTALL
COPY ./common/install_cache.sh install_cache.sh
ENV PATH /opt/cache/bin:$PATH
RUN if [ -z "${SKIP_SCCACHE_INSTALL}" ]; then bash ./install_cache.sh; fi
RUN rm install_cache.sh
RUN bash ./install_cache.sh && rm install_cache.sh
# Add jni.h for java host build
COPY ./common/install_jni.sh install_jni.sh
@ -198,9 +188,7 @@ ARG BUILD_ENVIRONMENT
ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
# Install LLVM dev version (Defined in the pytorch/builder github repository)
ARG SKIP_LLVM_SRC_BUILD_INSTALL
COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
RUN if [ -n "${SKIP_LLVM_SRC_BUILD_INSTALL}" ]; then set -eu; rm -rf /opt/llvm; fi
# AWS specific CUDA build guidance
ENV TORCH_CUDA_ARCH_LIST Maxwell

View File

@ -1,9 +1,5 @@
#!/bin/bash
set -ex
source "$(dirname "${BASH_SOURCE[0]}")/../pytorch/common_utils.sh"
LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
TEST_DIR="$ROOT_DIR/test"

View File

@ -3,20 +3,6 @@
# shellcheck source=./common.sh
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
# Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
cleanup_workspace() {
echo "sudo may print the following warning message that can be ignored. The chown command will still run."
echo " sudo: setrlimit(RLIMIT_STACK): Operation not permitted"
echo "For more details refer to https://github.com/sudo-project/sudo/issues/42"
sudo chown -R "$WORKSPACE_ORIGINAL_OWNER_ID" /var/lib/jenkins/workspace
}
# Disable shellcheck SC2064 as we want to parse the original owner immediately.
# shellcheck disable=SC2064
trap_add cleanup_workspace EXIT
sudo chown -R jenkins /var/lib/jenkins/workspace
git config --global --add safe.directory /var/lib/jenkins/workspace
if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
# TODO: This can be removed later once vision is also part of the Docker image
pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"

View File

@ -44,7 +44,15 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
fi
fi
if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
if [[ ${BUILD_ENVIRONMENT} == *"caffe2"* ]]; then
echo "Caffe2 build is ON"
export BUILD_CAFFE2=ON
fi
if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then
export ATEN_THREADING=TBB
export USE_TBB=1
elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
export ATEN_THREADING=NATIVE
fi
@ -73,22 +81,7 @@ if ! which conda; then
export USE_MKLDNN=0
fi
else
# CMAKE_PREFIX_PATH precedences
# 1. $CONDA_PREFIX, if defined. This follows the pytorch official build instructions.
# 2. /opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}, if ANACONDA_PYTHON_VERSION defined.
# This is for CI, which defines ANACONDA_PYTHON_VERSION but not CONDA_PREFIX.
# 3. $(conda info --base). The fallback value of pytorch official build
# instructions actually refers to this.
# Commonly this is /opt/conda/
if [[ -v CONDA_PREFIX ]]; then
export CMAKE_PREFIX_PATH=${CONDA_PREFIX}
elif [[ -v ANACONDA_PYTHON_VERSION ]]; then
export CMAKE_PREFIX_PATH="/opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}"
else
# already checked by `! which conda`
CMAKE_PREFIX_PATH="$(conda info --base)"
export CMAKE_PREFIX_PATH
fi
export CMAKE_PREFIX_PATH=/opt/conda
# Workaround required for MKL library linkage
# https://github.com/pytorch/pytorch/issues/119557
@ -230,28 +223,6 @@ if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]
export BUILD_STATIC_RUNTIME_BENCHMARK=ON
fi
if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
export CMAKE_BUILD_TYPE=RelWithAssert
fi
# Do not change workspace permissions for ROCm CI jobs
# as it can leave workspace with bad permissions for cancelled jobs
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
# Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
cleanup_workspace() {
echo "sudo may print the following warning message that can be ignored. The chown command will still run."
echo " sudo: setrlimit(RLIMIT_STACK): Operation not permitted"
echo "For more details refer to https://github.com/sudo-project/sudo/issues/42"
sudo chown -R "$WORKSPACE_ORIGINAL_OWNER_ID" /var/lib/jenkins/workspace
}
# Disable shellcheck SC2064 as we want to parse the original owner immediately.
# shellcheck disable=SC2064
trap_add cleanup_workspace EXIT
sudo chown -R jenkins /var/lib/jenkins/workspace
git config --global --add safe.directory /var/lib/jenkins/workspace
fi
if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
set -e
@ -277,37 +248,16 @@ else
( ! get_exit_code python setup.py clean bad_argument )
if [[ "$BUILD_ENVIRONMENT" != *libtorch* ]]; then
# rocm builds fail when WERROR=1
# XLA test build fails when WERROR=1
# set only when building other architectures
# or building non-XLA tests.
if [[ "$BUILD_ENVIRONMENT" != *rocm* &&
"$BUILD_ENVIRONMENT" != *xla* ]]; then
if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
# Install numpy-2.0 release candidate for builds
# Which should be backward compatible with Numpy-1.X
python -mpip install --pre numpy==2.0.0rc1
fi
WERROR=1 python setup.py clean
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel
BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 python setup.py bdist_wheel --cmake
else
WERROR=1 python setup.py bdist_wheel
fi
WERROR=1 python setup.py bdist_wheel
else
python setup.py clean
if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
source .ci/pytorch/install_cache_xla.sh
fi
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
echo "USE_SPLIT_BUILD cannot be used with xla or rocm"
exit 1
else
python setup.py bdist_wheel
fi
python setup.py bdist_wheel
fi
pip_install_whl "$(echo dist/*.whl)"
@ -346,10 +296,9 @@ else
CUSTOM_OP_TEST="$PWD/test/custom_operator"
python --version
SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
mkdir -p "$CUSTOM_OP_BUILD"
pushd "$CUSTOM_OP_BUILD"
cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
-DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
make VERBOSE=1
popd
@ -362,7 +311,7 @@ else
SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
mkdir -p "$JIT_HOOK_BUILD"
pushd "$JIT_HOOK_BUILD"
cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
-DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
make VERBOSE=1
popd
@ -374,7 +323,7 @@ else
python --version
mkdir -p "$CUSTOM_BACKEND_BUILD"
pushd "$CUSTOM_BACKEND_BUILD"
cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
-DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
make VERBOSE=1
popd
@ -405,8 +354,4 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];
python tools/stats/export_test_times.py
fi
# snadampal: skipping it till sccache support added for aarch64
# https://github.com/pytorch/pytorch/issues/121559
if [[ "$BUILD_ENVIRONMENT" != *aarch64* ]]; then
print_sccache_stats
fi
print_sccache_stats

View File

@ -56,29 +56,9 @@ function assert_git_not_dirty() {
function pip_install_whl() {
# This is used to install PyTorch and other build artifacts wheel locally
# without using any network connection
# Convert the input arguments into an array
local args=("$@")
# Check if the first argument contains multiple paths separated by spaces
if [[ "${args[0]}" == *" "* ]]; then
# Split the string by spaces into an array
IFS=' ' read -r -a paths <<< "${args[0]}"
# Loop through each path and install individually
for path in "${paths[@]}"; do
echo "Installing $path"
python3 -mpip install --no-index --no-deps "$path"
done
else
# Loop through each argument and install individually
for path in "${args[@]}"; do
echo "Installing $path"
python3 -mpip install --no-index --no-deps "$path"
done
fi
python3 -mpip install --no-index --no-deps "$@"
}
function pip_install() {
# retry 3 times
# old versions of pip don't have the "--progress-bar" flag
@ -179,7 +159,7 @@ function install_torchvision() {
}
function install_tlparse() {
pip_install --user "tlparse==0.3.7"
pip_install --user "tlparse==0.3.5"
PATH="$(python -m site --user-base)/bin:$PATH"
}
@ -198,7 +178,7 @@ function install_torchrec_and_fbgemm() {
function clone_pytorch_xla() {
if [[ ! -d ./xla ]]; then
git clone --recursive --quiet https://github.com/pytorch/xla.git
git clone --recursive -b r2.3 https://github.com/pytorch/xla.git
pushd xla
# pin the xla hash so that we don't get broken by changes to xla
git checkout "$(cat ../.github/ci_commit_pins/xla.txt)"
@ -208,6 +188,28 @@ function clone_pytorch_xla() {
fi
}
function checkout_install_torchdeploy() {
local commit
commit=$(get_pinned_commit multipy)
pushd ..
git clone --recurse-submodules https://github.com/pytorch/multipy.git
pushd multipy
git checkout "${commit}"
python multipy/runtime/example/generate_examples.py
BUILD_CUDA_TESTS=1 pip install -e .
popd
popd
}
function test_torch_deploy(){
pushd ..
pushd multipy
./multipy/runtime/build/test_deploy
./multipy/runtime/build/test_deploy_gpu
popd
popd
}
function checkout_install_torchbench() {
local commit
commit=$(get_pinned_commit torchbench)
@ -222,8 +224,6 @@ function checkout_install_torchbench() {
# to install and test other models
python install.py --continue_on_fail
fi
echo "Print all dependencies after TorchBench is installed"
python -mpip freeze
popd
}

View File

@ -6,7 +6,6 @@ from cryptography.hazmat.primitives import hashes, serialization
from cryptography.hazmat.primitives.asymmetric import rsa
from cryptography.x509.oid import NameOID
temp_dir = mkdtemp()
print(temp_dir)

View File

@ -6,4 +6,4 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
echo "Testing pytorch docs"
cd docs
TERM=vt100 make doctest
make doctest

View File

@ -1,37 +0,0 @@
#!/bin/bash
# Script for installing sccache on the xla build job, which uses xla's docker
# image and doesn't have sccache installed on it. This is mostly copied from
# .ci/docker/install_cache.sh. Changes are: removing checks that will always
# return the same thing, ex checks for for rocm, CUDA, and changing the path
# where sccache is installed, and not changing /etc/environment.
set -ex
install_binary() {
echo "Downloading sccache binary from S3 repo"
curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /tmp/cache/bin/sccache
}
mkdir -p /tmp/cache/bin
mkdir -p /tmp/cache/lib
export PATH="/tmp/cache/bin:$PATH"
install_binary
chmod a+x /tmp/cache/bin/sccache
function write_sccache_stub() {
# Unset LD_PRELOAD for ps because of asan + ps issues
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589
# shellcheck disable=SC2086
# shellcheck disable=SC2059
printf "#!/bin/sh\nif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then\n exec sccache $(which $1) \"\$@\"\nelse\n exec $(which $1) \"\$@\"\nfi" > "/tmp/cache/bin/$1"
chmod a+x "/tmp/cache/bin/$1"
}
write_sccache_stub cc
write_sccache_stub c++
write_sccache_stub gcc
write_sccache_stub g++
write_sccache_stub clang
write_sccache_stub clang++

View File

@ -18,9 +18,7 @@ time python test/run_test.py --verbose -i distributed/test_c10d_gloo
time python test/run_test.py --verbose -i distributed/test_c10d_nccl
time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering
time python test/run_test.py --verbose -i distributed/test_store
time python test/run_test.py --verbose -i distributed/test_symmetric_memory
time python test/run_test.py --verbose -i distributed/test_pg_wrapper
time python test/run_test.py --verbose -i distributed/rpc/cuda/test_tensorpipe_agent
# FSDP tests
@ -47,13 +45,6 @@ time python test/run_test.py --verbose -i distributed/test_device_mesh
time python test/run_test.py --verbose -i distributed/tensor/parallel/test_ddp_2d_parallel
time python test/run_test.py --verbose -i distributed/tensor/parallel/test_fsdp_2d_parallel
time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_examples
time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state
# FSDP2 tests
time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh
# Pipelining composability tests
time python test/run_test.py --verbose -i distributed/pipelining/test_composability.py
# Other tests
time python test/run_test.py --verbose -i test_cuda_primary_ctx

View File

@ -3,7 +3,6 @@ import json
import math
import sys
parser = argparse.ArgumentParser()
parser.add_argument(
"--test-name", dest="test_name", action="store", required=True, help="test name"
@ -60,16 +59,16 @@ print("sample mean: ", sample_mean)
print("sample sigma: ", sample_sigma)
if math.isnan(sample_mean):
raise Exception("""Error: sample mean is NaN""") # noqa: TRY002
raise Exception("""Error: sample mean is NaN""")
elif math.isnan(sample_sigma):
raise Exception("""Error: sample sigma is NaN""") # noqa: TRY002
raise Exception("""Error: sample sigma is NaN""")
z_value = (sample_mean - mean) / sigma
print("z-value: ", z_value)
if z_value >= 3:
raise Exception( # noqa: TRY002
raise Exception(
f"""\n
z-value >= 3, there is high chance of perf regression.\n
To reproduce this regression, run

View File

@ -3,7 +3,6 @@ import sys
import numpy
sample_data_list = sys.argv[1:]
sample_data_list = [float(v.strip()) for v in sample_data_list]

View File

@ -1,7 +1,6 @@
import json
import sys
data_file_path = sys.argv[1]
commit_hash = sys.argv[2]

View File

@ -1,6 +1,5 @@
import sys
log_file_path = sys.argv[1]
with open(log_file_path) as f:

View File

@ -26,8 +26,8 @@ echo "error: python_doc_push_script.sh: version (arg2) not specified"
fi
# Argument 1: Where to copy the built documentation to
# (pytorch_docs/$install_path)
install_path="${1:-${DOCS_INSTALL_PATH:-${DOCS_VERSION}}}"
# (pytorch.github.io/$install_path)
install_path="${1:-${DOCS_INSTALL_PATH:-docs/${DOCS_VERSION}}}"
if [ -z "$install_path" ]; then
echo "error: python_doc_push_script.sh: install_path (arg1) not specified"
exit 1
@ -68,8 +68,8 @@ build_docs () {
}
git clone https://github.com/pytorch/docs pytorch_docs -b "$branch" --depth 1
pushd pytorch_docs
git clone https://github.com/pytorch/pytorch.github.io -b "$branch" --depth 1
pushd pytorch.github.io
export LC_ALL=C
export PATH=/opt/conda/bin:$PATH
@ -105,7 +105,6 @@ if [ "$is_main_doc" = true ]; then
echo undocumented objects found:
cat build/coverage/python.txt
echo "Make sure you've updated relevant .rsts in docs/source!"
echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
exit 1
fi
else

View File

@ -6,27 +6,6 @@
set -ex
# shellcheck source=./common.sh
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
# Do not change workspace permissions for ROCm CI jobs
# as it can leave workspace with bad permissions for cancelled jobs
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
# Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
cleanup_workspace() {
echo "sudo may print the following warning message that can be ignored. The chown command will still run."
echo " sudo: setrlimit(RLIMIT_STACK): Operation not permitted"
echo "For more details refer to https://github.com/sudo-project/sudo/issues/42"
sudo chown -R "$WORKSPACE_ORIGINAL_OWNER_ID" /var/lib/jenkins/workspace
}
# Disable shellcheck SC2064 as we want to parse the original owner immediately.
# shellcheck disable=SC2064
trap_add cleanup_workspace EXIT
sudo chown -R jenkins /var/lib/jenkins/workspace
git config --global --add safe.directory /var/lib/jenkins/workspace
fi
echo "Environment variables:"
env
@ -111,6 +90,9 @@ if [[ -n $TESTS_TO_INCLUDE ]]; then
INCLUDE_CLAUSE="--include $TESTS_TO_INCLUDE"
fi
# shellcheck source=./common.sh
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
echo "Environment variables"
env
@ -181,11 +163,6 @@ if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
export PATH="$HOME/.local/bin:$PATH"
fi
if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
# TODO: revisit this once the CI is stabilized on aarch64 linux
export VALGRIND=OFF
fi
install_tlparse
# DANGER WILL ROBINSON. The LD_PRELOAD here could cause you problems
@ -234,6 +211,8 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
export LD_PRELOAD=/usr/lib/llvm-15/lib/clang/15.0.7/lib/linux/libclang_rt.asan-x86_64.so
# Disable valgrind for asan
export VALGRIND=OFF
# Increase stack size, because ASAN red zones use more stack
ulimit -s 81920
(cd test && python -c "import torch; print(torch.__version__, torch.version.git_version)")
echo "The next four invocations are expected to crash; if they don't that means ASAN/UBSAN is misconfigured"
@ -249,7 +228,9 @@ fi
# This tests that the debug asserts are working correctly.
if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
echo "We are in debug mode: $BUILD_ENVIRONMENT. Expect the python assertion to fail"
(cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_debug_asserts_fail(424242)")
# TODO: Enable the check after we setup the build to run debug asserts without having
# to do a full (and slow) debug build
# (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_debug_asserts_fail(424242)")
elif [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
# Noop when debug is disabled. Skip bazel jobs because torch isn't available there yet.
echo "We are not in debug mode: $BUILD_ENVIRONMENT. Expect the assertion to pass"
@ -275,9 +256,6 @@ test_python_shard() {
# Bare --include flag is not supported and quoting for lint ends up with flag not being interpreted correctly
# shellcheck disable=SC2086
# modify LD_LIBRARY_PATH to ensure it has the conda env.
# This set of tests has been shown to be buggy without it for the split-build
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION
assert_git_not_dirty
@ -311,25 +289,19 @@ test_dynamo_shard() {
test_inductor_distributed() {
# Smuggle a few multi-gpu tests here so that we don't have to request another large node
echo "Testing multi_gpu tests in test_torchinductor"
python test/run_test.py -i inductor/test_torchinductor.py -k test_multi_gpu --verbose
python test/run_test.py -i inductor/test_aot_inductor.py -k test_non_default_cuda_device --verbose
python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
python test/run_test.py -i distributed/tensor/parallel/test_fsdp_2d_parallel.py --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_state_dict.py -k test_dp_state_dict_save_load --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py -k test_clip_grad_norm_2d --verbose
python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose
pytest test/inductor/test_torchinductor.py -k test_multi_gpu
pytest test/inductor/test_aot_inductor.py -k test_non_default_cuda_device
pytest test/inductor/test_aot_inductor.py -k test_replicate_on_devices
pytest test/distributed/test_c10d_functional_native.py
pytest test/distributed/_tensor/test_dtensor_compile.py
pytest test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
pytest test/distributed/_composable/fsdp/test_fully_shard_comm.py
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing
pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp
pytest test/distributed/_composable/fsdp/test_fully_shard_frozen.py
pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype
pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype
# this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
# with if required # gpus aren't available
@ -337,50 +309,26 @@ test_inductor_distributed() {
assert_git_not_dirty
}
test_inductor_shard() {
if [[ -z "$NUM_TEST_SHARDS" ]]; then
echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
exit 1
fi
test_inductor() {
python tools/dynamo/verify_dynamo.py
python test/run_test.py --inductor \
--include test_modules test_ops test_ops_gradients test_torch \
--shard "$1" "$NUM_TEST_SHARDS" \
--verbose
python test/run_test.py --inductor --include test_modules test_ops test_ops_gradients test_torch --verbose
# Do not add --inductor for the following inductor unit tests, otherwise we will fail because of nested dynamo state
python test/run_test.py \
--include inductor/test_torchinductor inductor/test_torchinductor_opinfo inductor/test_aot_inductor \
--shard "$1" "$NUM_TEST_SHARDS" \
--verbose
}
python test/run_test.py --include inductor/test_torchinductor inductor/test_torchinductor_opinfo --verbose
test_inductor_aoti() {
# docker build uses bdist_wheel which does not work with test_aot_inductor
# TODO: need a faster way to build
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aot_inductor
fi
}
test_inductor_cpp_wrapper_abi_compatible() {
export TORCHINDUCTOR_ABI_COMPATIBLE=1
TEST_REPORTS_DIR=$(pwd)/test/test-reports
mkdir -p "$TEST_REPORTS_DIR"
echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
# cpu stack allocation causes segfault and needs more investigation
PYTORCH_TESTING_DEVICE_ONLY_FOR="" python test/run_test.py --include inductor/test_cpu_cpp_wrapper
TORCHINDUCTOR_STACK_ALLOCATION=0 python test/run_test.py --include inductor/test_cpu_cpp_wrapper
python test/run_test.py --include inductor/test_cuda_cpp_wrapper
TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
--training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
--output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv"
python benchmarks/dynamo/check_accuracy.py \
--actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
--expected "benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv"
}
# "Global" flags for inductor benchmarking controlled by TEST_CONFIG
@ -405,7 +353,7 @@ if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
DYNAMO_BENCHMARK_FLAGS+=(--dynamic-shapes --dynamic-batch-only)
fi
if [[ "${TEST_CONFIG}" == *cpu_inductor* || "${TEST_CONFIG}" == *cpu_aot_inductor* ]]; then
if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
else
DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
@ -484,17 +432,6 @@ test_perf_for_dashboard() {
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
--output "$TEST_REPORTS_DIR/${backend}_max_autotune_${suite}_${dtype}_${mode}_cuda_${target}.csv"
fi
if [[ "$DASHBOARD_TAG" == *cudagraphs_low_precision-true* ]] && [[ "$mode" == "inference" ]]; then
# TODO: This has a new dtype called quant and the benchmarks script needs to be updated to support this.
# The tentative command is as follows. It doesn't work now, but it's ok because we only need mock data
# to fill the dashboard.
python "benchmarks/dynamo/$suite.py" \
"${target_flag[@]}" --"$mode" --quant --backend "$backend" "$@" \
--output "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_cuda_${target}.csv" || true
# Copy cudagraph results as mock data, easiest choice?
cp "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_cuda_${target}.csv" \
"$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_cuda_${target}.csv"
fi
done
done
}
@ -530,10 +467,9 @@ test_single_dynamo_benchmark() {
test_perf_for_dashboard "$suite" \
"${DYNAMO_BENCHMARK_FLAGS[@]}" "$@" "${partition_flags[@]}"
else
if [[ "${TEST_CONFIG}" == *aot_inductor* && "${TEST_CONFIG}" != *cpu_aot_inductor* ]]; then
if [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
# Test AOTInductor with the ABI-compatible mode on CI
# This can be removed once the ABI-compatible mode becomes default.
# For CPU device, we perfer non ABI-compatible mode on CI when testing AOTInductor.
export TORCHINDUCTOR_ABI_COMPATIBLE=1
fi
python "benchmarks/dynamo/$suite.py" \
@ -550,16 +486,6 @@ test_single_dynamo_benchmark() {
fi
}
test_inductor_micro_benchmark() {
TEST_REPORTS_DIR=$(pwd)/test/test-reports
python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
}
test_inductor_halide() {
python test/run_test.py --include inductor/test_halide.py --verbose
assert_git_not_dirty
}
test_dynamo_benchmark() {
# Usage: test_dynamo_benchmark huggingface 0
TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -574,16 +500,8 @@ test_dynamo_benchmark() {
elif [[ "${TEST_CONFIG}" == *perf* ]]; then
test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
else
if [[ "${TEST_CONFIG}" == *cpu_inductor* || "${TEST_CONFIG}" == *cpu_aot_inductor* ]]; then
local dt="float32"
if [[ "${TEST_CONFIG}" == *amp* ]]; then
dt="amp"
fi
if [[ "${TEST_CONFIG}" == *freezing* ]]; then
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" --freezing "$@"
else
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" "$@"
fi
if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 "$@"
elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
else
@ -597,16 +515,12 @@ test_inductor_torchbench_smoketest_perf() {
TEST_REPORTS_DIR=$(pwd)/test/test-reports
mkdir -p "$TEST_REPORTS_DIR"
# Test some models in the cpp wrapper mode
TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
--bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
--bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
--bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
# smoke test the cpp_wrapper mode
TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy --bfloat16 \
--inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_smoketest.csv"
python benchmarks/dynamo/check_accuracy.py \
--actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
--expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"
--actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_smoketest.csv" \
--expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
--batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
@ -621,8 +535,7 @@ test_inductor_torchbench_smoketest_perf() {
# https://github.com/pytorch/pytorch/actions/runs/7158691360/job/19491437314,
# and thus we lower its threshold to reduce flakiness. If this continues to be a problem,
# we switch to use some other model.
# lowering threshold from 4.9 to 4.7 for cu124. Will bump it up after cuda 12.4.0->12.4.1 update
python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.7
python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.9
# Check memory compression ratio for a few models
for test in hf_Albert timm_vision_transformer; do
@ -634,15 +547,6 @@ test_inductor_torchbench_smoketest_perf() {
"$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv" \
--expected benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
done
# Perform some "warm-start" runs for a few huggingface models.
for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
--only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
python benchmarks/dynamo/check_accuracy.py \
--actual "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" \
--expected "benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv"
done
}
test_inductor_torchbench_cpu_smoketest_perf(){
@ -689,12 +593,6 @@ test_inductor_torchbench_cpu_smoketest_perf(){
done
}
test_torchbench_gcp_smoketest(){
pushd "${TORCHBENCHPATH}"
python test.py -v
popd
}
test_python_gloo_with_tls() {
source "$(dirname "${BASH_SOURCE[0]}")/run_glootls_test.sh"
assert_git_not_dirty
@ -726,6 +624,7 @@ test_aten() {
${SUDO} ln -sf "$TORCH_LIB_DIR"/libmkldnn* "$TEST_BASE_DIR"
${SUDO} ln -sf "$TORCH_LIB_DIR"/libnccl* "$TEST_BASE_DIR"
${SUDO} ln -sf "$TORCH_LIB_DIR"/libtorch* "$TEST_BASE_DIR"
${SUDO} ln -sf "$TORCH_LIB_DIR"/libtbb* "$TEST_BASE_DIR"
ls "$TEST_BASE_DIR"
aten/tools/run_tests.sh "$TEST_BASE_DIR"
@ -750,6 +649,21 @@ test_without_numpy() {
popd
}
# pytorch extensions require including torch/extension.h which includes all.h
# which includes utils.h which includes Parallel.h.
# So you can call for instance parallel_for() from your extension,
# but the compilation will fail because of Parallel.h has only declarations
# and definitions are conditionally included Parallel.h(see last lines of Parallel.h).
# I tried to solve it #39612 and #39881 by including Config.h into Parallel.h
# But if Pytorch is built with TBB it provides Config.h
# that has AT_PARALLEL_NATIVE_TBB=1(see #3961 or #39881) and it means that if you include
# torch/extension.h which transitively includes Parallel.h
# which transitively includes tbb.h which is not available!
if [[ "${BUILD_ENVIRONMENT}" == *tbb* ]]; then
sudo mkdir -p /usr/include/tbb
sudo cp -r "$PWD"/third_party/tbb/include/tbb/* /usr/include/tbb
fi
test_libtorch() {
local SHARD="$1"
@ -763,6 +677,7 @@ test_libtorch() {
ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
ln -sf "$TORCH_LIB_DIR"/libshm* "$TORCH_BIN_DIR"
ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
ln -sf "$TORCH_LIB_DIR"/libtbb* "$TORCH_BIN_DIR"
ln -sf "$TORCH_LIB_DIR"/libnvfuser* "$TORCH_BIN_DIR"
export CPP_TESTS_DIR="${TORCH_BIN_DIR}"
@ -899,6 +814,7 @@ test_rpc() {
# test reporting process to function as expected.
ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
ln -sf "$TORCH_LIB_DIR"/libtbb* "$TORCH_BIN_DIR"
CPP_TESTS_DIR="${TORCH_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_cpp_rpc
}
@ -1178,21 +1094,15 @@ test_executorch() {
pushd /executorch
export PYTHON_EXECUTABLE=python
export EXECUTORCH_BUILD_PYBIND=ON
export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
# NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
# from the PR
# NB: We need to build ExecuTorch runner here and not inside the Docker image
# because it depends on PyTorch
# shellcheck disable=SC1091
source .ci/scripts/setup-linux.sh cmake
echo "Run ExecuTorch unit tests"
pytest -v -n auto
# shellcheck disable=SC1091
LLVM_PROFDATA=llvm-profdata-12 LLVM_COV=llvm-cov-12 bash test/run_oss_cpp_tests.sh
source .ci/scripts/utils.sh
build_executorch_runner "cmake"
echo "Run ExecuTorch regression tests for some models"
# NB: This is a sample model, more can be added here
export PYTHON_EXECUTABLE=python
# TODO(huydhn): Add more coverage here using ExecuTorch's gather models script
# shellcheck disable=SC1091
source .ci/scripts/test.sh mv3 cmake xnnpack-quantization-delegation ''
@ -1206,33 +1116,11 @@ test_executorch() {
assert_git_not_dirty
}
test_linux_aarch64(){
python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
test_transformers test_multiprocessing test_numpy_interop --verbose
# Dynamo tests
python test/run_test.py --include dynamo/test_compile dynamo/test_backends dynamo/test_comptime dynamo/test_config \
dynamo/test_functions dynamo/test_fx_passes_pre_grad dynamo/test_interop dynamo/test_model_output dynamo/test_modules \
dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles --verbose
# Inductor tests
python test/run_test.py --include inductor/test_torchinductor inductor/test_benchmark_fusion inductor/test_codecache \
inductor/test_config inductor/test_control_flow inductor/test_coordinate_descent_tuner inductor/test_fx_fusion \
inductor/test_group_batch_fusion inductor/test_inductor_freezing inductor/test_inductor_utils \
inductor/test_inplacing_pass inductor/test_kernel_benchmark inductor/test_layout_optim \
inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \
inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes --verbose
}
if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
(cd test && python -c "import torch; print(torch.__config__.show())")
(cd test && python -c "import torch; print(torch.__config__.parallel_info())")
fi
if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
test_linux_aarch64
elif [[ "${TEST_CONFIG}" == *backward* ]]; then
if [[ "${TEST_CONFIG}" == *backward* ]]; then
test_forward_backward_compatibility
# Do NOT add tests after bc check tests, see its comment.
elif [[ "${TEST_CONFIG}" == *xla* ]]; then
@ -1252,12 +1140,11 @@ elif [[ "$TEST_CONFIG" == distributed ]]; then
if [[ "${SHARD_NUMBER}" == 1 ]]; then
test_rpc
fi
elif [[ "$TEST_CONFIG" == deploy ]]; then
checkout_install_torchdeploy
test_torch_deploy
elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
test_inductor_distributed
elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
test_inductor_halide
elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
test_inductor_micro_benchmark
elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
install_torchvision
id=$((SHARD_NUMBER-1))
@ -1267,14 +1154,13 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
id=$((SHARD_NUMBER-1))
test_dynamo_benchmark timm_models "$id"
elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
if [[ "${TEST_CONFIG}" == *cpu_inductor* || "${TEST_CONFIG}" == *cpu_aot_inductor* ]]; then
if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
install_torchaudio cpu
else
install_torchaudio cuda
fi
install_torchtext
install_torchvision
TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install git+https://github.com/pytorch/ao.git
id=$((SHARD_NUMBER-1))
# https://github.com/opencv/opencv-python/issues/885
pip_install opencv-python==4.8.0.74
@ -1286,14 +1172,11 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
shufflenet_v2_x1_0 hf_GPT2
PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
checkout_install_torchbench
TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest
else
checkout_install_torchbench
# Do this after checkout_install_torchbench to ensure we clobber any
# nightlies that torchbench may pull in
if [[ "${TEST_CONFIG}" != *cpu_inductor* && "${TEST_CONFIG}" != *cpu_aot_inductor* ]]; then
if [[ "${TEST_CONFIG}" != *cpu_inductor* ]]; then
install_torchrec_and_fbgemm
fi
PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
@ -1301,23 +1184,17 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper_abi_compatible* ]]; then
install_torchvision
test_inductor_cpp_wrapper_abi_compatible
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
install_torchvision
test_inductor_shard "${SHARD_NUMBER}"
if [[ "${SHARD_NUMBER}" == 1 ]]; then
test_inductor_aoti
test_inductor_distributed
fi
elif [[ "${TEST_CONFIG}" == *dynamo* ]]; then
test_inductor
test_inductor_distributed
elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
install_torchvision
test_dynamo_shard 1
test_aten
elif [[ "${TEST_CONFIG}" == *dynamo* && $SHARD_NUMBER -gt 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
install_torchvision
test_dynamo_shard "${SHARD_NUMBER}"
if [[ "${SHARD_NUMBER}" == 1 ]]; then
test_aten
fi
elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
install_torchvision
test_python_shard "$SHARD_NUMBER"
test_aten
elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
test_without_numpy
install_torchvision
@ -1347,6 +1224,10 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
test_libtorch
elif [[ "${TEST_CONFIG}" = docs_test ]]; then
test_docs_test
elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
install_torchvision
test_python
test_aten
elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
install_torchvision
test_python

View File

@ -17,22 +17,22 @@ set PATH=C:\Program Files\CMake\bin;C:\Program Files\7-Zip;C:\ProgramData\chocol
set INSTALLER_DIR=%SCRIPT_HELPERS_DIR%\installation-helpers
call %INSTALLER_DIR%\install_magma.bat
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
call %INSTALLER_DIR%\install_sccache.bat
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
:: Miniconda has been installed as part of the Windows AMI with all the dependencies.
:: We just need to activate it here
call %INSTALLER_DIR%\activate_miniconda3.bat
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
:: Override VS env here
pushd .
@ -41,8 +41,8 @@ if "%VC_VERSION%" == "" (
) else (
call "C:\Program Files (x86)\Microsoft Visual Studio\%VC_YEAR%\%VC_PRODUCT%\VC\Auxiliary\Build\vcvarsall.bat" x64 -vcvars_ver=%VC_VERSION%
)
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
@echo on
popd
@ -52,12 +52,12 @@ set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION%
if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.'
goto fail
exit /b 1
)
rem version transformer, for example 10.1 to 10_1.
if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.'
goto fail
exit /b 1
)
set VERSION_SUFFIX=%CUDA_VERSION:.=_%
set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
@ -101,8 +101,8 @@ if "%USE_CUDA%"=="1" (
:: CMake requires a single command as CUDA_NVCC_EXECUTABLE, so we push the wrappers
:: randomtemp.exe and sccache.exe into a batch file which CMake invokes.
curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.4/randomtemp.exe --output %TMP_DIR_WIN%\bin\randomtemp.exe
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
echo @"%TMP_DIR_WIN%\bin\randomtemp.exe" "%TMP_DIR_WIN%\bin\sccache.exe" "%CUDA_PATH%\bin\nvcc.exe" %%* > "%TMP_DIR%/bin/nvcc.bat"
cat %TMP_DIR%/bin/nvcc.bat
set CUDA_NVCC_EXECUTABLE=%TMP_DIR%/bin/nvcc.bat
@ -114,8 +114,8 @@ if "%USE_CUDA%"=="1" (
set
python setup.py bdist_wheel
if errorlevel 1 goto fail
if not errorlevel 0 goto fail
if errorlevel 1 exit /b
if not errorlevel 0 exit /b
sccache --show-stats
python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
(
@ -135,8 +135,3 @@ python -c "import os, glob; os.system('python -mpip install --no-index --no-deps
sccache --show-stats --stats-format json | jq .stats > sccache-stats-%BUILD_ENVIRONMENT%-%OUR_GITHUB_JOB_ID%.json
sccache --stop-server
exit /b 0
:fail
exit /b 1

View File

@ -4,7 +4,6 @@ import os
import subprocess
import sys
COMMON_TESTS = [
(
"Checking that torch is available",

View File

@ -5,7 +5,6 @@ import sys
import yaml
# Need to import modules that lie on an upward-relative path
sys.path.append(os.path.join(sys.path[0], ".."))

View File

@ -46,18 +46,13 @@ if [[ "\$python_nodot" = *310* ]]; then
PROTOBUF_PACKAGE="protobuf>=3.19.0"
fi
if [[ "\$python_nodot" = *39* ]]; then
if [[ "\$python_nodot" = *39* ]]; then
# There's an issue with conda channel priority where it'll randomly pick 1.19 over 1.20
# we set a lower boundary here just to be safe
NUMPY_PIN=">=1.20"
fi
if [[ "\$python_nodot" = *38* ]]; then
# sympy 1.12.1 is the last version that supports Python 3.8
SYMPY_PIN="==1.12.1"
else
SYMPY_PIN=">=1.13.0"
fi
# Move debug wheels out of the package dir so they don't get installed
mkdir -p /tmp/debug_final_pkgs
@ -88,7 +83,7 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
"numpy\${NUMPY_PIN}" \
mkl>=2018 \
ninja \
"sympy\${SYMPY_PIN}" \
sympy \
typing-extensions \
${PROTOBUF_PACKAGE}
if [[ "$DESIRED_CUDA" == 'cpu' ]]; then
@ -101,21 +96,8 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
conda install \${EXTRA_CONDA_FLAGS} -y "\$pkg" --offline
)
elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then
if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
pkg_no_python="$(ls -1 /final_pkgs/torch_no_python* | sort |tail -1)"
pkg_torch="$(ls -1 /final_pkgs/torch-* | sort |tail -1)"
# todo: after folder is populated use the pypi_pkg channel instead
pip install "\$pkg_no_python" "\$pkg_torch" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}_pypi_pkg"
retry pip install -q numpy protobuf typing-extensions
else
pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
retry pip install -q numpy protobuf typing-extensions
fi
else
pip install "\$pkg"
retry pip install -q numpy protobuf typing-extensions
fi
pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
retry pip install -q numpy protobuf typing-extensions
fi
if [[ "$PACKAGE_TYPE" == libtorch ]]; then
pkg="\$(ls /final_pkgs/*-latest.zip)"
@ -123,18 +105,9 @@ if [[ "$PACKAGE_TYPE" == libtorch ]]; then
cd /tmp/libtorch
fi
if [[ "$GPU_ARCH_TYPE" == xpu ]]; then
# Workaround for __mkl_tmp_MOD unbound variable issue, refer https://github.com/pytorch/pytorch/issues/130543
set +u
source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
fi
# Test the package
/builder/check_binary.sh
# Clean temp files
cd /builder && git clean -ffdx
# =================== The above code will be executed inside Docker container ===================
EOL
echo

View File

@ -33,9 +33,9 @@ if [[ -z "$DOCKER_IMAGE" ]]; then
if [[ "$PACKAGE_TYPE" == conda ]]; then
export DOCKER_IMAGE="pytorch/conda-cuda"
elif [[ "$DESIRED_CUDA" == cpu ]]; then
export DOCKER_IMAGE="pytorch/manylinux:cpu"
export DOCKER_IMAGE="pytorch/manylinux-cpu"
else
export DOCKER_IMAGE="pytorch/manylinux-builder:${DESIRED_CUDA:2}"
export DOCKER_IMAGE="pytorch/manylinux-cuda${DESIRED_CUDA:2}"
fi
fi
@ -75,9 +75,9 @@ export PYTORCH_BUILD_NUMBER=1
TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.13'"
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
# Only linux Python < 3.13 are supported wheels for triton
# Only linux Python < 3.12 are supported wheels for triton
TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.12'"
TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
@ -87,11 +87,11 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:
fi
# Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton rocm package
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then
TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" && "$DESIRED_PYTHON" != "3.12" ]]; then
TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}"
if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-rocm.txt)
TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}"
fi
if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
@ -100,18 +100,30 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B
fi
fi
# Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton xpu package
if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*xpu.* && $(uname) == "Linux" ]]; then
TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}"
if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-xpu.txt)
TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}+${TRITON_SHORTHASH}"
fi
if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
else
export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
JAVA_HOME=
BUILD_JNI=OFF
if [[ "$PACKAGE_TYPE" == libtorch ]]; then
POSSIBLE_JAVA_HOMES=()
POSSIBLE_JAVA_HOMES+=(/usr/local)
POSSIBLE_JAVA_HOMES+=(/usr/lib/jvm/java-8-openjdk-amd64)
POSSIBLE_JAVA_HOMES+=(/Library/Java/JavaVirtualMachines/*.jdk/Contents/Home)
# Add the Windows-specific JNI path
POSSIBLE_JAVA_HOMES+=("$PWD/pytorch/.circleci/windows-jni/")
for JH in "${POSSIBLE_JAVA_HOMES[@]}" ; do
if [[ -e "$JH/include/jni.h" ]] ; then
# Skip if we're not on Windows but haven't found a JAVA_HOME
if [[ "$JH" == "$PWD/pytorch/.circleci/windows-jni/" && "$OSTYPE" != "msys" ]] ; then
break
fi
echo "Found jni.h under $JH"
JAVA_HOME="$JH"
BUILD_JNI=ON
break
fi
done
if [ -z "$JAVA_HOME" ]; then
echo "Did not find jni.h"
fi
fi
cat >"$envfile" <<EOL
@ -124,7 +136,6 @@ export DESIRED_PYTHON="${DESIRED_PYTHON:-}"
export DESIRED_CUDA="$DESIRED_CUDA"
export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}"
export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}"
export USE_SPLIT_BUILD="${USE_SPLIT_BUILD:-}"
if [[ "${OSTYPE}" == "msys" ]]; then
export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}"
if [[ "${LIBTORCH_CONFIG:-}" == 'debug' ]]; then
@ -148,6 +159,8 @@ export TORCH_CONDA_BUILD_FOLDER='pytorch-nightly'
export ANACONDA_USER='pytorch'
export USE_FBGEMM=1
export JAVA_HOME=$JAVA_HOME
export BUILD_JNI=$BUILD_JNI
export PIP_UPLOAD_FOLDER="$PIP_UPLOAD_FOLDER"
export DOCKER_IMAGE="$DOCKER_IMAGE"

View File

@ -25,15 +25,6 @@ if [[ "${DRY_RUN}" = "disabled" ]]; then
AWS_S3_CP="aws s3 cp"
fi
if [[ "${USE_SPLIT_BUILD:-false}" == "true" ]]; then
UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_pkg"
fi
# this is special build with all dependencies packaged
if [[ ${BUILD_NAME} == *-full* ]]; then
UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_full"
fi
# Sleep 2 minutes between retries for conda upload
retry () {
"$@" || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@")

View File

@ -8,7 +8,6 @@ import time
import requests
AZURE_PIPELINE_BASE_URL = "https://aiinfra.visualstudio.com/PyTorch/"
AZURE_DEVOPS_PAT_BASE64 = os.environ.get("AZURE_DEVOPS_PAT_BASE64_SECRET", "")
PIPELINE_ID = "911"

View File

@ -36,7 +36,6 @@ hicpp-exception-baseclass,
hicpp-avoid-goto,
misc-*,
-misc-const-correctness,
-misc-include-cleaner,
-misc-use-anonymous-namespace,
-misc-unused-parameters,
-misc-no-recursion,
@ -61,7 +60,6 @@ readability-simplify-subscript-expr,
readability-string-compare,
'
HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
AnalyzeTemporaryDtors: false
WarningsAsErrors: '*'
CheckOptions:
misc-header-include-cycle.IgnoredFilesList: 'format.h;ivalue.h;custom_class.h;Dict.h;List.h'
...

View File

@ -2,7 +2,7 @@
# NOTE: **Mirror any changes** to this file the [tool.ruff] config in pyproject.toml
# before we can fully move to use ruff
enable-extensions = G
select = B,C,E,F,G,P,SIM1,SIM911,T4,W,B9,TOR0,TOR1,TOR2,TOR9
select = B,C,E,F,G,P,SIM1,T4,W,B9,TOR0,TOR1,TOR2,TOR9
max-line-length = 120
# C408 ignored because we like the dict keyword argument syntax
# E501 is not flexible enough, we're using B950 instead
@ -54,7 +54,6 @@ per-file-ignores =
torch/ao/quantization/fx/_decomposed.py: TOR901
torch/distributed/_functional_collectives.py: TOR901
torch/distributed/_spmd/data_parallel.py: TOR901
torch/distributed/_tensor/_collective_utils.py: TOR901
optional-ascii-coding = True
exclude =
./.git,

View File

@ -40,7 +40,3 @@ e6ec0efaf87703c5f889cfc20b29be455885d58d
a53cda1ddc15336dc1ff0ce1eff2a49cdc5f882e
# 2024-01-02 clangformat: fused adam #116583
9dc68d1aa9e554d09344a10fff69f7b50b2d23a0
# 2024-06-28 enable UFMT in `torch/storage.py`
d80939e5e9337e8078f11489afefec59fd42f93b
# 2024-06-28 enable UFMT in `torch.utils.data`
7cf0b90e49689d45be91aa539fdf54cf2ea8a9a3

1
.gitattributes vendored
View File

@ -4,4 +4,3 @@
.github/generated-* linguist-generated=true
.github/scripts/gql_mocks.json linguist-generated=true
third_party/LICENSES_BUNDLED.txt linguist-generated=true
tools/build/bazel/requirements.txt linguist-generated=true

View File

@ -8,18 +8,7 @@ body:
value: >
#### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the
existing and past issues](https://github.com/pytorch/pytorch/issues)
It's likely that your bug will be resolved by checking our FAQ or troubleshooting guide [documentation](https://pytorch.org/docs/main/dynamo/index.html)
Note: if you're submitting an issue that you generated from a fuzzer. Please do the following:
- Ensure rtol/atol are at default tolerances
- Dont compare indices of max/min etc, because that avoids the above requirement
- If comparing eager and torch.compile at fp16/bf16, you should use fp32 as baseline
If the above requirements are met, add the label "topic: fuzzer" to your issue.
It's likely that your bug will be resolved by checking our FAQ or troubleshooting guide [documentation](https://pytorch.org/docs/master/dynamo/index.html)
- type: textarea
attributes:
label: 🐛 Describe the bug
@ -44,7 +33,7 @@ body:
label: Minified repro
description: |
Please run the minifier on your example and paste the minified code below
Learn more here https://pytorch.org/docs/main/torch.compiler_troubleshooting.html
Learn more here https://pytorch.org/docs/master/compile/troubleshooting.html
placeholder: |
env TORCHDYNAMO_REPRO_AFTER="aot" python your_model.py
or

View File

@ -1,12 +1,9 @@
self-hosted-runner:
labels:
# GitHub hosted x86 Linux runners
- linux.20_04.4x
- linux.20_04.16x
# Repo-specific LF hosted ARC runners
- linux.large.arc
# Organization-wide AWS Linux Runners
- linux.large
- linux.large.arc
- linux.2xlarge
- linux.4xlarge
- linux.12xlarge
@ -16,36 +13,16 @@ self-hosted-runner:
- linux.8xlarge.nvidia.gpu
- linux.16xlarge.nvidia.gpu
- linux.g5.4xlarge.nvidia.gpu
# Organization-wide AWS Linux Runners on Linux Foundation account
- lf.linux.large
- lf.linux.2xlarge
- lf.linux.4xlarge
- lf.linux.12xlarge
- lf.linux.24xlarge
- lf.linux.arm64.2xlarge
- lf.linux.4xlarge.nvidia.gpu
- lf.linux.8xlarge.nvidia.gpu
- lf.linux.16xlarge.nvidia.gpu
- lf.linux.g5.4xlarge.nvidia.gpu
# Repo-specific IBM hosted S390x runner
- linux.s390x
# Organization wide AWS Windows runners
- windows.4xlarge.nonephemeral
- windows.8xlarge.nvidia.gpu
- windows.8xlarge.nvidia.gpu.nonephemeral
- windows.g5.4xlarge.nvidia.gpu
# Organization-wide AMD hosted MI300 runners
- bm-runner
- linux.rocm.gpu
# Repo-specific Apple hosted runners
- macos-m1-ultra
- macos-m2-14
# Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
- macos-m1-stable
- macos-m1-13
- macos-m1-14
# GitHub-hosted MacOS runners
- macos-12-xl
- macos-12
- macos12.3-m1
- macos-latest-xlarge
- macos-13-xlarge
- macos-14-xlarge
# Organization-wide Intel hosted XPU runners
- linux.idc.xpu

View File

@ -14,14 +14,12 @@ runs:
- name: Cleans up diskspace
shell: bash
run: |
set -ex
diskspace_cutoff=${{ inputs.diskspace-cutoff }}
docker_root_dir=$(docker info -f '{{.DockerRootDir}}')
diskspace=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
diskspace=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
if [[ "$diskspace" -ge "$diskspace_cutoff" ]] ; then
docker system prune -af
diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
diskspace_new=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
echo "$msg"

View File

@ -9,10 +9,6 @@ inputs:
use-gha:
description: If set to any value, use GHA to download the artifact. Otherwise use s3.
required: false
s3-bucket:
description: S3 bucket to download builds
required: false
default: "gha-artifacts"
runs:
using: composite
@ -22,10 +18,9 @@ runs:
uses: seemethere/download-artifact-s3@v4
with:
name: ${{ inputs.name }}
s3-bucket: ${{ inputs.s3-bucket }}
- name: Download PyTorch Build Artifacts from GHA
if: ${{ inputs.use-gha }}
if: inputs.use-gha
uses: actions/download-artifact@v3
with:
name: ${{ inputs.name }}
@ -34,10 +29,6 @@ runs:
shell: bash
run: unzip -o artifacts.zip
- name: Remove artifacts.zip
shell: bash
run: rm artifacts.zip
- name: Output disk space left
shell: bash
run: df -H

View File

@ -13,13 +13,6 @@ inputs:
required: true
type: string
description: JSON description of what test configs to run.
selected-test-configs:
required: false
type: string
description: |
A comma-separated list of test configurations from the test matrix to keep,
The empty list means we are going to keep every configurations by defaults
default: ""
job-name:
type: string
required: false
@ -47,9 +40,6 @@ outputs:
ci-no-td:
description: True if ci-no-td label was on PR or [ci-no-td] in PR body.
value: ${{ steps.filter.outputs.ci-no-td }}
ci-td-distributed:
description: True if ci-td-distributed label was on PR or [ci-td-distributed] in PR body.
value: ${{ steps.filter.outputs.ci-td-distributed }}
runs:
using: composite
@ -66,8 +56,7 @@ runs:
command: |
set -eux
# PyYAML 6.0 doesn't work with MacOS x86 anymore
# This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2
python3 -m pip install requests==2.27.1 pyyaml==6.0.1
python3 -m pip install requests==2.26.0 pyyaml==6.0.1
- name: Parse ref
id: parse-ref
@ -134,7 +123,6 @@ runs:
--workflow "${GITHUB_WORKFLOW}" \
--job-name "${JOB_NAME}" \
--test-matrix "${{ inputs.test-matrix }}" \
--selected-test-configs "${{ inputs.selected-test-configs }}" \
--pr-number "${PR_NUMBER}" \
--tag "${TAG}" \
--event-name "${EVENT_NAME}" \

View File

@ -1,226 +0,0 @@
name: linux-build
inputs:
build-environment:
required: true
description: Top-level label for what's being built/tested.
docker-image-name:
required: true
description: Name of the base docker image to build with.
build-generates-artifacts:
required: false
default: "true"
description: If set, upload generated build artifacts.
build-with-debug:
required: false
default: "false"
description: If set, build in debug mode.
sync-tag:
required: false
default: ""
description: |
If this is set, our linter will use this to make sure that every other
job with the same `sync-tag` is identical.
cuda-arch-list:
required: false
default: "5.2"
description: Runner label to select worker type
runner:
required: false
default: "linux.2xlarge"
description: |
List of CUDA architectures CI build should target.
test-matrix:
required: false
type: string
description: |
An option JSON description of what test configs to run later on. This
is moved here from the Linux test workflow so that we can apply filter
logic using test-config labels earlier and skip unnecessary builds
s3-bucket:
description: S3 bucket to download artifact
required: false
default: "gha-artifacts"
aws-role-to-assume:
description: role to assume for downloading artifacts
required: false
default: ""
GITHUB_TOKEN:
description: GitHub token
required: true
HUGGING_FACE_HUB_TOKEN:
description: Hugging Face Hub token
required: false
default: ""
use_split_build:
description: |
[Experimental] Build a libtorch only wheel and build pytorch such that
are built from the libtorch wheel.
required: false
type: boolean
default: false
outputs:
docker-image:
value: ${{ steps.calculate-docker-image.outputs.docker-image }}
description: The docker image containing the built PyTorch.
test-matrix:
value: ${{ steps.filter.outputs.test-matrix }}
description: An optional JSON description of what test configs to run later on.
runs:
using: composite
steps:
- name: Setup Linux
uses: ./.github/actions/setup-linux
- name: configure aws credentials
uses: aws-actions/configure-aws-credentials@v3
if: ${{ inputs.aws-role-to-assume != '' }}
with:
role-to-assume: ${{ inputs.aws-role-to-assume }}
role-session-name: gha-linux-build
role-duration-seconds: 10800
aws-region: us-east-1
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-image-name: ${{ inputs.docker-image-name }}
- name: Use following to pull public copy of the image
id: print-ghcr-mirror
env:
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
shell: bash
run: |
tag=${ECR_DOCKER_IMAGE##*/}
echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Parse ref
id: parse-ref
shell: bash
run: .github/scripts/parse_ref.py
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
if: always()
with:
github-token: ${{ inputs.GITHUB_TOKEN }}
# Apply the filter logic to the build step too if the test-config label is already there
- name: Select all requested test configurations (if the test matrix is available)
id: filter
uses: ./.github/actions/filter-test-configs
with:
github-token: ${{ inputs.GITHUB_TOKEN }}
test-matrix: ${{ inputs.test-matrix }}
job-name: ${{ steps.get-job-id.outputs.job-name }}
- name: Download pytest cache
uses: ./.github/actions/pytest-cache-download
continue-on-error: true
with:
cache_dir: .pytest_cache
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
s3_bucket: ${{ inputs.s3-bucket }}
- name: Build
if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == ''
id: build
env:
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
BRANCH: ${{ steps.parse-ref.outputs.branch }}
# TODO duplicated
AWS_DEFAULT_REGION: us-east-1
PR_NUMBER: ${{ github.event.pull_request.number }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
DEBUG: ${{ inputs.build-with-debug == 'true' && '1' || '0' }}
OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
shell: bash
run: |
# detached container should get cleaned up by teardown_ec2_linux
container_name=$(docker run \
-e BUILD_ENVIRONMENT \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e AWS_DEFAULT_REGION \
-e PR_NUMBER \
-e SHA1 \
-e BRANCH \
-e SCCACHE_BUCKET \
-e SCCACHE_S3_KEY_PREFIX \
-e XLA_CUDA \
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-e SKIP_SCCACHE_INITIALIZATION=1 \
-e TORCH_CUDA_ARCH_LIST \
-e PR_LABELS \
-e OUR_GITHUB_JOB_ID \
-e HUGGING_FACE_HUB_TOKEN \
-e USE_SPLIT_BUILD \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--tty \
--detach \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}"
)
docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'
- name: Archive artifacts into zip
if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped'
shell: bash
run: |
zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .additional_ci_files
- name: Store PyTorch Build Artifacts on S3
uses: seemethere/upload-artifact-s3@v5
if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped' && inputs.use_split_build != 'true'
with:
name: ${{ inputs.build-environment }}
retention-days: 14
if-no-files-found: error
path: artifacts.zip
s3-bucket: ${{ inputs.s3-bucket }}
- name: Store PyTorch Build Artifacts on S3 for split build
uses: seemethere/upload-artifact-s3@v5
if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped' && inputs.use_split_build == 'true'
with:
name: ${{ inputs.build-environment }}-experimental-split-build
retention-days: 14
if-no-files-found: error
path: artifacts.zip
s3-bucket: ${{ inputs.s3-bucket }}
- name: Upload sccache stats
if: steps.build.outcome != 'skipped'
uses: seemethere/upload-artifact-s3@v5
with:
s3-prefix: |
${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
retention-days: 365
if-no-files-found: warn
path: sccache-stats-*.json
s3-bucket: ${{ inputs.s3-bucket }}
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always()

View File

@ -1,384 +0,0 @@
name: linux-test
inputs:
build-environment:
required: true
type: string
description: Top-level label for what's being built/tested.
test-matrix:
required: true
type: string
description: JSON description of what test configs to run.
docker-image:
required: true
type: string
description: Docker image to run in.
sync-tag:
required: false
type: string
default: ""
description: |
If this is set, our linter will use this to make sure that every other
job with the same `sync-tag` is identical.
use-gha:
required: false
type: string
default: ""
description: If set to any value, upload to GHA. Otherwise upload to S3.
dashboard-tag:
required: false
type: string
default: ""
s3-bucket:
description: S3 bucket to download artifact
required: false
type: string
default: "gha-artifacts"
aws-role-to-assume:
description: role to assume for downloading artifacts
required: false
type: string
default: ""
HUGGING_FACE_HUB_TOKEN:
description: |
HF Auth token to avoid rate limits when downloading models or datasets from hub
required: false
default: ""
GITHUB_TOKEN:
description: GitHub token
required: true
#env:
# GIT_DEFAULT_BRANCH: ${{ inputs.default_branch }}
runs:
using: composite
steps:
- name: Setup Linux
uses: ./.github/actions/setup-linux
- name: configure aws credentials
if : ${{ inputs.aws-role-to-assume != '' }}
uses: aws-actions/configure-aws-credentials@v3
with:
role-to-assume: ${{ inputs.aws-role-to-assume }}
role-session-name: gha-linux-test
aws-region: us-east-1
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-image-name: ${{ inputs.docker-image }}
- name: Use following to pull public copy of the image
id: print-ghcr-mirror
env:
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
shell: bash
run: |
tag=${ECR_DOCKER_IMAGE##*/}
echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Check if in a ARC runner
shell: bash
id: check_arc_runner
run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
id: install-nvidia-driver
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
- name: Lock NVIDIA A100 40GB Frequency
shell: bash
run: |
sudo nvidia-smi -pm 1
sudo nvidia-smi -ac 1215,1410
nvidia-smi
if: contains(matrix.runner, 'a100')
- name: Start monitoring script
id: monitor-script
shell: bash
continue-on-error: true
run: |
python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
- name: Download build artifacts
uses: ./.github/actions/download-build-artifacts
with:
name: ${{ inputs.build-environment }}
s3-bucket: ${{ inputs.s3-bucket }}
- name: Download TD artifacts
continue-on-error: true
uses: ./.github/actions/download-td-artifacts
- name: Parse ref
id: parse-ref
shell: bash
run: .github/scripts/parse_ref.py
- name: Get workflow job id
id: get-job-id
uses: ./.github/actions/get-workflow-job-id
if: always()
with:
github-token: ${{ inputs.GITHUB_TOKEN }}
- name: Check for keep-going label and re-enabled test issues
# This uses the filter-test-configs action because it conviniently
# checks for labels and re-enabled test issues. It does not actually do
# any filtering. All filtering is done in the build step.
id: keep-going
uses: ./.github/actions/filter-test-configs
with:
github-token: ${{ inputs.GITHUB_TOKEN }}
test-matrix: ${{ inputs.test-matrix }}
job-name: ${{ steps.get-job-id.outputs.job-name }}
- name: Test
id: test
env:
BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
PR_NUMBER: ${{ github.event.pull_request.number }}
GITHUB_REPOSITORY: ${{ github.repository }}
GITHUB_WORKFLOW: ${{ github.workflow }}
GITHUB_JOB: ${{ github.job }}
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_RUN_NUMBER: ${{ github.run_number }}
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
BRANCH: ${{ steps.parse-ref.outputs.branch }}
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
TEST_CONFIG: ${{ matrix.config }}
SHARD_NUMBER: ${{ matrix.shard }}
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
DOCKER_IMAGE: ${{ inputs.docker-image }}
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
shell: bash
run: |
set -x
if [[ $TEST_CONFIG == 'multigpu' ]]; then
TEST_COMMAND=.ci/pytorch/multigpu-test.sh
elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
TEST_COMMAND=.ci/onnx/test.sh
else
TEST_COMMAND=.ci/pytorch/test.sh
fi
# detached container should get cleaned up by teardown_ec2_linux
# TODO: Stop building test binaries as part of the build phase
# Used for GPU_FLAG since that doesn't play nice
# shellcheck disable=SC2086,SC2090
container_name=$(docker run \
${GPU_FLAG:-} \
-e BUILD_ENVIRONMENT \
-e PR_NUMBER \
-e GITHUB_ACTIONS \
-e GITHUB_REPOSITORY \
-e GITHUB_WORKFLOW \
-e GITHUB_JOB \
-e GITHUB_RUN_ID \
-e GITHUB_RUN_NUMBER \
-e GITHUB_RUN_ATTEMPT \
-e JOB_ID \
-e JOB_NAME \
-e BASE_SHA \
-e BRANCH \
-e SHA1 \
-e AWS_DEFAULT_REGION \
-e IN_WHEEL_TEST \
-e SHARD_NUMBER \
-e TEST_CONFIG \
-e NUM_TEST_SHARDS \
-e REENABLED_ISSUES \
-e CONTINUE_THROUGH_ERROR \
-e VERBOSE_TEST_LOGS \
-e NO_TEST_TIMEOUT \
-e NO_TD \
-e TD_DISTRIBUTED \
-e PR_LABELS \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e SCCACHE_BUCKET \
-e SCCACHE_S3_KEY_PREFIX \
-e XLA_CUDA \
-e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
-e SKIP_SCCACHE_INITIALIZATION=1 \
-e HUGGING_FACE_HUB_TOKEN \
-e DASHBOARD_TAG \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--ipc=host \
--shm-size="${SHM_SIZE}" \
--tty \
--detach \
--name="${container_name}" \
--user jenkins \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}"
)
# Propagate download.pytorch.org IP to container
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
- name: Upload pytest cache if tests failed
uses: ./.github/actions/pytest-cache-upload
continue-on-error: true
if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure'
with:
cache_dir: .pytest_cache
shard: ${{ matrix.shard }}
sha: ${{ github.event.pull_request.head.sha || github.sha }}
test_config: ${{ matrix.config }}
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
- name: Print remaining test logs
shell: bash
if: always() && steps.test.conclusion
run: |
cat test/**/*_toprint.log || true
- name: Stop monitoring script
if: always() && steps.monitor-script.outputs.monitor-script-pid
shell: bash
continue-on-error: true
env:
MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
run: |
kill "$MONITOR_SCRIPT_PID"
- name: Upload test artifacts
uses: ./.github/actions/upload-test-artifacts
if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
with:
file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
use-gha: ${{ inputs.use-gha }}
s3-bucket: ${{ inputs.s3-bucket }}
- name: Collect backtraces from coredumps (if any)
if: always()
shell: bash
run: |
# shellcheck disable=SC2156
find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
- name: Store Core dumps on S3
uses: seemethere/upload-artifact-s3@v5
if: failure()
with:
name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
retention-days: 14
if-no-files-found: ignore
path: ./**/core.[1-9]*
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always()
# NB: We are currently having an intermittent GPU-related issue on G5 runners with
# A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
# not seem to help. Here are some symptoms:
# * Calling nvidia-smi timeouts after 60 second
# * Fail to run nvidia-smi with an unable to determine the device handle for GPU
# unknown error
# * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch
# * Run docker --gpus all fails with error response from daemon
#
# As both the root cause and recovery path are unclear, let's take the runner out of
# service so that it doesn't get any more jobs
- name: Check NVIDIA driver installation step
if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped'
shell: bash
env:
RUNNER_WORKSPACE: ${{ runner.workspace }}
run: |
set +e
set -x
nvidia-smi
# NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in
# the case where the driver has already crashed as it still can get the driver version
# and some basic information like the bus ID. However, the rest of the information
# would be missing (ERR!), for example:
#
# +-----------------------------------------------------------------------------+
# | NVIDIA-SMI 525.89.02 Driver Version: 525.89.02 CUDA Version: 12.0 |
# |-------------------------------+----------------------+----------------------+
# | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
# | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
# | | | MIG M. |
# |===============================+======================+======================|
# | 0 ERR! Off | 00000000:00:1E.0 Off | ERR! |
# |ERR! ERR! ERR! ERR! / ERR! | 4184MiB / 23028MiB | ERR! Default |
# | | | ERR! |
# +-------------------------------+----------------------+----------------------+
#
# +-----------------------------------------------------------------------------+
# | Processes: |
# | GPU GI CI PID Type Process name GPU Memory |
# | ID ID Usage |
# |=============================================================================|
# +-----------------------------------------------------------------------------+
#
# This should be reported as a failure instead as it will guarantee to fail when
# Docker tries to run with --gpus all
#
# So, the correct check here is to query one of the missing piece of info like
# GPU name, so that the command can fail accordingly
nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
NVIDIA_SMI_STATUS=$?
# These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
echo "NVIDIA driver installation has failed, shutting down the runner..."
.github/scripts/stop_runner_service.sh
fi
# For runner with multiple GPUs, we also want to confirm that the number of GPUs are the
# power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails
# https://github.com/pytorch/test-infra/issues/4000
GPU_COUNT=$(nvidia-smi --list-gpus | wc -l)
NVIDIA_SMI_STATUS=$?
# These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
echo "NVIDIA driver installation has failed, shutting down the runner..."
.github/scripts/stop_runner_service.sh
fi
# Check the GPU count to be a power of 2
if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then
echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..."
.github/scripts/stop_runner_service.sh
fi

View File

@ -9,10 +9,6 @@ inputs:
job_identifier:
description: Text that uniquely identifies a given job type within a workflow. All shards of a job should share the same job identifier.
required: true
s3_bucket:
description: S3 bucket to download PyTest cache
required: false
default: "gha-artifacts"
runs:
using: composite
@ -34,7 +30,6 @@ runs:
CACHE_DIR: ${{ inputs.cache_dir }}
JOB_IDENTIFIER: ${{ inputs.job_identifier }}
REPO: ${{ github.repository }}
BUCKET: ${{ inputs.s3_bucket }}
run: |
python3 .github/scripts/pytest_cache.py \
--download \
@ -43,4 +38,3 @@ runs:
--job_identifier $JOB_IDENTIFIER \
--temp_dir $RUNNER_TEMP \
--repo $REPO \
--bucket $BUCKET \

View File

@ -15,12 +15,10 @@ runs:
category=$1
# If it is GCP runner (runner name contains gcp), do not run this
runner_name_str=${{ runner.name }}
if [[ -f /.inarc ]]; then
echo "ARC Runner, no info on ec2 metadata"
elif [[ $runner_name_str == *"gcp"* ]]; then
echo "Runner is from Google Cloud Platform, No info on ec2 metadata"
else
if [[ $runner_name_str != *"gcp"* ]]; then
curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
else
echo "Runner is from Google Cloud Platform, No info on ec2 metadata"
fi
}
echo "ami-id: $(get_ec2_metadata ami-id)"
@ -28,14 +26,8 @@ runs:
echo "instance-type: $(get_ec2_metadata instance-type)"
echo "system info $(uname -a)"
- name: Check if in a ARC runner
shell: bash
id: check_arc_runner
run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> $GITHUB_OUTPUT
- name: Start docker if docker deamon is not running
shell: bash
if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
run: |
if systemctl is-active --quiet docker; then
echo "Docker daemon is running...";
@ -66,7 +58,6 @@ runs:
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
- name: Kill any existing containers, clean up images
if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
shell: bash
run: |
# ignore expansion of "docker ps -q" since it could be empty
@ -105,28 +96,3 @@ runs:
echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
cat /etc/hosts
- name: Check that the docker daemon is running
shell: bash
continue-on-error: true
if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'true' }}
run: |
set +x
max_attempts=30
delay=10
attempt=1
for attempt in $(seq 1 $max_attempts); do
echo "Attempt $attempt of $max_attempts: Checking if Docker daemon is running..."
if docker info > /dev/null 2>&1; then
echo "Docker is running. Proceeding with the next steps"
exit 0
else
echo "Docker is not running yet."
echo "Retrying in $delay seconds..."
sleep $delay
fi
done
echo "Reached maximum attempts to connect to Docker. Exiting."
exit 1

View File

@ -26,7 +26,6 @@ runs:
-e PYTORCH_FINAL_PACKAGE_DIR \
-e PYTORCH_ROOT \
-e SKIP_ALL_TESTS \
-e USE_SPLIT_BUILD \
--tty \
--detach \
-v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
@ -36,8 +35,7 @@ runs:
"${DOCKER_IMAGE}"
)
echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" && "${GPU_ARCH_TYPE}" != "xpu" ]]; then
if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" ]]; then
# Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts"
fi
@ -46,11 +44,3 @@ runs:
# Generate test script
docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
- name: Cleanup docker
if: always() && (env.BUILD_ENVIRONMENT == 'linux-s390x-binary-manywheel' || env.GPU_ARCH_TYPE == 'xpu')
shell: bash
run: |
# on s390x or xpu stop the container for clean worker stop
# shellcheck disable=SC2046
docker stop "${{ env.CONTAINER_NAME }}" || true

View File

@ -11,10 +11,6 @@ inputs:
Suffix to add to the filename of the artifacts. This should include the
workflow job id, see [Job id in artifacts].
required: true
s3-bucket:
description: S3 bucket to download builds
required: false
default: "gha-artifacts"
runs:
using: composite
@ -46,7 +42,7 @@ runs:
env:
FILE_SUFFIX: ${{ inputs.file-suffix }}
run: |
# Remove any previous usage logs if they exist
# Remove any previous test reports if they exist
rm -f logs-*.zip
# this workflow is also run in bazel build test, but we dont generate usage reports for it
# so check to see if the file exists first
@ -57,18 +53,6 @@ runs:
zip -r "logs-${FILE_SUFFIX}.zip" test -i '*.log'
fi
- name: Zip debugging artifacts for upload
if: runner.os != 'Windows' && !inputs.use-gha
shell: bash
env:
FILE_SUFFIX: ${{ inputs.file-suffix }}
run: |
# Remove any previous debugging artifacts if they exist
rm -f debug-*.zip
if [ -d 'test/debug' ]; then
zip -r "debug-${FILE_SUFFIX}.zip" test/debug
fi
# Windows zip
- name: Zip JSONs for upload
if: runner.os == 'Windows' && !inputs.use-gha
@ -103,7 +87,6 @@ runs:
uses: seemethere/upload-artifact-s3@v5
if: ${{ !inputs.use-gha }}
with:
s3-bucket: ${{ inputs.s3-bucket }}
s3-prefix: |
${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
retention-days: 14
@ -114,7 +97,6 @@ runs:
uses: seemethere/upload-artifact-s3@v5
if: ${{ !inputs.use-gha }}
with:
s3-bucket: ${{ inputs.s3-bucket }}
s3-prefix: |
${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
retention-days: 14
@ -126,25 +108,12 @@ runs:
if: ${{ !inputs.use-gha }}
continue-on-error: true
with:
s3-bucket: ${{ inputs.s3-bucket }}
s3-prefix: |
${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
retention-days: 14
if-no-files-found: ignore
path: logs-*.zip
- name: Store Debug Artifacts on S3
uses: seemethere/upload-artifact-s3@v5
if: ${{ !inputs.use-gha }}
continue-on-error: true
with:
s3-bucket: ${{ inputs.s3-bucket }}
s3-prefix: |
${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
retention-days: 14
if-no-files-found: ignore
path: debug-*.zip
# GHA upload
- name: Store Test Downloaded JSONs on Github
uses: actions/upload-artifact@v3

View File

@ -1 +1 @@
69b2a0adc2ec03ab99990d7e8be3d4510438c148
87aeb554d3e2f7855b7abe5120c282f59648ed7a

View File

@ -1 +1 @@
23512dbebd44a11eb84afbf53c3c071dd105297e
d6015d42d9a1834bc7595c4bd6852562fb80b30b

View File

@ -1 +1 @@
d23a6e1664d20707c11781299611436e1f0c104f
2c127da8b5e2e8f44b50994c6cb931bcca267cfe

View File

@ -1 +1 @@
5ea4535f0699f366adb554183a65ebf7dc34a8be
r2.3

View File

@ -1,13 +0,0 @@
# Use this to auto apply labels based on other labels. Applies to both PRs and
# issues. Currently only supports any and all
- any:
- "module: custom operators"
- "module: aotdispatch"
then:
- "module: pt2-dispatcher"
- any:
- "module: dynamo"
- "module: pt2-dispatcher"
- "module: inductor"
then:
- "oncall: pt2"

14
.github/labeler.yml vendored
View File

@ -35,9 +35,6 @@
- test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
- torch/distributed/_tensor/**
- torch/distributed/fsdp/**
- torch/csrc/inductor/**
- test/cpp/aoti_abi_check/**
- test/cpp/aoti_inference/**
"module: cpu":
- aten/src/ATen/cpu/**
@ -58,17 +55,6 @@
- third_party/mkl-dnn.BUILD
- torch/csrc/jit/codegen/onednn/**
- test/test_jit_llga_fuser.py
- test/test_mkldnn.py
"ciflow/linux-aarch64":
- third_party/ideep
- caffe2/ideep/**
- caffe2/python/ideep/**
- cmake/Modules/FindMKLDNN.cmake
- third_party/mkl-dnn.BUILD
- torch/csrc/jit/codegen/onednn/**
- test/test_jit_llga_fuser.py
- test/test_mkldnn.py
"module: amp (automated mixed precision)":
- torch/amp/**

View File

@ -1,281 +0,0 @@
# Defines runner types that will be provisioned by by LF Self-hosted
# runners for pytorch/pytorch-canary and their labels.
#
# Runners listed here will be available as self hosted runners.
# Configuration is directly pulled from the main branch.
#
# Default values:
#
# runner_types:
# runner_label: # label to specify in the Github Actions workflow
# instance_type: m4.large
# os: linux
# max_available: 20
# disk_size: 50
# is_ephemeral: true
runner_types:
lf.c.linux.12xlarge:
disk_size: 200
instance_type: c5.12xlarge
is_ephemeral: false
max_available: 1000
os: linux
lf.c.linux.24xl.spr-metal:
disk_size: 200
instance_type: c7i.metal-24xl
is_ephemeral: false
max_available: 30
os: linux
lf.c.linux.16xlarge.spr:
disk_size: 200
instance_type: c7i.16xlarge
is_ephemeral: false
max_available: 30
os: linux
lf.c.linux.12xlarge.ephemeral:
disk_size: 200
instance_type: c5.12xlarge
is_ephemeral: true
max_available: 300
os: linux
lf.c.linux.16xlarge.nvidia.gpu:
disk_size: 150
instance_type: g3.16xlarge
is_ephemeral: false
max_available: 30
os: linux
lf.c.linux.24xlarge:
disk_size: 150
instance_type: c5.24xlarge
is_ephemeral: false
max_available: 250
os: linux
lf.c.linux.2xlarge:
disk_size: 150
instance_type: c5.2xlarge
is_ephemeral: false
max_available: 3120
os: linux
lf.c.linux.4xlarge:
disk_size: 150
instance_type: c5.4xlarge
is_ephemeral: false
max_available: 1000
os: linux
lf.c.linux.4xlarge.nvidia.gpu:
disk_size: 150
instance_type: g3.4xlarge
is_ephemeral: false
max_available: 520
os: linux
lf.c.linux.8xlarge.nvidia.gpu:
disk_size: 150
instance_type: g3.8xlarge
is_ephemeral: false
max_available: 400
os: linux
lf.c.linux.g4dn.12xlarge.nvidia.gpu:
disk_size: 150
instance_type: g4dn.12xlarge
is_ephemeral: false
max_available: 50
os: linux
lf.c.linux.g4dn.metal.nvidia.gpu:
disk_size: 150
instance_type: g4dn.metal
is_ephemeral: false
max_available: 30
os: linux
lf.c.linux.g5.48xlarge.nvidia.gpu:
disk_size: 150
instance_type: g5.48xlarge
is_ephemeral: false
max_available: 20
os: linux
lf.c.linux.g5.12xlarge.nvidia.gpu:
disk_size: 150
instance_type: g5.12xlarge
is_ephemeral: false
max_available: 150
os: linux
lf.c.linux.g5.4xlarge.nvidia.gpu:
disk_size: 150
instance_type: g5.4xlarge
is_ephemeral: false
max_available: 1200
os: linux
lf.c.linux.large:
disk_size: 15
instance_type: c5.large
is_ephemeral: false
os: linux
lf.c.linux.arm64.2xlarge:
disk_size: 256
instance_type: t4g.2xlarge
is_ephemeral: false
max_available: 200
os: linux
lf.c.linux.arm64.m7g.2xlarge:
disk_size: 256
instance_type: m7g.2xlarge
is_ephemeral: false
max_available: 20
os: linux
lf.c.windows.4xlarge:
disk_size: 256
instance_type: c5d.4xlarge
is_ephemeral: true
max_available: 420
os: windows
lf.c.windows.4xlarge.nonephemeral:
disk_size: 256
instance_type: c5d.4xlarge
is_ephemeral: false
max_available: 420
os: windows
lf.c.windows.8xlarge.nvidia.gpu:
disk_size: 256
instance_type: p3.2xlarge
is_ephemeral: true
max_available: 150
os: windows
lf.c.windows.8xlarge.nvidia.gpu.nonephemeral:
disk_size: 256
instance_type: p3.2xlarge
is_ephemeral: false
max_available: 150
os: windows
lf.c.windows.g5.4xlarge.nvidia.gpu:
disk_size: 256
instance_type: g5.4xlarge
is_ephemeral: false
max_available: 250
os: windows
### Setup runner types to test the Amazon Linux 2023 AMI
lf.c.amz2023.linux.12xlarge:
disk_size: 200
instance_type: c5.12xlarge
is_ephemeral: false
max_available: 1000
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.c.amz2023.linux.24xl.spr-metal:
disk_size: 200
instance_type: c7i.metal-24xl
is_ephemeral: false
max_available: 30
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.c.amz2023.linux.16xlarge.spr:
disk_size: 200
instance_type: c7i.16xlarge
is_ephemeral: false
max_available: 30
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.c.amz2023.linux.12xlarge.ephemeral:
disk_size: 200
instance_type: c5.12xlarge
is_ephemeral: true
max_available: 300
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.c.amz2023.linux.16xlarge.nvidia.gpu:
disk_size: 150
instance_type: g3.16xlarge
is_ephemeral: false
max_available: 30
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.c.amz2023.linux.24xlarge:
disk_size: 150
instance_type: c5.24xlarge
is_ephemeral: false
max_available: 250
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.c.amz2023.linux.2xlarge:
disk_size: 150
instance_type: c5.2xlarge
is_ephemeral: false
max_available: 3120
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.c.amz2023.linux.4xlarge:
disk_size: 150
instance_type: c5.4xlarge
is_ephemeral: false
max_available: 1000
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.c.amz2023.linux.4xlarge.nvidia.gpu:
disk_size: 150
instance_type: g3.4xlarge
is_ephemeral: false
max_available: 520
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.c.amz2023.linux.8xlarge.nvidia.gpu:
disk_size: 150
instance_type: g3.8xlarge
is_ephemeral: false
max_available: 400
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.c.amz2023.linux.g4dn.12xlarge.nvidia.gpu:
disk_size: 150
instance_type: g4dn.12xlarge
is_ephemeral: false
max_available: 50
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.c.amz2023.linux.g4dn.metal.nvidia.gpu:
disk_size: 150
instance_type: g4dn.metal
is_ephemeral: false
max_available: 30
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.c.amz2023.linux.g5.48xlarge.nvidia.gpu:
disk_size: 150
instance_type: g5.48xlarge
is_ephemeral: false
max_available: 20
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.c.amz2023.linux.g5.12xlarge.nvidia.gpu:
disk_size: 150
instance_type: g5.12xlarge
is_ephemeral: false
max_available: 150
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.c.amz2023.linux.g5.4xlarge.nvidia.gpu:
disk_size: 150
instance_type: g5.4xlarge
is_ephemeral: false
max_available: 1200
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.c.amz2023.linux.large:
disk_size: 15
instance_type: c5.large
is_ephemeral: false
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.c.amz2023.linux.arm64.2xlarge:
disk_size: 256
instance_type: t4g.2xlarge
is_ephemeral: false
max_available: 200
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.c.amz2023.linux.arm64.m7g.2xlarge:
disk_size: 256
instance_type: m7g.2xlarge
is_ephemeral: false
max_available: 20
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64

View File

@ -1,281 +0,0 @@
# Defines runner types that will be provisioned by by LF Self-hosted
# runners for pytorch/pytorch and their labels.
#
# Runners listed here will be available as self hosted runners.
# Configuration is directly pulled from the main branch.
#
# Default values:
#
# runner_types:
# runner_label: # label to specify in the Github Actions workflow
# instance_type: m4.large
# os: linux
# max_available: 20
# disk_size: 50
# is_ephemeral: true
runner_types:
lf.linux.12xlarge:
disk_size: 200
instance_type: c5.12xlarge
is_ephemeral: false
max_available: 1000
os: linux
lf.linux.24xl.spr-metal:
disk_size: 200
instance_type: c7i.metal-24xl
is_ephemeral: false
max_available: 30
os: linux
lf.linux.16xlarge.spr:
disk_size: 200
instance_type: c7i.16xlarge
is_ephemeral: false
max_available: 30
os: linux
lf.linux.12xlarge.ephemeral:
disk_size: 200
instance_type: c5.12xlarge
is_ephemeral: true
max_available: 300
os: linux
lf.linux.16xlarge.nvidia.gpu:
disk_size: 150
instance_type: g3.16xlarge
is_ephemeral: false
max_available: 30
os: linux
lf.linux.24xlarge:
disk_size: 150
instance_type: c5.24xlarge
is_ephemeral: false
max_available: 250
os: linux
lf.linux.2xlarge:
disk_size: 150
instance_type: c5.2xlarge
is_ephemeral: false
max_available: 3120
os: linux
lf.linux.4xlarge:
disk_size: 150
instance_type: c5.4xlarge
is_ephemeral: false
max_available: 1000
os: linux
lf.linux.4xlarge.nvidia.gpu:
disk_size: 150
instance_type: g3.4xlarge
is_ephemeral: false
max_available: 520
os: linux
lf.linux.8xlarge.nvidia.gpu:
disk_size: 150
instance_type: g3.8xlarge
is_ephemeral: false
max_available: 400
os: linux
lf.linux.g4dn.12xlarge.nvidia.gpu:
disk_size: 150
instance_type: g4dn.12xlarge
is_ephemeral: false
max_available: 50
os: linux
lf.linux.g4dn.metal.nvidia.gpu:
disk_size: 150
instance_type: g4dn.metal
is_ephemeral: false
max_available: 30
os: linux
lf.linux.g5.48xlarge.nvidia.gpu:
disk_size: 150
instance_type: g5.48xlarge
is_ephemeral: false
max_available: 20
os: linux
lf.linux.g5.12xlarge.nvidia.gpu:
disk_size: 150
instance_type: g5.12xlarge
is_ephemeral: false
max_available: 150
os: linux
lf.linux.g5.4xlarge.nvidia.gpu:
disk_size: 150
instance_type: g5.4xlarge
is_ephemeral: false
max_available: 1200
os: linux
lf.linux.large:
disk_size: 15
instance_type: c5.large
is_ephemeral: false
os: linux
lf.linux.arm64.2xlarge:
disk_size: 256
instance_type: t4g.2xlarge
is_ephemeral: false
max_available: 200
os: linux
lf.linux.arm64.m7g.2xlarge:
disk_size: 256
instance_type: m7g.2xlarge
is_ephemeral: false
max_available: 20
os: linux
lf.windows.4xlarge:
disk_size: 256
instance_type: c5d.4xlarge
is_ephemeral: true
max_available: 420
os: windows
lf.windows.4xlarge.nonephemeral:
disk_size: 256
instance_type: c5d.4xlarge
is_ephemeral: false
max_available: 420
os: windows
lf.windows.8xlarge.nvidia.gpu:
disk_size: 256
instance_type: p3.2xlarge
is_ephemeral: true
max_available: 150
os: windows
lf.windows.8xlarge.nvidia.gpu.nonephemeral:
disk_size: 256
instance_type: p3.2xlarge
is_ephemeral: false
max_available: 150
os: windows
lf.windows.g5.4xlarge.nvidia.gpu:
disk_size: 256
instance_type: g5.4xlarge
is_ephemeral: false
max_available: 250
os: windows
### Setup runner types to test the Amazon Linux 2023 AMI
lf.amz2023.linux.12xlarge:
disk_size: 200
instance_type: c5.12xlarge
is_ephemeral: false
max_available: 1000
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.amz2023.linux.24xl.spr-metal:
disk_size: 200
instance_type: c7i.metal-24xl
is_ephemeral: false
max_available: 30
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.amz2023.linux.16xlarge.spr:
disk_size: 200
instance_type: c7i.16xlarge
is_ephemeral: false
max_available: 30
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.amz2023.linux.12xlarge.ephemeral:
disk_size: 200
instance_type: c5.12xlarge
is_ephemeral: true
max_available: 300
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.amz2023.linux.16xlarge.nvidia.gpu:
disk_size: 150
instance_type: g3.16xlarge
is_ephemeral: false
max_available: 30
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.amz2023.linux.24xlarge:
disk_size: 150
instance_type: c5.24xlarge
is_ephemeral: false
max_available: 250
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.amz2023.linux.2xlarge:
disk_size: 150
instance_type: c5.2xlarge
is_ephemeral: false
max_available: 3120
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.amz2023.linux.4xlarge:
disk_size: 150
instance_type: c5.4xlarge
is_ephemeral: false
max_available: 1000
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.amz2023.linux.4xlarge.nvidia.gpu:
disk_size: 150
instance_type: g3.4xlarge
is_ephemeral: false
max_available: 520
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.amz2023.linux.8xlarge.nvidia.gpu:
disk_size: 150
instance_type: g3.8xlarge
is_ephemeral: false
max_available: 400
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.amz2023.linux.g4dn.12xlarge.nvidia.gpu:
disk_size: 150
instance_type: g4dn.12xlarge
is_ephemeral: false
max_available: 50
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.amz2023.linux.g4dn.metal.nvidia.gpu:
disk_size: 150
instance_type: g4dn.metal
is_ephemeral: false
max_available: 30
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.amz2023.linux.g5.48xlarge.nvidia.gpu:
disk_size: 150
instance_type: g5.48xlarge
is_ephemeral: false
max_available: 20
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.amz2023.linux.g5.12xlarge.nvidia.gpu:
disk_size: 150
instance_type: g5.12xlarge
is_ephemeral: false
max_available: 150
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.amz2023.linux.g5.4xlarge.nvidia.gpu:
disk_size: 150
instance_type: g5.4xlarge
is_ephemeral: false
max_available: 1200
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.amz2023.linux.large:
disk_size: 15
instance_type: c5.large
is_ephemeral: false
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.amz2023.linux.arm64.2xlarge:
disk_size: 256
instance_type: t4g.2xlarge
is_ephemeral: false
max_available: 200
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
lf.amz2023.linux.arm64.m7g.2xlarge:
disk_size: 256
instance_type: m7g.2xlarge
is_ephemeral: false
max_available: 20
os: linux
ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64

View File

@ -27,12 +27,13 @@
- third_party/onnx
- caffe2/python/onnx/**
approved_by:
- BowenBao
- abock
- justinchuby
- liqunfu
- shubhambhokare1
- thiagocrepaldi
- titaiwangms
- wschin
- xadupre
mandatory_checks_name:
- EasyCLA
- Lint
@ -235,25 +236,6 @@
- Lint
- pull
- name: XPU ATen
patterns:
- aten/src/ATen/xpu/**
- c10/xpu/**
- torch/csrc/xpu/**
- torch/xpu/**
- test/xpu/**
- test/test_xpu.py
- third_party/xpu.txt
- .ci/docker/ci_commit_pins/triton-xpu.txt
approved_by:
- EikanWang
- jgong5
- gujinghui
mandatory_checks_name:
- EasyCLA
- Lint
- pull
- name: Distributions
patterns:
- torch/distributions/**
@ -286,7 +268,6 @@
- test/cpp/dist_autograd/**
- test/cpp/rpc/**
approved_by:
- wconstab
- mrshenli
- pritamdamania87
- zhaojuanmao
@ -313,25 +294,6 @@
- Lint
- pull
- name: DCP
patterns:
- torch/distributed/checkpoint/**
approved_by:
- LucasLLC
- fegin
- wz337
- saumishr
- daulet-askarov
- pradeepdfb
- kirtiteja
- mhorowitz
- saiteja64
mandatory_checks_name:
- EasyCLA
- Lint
- pull
- name: IDEEP
patterns:
- third_party/ideep
@ -395,22 +357,12 @@
- name: CPU inductor
patterns:
- torch/_inductor/mkldnn_ir.py
- torch/_inductor/mkldnn_lowerings.py
- torch/_inductor/fx_passes/mkldnn_fusion.py
- torch/_inductor/fx_passes/quantization.py
- torch/_inductor/codegen/cpp_prefix.h
- torch/_inductor/codegen/cpp.py
- torch/_inductor/codegen/cpp_utils.py
- torch/_inductor/codegen/cpp_micro_gemm.py
- torch/_inductor/codegen/cpp_template_kernel.py
- torch/_inductor/codegen/cpp_template.py
- torch/_inductor/codegen/cpp_gemm_template.py
- test/inductor/test_mkldnn_pattern_matcher.py
- test/inductor/test_cpu_repro.py
- test/inductor/test_cpu_repo.py
- test/inductor/test_cpu_cpp_wrapper.py
- test/inductor/test_cpu_select_algorithm.py
- aten/src/ATen/cpu/**
- aten/src/ATen/native/quantized/cpu/**
- test/quantization/core/test_quantized_op.py
- torch/ao/quantization/quantizer/x86_inductor_quantizer.py

View File

@ -7,9 +7,6 @@ ciflow_push_tags:
- ciflow/binaries_wheel
- ciflow/inductor
- ciflow/inductor-perf-compare
- ciflow/inductor-micro-benchmark
- ciflow/inductor-cu124
- ciflow/linux-aarch64
- ciflow/mps
- ciflow/nightly
- ciflow/periodic
@ -18,12 +15,9 @@ ciflow_push_tags:
- ciflow/trunk
- ciflow/unstable
- ciflow/xpu
- ciflow/torchbench
retryable_workflows:
- lint
- pull
- trunk
- linux-binary
- windows-binary
labeler_config: labeler.yml
label_to_label_config: label_to_label.yml
mergebot: True

View File

@ -5,11 +5,11 @@
# functorch/docs/requirements.txt
# .ci/docker/requirements-ci.txt
boto3==1.19.12
jinja2==3.1.4
jinja2==3.0.1
lintrunner==0.10.7
ninja==1.10.0.post1
nvidia-ml-py==11.525.84
pyyaml==6.0
requests==2.32.2
requests==2.31.0
rich==10.9.0
rockset==1.0.3

View File

@ -4,5 +4,6 @@ mkl-include=2022.1.0
ninja=1.10.2
numpy=1.23.3
pyyaml=6.0
requests=2.31.0
setuptools=68.2.2
typing-extensions=4.9.0
typing-extensions=4.3.0

View File

@ -3,5 +3,6 @@ cmake=3.22.1
ninja=1.10.2
numpy=1.23.3
pyyaml=6.0
requests=2.31.0
setuptools=68.2.2
typing-extensions=4.9.0
typing-extensions=4.3.0

View File

@ -2,7 +2,7 @@ numpy=1.22.3
pyyaml=6.0
setuptools=61.2.0
cmake=3.22.*
typing-extensions=4.9.0
typing-extensions=4.3.0
dataclasses=0.8
pip=22.2.2
pillow=10.0.1

View File

@ -4,7 +4,7 @@ numpy=1.21.2
pyyaml=5.3
setuptools=46.0.0
cmake=3.22.*
typing-extensions=4.9.0
typing-extensions=4.3.0
dataclasses=0.8
pip=22.2.2
pillow=10.0.1

View File

@ -1,4 +1,4 @@
# iOS simulator requirements
coremltools==5.0b5
protobuf==3.20.2
optree==0.12.1
optree==0.9.1

View File

@ -17,16 +17,13 @@ pytest-xdist==3.3.1
pytest-rerunfailures==10.3
pytest-flakefinder==1.1.0
scipy==1.10.1
sympy==1.12.1 ; python_version == "3.8"
sympy>=1.13.0 ; python_version >= "3.9"
sympy==1.11.1
unittest-xml-reporting<=3.2.0,>=2.0.0
xdoctest==1.1.0
filelock==3.6.0
sympy==1.11.1
pytest-cpp==2.3.0
rockset==1.0.3
z3-solver==4.12.2.0
tensorboard==2.13.0
optree==0.12.1
# NB: test_hparams_* from test_tensorboard is failing with protobuf 5.26.0 in
# which the stringify metadata is wrong when escaping double quote
protobuf==3.20.2
optree==0.9.1

View File

@ -1,101 +0,0 @@
set -ex
# Set ROCM_HOME isn't available, use ROCM_PATH if set or /opt/rocm
ROCM_HOME="${ROCM_HOME:-${ROCM_PATH:-/opt/rocm}}"
# Find rocm_version.h header file for ROCm version extract
rocm_version_h="${ROCM_HOME}/include/rocm-core/rocm_version.h"
if [ ! -f "$rocm_version_h" ]; then
rocm_version_h="${ROCM_HOME}/include/rocm_version.h"
fi
# Error out if rocm_version.h not found
if [ ! -f "$rocm_version_h" ]; then
echo "Error: rocm_version.h not found in expected locations." >&2
exit 1
fi
# Extract major, minor and patch ROCm version numbers
MAJOR_VERSION=$(grep 'ROCM_VERSION_MAJOR' "$rocm_version_h" | awk '{print $3}')
MINOR_VERSION=$(grep 'ROCM_VERSION_MINOR' "$rocm_version_h" | awk '{print $3}')
PATCH_VERSION=$(grep 'ROCM_VERSION_PATCH' "$rocm_version_h" | awk '{print $3}')
ROCM_INT=$(($MAJOR_VERSION * 10000 + $MINOR_VERSION * 100 + $PATCH_VERSION))
echo "ROCm version: $ROCM_INT"
# Check TRITON_ROCM_DIR is set
if [[ -z "${TRITON_ROCM_DIR}" ]]; then
export TRITON_ROCM_DIR=third_party/amd/backend
fi
# Remove packaged libs and headers
rm -rf $TRITON_ROCM_DIR/include/*
LIBTINFO_PATH="/usr/lib64/libtinfo.so.5"
LIBNUMA_PATH="/usr/lib64/libnuma.so.1"
LIBELF_PATH="/usr/lib64/libelf.so.1"
OS_SO_PATHS=(
$LIBELF_PATH
$LIBNUMA_PATH
$LIBTINFO_PATH
)
for lib in "${OS_SO_PATHS[@]}"
do
cp $lib $TRITON_ROCM_DIR/lib/
done
# Required ROCm libraries
if [[ "${MAJOR_VERSION}" == "6" ]]; then
libamdhip="libamdhip64.so.6"
else
libamdhip="libamdhip64.so.5"
fi
# Required ROCm libraries - ROCm 6.0
ROCM_SO=(
"${libamdhip}"
"libhsa-runtime64.so.1"
"libamd_comgr.so.2"
"libdrm.so.2"
"libdrm_amdgpu.so.1"
)
if [[ $ROCM_INT -ge 60100 ]]; then
ROCM_SO+=("librocprofiler-register.so.0")
fi
for lib in "${ROCM_SO[@]}"
do
file_path=($(find $ROCM_HOME/lib/ -name "$lib")) # First search in lib
if [[ -z $file_path ]]; then
if [ -d "$ROCM_HOME/lib64/" ]; then
file_path=($(find $ROCM_HOME/lib64/ -name "$lib")) # Then search in lib64
fi
fi
if [[ -z $file_path ]]; then
file_path=($(find $ROCM_HOME/ -name "$lib")) # Then search in ROCM_HOME
fi
if [[ -z $file_path ]]; then
file_path=($(find /opt/ -name "$lib")) # Then search in /opt
fi
if [[ -z $file_path ]]; then
echo "Error: Library file $lib is not found." >&2
exit 1
fi
cp $file_path $TRITON_ROCM_DIR/lib
# When running locally, and not building a wheel, we need to satisfy shared objects requests that don't look for versions
LINKNAME=$(echo $lib | sed -e 's/\.so.*/.so/g')
ln -sf $lib $TRITON_ROCM_DIR/lib/$LINKNAME
done
# Copy Include Files
cp -r $ROCM_HOME/include/hip $TRITON_ROCM_DIR/include
cp -r $ROCM_HOME/include/roctracer $TRITON_ROCM_DIR/include
cp -r $ROCM_HOME/include/hsa $TRITON_ROCM_DIR/include
# Copy linker
mkdir -p $TRITON_ROCM_DIR/llvm/bin
cp $ROCM_HOME/llvm/bin/ld.lld $TRITON_ROCM_DIR/llvm/bin/

View File

@ -1,103 +0,0 @@
#!/bin/bash
set -x
if [ -z "$1" ]; then
echo "Need wheel location argument" && exit 1
fi
WHEELHOUSE_DIR=$1
PATCHELF_BIN=patchelf
ROCM_LIB=backends/amd/lib
ROCM_LD=backends/amd/llvm/bin
PREFIX=triton
fname_without_so_number() {
LINKNAME=$(echo $1 | sed -e 's/\.so.*/.so/g')
echo "$LINKNAME"
}
replace_needed_sofiles() {
find $1 -name '*.so*' -o -name 'ld.lld' | while read sofile; do
origname=$2
patchedname=$3
if [[ "$origname" != "$patchedname" ]]; then
set +e
origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
ERRCODE=$?
set -e
if [ "$ERRCODE" -eq "0" ]; then
echo "patching $sofile entry $origname to $patchedname"
$PATCHELF_BIN --replace-needed $origname $patchedname $sofile
fi
fi
done
}
mkdir -p "/tmp_dir"
pushd /tmp_dir
for pkg in /$WHEELHOUSE_DIR/*triton*.whl; do
echo "Modifying $pkg"
rm -rf tmp
mkdir -p tmp
cd tmp
cp $pkg .
unzip -q $(basename $pkg)
rm -f $(basename $pkg)
$PATCHELF_BIN --set-rpath ${LD_SO_RPATH:-'$ORIGIN:$ORIGIN/../../lib'} $PREFIX/$ROCM_LD/ld.lld
$PATCHELF_BIN --print-rpath $PREFIX/$ROCM_LD/ld.lld
# Modify libtriton.so as it sits in _C directory apart from its dependencies
find $PREFIX/_C -type f -name "*.so*" | while read sofile; do
echo "Setting rpath of $sofile"
$PATCHELF_BIN --set-rpath ${C_SO_RPATH:-'$ORIGIN:$ORIGIN/'../$ROCM_LIB} ${FORCE_RPATH:-} $sofile
$PATCHELF_BIN --print-rpath $sofile
done
# All included dependencies are included in a single lib directory
deps=()
deps_soname=()
while read sofile; do
echo "Setting rpath of $sofile to ${LIB_SO_RPATH:-'$ORIGIN'}"
$PATCHELF_BIN --set-rpath ${LIB_SO_RPATH:-'$ORIGIN'} ${FORCE_RPATH:-} $sofile
$PATCHELF_BIN --print-rpath $sofile
deps+=("$sofile")
deps_soname+=("$(basename $sofile)")
done < <(find $PREFIX/$ROCM_LIB -type f -name "*.so*")
patched=()
for filepath in "${deps[@]}"; do
filename=$(basename $filepath)
destpath=$PREFIX/$ROCM_LIB/$filename
if [[ "$filepath" != "$destpath" ]]; then
cp $filepath $destpath
fi
patchedpath=$(fname_without_so_number $destpath)
patchedname=$(basename $patchedpath)
if [[ "$destpath" != "$patchedpath" ]]; then
mv $destpath $patchedpath
fi
patched+=("$patchedname")
echo "Copied $filepath to $patchedpath"
done
# Go through all required shared objects and see if any of our other objects are dependants. If so, replace so.ver wth so
for ((i=0;i<${#deps[@]};++i)); do
echo "replacing "${deps_soname[i]} ${patched[i]}
replace_needed_sofiles $PREFIX/$ROCM_LIB ${deps_soname[i]} ${patched[i]}
replace_needed_sofiles $PREFIX/_C ${deps_soname[i]} ${patched[i]}
replace_needed_sofiles $PREFIX/$ROCM_LD ${deps_soname[i]} ${patched[i]}
done
# Re-bundle whl with so adjustments
zip -rqy $(basename $pkg) *
if [[ -z "${MANYLINUX_VERSION}" ]]; then
newpkg=$pkg
else
newpkg=$(echo $pkg | sed -e "s/\linux_x86_64/${MANYLINUX_VERSION}/g")
fi
# Remove original whl
rm -f $pkg
# Move rebuilt whl to original location with new name.
mv $(basename $pkg) $newpkg
done

View File

@ -1,5 +1,4 @@
#!/usr/bin/env python3
import os
import shutil
import sys
@ -8,17 +7,15 @@ from subprocess import check_call
from tempfile import TemporaryDirectory
from typing import Optional
SCRIPT_DIR = Path(__file__).parent
REPO_DIR = SCRIPT_DIR.parent.parent
# TODO: Remove me once Triton version is again in sync for vanilla and ROCm
ROCM_TRITION_VERSION = "2.2.0"
def read_triton_pin(device: str = "cuda") -> str:
triton_file = "triton.txt"
if device == "rocm":
triton_file = "triton-rocm.txt"
elif device == "xpu":
triton_file = "triton-xpu.txt"
def read_triton_pin(rocm_hash: bool = False) -> str:
triton_file = "triton.txt" if not rocm_hash else "triton-rocm.txt"
with open(REPO_DIR / ".ci" / "docker" / "ci_commit_pins" / triton_file) as f:
return f.read().strip()
@ -35,6 +32,27 @@ def check_and_replace(inp: str, src: str, dst: str) -> str:
return inp.replace(src, dst)
def patch_setup_py(
path: Path,
*,
version: str,
name: str = "triton",
expected_version: Optional[str] = None,
) -> None:
with open(path) as f:
orig = f.read()
# Replace name
orig = check_and_replace(orig, 'name="triton",', f'name="{name}",')
# Replace version
if not expected_version:
expected_version = read_triton_version()
orig = check_and_replace(
orig, f'version="{expected_version}",', f'version="{version}",'
)
with open(path, "w") as f:
f.write(orig)
def patch_init_py(
path: Path, *, version: str, expected_version: Optional[str] = None
) -> None:
@ -55,7 +73,7 @@ def build_triton(
version: str,
commit_hash: str,
build_conda: bool = False,
device: str = "cuda",
build_rocm: bool = False,
py_version: Optional[str] = None,
release: bool = False,
) -> Path:
@ -74,20 +92,23 @@ def build_triton(
with TemporaryDirectory() as tmpdir:
triton_basedir = Path(tmpdir) / "triton"
triton_pythondir = triton_basedir / "python"
triton_repo = "https://github.com/openai/triton"
if device == "rocm":
if build_rocm:
triton_repo = "https://github.com/ROCmSoftwarePlatform/triton"
triton_pkg_name = "pytorch-triton-rocm"
elif device == "xpu":
triton_pkg_name = "pytorch-triton-xpu"
triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton"
else:
triton_repo = "https://github.com/openai/triton"
triton_pkg_name = "pytorch-triton"
check_call(["git", "clone", triton_repo, "triton"], cwd=tmpdir)
check_call(["git", "clone", triton_repo], cwd=tmpdir)
if release:
ver, rev, patch = version.split(".")
check_call(
["git", "checkout", f"release/{ver}.{rev}.x"], cwd=triton_basedir
)
if build_rocm:
check_call(
["git", "checkout", f"release/2.2.x"], cwd=triton_basedir
)
else:
check_call(
["git", "checkout", f"release/{ver}.{rev}.x"], cwd=triton_basedir
)
else:
check_call(["git", "checkout", commit_hash], cwd=triton_basedir)
@ -100,7 +121,7 @@ def build_triton(
print("source:\n path: .\n", file=meta)
print(
"build:\n string: py{{py}}\n number: 1\n script: cd python; "
"python setup.py install --record=record.txt\n",
"python setup.py install --single-version-externally-managed --record=record.txt\n",
" script_env:\n - MAX_JOBS\n",
file=meta,
)
@ -146,15 +167,18 @@ def build_triton(
patch_init_py(
triton_pythondir / "triton" / "__init__.py",
version=f"{version}",
expected_version=None,
expected_version=ROCM_TRITION_VERSION if build_rocm else None,
)
if device == "rocm":
check_call(
[f"{SCRIPT_DIR}/amd/package_triton_wheel.sh"],
cwd=triton_basedir,
shell=True,
if build_rocm:
# TODO: Remove me when ROCM triton is updated
patch_setup_py(
triton_pythondir / "setup.py",
name=triton_pkg_name,
version=f"{version}",
expected_version=ROCM_TRITION_VERSION,
)
check_call("scripts/amd/setup_rocm_libs.sh", cwd=triton_basedir, shell=True)
print("ROCm libraries setup for triton installation...")
check_call(
@ -164,11 +188,8 @@ def build_triton(
whl_path = next(iter((triton_pythondir / "dist").glob("*.whl")))
shutil.copy(whl_path, Path.cwd())
if device == "rocm":
check_call(
[f"{SCRIPT_DIR}/amd/patch_triton_wheel.sh", Path.cwd()],
cwd=triton_basedir,
)
if build_rocm:
check_call("scripts/amd/fix_so.sh", cwd=triton_basedir, shell=True)
return Path.cwd() / whl_path.name
@ -179,19 +200,17 @@ def main() -> None:
parser = ArgumentParser("Build Triton binaries")
parser.add_argument("--release", action="store_true")
parser.add_argument("--build-conda", action="store_true")
parser.add_argument(
"--device", type=str, default="cuda", choices=["cuda", "rocm", "xpu"]
)
parser.add_argument("--build-rocm", action="store_true")
parser.add_argument("--py-version", type=str)
parser.add_argument("--commit-hash", type=str)
parser.add_argument("--triton-version", type=str, default=read_triton_version())
args = parser.parse_args()
build_triton(
device=args.device,
build_rocm=args.build_rocm,
commit_hash=args.commit_hash
if args.commit_hash
else read_triton_pin(args.device),
else read_triton_pin(args.build_rocm),
version=args.triton_version,
build_conda=args.build_conda,
py_version=args.py_version,

View File

@ -5,6 +5,7 @@ import sys
from typing import Any
from github_utils import gh_delete_comment, gh_post_pr_comment
from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
from label_utils import has_required_labels, is_label_err_comment, LABEL_ERR_MSG
from trymerge import GitHubPR

View File

@ -3,10 +3,12 @@
import json
import os
import re
from typing import Any, cast, Dict, List, Optional
from typing import Any, Optional
from urllib.error import HTTPError
from github_utils import gh_fetch_url, gh_post_pr_comment, gh_query_issues_by_labels
from github_utils import gh_fetch_url, gh_post_pr_comment
from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
from trymerge import get_pr_commit_sha, GitHubPR
@ -17,7 +19,6 @@ REQUIRES_ISSUE = {
"critical",
"fixnewfeature",
}
RELEASE_BRANCH_REGEX = re.compile(r"release/(?P<version>.+)")
def parse_args() -> Any:
@ -28,7 +29,7 @@ def parse_args() -> Any:
"--onto-branch", type=str, required=True, help="the target release branch"
)
parser.add_argument(
"--github-actor", type=str, required=True, help="all the world's a stage"
"--github-actor", type=str, required=True, help="all the worlds a stage"
)
parser.add_argument(
"--classification",
@ -57,33 +58,6 @@ def get_merge_commit_sha(repo: GitRepo, pr: GitHubPR) -> Optional[str]:
return commit_sha if pr.is_closed() else None
def get_release_version(onto_branch: str) -> Optional[str]:
"""
Return the release version if the target branch is a release branch
"""
m = re.match(RELEASE_BRANCH_REGEX, onto_branch)
return m.group("version") if m else ""
def get_tracker_issues(
org: str, project: str, onto_branch: str
) -> List[Dict[str, Any]]:
"""
Find the tracker issue from the repo. The tracker issue needs to have the title
like [VERSION] Release Tracker following the convention on PyTorch
"""
version = get_release_version(onto_branch)
if not version:
return []
tracker_issues = gh_query_issues_by_labels(org, project, labels=["release tracker"])
if not tracker_issues:
return []
# Figure out the tracker issue from the list by looking at the title
return [issue for issue in tracker_issues if version in issue.get("title", "")]
def cherry_pick(
github_actor: str,
repo: GitRepo,
@ -103,49 +77,17 @@ def cherry_pick(
)
try:
org, project = repo.gh_owner_and_name()
cherry_pick_pr = ""
if not dry_run:
org, project = repo.gh_owner_and_name()
cherry_pick_pr = submit_pr(repo, pr, cherry_pick_branch, onto_branch)
tracker_issues_comments = []
tracker_issues = get_tracker_issues(org, project, onto_branch)
for issue in tracker_issues:
issue_number = int(str(issue.get("number", "0")))
if not issue_number:
continue
msg = f"The cherry pick PR is at {cherry_pick_pr}"
if fixes:
msg += f" and it is linked with issue {fixes}"
elif classification in REQUIRES_ISSUE:
msg += f" and it is recommended to link a {classification} cherry pick PR with an issue"
res = cast(
Dict[str, Any],
post_tracker_issue_comment(
org,
project,
issue_number,
pr.pr_num,
cherry_pick_pr,
classification,
fixes,
dry_run,
),
)
comment_url = res.get("html_url", "")
if comment_url:
tracker_issues_comments.append(comment_url)
msg = f"The cherry pick PR is at {cherry_pick_pr}"
if fixes:
msg += f" and it is linked with issue {fixes}."
elif classification in REQUIRES_ISSUE:
msg += f" and it is recommended to link a {classification} cherry pick PR with an issue."
if tracker_issues_comments:
msg += " The following tracker issues are updated:\n"
for tracker_issues_comment in tracker_issues_comments:
msg += f"* {tracker_issues_comment}\n"
post_pr_comment(org, project, pr.pr_num, msg, dry_run)
post_comment(org, project, pr.pr_num, msg)
finally:
if current_branch:
@ -217,9 +159,7 @@ def submit_pr(
raise RuntimeError(msg) from error
def post_pr_comment(
org: str, project: str, pr_num: int, msg: str, dry_run: bool = False
) -> List[Dict[str, Any]]:
def post_comment(org: str, project: str, pr_num: int, msg: str) -> None:
"""
Post a comment on the PR itself to point to the cherry picking PR when success
or print the error when failure
@ -242,35 +182,7 @@ def post_pr_comment(
comment = "\n".join(
(f"### Cherry picking #{pr_num}", f"{msg}", "", f"{internal_debugging}")
)
return gh_post_pr_comment(org, project, pr_num, comment, dry_run)
def post_tracker_issue_comment(
org: str,
project: str,
issue_num: int,
pr_num: int,
cherry_pick_pr: str,
classification: str,
fixes: str,
dry_run: bool = False,
) -> List[Dict[str, Any]]:
"""
Post a comment on the tracker issue (if any) to record the cherry pick
"""
comment = "\n".join(
(
"Link to landed trunk PR (if applicable):",
f"* https://github.com/{org}/{project}/pull/{pr_num}",
"",
"Link to release branch PR:",
f"* {cherry_pick_pr}",
"",
"Criteria Category:",
" - ".join((classification.capitalize(), fixes.capitalize())),
)
)
return gh_post_pr_comment(org, project, issue_num, comment, dry_run)
gh_post_pr_comment(org, project, pr_num, comment)
def main() -> None:
@ -302,7 +214,7 @@ def main() -> None:
except RuntimeError as error:
if not args.dry_run:
post_pr_comment(org, project, pr_num, str(error))
post_comment(org, project, pr_num, str(error))
else:
raise error

View File

@ -10,7 +10,6 @@ import requests
import rockset # type: ignore[import]
from gitutils import retries_decorator
LOGS_QUERY = """
with
shas as (

View File

@ -1,12 +1,10 @@
#!/usr/bin/env python3
import sys
from pathlib import Path
from typing import Any, cast, Dict, List, Set
import yaml
GITHUB_DIR = Path(__file__).parent.parent

View File

@ -23,10 +23,8 @@ def main() -> None:
job_link = f"[job]({run_url})" if run_url is not None else "job"
msg = (
f"The {args.action} {job_link} was canceled or timed out. This most often happen if two merge requests were issued"
+ " for the same PR, or if merge job was waiting for more than 6 hours for tests to finish."
+ " In later case, please do not hesitate to reissue the merge command\n"
+ f" For more information see [pytorch-bot wiki]({BOT_COMMANDS_WIKI})."
f"The {args.action} {job_link} was canceled. If you believe this is a mistake,"
+ f" then you can re trigger it through [pytorch-bot]({BOT_COMMANDS_WIKI})."
)
gh_post_pr_comment(org, project, args.pr_num, msg)

View File

@ -1,6 +1,7 @@
import json
import subprocess
import sys
from enum import Enum
from pathlib import Path
from typing import NamedTuple, Optional

View File

@ -2,14 +2,12 @@
import os
import re
from datetime import datetime
from functools import lru_cache
from pathlib import Path
from typing import Any, Callable, Dict, List, Set
from github_utils import gh_fetch_json_dict, gh_graphql
from gitutils import GitRepo
SEC_IN_DAY = 24 * 60 * 60
CLOSED_PR_RETENTION = 30 * SEC_IN_DAY
NO_PR_RETENTION = 1.5 * 365 * SEC_IN_DAY
@ -20,7 +18,7 @@ ESTIMATED_TOKENS = [0]
TOKEN = os.environ["GITHUB_TOKEN"]
if not TOKEN:
raise Exception("GITHUB_TOKEN is not set") # noqa: TRY002
raise Exception("GITHUB_TOKEN is not set")
REPO_ROOT = Path(__file__).parent.parent.parent
@ -189,17 +187,6 @@ def get_recent_prs() -> Dict[str, Any]:
return prs_by_branch_base
@lru_cache(maxsize=1)
def get_open_prs() -> List[Dict[str, Any]]:
return paginate_graphql(
GRAPHQL_OPEN_PRS,
{"owner": "pytorch", "repo": "pytorch"},
lambda data: False,
lambda res: res["data"]["repository"]["pullRequests"]["nodes"],
lambda res: res["data"]["repository"]["pullRequests"]["pageInfo"],
)
def get_branches_with_magic_label_or_open_pr() -> Set[str]:
pr_infos: List[Dict[str, Any]] = paginate_graphql(
GRAPHQL_NO_DELETE_BRANCH_LABEL,
@ -209,7 +196,15 @@ def get_branches_with_magic_label_or_open_pr() -> Set[str]:
lambda res: res["data"]["repository"]["label"]["pullRequests"]["pageInfo"],
)
pr_infos.extend(get_open_prs())
pr_infos.extend(
paginate_graphql(
GRAPHQL_OPEN_PRS,
{"owner": "pytorch", "repo": "pytorch"},
lambda data: False,
lambda res: res["data"]["repository"]["pullRequests"]["nodes"],
lambda res: res["data"]["repository"]["pullRequests"]["pageInfo"],
)
)
# Get the most recent PR for each branch base (group gh together)
branch_bases = set()
@ -275,41 +270,5 @@ def delete_branches() -> None:
delete_branch(git_repo, branch)
def delete_old_ciflow_tags() -> None:
# Deletes ciflow tags if they are associated with a closed PR or a specific
# commit. Lightweight tags don't have information about the date they were
# created, so we can't check how old they are. The script just assumes that
# ciflow tags should be deleted regardless of creation date.
git_repo = GitRepo(str(REPO_ROOT), "origin", debug=True)
def delete_tag(tag: str) -> None:
print(f"Deleting tag {tag}")
ESTIMATED_TOKENS[0] += 1
delete_branch(git_repo, f"refs/tags/{tag}")
tags = git_repo._run_git("tag").splitlines()
open_pr_numbers = [x["number"] for x in get_open_prs()]
for tag in tags:
try:
if ESTIMATED_TOKENS[0] > 400:
print("Estimated tokens exceeded, exiting")
break
if not tag.startswith("ciflow/"):
continue
re_match_pr = re.match(r"^ciflow\/.*\/(\d{5,6})$", tag)
re_match_sha = re.match(r"^ciflow\/.*\/([0-9a-f]{40})$", tag)
if re_match_pr:
pr_number = int(re_match_pr.group(1))
if pr_number in open_pr_numbers:
continue
delete_tag(tag)
elif re_match_sha:
delete_tag(tag)
except Exception as e:
print(f"Failed to check tag {tag}: {e}")
if __name__ == "__main__":
delete_branches()
delete_old_ciflow_tags()

View File

@ -1,52 +0,0 @@
import os
import re
import sys
from github import Github
def main() -> None:
token = os.environ.get("GITHUB_TOKEN")
repo_owner = "pytorch"
repo_name = "pytorch"
pull_request_number = int(sys.argv[1])
g = Github(token)
repo = g.get_repo(f"{repo_owner}/{repo_name}")
pull_request = repo.get_pull(pull_request_number)
pull_request_body = pull_request.body
# PR without description
if pull_request_body is None:
return
# get issue number from the PR body
if not re.search(r"#\d{1,6}", pull_request_body):
print("The pull request does not mention an issue.")
return
issue_number = int(re.findall(r"#(\d{1,6})", pull_request_body)[0])
issue = repo.get_issue(issue_number)
issue_labels = issue.labels
docathon_label_present = any(
label.name == "docathon-h1-2024" for label in issue_labels
)
# if the issue has a docathon label, add all labels from the issue to the PR.
if not docathon_label_present:
print("The 'docathon-h1-2024' label is not present in the issue.")
return
pull_request_labels = pull_request.get_labels()
pull_request_label_names = [label.name for label in pull_request_labels]
issue_label_names = [label.name for label in issue_labels]
labels_to_add = [
label for label in issue_label_names if label not in pull_request_label_names
]
if not labels_to_add:
print("The pull request already has the same labels.")
return
pull_request.add_to_labels(*labels_to_add)
print("Labels added to the pull request!")
if __name__ == "__main__":
main()

Binary file not shown.

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python3
import sys
from pathlib import Path
import yaml

View File

@ -14,6 +14,7 @@ import json
from typing import Any
import boto3 # type: ignore[import]
from label_utils import gh_get_labels

View File

@ -1,7 +1,6 @@
#!/usr/bin/env python3
import json
import logging
import os
import re
import subprocess
@ -9,18 +8,42 @@ import sys
import warnings
from enum import Enum
from functools import lru_cache
from logging import info
from typing import Any, Callable, Dict, List, Optional, Set
from urllib.request import Request, urlopen
import yaml
REENABLE_TEST_REGEX = "(?i)(Close(d|s)?|Resolve(d|s)?|Fix(ed|es)?) (#|https://github.com/pytorch/pytorch/issues/)([0-9]+)"
PREFIX = "test-config/"
logging.basicConfig(level=logging.INFO)
# Same as shard names
VALID_TEST_CONFIG_LABELS = {
f"{PREFIX}{label}"
for label in {
"backwards_compat",
"crossref",
"default",
"deploy",
"distributed",
"docs_tests",
"dynamo",
"force_on_cpu",
"functorch",
"inductor",
"inductor_distributed",
"inductor_huggingface",
"inductor_timm",
"inductor_torchbench",
"jit_legacy",
"multigpu",
"nogpu_AVX512",
"nogpu_NO_AVX2",
"slow",
"tsan",
"xla",
}
}
def is_cuda_or_rocm_job(job_name: Optional[str]) -> bool:
@ -39,9 +62,9 @@ SUPPORTED_PERIODICAL_MODES: Dict[str, Callable[[Optional[str]], bool]] = {
}
# The link to the published list of disabled jobs
DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json"
DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=qO7aEr.Og33PtLXfNq0j0yj.bbLC7SzR"
# and unstable jobs
UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json"
UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=7NhgpqKTtGXVUnL1C79KboTW_5qQx8y5"
# Some constants used to handle disabled and unstable jobs
JOB_NAME_SEP = "/"
@ -67,12 +90,6 @@ def parse_args() -> Any:
parser.add_argument(
"--test-matrix", type=str, required=True, help="the original test matrix"
)
parser.add_argument(
"--selected-test-configs",
type=str,
default="",
help="a comma-separated list of test configurations from the test matrix to keep",
)
parser.add_argument(
"--workflow", type=str, help="the name of the current workflow, i.e. pull"
)
@ -138,25 +155,19 @@ def get_labels(pr_number: int) -> Set[str]:
}
def filter_labels(labels: Set[str], label_regex: Any) -> Set[str]:
"""
Return the list of matching labels
"""
return {l for l in labels if re.match(label_regex, l)}
def filter(test_matrix: Dict[str, List[Any]], labels: Set[str]) -> Dict[str, List[Any]]:
"""
Select the list of test config to run from the test matrix. The logic works
as follows:
If the PR has one or more test-config labels as specified, only these test configs
will be selected. This also works with ciflow labels, for example, if a PR has both
ciflow/trunk and test-config/functorch, only trunk functorch builds and tests will
be run.
If the PR has one or more labels as specified in the VALID_TEST_CONFIG_LABELS set, only
these test configs will be selected. This also works with ciflow labels, for example,
if a PR has both ciflow/trunk and test-config/functorch, only trunk functorch builds
and tests will be run
If the PR has none of the test-config label, all tests are run as usual.
"""
filtered_test_matrix: Dict[str, List[Any]] = {"include": []}
for entry in test_matrix.get("include", []):
@ -166,46 +177,23 @@ def filter(test_matrix: Dict[str, List[Any]], labels: Set[str]) -> Dict[str, Lis
label = f"{PREFIX}{config_name.strip()}"
if label in labels:
msg = f"Select {config_name} because label {label} is present in the pull request by the time the test starts"
info(msg)
print(
f"Select {config_name} because label {label} is presented in the pull request by the time the test starts"
)
filtered_test_matrix["include"].append(entry)
test_config_labels = filter_labels(labels, re.compile(f"{PREFIX}.+"))
if not filtered_test_matrix["include"] and not test_config_labels:
info("Found no test-config label on the PR, so all test configs are included")
# Found no test-config label and the filtered test matrix is empty, return the same
valid_test_config_labels = labels.intersection(VALID_TEST_CONFIG_LABELS)
if not filtered_test_matrix["include"] and not valid_test_config_labels:
# Found no valid label and the filtered test matrix is empty, return the same
# test matrix as before so that all tests can be run normally
return test_matrix
else:
msg = f"Found {test_config_labels} on the PR so only these test configs are run"
info(msg)
# When the filter test matrix contain matches or if a valid test config label
# is found in the PR, return the filtered test matrix
return filtered_test_matrix
def filter_selected_test_configs(
test_matrix: Dict[str, List[Any]], selected_test_configs: Set[str]
) -> Dict[str, List[Any]]:
"""
Keep only the selected configs if the list if not empty. Otherwise, keep all test configs.
This filter is used when the workflow is dispatched manually.
"""
if not selected_test_configs:
return test_matrix
filtered_test_matrix: Dict[str, List[Any]] = {"include": []}
for entry in test_matrix.get("include", []):
config_name = entry.get("config", "")
if not config_name:
continue
if config_name in selected_test_configs:
filtered_test_matrix["include"].append(entry)
return filtered_test_matrix
def set_periodic_modes(
test_matrix: Dict[str, List[Any]], job_name: Optional[str]
) -> Dict[str, List[Any]]:
@ -386,33 +374,30 @@ def process_jobs(
# - If the target record has the job (config) name, only that test config
# will be skipped or marked as unstable
if not target_job_cfg:
msg = (
print(
f"Issue {target_url} created by {author} has {issue_type.value} "
+ f"all CI jobs for {workflow} / {job_name}"
)
info(msg)
return _filter_jobs(
test_matrix=test_matrix,
issue_type=issue_type,
)
if target_job_cfg == BUILD_JOB_NAME:
msg = (
print(
f"Issue {target_url} created by {author} has {issue_type.value} "
+ f"the build job for {workflow} / {job_name}"
)
info(msg)
return _filter_jobs(
test_matrix=test_matrix,
issue_type=issue_type,
)
if target_job_cfg in (TEST_JOB_NAME, BUILD_AND_TEST_JOB_NAME):
msg = (
print(
f"Issue {target_url} created by {author} has {issue_type.value} "
+ f"all the test jobs for {workflow} / {job_name}"
)
info(msg)
return _filter_jobs(
test_matrix=test_matrix,
issue_type=issue_type,
@ -478,7 +463,7 @@ def parse_reenabled_issues(s: Optional[str]) -> List[str]:
def get_reenabled_issues(pr_body: str = "") -> List[str]:
default_branch = f"origin/{os.environ.get('GIT_DEFAULT_BRANCH', 'main')}"
default_branch = os.getenv("GIT_DEFAULT_BRANCH", "main")
try:
commit_messages = subprocess.check_output(
f"git cherry -v {default_branch}".split(" ")
@ -509,15 +494,10 @@ def perform_misc_tasks(
"ci-no-test-timeout", check_for_setting(labels, pr_body, "ci-no-test-timeout")
)
set_output("ci-no-td", check_for_setting(labels, pr_body, "ci-no-td"))
# Only relevant for the one linux distributed cuda job, delete this when TD
# is rolled out completely
set_output(
"ci-td-distributed", check_for_setting(labels, pr_body, "ci-td-distributed")
)
# Obviously, if the job name includes unstable, then this is an unstable job
is_unstable = job_name and IssueType.UNSTABLE.value in job_name
if not is_unstable and test_matrix and test_matrix.get("include"):
if not is_unstable and test_matrix:
# Even when the job name doesn't mention unstable, we will also mark it as
# unstable when the test matrix only includes unstable jobs. Basically, this
# logic allows build or build-and-test jobs to be marked as unstable too.
@ -587,16 +567,6 @@ def main() -> None:
# No PR number, no tag, we can just return the test matrix as it is
filtered_test_matrix = test_matrix
if args.selected_test_configs:
selected_test_configs = {
v.strip().lower()
for v in args.selected_test_configs.split(",")
if v.strip()
}
filtered_test_matrix = filter_selected_test_configs(
filtered_test_matrix, selected_test_configs
)
if args.event_name == "schedule" and args.schedule == "29 8 * * *":
# we don't want to run the mem leak check or disabled tests on normal
# periodically scheduled jobs, only the ones at this time

View File

@ -8,25 +8,22 @@ architectures:
* CPU
* Latest CUDA
* Latest ROCM
* Latest XPU
"""
import os
from typing import Dict, List, Optional, Tuple
CUDA_ARCHES = ["11.8", "12.1", "12.4"]
CUDA_ARCHES = ["11.8", "12.1"]
CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1", "12.4": "12.4.0"}
CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1"}
CUDA_ARCHES_CUDNN_VERSION = {"11.8": "9", "12.1": "9", "12.4": "9"}
CUDA_ARCHES_CUDNN_VERSION = {"11.8": "8", "12.1": "8"}
ROCM_ARCHES = ["6.0", "6.1"]
ROCM_ARCHES = ["5.7", "6.0"]
XPU_ARCHES = ["xpu"]
CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]
@ -34,53 +31,33 @@ CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]
CPU_AARCH64_ARCH = ["cpu-aarch64"]
CPU_S390X_ARCH = ["cpu-s390x"]
CUDA_AARCH64_ARCH = ["cuda-aarch64"]
PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"11.8": (
"nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | " # noqa: B950
"nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'"
),
"12.1": (
"nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | " # noqa: B950
"nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'"
),
"12.4": (
"nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'"
),
}
@ -135,16 +112,10 @@ def arch_type(arch_version: str) -> str:
return "cuda"
elif arch_version in ROCM_ARCHES:
return "rocm"
elif arch_version in XPU_ARCHES:
return "xpu"
elif arch_version in CPU_CXX11_ABI_ARCH:
return "cpu-cxx11-abi"
elif arch_version in CPU_AARCH64_ARCH:
return "cpu-aarch64"
elif arch_version in CPU_S390X_ARCH:
return "cpu-s390x"
elif arch_version in CUDA_AARCH64_ARCH:
return "cuda-aarch64"
else: # arch_version should always be "cpu" in this case
return "cpu"
@ -161,12 +132,9 @@ WHEEL_CONTAINER_IMAGES = {
gpu_arch: f"pytorch/manylinux-builder:rocm{gpu_arch}-{DEFAULT_TAG}"
for gpu_arch in ROCM_ARCHES
},
"xpu": f"pytorch/manylinux2_28-builder:xpu-{DEFAULT_TAG}",
"cpu": f"pytorch/manylinux-builder:cpu-{DEFAULT_TAG}",
"cpu-cxx11-abi": f"pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-{DEFAULT_TAG}",
"cpu-aarch64": f"pytorch/manylinuxaarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
"cpu-s390x": f"pytorch/manylinuxs390x-builder:cpu-s390x-{DEFAULT_TAG}",
"cuda-aarch64": f"pytorch/manylinuxaarch64-builder:cuda12.4-{DEFAULT_TAG}",
}
CONDA_CONTAINER_IMAGES = {
@ -223,11 +191,8 @@ def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
"cpu": "cpu",
"cpu-aarch64": "cpu",
"cpu-cxx11-abi": "cpu-cxx11-abi",
"cpu-s390x": "cpu",
"cuda": f"cu{gpu_arch_version.replace('.', '')}",
"cuda-aarch64": "cu124",
"rocm": f"rocm{gpu_arch_version}",
"xpu": "xpu",
}.get(gpu_arch_type, gpu_arch_version)
@ -307,11 +272,11 @@ def generate_libtorch_matrix(
"libtorch_variant": libtorch_variant,
"libtorch_config": abi_version if os == "windows" else "",
"devtoolset": abi_version if os != "windows" else "",
"container_image": (
LIBTORCH_CONTAINER_IMAGES[(arch_version, abi_version)]
if os != "windows"
else ""
),
"container_image": LIBTORCH_CONTAINER_IMAGES[
(arch_version, abi_version)
]
if os != "windows"
else "",
"package_type": "libtorch",
"build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{abi_version}".replace(
".", "_"
@ -327,28 +292,24 @@ def generate_wheels_matrix(
python_versions: Optional[List[str]] = None,
) -> List[Dict[str, str]]:
package_type = "wheel"
if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
# NOTE: We only build manywheel packages for x86_64 and aarch64 and s390x linux
if os == "linux" or os == "linux-aarch64":
# NOTE: We only build manywheel packages for x86_64 and aarch64 linux
package_type = "manywheel"
if python_versions is None:
python_versions = FULL_PYTHON_VERSIONS + ["3.13"]
python_versions = FULL_PYTHON_VERSIONS
if arches is None:
# Define default compute archivectures
arches = ["cpu"]
if os == "linux":
arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES
elif os == "windows":
arches += CUDA_ARCHES
elif os == "linux-aarch64":
# Only want the one arch as the CPU type is different and
# uses different build/test scripts
arches = ["cpu-aarch64", "cuda-aarch64"]
elif os == "linux-s390x":
# Only want the one arch as the CPU type is different and
# uses different build/test scripts
arches = ["cpu-s390x"]
arches = ["cpu-aarch64"]
ret: List[Dict[str, str]] = []
for python_version in python_versions:
@ -359,24 +320,11 @@ def generate_wheels_matrix(
if arch_version == "cpu"
or arch_version == "cpu-cxx11-abi"
or arch_version == "cpu-aarch64"
or arch_version == "cpu-s390x"
or arch_version == "cuda-aarch64"
or arch_version == "xpu"
else arch_version
)
# TODO: Enable python 3.13 on rocm, xpu, aarch64, windows
if (
gpu_arch_type in ["rocm", "xpu"] or os != "linux"
) and python_version == "3.13":
continue
# 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
if (
arch_version in ["12.4", "12.1", "11.8"]
and os == "linux"
or arch_version == "cuda-aarch64"
):
if arch_version in ["12.1", "11.8"] and os == "linux":
ret.append(
{
"python_version": python_version,
@ -385,64 +333,15 @@ def generate_wheels_matrix(
"desired_cuda": translate_desired_cuda(
gpu_arch_type, gpu_arch_version
),
"devtoolset": (
"cxx11-abi" if arch_version == "cuda-aarch64" else ""
),
"devtoolset": "",
"container_image": WHEEL_CONTAINER_IMAGES[arch_version],
"package_type": package_type,
"pytorch_extra_install_requirements": (
PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version] # fmt: skip
if os != "linux-aarch64"
else ""
),
"pytorch_extra_install_requirements": PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version], # fmt: skip
"build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace( # noqa: B950
".", "_"
),
}
)
if arch_version != "cuda-aarch64":
ret.append(
{
"python_version": python_version,
"gpu_arch_type": gpu_arch_type,
"gpu_arch_version": gpu_arch_version,
"desired_cuda": translate_desired_cuda(
gpu_arch_type, gpu_arch_version
),
"use_split_build": "True",
"devtoolset": "",
"container_image": WHEEL_CONTAINER_IMAGES[arch_version],
"package_type": package_type,
"pytorch_extra_install_requirements": (
PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version] # fmt: skip
if os != "linux-aarch64"
else ""
),
"build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-split".replace( # noqa: B950
".", "_"
),
}
)
# Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
if python_version == "3.10" and arch_version == "12.1":
ret.append(
{
"python_version": python_version,
"gpu_arch_type": gpu_arch_type,
"gpu_arch_version": gpu_arch_version,
"desired_cuda": translate_desired_cuda(
gpu_arch_type, gpu_arch_version
),
"use_split_build": "False",
"devtoolset": "",
"container_image": WHEEL_CONTAINER_IMAGES[arch_version],
"package_type": package_type,
"pytorch_extra_install_requirements": "",
"build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace( # noqa: B950
".", "_"
),
}
)
else:
ret.append(
{
@ -452,26 +351,21 @@ def generate_wheels_matrix(
"desired_cuda": translate_desired_cuda(
gpu_arch_type, gpu_arch_version
),
"devtoolset": (
"cxx11-abi"
if arch_version in ["cpu-cxx11-abi", "xpu"]
else ""
),
"devtoolset": "cxx11-abi"
if arch_version == "cpu-cxx11-abi"
else "",
"container_image": WHEEL_CONTAINER_IMAGES[arch_version],
"package_type": package_type,
"build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
".", "_"
),
"pytorch_extra_install_requirements": (
PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.1"] # fmt: skip
if os != "linux"
else ""
),
"pytorch_extra_install_requirements":
PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.1"] # fmt: skip
if os != "linux" else "",
}
)
return ret
validate_nccl_dep_consistency("12.4")
validate_nccl_dep_consistency("12.1")
validate_nccl_dep_consistency("11.8")

Some files were not shown because too many files have changed in this diff Show More