Compare commits

..

2 Commits

Author SHA1 Message Date
6d20b39d3f [CI] Release only chnages use anaconda token for test env (#108064) 2023-08-28 12:41:57 -04:00
17f400404f [CI] Release only changes for 2.1 release (#108053)
* [CI] Release only changes for 2.1 release

* include circle script

* release only changes for test-infra

* More test-infra related
2023-08-28 11:55:58 -04:00
3710 changed files with 412031 additions and 227773 deletions

View File

@ -71,9 +71,6 @@ if [[ "$image" == *cuda* && "$UBUNTU_VERSION" != "22.04" ]]; then
DOCKERFILE="${OS}-cuda/Dockerfile"
elif [[ "$image" == *rocm* ]]; then
DOCKERFILE="${OS}-rocm/Dockerfile"
elif [[ "$image" == *cuda*linter* ]]; then
# Use a separate Dockerfile for linter to keep a small image size
DOCKERFILE="linter-cuda/Dockerfile"
elif [[ "$image" == *linter* ]]; then
# Use a separate Dockerfile for linter to keep a small image size
DOCKERFILE="linter/Dockerfile"
@ -132,6 +129,35 @@ case "$image" in
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc7)
CUDA_VERSION=11.8.0
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=7
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc7-inductor-benchmarks)
CUDA_VERSION=11.8.0
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=7
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
CUDA_VERSION=12.1.1
CUDNN_VERSION=8
@ -155,13 +181,13 @@ case "$image" in
CONDA_CMAKE=yes
ONNX=yes
;;
pytorch-linux-focal-py3-clang9-android-ndk-r21e)
pytorch-linux-focal-py3-clang7-android-ndk-r19c)
ANACONDA_PYTHON_VERSION=3.8
CLANG_VERSION=9
CLANG_VERSION=7
LLVMDEV=yes
PROTOBUF=yes
ANDROID=yes
ANDROID_NDK_VERSION=r21e
ANDROID_NDK_VERSION=r19c
GRADLE_VERSION=6.8.3
NINJA_VERSION=1.9.0
;;
@ -202,7 +228,7 @@ case "$image" in
PROTOBUF=yes
DB=yes
VISION=yes
ROCM_VERSION=5.6
ROCM_VERSION=5.4.2
NINJA_VERSION=1.9.0
CONDA_CMAKE=yes
TRITON=yes
@ -213,11 +239,22 @@ case "$image" in
PROTOBUF=yes
DB=yes
VISION=yes
ROCM_VERSION=5.7
ROCM_VERSION=5.6
NINJA_VERSION=1.9.0
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-py3.8-gcc7)
ANACONDA_PYTHON_VERSION=3.8
GCC_VERSION=7
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
CONDA_CMAKE=yes
TRITON=yes
DOCS=yes
;;
pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.8
GCC_VERSION=11
@ -249,12 +286,6 @@ case "$image" in
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-jammy-py3-clang15-asan)
ANACONDA_PYTHON_VERSION=3.10
CLANG_VERSION=15
CONDA_CMAKE=yes
VISION=yes
;;
pytorch-linux-jammy-py3.8-gcc11)
ANACONDA_PYTHON_VERSION=3.8
GCC_VERSION=11
@ -266,12 +297,6 @@ case "$image" in
TRITON=yes
DOCS=yes
;;
pytorch-linux-jammy-py3-clang12-executorch)
ANACONDA_PYTHON_VERSION=3.10
CLANG_VERSION=12
CONDA_CMAKE=yes
EXECUTORCH=yes
;;
pytorch-linux-focal-linter)
# TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
# We will need to update mypy version eventually, but that's for another day. The task
@ -279,11 +304,6 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.9
CONDA_CMAKE=yes
;;
pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter)
ANACONDA_PYTHON_VERSION=3.9
CUDA_VERSION=11.8
CONDA_CMAKE=yes
;;
*)
# Catch-all for builds that are not hardcoded.
PROTOBUF=yes
@ -301,9 +321,6 @@ case "$image" in
extract_version_from_image_name rocm ROCM_VERSION
NINJA_VERSION=1.9.0
TRITON=yes
# To ensure that any ROCm config will build using conda cmake
# and thus have LAPACK/MKL enabled
CONDA_CMAKE=yes
fi
if [[ "$image" == *centos7* ]]; then
NINJA_VERSION=1.10.2
@ -337,11 +354,14 @@ if [[ "$image" == *cuda* && ${OS} == "ubuntu" ]]; then
fi
# Build image
# TODO: build-arg THRIFT is not turned on for any image, remove it once we confirm
# it's no longer needed.
docker build \
--no-cache \
--progress=plain \
--build-arg "BUILD_ENVIRONMENT=${image}" \
--build-arg "PROTOBUF=${PROTOBUF:-}" \
--build-arg "THRIFT=${THRIFT:-}" \
--build-arg "LLVMDEV=${LLVMDEV:-}" \
--build-arg "DB=${DB:-}" \
--build-arg "VISION=${VISION:-}" \
@ -373,7 +393,6 @@ docker build \
--build-arg "ONNX=${ONNX}" \
--build-arg "DOCS=${DOCS}" \
--build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
--build-arg "EXECUTORCH=${EXECUTORCH}" \
-f $(dirname ${DOCKERFILE})/Dockerfile \
-t "$tmp_tag" \
"$@" \

View File

@ -98,18 +98,6 @@ COPY ./common/install_ninja.sh install_ninja.sh
RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
RUN rm install_ninja.sh
ARG TRITON
# Install triton, this needs to be done before sccache because the latter will
# try to reach out to S3, which docker build runners don't have access
ENV CMAKE_C_COMPILER cc
ENV CMAKE_CXX_COMPILER c++
COPY ./common/install_triton.sh install_triton.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/triton-rocm.txt triton-rocm.txt
COPY triton_version.txt triton_version.txt
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt
# Install ccache/sccache (do this last, so we get priority in PATH)
COPY ./common/install_cache.sh install_cache.sh
ENV PATH /opt/cache/bin:$PATH

View File

@ -1 +0,0 @@
b2f5dfe80704404298467347b8ee3ac229efed47

View File

@ -1 +1 @@
6c26faa159b79a42d7fa46cb66e2d21523351987
4.27.4

View File

@ -1 +1 @@
730b907b4d45a4713cbc425cbf224c46089fd514
b9d43c7dcac1fe05e851dd7be7187b108af593d2

View File

@ -1 +1 @@
dafe1459823b9549417ed95e9720f1b594fab329
05d67b9418cacda0d356c2102d7c1a887948b013

View File

@ -1 +1 @@
bcad9dabe15021c53b6a88296e9d7a210044f108
e6216047b8b0aef1fe8da6ca8667a3ad0a016411

View File

@ -9,7 +9,10 @@ install_ubuntu() {
# "$UBUNTU_VERSION" == "18.04"*
# instead of
# "$UBUNTU_VERSION" == "18.04"
if [[ "$UBUNTU_VERSION" == "20.04"* ]]; then
if [[ "$UBUNTU_VERSION" == "18.04"* ]]; then
cmake3="cmake=3.10*"
maybe_libiomp_dev="libiomp-dev"
elif [[ "$UBUNTU_VERSION" == "20.04"* ]]; then
cmake3="cmake=3.16*"
maybe_libiomp_dev=""
elif [[ "$UBUNTU_VERSION" == "22.04"* ]]; then
@ -20,9 +23,7 @@ install_ubuntu() {
maybe_libiomp_dev="libiomp-dev"
fi
if [[ "$CLANG_VERSION" == 15 ]]; then
maybe_libomp_dev="libomp-15-dev"
elif [[ "$CLANG_VERSION" == 12 ]]; then
if [[ "$CLANG_VERSION" == 12 ]]; then
maybe_libomp_dev="libomp-12-dev"
elif [[ "$CLANG_VERSION" == 10 ]]; then
maybe_libomp_dev="libomp-10-dev"
@ -61,7 +62,6 @@ install_ubuntu() {
${maybe_libiomp_dev} \
libyaml-dev \
libz-dev \
libjemalloc2 \
libjpeg-dev \
libasound2-dev \
libsndfile-dev \

View File

@ -54,13 +54,23 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"
if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ]; then
conda_install numpy=1.23.5 ${CONDA_COMMON_DEPS}
else
elif [ "$ANACONDA_PYTHON_VERSION" = "3.10" ]; then
conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
elif [ "$ANACONDA_PYTHON_VERSION" = "3.9" ]; then
conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
elif [ "$ANACONDA_PYTHON_VERSION" = "3.8" ]; then
conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
else
# Install `typing-extensions` for 3.7
conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS} typing-extensions
fi
# Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
# and libpython-static for torch deploy
conda_install llvmdev=8.0.0 "libpython-static=${ANACONDA_PYTHON_VERSION}"
# This is only supported in 3.8 upward
if [ "$MINOR_PYTHON_VERSION" -gt "7" ]; then
# Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
# and libpython-static for torch deploy
conda_install llvmdev=8.0.0 "libpython-static=${ANACONDA_PYTHON_VERSION}"
fi
# Use conda cmake in some cases. Conda cmake will be newer than our supported
# min version (3.5 for xenial and 3.10 for bionic), so we only do it in those
@ -79,7 +89,13 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
# Install some other packages, including those needed for Python test reporting
pip_install -r /opt/conda/requirements-ci.txt
pip_install -U scikit-learn
# Update scikit-learn to a python-3.8 compatible version
if [[ $(python -c "import sys; print(int(sys.version_info >= (3, 8)))") == "1" ]]; then
pip_install -U scikit-learn
else
# Pinned scikit-learn due to https://github.com/scikit-learn/scikit-learn/issues/14485 (affects gcc 5.5 only)
pip_install scikit-learn==0.20.3
fi
if [ -n "$DOCS" ]; then
apt-get update

View File

@ -1,62 +0,0 @@
#!/bin/bash
set -ex
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
clone_executorch() {
EXECUTORCH_PINNED_COMMIT=$(get_pinned_commit executorch)
# Clone the Executorch
git clone https://github.com/pytorch/executorch.git
# and fetch the target commit
pushd executorch
git checkout "${EXECUTORCH_PINNED_COMMIT}"
git submodule update --init
popd
chown -R jenkins executorch
}
install_buck2() {
pushd executorch/.ci/docker
BUCK2_VERSION=$(cat ci_commit_pins/buck2.txt)
source common/install_buck.sh
popd
}
install_conda_dependencies() {
pushd executorch/.ci/docker
# Install conda dependencies like flatbuffer
conda_install --file conda-env-ci.txt
popd
}
install_pip_dependencies() {
pushd executorch/.ci/docker
# Install all Python dependencies
pip_install -r requirements-ci.txt
popd
}
setup_executorch() {
pushd executorch
source .ci/scripts/utils.sh
install_flatc_from_source
pip_install .
build_executorch_runner "cmake"
# Make sure that all the newly generate files are owned by Jenkins
chown -R jenkins .
popd
}
clone_executorch
install_buck2
install_conda_dependencies
install_pip_dependencies
setup_executorch

View File

@ -6,21 +6,23 @@ source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
function install_huggingface() {
local version
commit=$(get_pinned_commit huggingface)
pip_install pandas==2.0.3
pip_install "git+https://github.com/huggingface/transformers@${commit}"
version=$(get_pinned_commit huggingface)
pip_install pandas
pip_install scipy
pip_install z3-solver
pip_install "transformers==${version}"
}
function install_timm() {
local commit
commit=$(get_pinned_commit timm)
pip_install pandas==2.0.3
pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
# Clean up
conda_run pip uninstall -y cmake torch torchvision triton
pip_install pandas
pip_install scipy
pip_install z3-solver
pip_install "git+https://github.com/rwightman/pytorch-image-models@${commit}"
}
# Pango is needed for weasyprint which is needed for doctr
conda_install pango
install_huggingface
install_timm
# install_timm

23
.ci/docker/common/install_onnx.sh Executable file → Normal file
View File

@ -4,35 +4,36 @@ set -ex
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
retry () {
"$@" || (sleep 10 && "$@") || (sleep 20 && "$@") || (sleep 40 && "$@")
}
# A bunch of custom pip dependencies for ONNX
pip_install \
beartype==0.15.0 \
beartype==0.10.4 \
filelock==3.9.0 \
flatbuffers==2.0 \
mock==5.0.1 \
ninja==1.10.2 \
networkx==2.0 \
numpy==1.24.2
numpy==1.22.4
# ONNXRuntime should be installed before installing
# onnx-weekly. Otherwise, onnx-weekly could be
# overwritten by onnx.
pip_install \
onnxruntime==1.15.1 \
parameterized==0.8.1 \
pytest-cov==4.0.0 \
pytest-subtests==0.10.0 \
tabulate==0.9.0 \
transformers==4.32.1
transformers==4.31.0
pip_install coloredlogs packaging
retry pip_install -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ --no-cache-dir --no-input ort-nightly==1.17.0.dev20231005006
# Using 1.15dev branch for the following not yet released features and fixes.
# - Segfault fix for shape inference.
# - Inliner to workaround ORT segfault.
pip_install onnx-weekly==1.15.0.dev20230717
pip_install -i https://test.pypi.org/simple/ onnx==1.15.0rc2
pip_install onnxscript==0.1.0.dev20231128 --no-deps
# TODO: change this when onnx-script is on testPypi
# pip_install onnxscript-preview==0.1.0.dev20230809 --no-deps
# NOTE: temp change for CI to run on unpublished onnxscript PR.
pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@f69be19ebd3f2e0d7efe64b0c7be3329cbab3822" --no-deps
# Cache the transformers model to be used later by ONNX tests. We need to run the transformers
# package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/

View File

@ -5,10 +5,8 @@ set -ex
# "install" hipMAGMA into /opt/rocm/magma by copying after build
git clone https://bitbucket.org/icl/magma.git
pushd magma
# Version 2.7.2 + ROCm related updates
git checkout 823531632140d0edcb7e77c3edc0e837421471c5
# Fixes memory leaks of magma found while executing linalg UTs
git checkout 28592a7170e4b3707ed92644bf4a689ed600c27f
cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib' >> make.inc

View File

@ -0,0 +1,14 @@
apt-get update
apt-get install -y sudo wget libboost-dev libboost-test-dev libboost-program-options-dev libboost-filesystem-dev libboost-thread-dev libevent-dev automake libtool flex bison pkg-config g++ libssl-dev
wget https://www-us.apache.org/dist/thrift/0.12.0/thrift-0.12.0.tar.gz
tar -xvf thrift-0.12.0.tar.gz
cd thrift-0.12.0
for file in ./compiler/cpp/Makefile*; do
sed -i 's/\-Werror//' $file
done
./bootstrap.sh
./configure --without-php --without-java --without-python --without-nodejs --without-go --without-ruby
sudo make
sudo make install
cd ..
rm thrift-0.12.0.tar.gz

View File

@ -23,10 +23,8 @@ fi
# The logic here is copied from .ci/pytorch/common_utils.sh
TRITON_PINNED_COMMIT=$(get_pinned_commit ${TRITON_TEXT_FILE})
if [ -n "${UBUNTU_VERSION}" ];then
apt update
apt-get install -y gpg-agent
fi
apt update
apt-get install -y gpg-agent
if [ -n "${CONDA_CMAKE}" ]; then
# Keep the current cmake and numpy version here, so we can reinstall them later
@ -38,12 +36,12 @@ if [ -z "${MAX_JOBS}" ]; then
export MAX_JOBS=$(nproc)
fi
if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}" == "7" ]]; then
if [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}" == "7" ]]; then
# Triton needs at least gcc-9 to build
apt-get install -y g++-9
CXX=g++-9 pip_install "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python"
elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
elif [ -n "${CLANG_VERSION}" ]; then
# Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
add-apt-repository -y ppa:ubuntu-toolchain-r/test
apt-get install -y g++-9

View File

@ -1,44 +0,0 @@
ARG UBUNTU_VERSION
FROM ubuntu:${UBUNTU_VERSION}
ARG UBUNTU_VERSION
ENV DEBIAN_FRONTEND noninteractive
# Install common dependencies (so that this step can be cached separately)
COPY ./common/install_base.sh install_base.sh
RUN bash ./install_base.sh && rm install_base.sh
# Install missing libomp-dev
RUN apt-get update && apt-get install -y --no-install-recommends libomp-dev && apt-get autoclean && apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# Install user
COPY ./common/install_user.sh install_user.sh
RUN bash ./install_user.sh && rm install_user.sh
# Install conda and other packages (e.g., numpy, pytest)
ARG ANACONDA_PYTHON_VERSION
ARG CONDA_CMAKE
ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
COPY requirements-ci.txt /opt/conda/requirements-ci.txt
COPY ./common/install_conda.sh install_conda.sh
COPY ./common/common_utils.sh common_utils.sh
RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
# Install cuda and cudnn
ARG CUDA_VERSION
RUN wget -q https://raw.githubusercontent.com/pytorch/builder/main/common/install_cuda.sh -O install_cuda.sh
RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
ENV DESIRED_CUDA ${CUDA_VERSION}
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
# Note that Docker build forbids copying file outside the build context
COPY ./common/install_linter.sh install_linter.sh
COPY ./common/common_utils.sh common_utils.sh
RUN bash ./install_linter.sh
RUN rm install_linter.sh common_utils.sh
USER jenkins
CMD ["bash"]

View File

@ -75,10 +75,10 @@ librosa>=0.6.2 ; python_version < "3.11"
#Pinned versions:
#test that import:
mypy==1.7.0
mypy==1.4.1
# Pin MyPy version because new errors are likely to appear with each release
#Description: linter
#Pinned versions: 1.7.0
#Pinned versions: 1.4.1
#test that import: test_typing.py, test_type_hints.py
networkx==2.8.8
@ -124,22 +124,10 @@ opt-einsum==3.3
#Pinned versions: 3.3
#test that import: test_linalg.py
optree==0.9.1
#Description: A library for tree manipulation
#Pinned versions: 0.9.1
#test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
#test_pytree.py, test_ops.py, test_control_flow.py, test_modules.py,
#common_utils.py, test_eager_transforms.py, test_python_dispatch.py,
#test_expanded_weights.py, test_decomp.py, test_overrides.py, test_masked.py,
#test_ops.py, test_prims.py, test_subclass.py, test_functionalization.py,
#test_schema_check.py, test_profiler_tree.py, test_meta.py, test_torchxla_num_output.py,
#test_utils.py, test_proxy_tensor.py, test_memory_profiler.py, test_view_ops.py,
#test_pointwise_ops.py, test_dtensor_ops.py, test_torchinductor.py, test_fx.py,
#test_fake_tensor.py, test_mps.py
pillow==10.0.1
pillow==9.3.0 ; python_version <= "3.8"
pillow==9.5.0 ; python_version > "3.8"
#Description: Python Imaging Library fork
#Pinned versions: 10.0.1
#Pinned versions:
#test that import:
protobuf==3.20.2
@ -283,23 +271,7 @@ pytest-cpp==2.3.0
#Pinned versions: 2.3.0
#test that import:
z3-solver==4.12.2.0
z3-solver
#Description: The Z3 Theorem Prover Project
#Pinned versions:
#test that import:
tensorboard==2.13.0
#Description: Also included in .ci/docker/requirements-docs.txt
#Pinned versions:
#test that import: test_tensorboard
pywavelets==1.4.1
#Description: This is a requirement of scikit-image, we need to pin
# it here because 1.5.0 conflicts with numpy 1.21.2 used in CI
#Pinned versions: 1.4.1
#test that import:
lxml==4.9.4
#Description: This is a requirement of unittest-xml-reporting
# have to pin to 4.9.4 because 5.0.0 release on Dec 29th missing
# Python-3.9 binaries

View File

@ -1 +1 @@
2.2.0
2.1.0

View File

@ -79,6 +79,12 @@ ENV OPENSSL_ROOT_DIR /opt/openssl
RUN bash ./install_openssl.sh
ENV OPENSSL_DIR /opt/openssl
# (optional) Install non-default CMake version
ARG CMAKE_VERSION
COPY ./common/install_cmake.sh install_cmake.sh
RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
RUN rm install_cmake.sh
ARG INDUCTOR_BENCHMARKS
COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
COPY ./common/common_utils.sh common_utils.sh
@ -87,12 +93,6 @@ COPY ci_commit_pins/timm.txt timm.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
# (optional) Install non-default CMake version
ARG CMAKE_VERSION
COPY ./common/install_cmake.sh install_cmake.sh
RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
RUN rm install_cmake.sh
ARG TRITON
# Install triton, this needs to be done before sccache because the latter will
# try to reach out to S3, which docker build runners don't have access

View File

@ -17,6 +17,13 @@ ARG LLVMDEV
COPY ./common/install_clang.sh install_clang.sh
RUN bash ./install_clang.sh && rm install_clang.sh
# (optional) Install thrift.
ARG THRIFT
COPY ./common/install_thrift.sh install_thrift.sh
RUN if [ -n "${THRIFT}" ]; then bash ./install_thrift.sh; fi
RUN rm install_thrift.sh
ENV INSTALLED_THRIFT ${THRIFT}
# Install user
COPY ./common/install_user.sh install_user.sh
RUN bash ./install_user.sh && rm install_user.sh
@ -146,14 +153,6 @@ COPY ci_commit_pins/triton.txt triton.txt
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
RUN rm install_triton.sh common_utils.sh triton.txt
ARG EXECUTORCH
# Build and install executorch
COPY ./common/install_executorch.sh install_executorch.sh
COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/executorch.txt executorch.txt
RUN if [ -n "${EXECUTORCH}" ]; then bash ./install_executorch.sh; fi
RUN rm install_executorch.sh common_utils.sh executorch.txt
ARG ONNX
# Install ONNX dependencies
COPY ./common/install_onnx.sh ./common/common_utils.sh ./

View File

@ -3,6 +3,11 @@
# shellcheck source=./common.sh
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
# Use to retry ONNX test, only retry it twice
retry () {
"$@" || (sleep 60 && "$@")
}
if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
# TODO: This can be removed later once vision is also part of the Docker image
pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
@ -11,5 +16,5 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
# NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we
# need to bring this to the standard PyTorch run_test eventually. The issue will be tracked in
# https://github.com/pytorch/pytorch/issues/98626
"$ROOT_DIR/scripts/onnx/test.sh"
retry "$ROOT_DIR/scripts/onnx/test.sh"
fi

View File

@ -28,8 +28,6 @@ echo "Environment variables:"
env
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
# Use jemalloc during compilation to mitigate https://github.com/pytorch/pytorch/issues/116289
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
echo "NVCC version:"
nvcc --version
fi
@ -65,12 +63,6 @@ else
export LLVM_DIR=/opt/llvm/lib/cmake/llvm
fi
if [[ "$BUILD_ENVIRONMENT" == *executorch* ]]; then
# To build test_edge_op_registration
export BUILD_EXECUTORCH=ON
export USE_CUDA=0
fi
if ! which conda; then
# In ROCm CIs, we are doing cross compilation on build machines with
# intel cpu and later run tests on machines with amd cpu.
@ -167,14 +159,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* && -z "$TORCH_CUDA_ARCH_LIST" ]]; then
exit 1
fi
# We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
# memory to build and will OOM
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ "$TORCH_CUDA_ARCH_LIST" == *"8.6"* || "$TORCH_CUDA_ARCH_LIST" == *"8.0"* ]]; then
echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
fi
if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
export CC=clang
export CXX=clang++
@ -184,6 +168,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
export LDSHARED="clang --shared"
export USE_CUDA=0
export USE_ASAN=1
export USE_MKLDNN=0
export UBSAN_FLAGS="-fno-sanitize-recover=all;-fno-sanitize=float-divide-by-zero;-fno-sanitize=float-cast-overflow"
unset USE_LLVM
fi

View File

@ -43,7 +43,7 @@ function assert_git_not_dirty() {
# TODO: we should add an option to `build_amd.py` that reverts the repo to
# an unmodified state.
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *xla* ]] ; then
git_status=$(git status --porcelain | grep -v '?? third_party' || true)
git_status=$(git status --porcelain)
if [[ $git_status ]]; then
echo "Build left local git repository checkout dirty"
echo "git status --porcelain:"
@ -171,9 +171,16 @@ function install_torchrec_and_fbgemm() {
pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
}
function install_numpy_pytorch_interop() {
local commit
commit=$(get_pinned_commit numpy_pytorch_interop)
# TODO: --no-use-pep517 will result in failure.
pip_install --user "git+https://github.com/Quansight-Labs/numpy_pytorch_interop.git@${commit}"
}
function clone_pytorch_xla() {
if [[ ! -d ./xla ]]; then
git clone --recursive -b r2.2 https://github.com/pytorch/xla.git
git clone --recursive -b r2.1 https://github.com/pytorch/xla.git
pushd xla
# pin the xla hash so that we don't get broken by changes to xla
git checkout "$(cat ../.github/ci_commit_pins/xla.txt)"
@ -205,6 +212,15 @@ function test_torch_deploy(){
popd
}
function install_timm() {
local commit
commit=$(get_pinned_commit timm)
pip_install pandas
pip_install scipy
pip_install z3-solver
pip_install "git+https://github.com/rwightman/pytorch-image-models@${commit}"
}
function checkout_install_torchbench() {
local commit
commit=$(get_pinned_commit torchbench)

View File

@ -43,7 +43,7 @@ cross_compile_arm64() {
compile_arm64() {
# Compilation for arm64
# TODO: Compile with OpenMP support (but this causes CI regressions as cross-compilation were done with OpenMP disabled)
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
USE_DISTRIBUTED=0 USE_OPENMP=0 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
}
compile_x86_64() {

View File

@ -36,12 +36,10 @@ time python test/run_test.py --verbose -i distributed/test_functional_api
# DTensor tests
time python test/run_test.py --verbose -i distributed/_tensor/test_device_mesh
time python test/run_test.py --verbose -i distributed/_tensor/test_random_ops
time python test/run_test.py --verbose -i distributed/_tensor/test_dtensor_compile
# DeviceMesh test
time python test/run_test.py --verbose -i distributed/test_device_mesh
# DTensor/TP tests
time python test/run_test.py --verbose -i distributed/tensor/parallel/test_ddp_2d_parallel
time python test/run_test.py --verbose -i distributed/tensor/parallel/test_fsdp_2d_parallel

View File

@ -80,11 +80,6 @@ if [[ "$BUILD_ENVIRONMENT" != *bazel* ]]; then
CUSTOM_TEST_ARTIFACT_BUILD_DIR=$(realpath "${CUSTOM_TEST_ARTIFACT_BUILD_DIR:-"build/custom_test_artifacts"}")
fi
# Reduce set of tests to include when running run_test.py
if [[ -n $TESTS_TO_INCLUDE ]]; then
echo "Setting INCLUDE_CLAUSE"
INCLUDE_CLAUSE="--include $TESTS_TO_INCLUDE"
fi
# shellcheck source=./common.sh
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
@ -153,7 +148,7 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
export PYTORCH_TEST_WITH_ASAN=1
export PYTORCH_TEST_WITH_UBSAN=1
# TODO: Figure out how to avoid hard-coding these paths
export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-15/bin/llvm-symbolizer
export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-12/bin/llvm-symbolizer
export TORCH_USE_RTLD_GLOBAL=1
# NB: We load libtorch.so with RTLD_GLOBAL for UBSAN, unlike our
# default behavior.
@ -187,7 +182,7 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
# have, and it applies to child processes.
# TODO: get rid of the hardcoded path
export LD_PRELOAD=/usr/lib/llvm-15/lib/clang/15.0.7/lib/linux/libclang_rt.asan-x86_64.so
export LD_PRELOAD=/usr/lib/llvm-12/lib/clang/12.0.1/lib/linux/libclang_rt.asan-x86_64.so
# Disable valgrind for asan
export VALGRIND=OFF
# Increase stack size, because ASAN red zones use more stack
@ -233,16 +228,13 @@ test_python_shard() {
exit 1
fi
# Bare --include flag is not supported and quoting for lint ends up with flag not being interpreted correctly
# shellcheck disable=SC2086
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS" --verbose
assert_git_not_dirty
}
test_python() {
# shellcheck disable=SC2086
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --verbose
assert_git_not_dirty
}
@ -289,10 +281,6 @@ test_inductor_distributed() {
# Smuggle a few multi-gpu tests here so that we don't have to request another large node
echo "Testing multi_gpu tests in test_torchinductor"
pytest test/inductor/test_torchinductor.py -k test_multi_gpu
pytest test/inductor/test_aot_inductor.py -k test_non_default_cuda_device
pytest test/inductor/test_aot_inductor.py -k test_replicate_on_devices
pytest test/distributed/_tensor/test_dtensor_compile.py
pytest test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
# this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
# with if required # gpus aren't available
@ -315,17 +303,14 @@ test_inductor() {
# "Global" flags for inductor benchmarking controlled by TEST_CONFIG
# For example 'dynamic_aot_eager_torchbench' TEST_CONFIG means we run
# the benchmark script with '--dynamic-shapes --backend aot_eager --device cuda'
# The matrix of test options is specified in .github/workflows/inductor.yml,
# .github/workflows/inductor-periodic.yml, and
# .github/workflows/inductor-perf-test-nightly.yml
# The matrix of test options is specified in .github/workflows/periodic.yml
# and .github/workflows/inductor.yml
DYNAMO_BENCHMARK_FLAGS=()
if [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
DYNAMO_BENCHMARK_FLAGS+=(--backend eager)
elif [[ "${TEST_CONFIG}" == *aot_eager* ]]; then
DYNAMO_BENCHMARK_FLAGS+=(--backend aot_eager)
elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
DYNAMO_BENCHMARK_FLAGS+=(--export-aot-inductor)
elif [[ "${TEST_CONFIG}" == *inductor* && "${TEST_CONFIG}" != *perf* ]]; then
DYNAMO_BENCHMARK_FLAGS+=(--inductor)
fi
@ -334,7 +319,7 @@ if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
DYNAMO_BENCHMARK_FLAGS+=(--dynamic-shapes --dynamic-batch-only)
fi
if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
if [[ "${TEST_CONFIG}" == *cpu_accuracy* ]]; then
DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
else
DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
@ -398,11 +383,6 @@ test_perf_for_dashboard() {
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
--output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_${suite}_${dtype}_${mode}_cuda_${target}.csv"
fi
if [[ "$DASHBOARD_TAG" == *freeze_autotune_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
TORCHINDUCTOR_MAX_AUTOTUNE=1 python "benchmarks/dynamo/$suite.py" \
"${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
--output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_autotune_${suite}_${dtype}_${mode}_cuda_${target}.csv"
fi
if [[ "$DASHBOARD_TAG" == *aotinductor-true* ]] && [[ "$mode" == "inference" ]]; then
python "benchmarks/dynamo/$suite.py" \
"${target_flag[@]}" --"$mode" --"$dtype" --export-aot-inductor --disable-cudagraphs "$@" \
@ -453,12 +433,19 @@ test_single_dynamo_benchmark() {
"${DYNAMO_BENCHMARK_FLAGS[@]}" \
"$@" "${partition_flags[@]}" \
--output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
python benchmarks/dynamo/check_accuracy.py \
--actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
--expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
python benchmarks/dynamo/check_graph_breaks.py \
--actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
--expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
if [[ "${TEST_CONFIG}" == *inductor* ]] && [[ "${TEST_CONFIG}" != *cpu_accuracy* ]]; then
# other jobs (e.g. periodic, cpu-accuracy) may have different set of expected models.
python benchmarks/dynamo/check_accuracy.py \
--actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
--expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
python benchmarks/dynamo/check_graph_breaks.py \
--actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
--expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
else
python benchmarks/dynamo/check_csv.py \
-f "$TEST_REPORTS_DIR/${name}_${suite}.csv"
fi
fi
}
@ -476,10 +463,8 @@ test_dynamo_benchmark() {
elif [[ "${TEST_CONFIG}" == *perf* ]]; then
test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
else
if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
if [[ "${TEST_CONFIG}" == *cpu_accuracy* ]]; then
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 "$@"
elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
else
test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
@ -494,13 +479,9 @@ test_inductor_torchbench_smoketest_perf() {
python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
--batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
--output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
# The threshold value needs to be actively maintained to make this check useful
python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4
python benchmarks/dynamo/torchbench.py --device cuda --performance --bfloat16 --inference \
--export-aot-inductor --only nanogpt --output "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv"
# The threshold value needs to be actively maintained to make this check useful
python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 5.2
# the reference speedup value is hardcoded in check_hf_bert_perf_csv.py
# this value needs to be actively maintained to make this check useful
python benchmarks/dynamo/check_hf_bert_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
# Check memory compression ratio for a few models
for test in hf_Albert timm_vision_transformer; do
@ -563,10 +544,6 @@ test_without_numpy() {
python -c "import sys;sys.path.insert(0, 'fake_numpy');from unittest import TestCase;import torch;x=torch.randn(3,3);TestCase().assertRaises(RuntimeError, lambda: x.numpy())"
# Regression test for https://github.com/pytorch/pytorch/issues/66353
python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;print(torch.tensor([torch.tensor(0.), torch.tensor(1.)]))"
# Regression test for https://github.com/pytorch/pytorch/issues/109387
if [[ "${TEST_CONFIG}" == *dynamo* ]]; then
python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;torch.compile(lambda x:print(x))('Hello World')"
fi
popd
}
@ -624,7 +601,7 @@ test_libtorch_jit() {
# Run jit and lazy tensor cpp tests together to finish them faster
if [[ "$BUILD_ENVIRONMENT" == *cuda* && "$TEST_CONFIG" != *nogpu* ]]; then
LTC_TS_CUDA=1 python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/test_lazy
LTC_TS_CUDA=1 python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/nvfuser_tests cpp/test_lazy
else
# CUDA tests have already been skipped when CUDA is not available
python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/test_lazy -k "not CUDA"
@ -685,8 +662,7 @@ test_vulkan() {
test_distributed() {
echo "Testing distributed python tests"
# shellcheck disable=SC2086
time python test/run_test.py --distributed-tests --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" $INCLUDE_CLAUSE --verbose
time python test/run_test.py --distributed-tests --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
assert_git_not_dirty
if [[ ("$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm*) && "$SHARD_NUMBER" == 1 ]]; then
@ -995,28 +971,9 @@ test_docs_test() {
}
test_executorch() {
pushd /executorch
echo "Install torchvision and torchaudio"
# TODO(huydhn): Switch this to the pinned commits on ExecuTorch once they are
# there. These libraries need to be built here, and not part of the Docker
# image because they require the target version of torch to be installed first
pip_install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git"
pip_install --no-use-pep517 --user "git+https://github.com/pytorch/vision.git"
echo "Run ExecuTorch regression tests for some models"
# NB: This is a sample model, more can be added here
export PYTHON_EXECUTABLE=python
# TODO(huydhn): Add more coverage here using ExecuTorch's gather models script
# shellcheck disable=SC1091
source .ci/scripts/test.sh mv3 cmake xnnpack-quantization-delegation ''
popd
# Test torchgen generated code for Executorch.
echo "Testing ExecuTorch op registration"
echo "Testing Executorch op registration"
"$BUILD_BIN_DIR"/test_edge_op_registration
assert_git_not_dirty
}
@ -1031,8 +988,6 @@ elif [[ "${TEST_CONFIG}" == *xla* ]]; then
install_torchvision
build_xla
test_xla
elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
test_executorch
elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
test_python_legacy_jit
elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
@ -1055,10 +1010,11 @@ elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
test_dynamo_benchmark huggingface "$id"
elif [[ "${TEST_CONFIG}" == *timm* ]]; then
install_torchvision
install_timm
id=$((SHARD_NUMBER-1))
test_dynamo_benchmark timm_models "$id"
elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
if [[ "${TEST_CONFIG}" == *cpu_accuracy* ]]; then
install_torchaudio cpu
else
install_torchaudio cuda
@ -1075,7 +1031,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
checkout_install_torchbench
# Do this after checkout_install_torchbench to ensure we clobber any
# nightlies that torchbench may pull in
if [[ "${TEST_CONFIG}" != *cpu_inductor* ]]; then
if [[ "${TEST_CONFIG}" != *cpu_accuracy* ]]; then
install_torchrec_and_fbgemm
fi
PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
@ -1087,10 +1043,12 @@ elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
test_without_numpy
install_torchvision
install_numpy_pytorch_interop
test_dynamo_shard 1
test_aten
elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
install_torchvision
install_numpy_pytorch_interop
test_dynamo_shard 2
elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
test_without_numpy
@ -1118,10 +1076,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
test_libtorch
elif [[ "${TEST_CONFIG}" = docs_test ]]; then
test_docs_test
elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
install_torchvision
test_python
test_aten
else
install_torchvision
install_monkeytype
@ -1134,4 +1088,5 @@ else
test_custom_backend
test_torch_function_benchmark
test_benchmarks
test_executorch
fi

View File

@ -127,7 +127,8 @@ python -c "import os, glob; os.system('python -mpip install --no-index --no-deps
:: export test times so that potential sharded tests that'll branch off this build will use consistent data
python tools/stats/export_test_times.py
robocopy /E ".additional_ci_files" "%PYTORCH_FINAL_PACKAGE_DIR%\.additional_ci_files"
copy /Y ".pytorch-test-times.json" "%PYTORCH_FINAL_PACKAGE_DIR%"
copy /Y ".pytorch-test-file-ratings.json" "%PYTORCH_FINAL_PACKAGE_DIR%"
:: Also save build/.ninja_log as an artifact
copy /Y "build\.ninja_log" "%PYTORCH_FINAL_PACKAGE_DIR%\"

View File

@ -2,7 +2,6 @@
import os
import subprocess
import sys
COMMON_TESTS = [
(
@ -54,4 +53,4 @@ if __name__ == "__main__":
print("Reruning with traceback enabled")
print("Command:", command_string)
subprocess.run(command_args, check=False)
sys.exit(e.returncode)
exit(e.returncode)

View File

@ -26,6 +26,11 @@ popd
python test_custom_ops.py -v
if ERRORLEVEL 1 exit /b 1
:: TODO: fix and re-enable this test
:: See https://github.com/pytorch/pytorch/issues/25155
:: python test_custom_classes.py -v
:: if ERRORLEVEL 1 exit /b 1
python model.py --export-script-module="build/model.pt"
if ERRORLEVEL 1 exit /b 1

View File

@ -1,3 +1,7 @@
:: Skip LibTorch tests when building a GPU binary and testing on a CPU machine
:: because LibTorch tests are not well designed for this use case.
if "%USE_CUDA%" == "0" IF NOT "%CUDA_VERSION%" == "cpu" exit /b 0
call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
if errorlevel 1 exit /b 1
@ -17,7 +21,7 @@ if not errorlevel 0 exit /b 1
cd %TMP_DIR_WIN%\build\torch\test
for /r "." %%a in (*.exe) do (
call :libtorch_check "%%~na" "%%~fa"
if errorlevel 1 goto fail
if errorlevel 1 exit /b 1
)
goto :eof
@ -30,6 +34,18 @@ set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\test
:: Skip verify_api_visibility as it a compile level test
if "%~1" == "verify_api_visibility" goto :eof
:: See https://github.com/pytorch/pytorch/issues/25161
if "%~1" == "c10_metaprogramming_test" goto :eof
if "%~1" == "module_test" goto :eof
:: See https://github.com/pytorch/pytorch/issues/25312
if "%~1" == "converter_nomigraph_test" goto :eof
:: See https://github.com/pytorch/pytorch/issues/35636
if "%~1" == "generate_proposals_op_gpu_test" goto :eof
:: See https://github.com/pytorch/pytorch/issues/35648
if "%~1" == "reshape_op_gpu_test" goto :eof
:: See https://github.com/pytorch/pytorch/issues/35651
if "%~1" == "utility_ops_gpu_test" goto :eof
echo Running "%~2"
if "%~1" == "c10_intrusive_ptr_benchmark" (
:: NB: This is not a gtest executable file, thus couldn't be handled by pytest-cpp
@ -40,15 +56,11 @@ if "%~1" == "c10_intrusive_ptr_benchmark" (
python test\run_test.py --cpp --verbose -i "cpp/%~1"
if errorlevel 1 (
echo %1 failed with exit code %errorlevel%
goto fail
exit /b 1
)
if not errorlevel 0 (
echo %1 failed with exit code %errorlevel%
goto fail
exit /b 1
)
:eof
exit /b 0
:fail
exit /b 1
goto :eof

View File

@ -1,7 +1,8 @@
call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
echo Copying over test times file
robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"
copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%PROJECT_DIR_WIN%"
copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-file-ratings.json" "%PROJECT_DIR_WIN%"
pushd test

View File

@ -22,7 +22,8 @@ if "%SHARD_NUMBER%" == "1" (
)
echo Copying over test times file
robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"
copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%PROJECT_DIR_WIN%"
copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-file-ratings.json" "%PROJECT_DIR_WIN%"
echo Run nn tests
python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose

View File

@ -35,10 +35,10 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
fi
# TODO: Move both of them to Windows AMI
python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0
python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0
# Install Z3 optional dependency for Windows builds.
python -m pip install z3-solver==4.12.2.0
python -m pip install z3-solver
run_tests() {
# Run nvidia-smi if available

View File

@ -0,0 +1,28 @@
from collections import OrderedDict
from cimodel.data.simple.util.branch_filters import gen_filter_dict
from cimodel.lib.miniutils import quote
CHANNELS_TO_PRUNE = ["pytorch-nightly", "pytorch-test"]
PACKAGES_TO_PRUNE = "pytorch torchvision torchaudio torchtext ignite torchcsprng"
def gen_workflow_job(channel: str):
return OrderedDict(
{
"anaconda_prune": OrderedDict(
{
"name": f"anaconda-prune-{channel}",
"context": quote("org-member"),
"packages": quote(PACKAGES_TO_PRUNE),
"channel": channel,
"filters": gen_filter_dict(branches_list=["postnightly"]),
}
)
}
)
def get_workflow_jobs():
return [gen_workflow_job(channel) for channel in CHANNELS_TO_PRUNE]

View File

@ -32,4 +32,4 @@ def gen_mobile_docker(specifier):
DOCKER_IMAGE_ASAN, DOCKER_REQUIREMENT_ASAN = gen_mobile_docker("asan")
DOCKER_IMAGE_NDK, DOCKER_REQUIREMENT_NDK = gen_mobile_docker("android-ndk-r21e")
DOCKER_IMAGE_NDK, DOCKER_REQUIREMENT_NDK = gen_mobile_docker("android-ndk-r19c")

49
.circleci/config.yml generated
View File

@ -444,6 +444,35 @@ jobs:
script="/Users/distiller/project/.circleci/scripts/binary_ios_upload.sh"
cat "$script"
source "$script"
anaconda_prune:
parameters:
packages:
type: string
description: "What packages are we pruning? (quoted, space-separated string. eg. 'pytorch', 'torchvision torchaudio', etc.)"
default: "pytorch"
channel:
type: string
description: "What channel are we pruning? (eq. pytorch-nightly)"
default: "pytorch-nightly"
docker:
- image: continuumio/miniconda3
environment:
- PACKAGES: "<< parameters.packages >>"
- CHANNEL: "<< parameters.channel >>"
steps:
- checkout
- run:
name: Install dependencies
no_output_timeout: "1h"
command: |
conda install -yq anaconda-client
- run:
name: Prune packages
no_output_timeout: "1h"
command: |
ANACONDA_API_TOKEN="${CONDA_PYTORCHBOT_TOKEN}" \
scripts/release/anaconda-prune/run.sh
pytorch_doc_push:
resource_class: medium
machine:
@ -623,7 +652,7 @@ jobs:
- run:
name: Archive artifacts into zip
command: |
zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json .additional_ci_files
zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json .pytorch-test-times.json .pytorch-test-file-ratings.json
cp artifacts.zip /Users/distiller/workspace
- persist_to_workspace:
@ -1385,4 +1414,22 @@ workflows:
requires:
- pytorch_ios_full_jit_12_5_1_nightly_x86_64_build
- pytorch_ios_full_jit_12_5_1_nightly_arm64_build
- anaconda_prune:
name: anaconda-prune-pytorch-nightly
context: "org-member"
packages: "pytorch torchvision torchaudio torchtext ignite torchcsprng"
channel: pytorch-nightly
filters:
branches:
only:
- postnightly
- anaconda_prune:
name: anaconda-prune-pytorch-test
context: "org-member"
packages: "pytorch torchvision torchaudio torchtext ignite torchcsprng"
channel: pytorch-test
filters:
branches:
only:
- postnightly
when: << pipeline.parameters.run_build >>

View File

@ -10,6 +10,8 @@ import shutil
import sys
from collections import namedtuple
import cimodel.data.simple.anaconda_prune_defintions
import cimodel.data.simple.docker_definitions
import cimodel.data.simple.mobile_definitions
import cimodel.data.simple.nightly_ios
@ -142,6 +144,7 @@ def gen_build_workflows_tree():
build_workflows_functions = [
cimodel.data.simple.mobile_definitions.get_workflow_jobs,
cimodel.data.simple.nightly_ios.get_workflow_jobs,
cimodel.data.simple.anaconda_prune_defintions.get_workflow_jobs,
]
build_jobs = [f() for f in build_workflows_functions]
build_jobs.extend(

View File

@ -62,7 +62,7 @@ git --no-pager log --max-count 1
popd
# Clone the Builder main repo
retry git clone -q https://github.com/pytorch/builder.git "$BUILDER_ROOT"
retry git clone -q https://github.com/pytorch/builder.git -b release/2.1 "$BUILDER_ROOT"
pushd "$BUILDER_ROOT"
echo "Using builder from "
git --no-pager log --max-count 1

View File

@ -33,7 +33,7 @@ fi
cp ${PROJ_ROOT}/LICENSE ${ZIP_DIR}/
# zip the library
export DATE="$(date -u +%Y%m%d)"
export IOS_NIGHTLY_BUILD_VERSION="2.2.0.${DATE}"
export IOS_NIGHTLY_BUILD_VERSION="2.1.0.${DATE}"
if [ "${BUILD_LITE_INTERPRETER}" == "1" ]; then
# libtorch_lite_ios_nightly_1.11.0.20210810.zip
ZIPFILE="libtorch_lite_ios_nightly_${IOS_NIGHTLY_BUILD_VERSION}.zip"

View File

@ -54,7 +54,7 @@ fi
# Move debug wheels out of the package dir so they don't get installed
# Move debug wheels out of the the package dir so they don't get installed
mkdir -p /tmp/debug_final_pkgs
mv /final_pkgs/debug-*.zip /tmp/debug_final_pkgs || echo "no debug packages to move"
@ -66,12 +66,6 @@ mv /final_pkgs/debug-*.zip /tmp/debug_final_pkgs || echo "no debug packages to m
# conda build scripts themselves. These should really be consolidated
# Pick only one package of multiple available (which happens as result of workflow re-runs)
pkg="/final_pkgs/\$(ls -1 /final_pkgs|sort|tail -1)"
if [[ "\$PYTORCH_BUILD_VERSION" == *dev* ]]; then
CHANNEL="nightly"
else
CHANNEL="test"
fi
if [[ "$PACKAGE_TYPE" == conda ]]; then
(
# For some reason conda likes to re-activate the conda environment when attempting this install
@ -89,14 +83,25 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
if [[ "$DESIRED_CUDA" == 'cpu' ]]; then
retry conda install -c pytorch -y cpuonly
else
cu_ver="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4}"
CUDA_PACKAGE="pytorch-cuda"
retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c "pytorch-\${CHANNEL}" "pytorch-cuda=\${cu_ver}"
PYTORCH_CHANNEL="pytorch"
if [[ "\${TORCH_CONDA_BUILD_FOLDER}" == "pytorch-nightly" ]]; then
PYTORCH_CHANNEL="pytorch-nightly"
fi
retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c "\${PYTORCH_CHANNEL}" "pytorch-cuda=\${cu_ver}"
fi
conda install \${EXTRA_CONDA_FLAGS} -y "\$pkg" --offline
)
elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
if [[ "$(uname -m)" == aarch64 ]]; then
# Using "extra-index-url" until all needed aarch64 dependencies are
# added to "https://download.pytorch.org/whl/nightly/"
pip install "\$pkg" --extra-index-url "https://download.pytorch.org/whl/nightly/${DESIRED_CUDA}"
else
pip install "\$pkg" --index-url "https://download.pytorch.org/whl/nightly/${DESIRED_CUDA}"
fi
retry pip install -q numpy protobuf typing-extensions
fi
if [[ "$PACKAGE_TYPE" == libtorch ]]; then

View File

@ -59,7 +59,7 @@ PIP_UPLOAD_FOLDER='nightly/'
# We put this here so that OVERRIDE_PACKAGE_VERSION below can read from it
export DATE="$(date -u +%Y%m%d)"
#TODO: We should be pulling semver version from the base version.txt
BASE_BUILD_VERSION="2.2.0.dev$DATE"
BASE_BUILD_VERSION="2.1.0.dev$DATE"
# Change BASE_BUILD_VERSION to git tag when on a git tag
# Use 'git -C' to make doubly sure we're in the correct directory for checking
# the git tag
@ -77,8 +77,13 @@ else
export PYTORCH_BUILD_VERSION="${BASE_BUILD_VERSION}+$DESIRED_CUDA"
fi
if [[ -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
export PYTORCH_BUILD_VERSION="${PYTORCH_BUILD_VERSION}-with-pypi-cudnn"
fi
export PYTORCH_BUILD_NUMBER=1
JAVA_HOME=
BUILD_JNI=OFF
if [[ "$PACKAGE_TYPE" == libtorch ]]; then
@ -150,8 +155,8 @@ EOL
# nproc doesn't exist on darwin
if [[ "$(uname)" != Darwin ]]; then
# This was lowered from 18 to 12 to avoid OOMs when compiling FlashAttentionV2
MEMORY_LIMIT_MAX_JOBS=12
# Because most Circle executors only have 20 CPUs, using more causes OOMs w/ Ninja and nvcc parallelization
MEMORY_LIMIT_MAX_JOBS=18
NUM_CPUS=$(( $(nproc) - 2 ))
# Defaults here for **binary** linux builds so they can be changed in one place

View File

@ -11,11 +11,16 @@ PKG_DIR=${PKG_DIR:-/tmp/workspace/final_pkgs}
# currently set within `designate_upload_channel`
UPLOAD_CHANNEL=${UPLOAD_CHANNEL:-nightly}
# Designates what subfolder to put packages into
UPLOAD_SUBFOLDER=${UPLOAD_SUBFOLDER:-}
UPLOAD_SUBFOLDER=${UPLOAD_SUBFOLDER:-cpu}
UPLOAD_BUCKET="s3://pytorch"
BACKUP_BUCKET="s3://pytorch-backup"
BUILD_NAME=${BUILD_NAME:-}
# this is temporary change to upload pypi-cudnn builds to separate folder
if [[ ${BUILD_NAME} == *with-pypi-cudnn* ]]; then
UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_cudnn"
fi
DRY_RUN=${DRY_RUN:-enabled}
# Don't actually do work unless explicit
ANACONDA="true anaconda"
@ -59,17 +64,12 @@ s3_upload() {
local pkg_type
extension="$1"
pkg_type="$2"
s3_root_dir="${UPLOAD_BUCKET}/${pkg_type}/${UPLOAD_CHANNEL}"
if [[ -z ${UPLOAD_SUBFOLDER:-} ]]; then
s3_upload_dir="${s3_root_dir}/"
else
s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/"
fi
s3_dir="${UPLOAD_BUCKET}/${pkg_type}/${UPLOAD_CHANNEL}/${UPLOAD_SUBFOLDER}/"
(
for pkg in ${PKG_DIR}/*.${extension}; do
(
set -x
${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_upload_dir}"
${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_dir}"
)
done
)
@ -82,17 +82,15 @@ pip install -q awscli
case "${PACKAGE_TYPE}" in
conda)
conda_upload
for conda_archive in ${PKG_DIR}/*.tar.bz2; do
# Fetch platform (eg. win-64, linux-64, etc.) from index file because
# there's no actual conda command to read this
subdir=$(\
tar -xOf "${conda_archive}" info/index.json \
| grep subdir \
| cut -d ':' -f2 \
| sed -e 's/[[:space:]]//' -e 's/"//g' -e 's/,//' \
)
BACKUP_DIR="conda/${subdir}"
done
# Fetch platform (eg. win-64, linux-64, etc.) from index file
# Because there's no actual conda command to read this
subdir=$(\
tar -xOf ${PKG_DIR}/*.bz2 info/index.json \
| grep subdir \
| cut -d ':' -f2 \
| sed -e 's/[[:space:]]//' -e 's/"//g' -e 's/,//' \
)
BACKUP_DIR="conda/${subdir}"
;;
libtorch)
s3_upload "zip" "libtorch"

View File

@ -42,3 +42,32 @@ jobs:
script="/Users/distiller/project/.circleci/scripts/binary_ios_upload.sh"
cat "$script"
source "$script"
anaconda_prune:
parameters:
packages:
type: string
description: "What packages are we pruning? (quoted, space-separated string. eg. 'pytorch', 'torchvision torchaudio', etc.)"
default: "pytorch"
channel:
type: string
description: "What channel are we pruning? (eq. pytorch-nightly)"
default: "pytorch-nightly"
docker:
- image: continuumio/miniconda3
environment:
- PACKAGES: "<< parameters.packages >>"
- CHANNEL: "<< parameters.channel >>"
steps:
- checkout
- run:
name: Install dependencies
no_output_timeout: "1h"
command: |
conda install -yq anaconda-client
- run:
name: Prune packages
no_output_timeout: "1h"
command: |
ANACONDA_API_TOKEN="${CONDA_PYTORCHBOT_TOKEN}" \
scripts/release/anaconda-prune/run.sh

View File

@ -177,7 +177,7 @@
- run:
name: Archive artifacts into zip
command: |
zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json .additional_ci_files
zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json .pytorch-test-times.json .pytorch-test-file-ratings.json
cp artifacts.zip /Users/distiller/workspace
- persist_to_workspace:

View File

@ -1,8 +1,5 @@
---
# NOTE there must be no spaces before the '-', so put the comma last.
# The check bugprone-unchecked-optional-access is also turned off atm
# because it causes clang-tidy to hang randomly. The tracking issue
# can be found at https://github.com/llvm/llvm-project/issues/69369.
InheritParentConfig: true
Checks: '
bugprone-*,
@ -12,7 +9,6 @@ bugprone-*,
-bugprone-lambda-function-name,
-bugprone-reserved-identifier,
-bugprone-swapped-arguments,
-bugprone-unchecked-optional-access,
clang-diagnostic-missing-prototypes,
cppcoreguidelines-*,
-cppcoreguidelines-avoid-do-while,
@ -34,13 +30,8 @@ cppcoreguidelines-*,
-facebook-hte-RelativeInclude,
hicpp-exception-baseclass,
hicpp-avoid-goto,
misc-*,
-misc-const-correctness,
-misc-use-anonymous-namespace,
-misc-unused-parameters,
-misc-no-recursion,
-misc-non-private-member-variables-in-classes,
-misc-confusable-identifiers,
misc-unused-alias-decls,
misc-unused-using-decls,
modernize-*,
-modernize-concat-nested-namespaces,
-modernize-macro-to-enum,
@ -53,7 +44,7 @@ modernize-*,
performance-*,
readability-container-size-empty,
'
HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
HeaderFilterRegex: '^(c10/(?!test)|torch/csrc/(?!deploy/interpreter/cpython)).*$'
AnalyzeTemporaryDtors: false
WarningsAsErrors: '*'
...

View File

@ -1,72 +0,0 @@
# Step by step guide on using PyTorch's DevContainer
Using PyTorch's DevContainer environment involves a series of steps that will help you set up a development environment that is isolated and replicable. Below, we'll guide you through each step to make this process as smooth as possible:
## Step 1: Install VSCode
1. Navigate to the [Visual Studio Code website](https://code.visualstudio.com/).
2. Download the appropriate installer for your operating system (Windows, Linux, or macOS).
3. Run the installer and follow the on-screen instructions to install VSCode on your system.
4. After installation, launch VSCode.
## Step 2: Install DevContainer Extension
1. In VSCode, go to the Extensions view by clicking on the Extensions icon in the Activity Bar on the side of the window.
2. Search for "Dev Containers" in the Extensions view search bar.
3. Find the "Dev Containers" extension in the search results and click on the install button to install it.
You can also go to the extension's [homepage](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) and [documentation page](https://code.visualstudio.com/docs/devcontainers/containers) to find more details.
## Step 3: Install Docker and Add Current Login User to Docker Group
1. Follow the [official guide](https://docs.docker.com/get-docker/) to install Docker. Don't forget the [post installation steps](https://docs.docker.com/engine/install/linux-postinstall/).
If you are using [Visual Studio Code Remote - SSH](https://code.visualstudio.com/docs/remote/ssh), then you only need to install Docker in the remote host, not your local computer. And the following steps should be run in the remote host.
## Step 4 (Optional): Install NVIDIA Container Toolkit for GPU Usage
1. If you intend to use GPU resources, first ensure you have NVIDIA drivers installed on your system. Check if `nvidia-smi` works to verify your GPU setup.
2. Follow the [official guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#docker) to install the NVIDIA Container Toolkit.
3. After installation, verify that the toolkit is installed correctly by running:
```
docker run --rm --runtime=nvidia --gpus all nvidia/cuda:11.6.2-base-ubuntu20.04 nvidia-smi
```
## Step 5: Clone PyTorch
1. Open a terminal or command prompt.
2. Use the following command to clone the PyTorch repository:
```
git clone https://github.com/pytorch/pytorch
```
3. Navigate to the cloned directory:
```
cd pytorch
```
## Step 6: Open in DevContainer
1. In VSCode, use the Command Palette (`Ctrl+Shift+P` or `Cmd+Shift+P` on macOS) to run the "Remote-Containers: Open Folder in Container..." command.
2. You will be prompted with two options: CPU dev container or CUDA dev container. Choose the one you want to run.
## Step 7: Wait for Building the Environment
1. After opening the folder in a DevContainer, VSCode will start building the container. This process can take some time as it involves downloading necessary images and setting up the environment.
2. You can monitor the progress in the VSCode terminal.
3. Once the build process completes, you'll have a fully configured PyTorch development environment in a container.
4. The next time you open the same dev container, it will be much faster, as it does not require building the image again.
You are now all set to start developing with PyTorch in a DevContainer environment. This setup ensures you have a consistent and isolated development environment for your PyTorch projects.
## Step 8: Build PyTorch
To build pytorch from source, simply run:
```
python setup.py develop
```
The process involves compiling thousands of files, and would take a long time. Fortunately, the compiled objects can be useful for your next build. When you modify some files, you only need to compile the changed files the next time.
Note that only contents in the `pytorch` directory are saved to disk. This directory is mounted to the docker image, while other contents in the docker image are all temporary, and will be lost if docker restarts the image or the server reboots.
For an in-depth understanding of Dev Container and its caveats, please refer to [the full documentation](https://code.visualstudio.com/docs/devcontainers/containers).

View File

@ -9,5 +9,3 @@ make setup_lint
# Add CMAKE_PREFIX_PATH to bashrc
echo 'export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}' >> ~/.bashrc
# Add linker path so that cuda-related libraries can be found
echo 'export LDFLAGS="-L${CONDA_PREFIX}/lib/ $LDFLAGS"' >> ~/.bashrc

12
.flake8
View File

@ -2,7 +2,7 @@
# NOTE: **Mirror any changes** to this file the [tool.ruff] config in pyproject.toml
# before we can fully move to use ruff
enable-extensions = G
select = B,C,E,F,G,P,SIM1,T4,W,B9,TOR0,TOR1,TOR2
select = B,C,E,F,G,P,SIM1,T4,W,B9
max-line-length = 120
# C408 ignored because we like the dict keyword argument syntax
# E501 is not flexible enough, we're using B950 instead
@ -14,21 +14,15 @@ ignore =
# to line this up with executable bit
EXE001,
# these ignores are from flake8-bugbear; please fix!
B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907
B007,B008,B017,B019,B020,B023,B024,B026,B028,B903,B904,B905,B906,B907
# these ignores are from flake8-comprehensions; please fix!
C407,
# these ignores are from flake8-logging-format; please fix!
G100,G101,G200
G100,G101,G200,G201,G202
# these ignores are from flake8-simplify. please fix or ignore with commented reason
SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
# flake8-simplify code styles
SIM102,SIM103,SIM106,SIM112,
# TorchFix codes that don't make sense for PyTorch itself:
# removed and deprecated PyTorch functions.
TOR001,TOR101,
# TODO(kit1980): fix all TOR102 issues
# `torch.load` without `weights_only` parameter is unsafe
TOR102,
per-file-ignores =
__init__.py: F401
torch/utils/cpp_extension.py: B950

View File

@ -7,7 +7,7 @@ self-hosted-runner:
- linux.4xlarge
- linux.12xlarge
- linux.24xlarge
- linux.arm64.2xlarge
- linux.t4g.2xlarge
- linux.4xlarge.nvidia.gpu
- linux.8xlarge.nvidia.gpu
- linux.16xlarge.nvidia.gpu
@ -23,5 +23,3 @@ self-hosted-runner:
- macos-12-xl
- macos-12
- macos12.3-m1
- macos-latest-xlarge
- macos-13-xlarge

View File

@ -13,10 +13,6 @@ inputs:
required: true
type: string
description: JSON description of what test configs to run.
job-name:
type: string
required: false
default: ""
outputs:
test-matrix:
@ -46,8 +42,7 @@ runs:
retry_wait_seconds: 30
command: |
set -eux
# PyYAML 6.0 doesn't work with MacOS x86 anymore
python3 -m pip install requests==2.26.0 pyyaml==6.0.1
python3 -m pip install requests==2.26.0 pyyaml==6.0
- name: Parse ref
id: parse-ref
@ -61,7 +56,6 @@ runs:
- name: Get the job name
id: get-job-name
if: inputs.job-name == ''
continue-on-error: true
shell: bash
run: |
@ -97,7 +91,7 @@ runs:
shell: bash
env:
GITHUB_TOKEN: ${{ inputs.github-token }}
JOB_NAME: ${{ inputs.job-name == '' && steps.get-job-name.outputs.job-name || inputs.job-name }}
JOB_NAME: ${{ steps.get-job-name.outputs.job-name }}
PR_NUMBER: ${{ github.event.pull_request.number }}
TAG: ${{ steps.parse-ref.outputs.tag }}
EVENT_NAME: ${{ github.event_name }}

View File

@ -11,20 +11,18 @@ outputs:
job-id:
description: The retrieved workflow job id
value: ${{ steps.get-job-id.outputs.job-id }}
job-name:
description: The retrieved workflow job name
value: ${{ steps.get-job-id.outputs.job-name }}
runs:
using: composite
steps:
- name: Get job id and name or fail
- name: Get jobid or fail
# timeout-minutes is unsupported for composite workflows, see https://github.com/actions/runner/issues/1979
# timeout-minutes: 10
shell: bash
id: get-job-id
run: |
set -eux
python3 .github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}"
GHA_WORKFLOW_JOB_ID=$(python3 .github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}")
echo "job-id=${GHA_WORKFLOW_JOB_ID}" >> "${GITHUB_OUTPUT}"
env:
GITHUB_TOKEN: ${{ inputs.github-token }}

View File

@ -10,13 +10,6 @@ inputs:
description: Shard number for the current job
required: false
default: "0"
sha:
description: SHA for the commit
required: true
test_config:
description: Name of the test config
required: false
default: "default"
job_identifier:
description: Text that uniquely identifies a given job type within a workflow. All shards of a job should share the same job identifier.
required: true
@ -40,8 +33,6 @@ runs:
env:
CACHE_DIR: ${{ inputs.cache_dir }}
JOB_IDENTIFIER: ${{ inputs.job_identifier }}
SHA: ${{ inputs.sha }}
TEST_CONFIG: ${{ inputs.test_config }}
SHARD: ${{ inputs.shard }}
REPO: ${{ github.repository }}
run: |
@ -50,8 +41,6 @@ runs:
--cache_dir $GITHUB_WORKSPACE/$CACHE_DIR \
--pr_identifier $GITHUB_REF \
--job_identifier $JOB_IDENTIFIER \
--sha $SHA \
--test_config $TEST_CONFIG \
--shard $SHARD \
--repo $REPO \
--temp_dir $RUNNER_TEMP \

View File

@ -43,14 +43,14 @@ runs:
FILE_SUFFIX: ${{ inputs.file-suffix }}
run: |
# Remove any previous test reports if they exist
rm -f logs-*.zip
rm -f usage-log-*.zip
# this workflow is also run in bazel build test, but we dont generate usage reports for it
# so check to see if the file exists first
if [ -f 'usage_log.txt' ]; then
zip "logs-${FILE_SUFFIX}.zip" 'usage_log.txt'
zip "usage-log-${FILE_SUFFIX}.zip" 'usage_log.txt'
fi
if ls test/**/*.log 1> /dev/null 2>&1; then
zip -r "logs-${FILE_SUFFIX}.zip" test -i '*.log'
zip -r "usage-log-${FILE_SUFFIX}.zip" test -i '*.log'
fi
# Windows zip
@ -80,7 +80,7 @@ runs:
FILE_SUFFIX: ${{ inputs.file-suffix }}
run: |
# -ir => recursive include all files in pattern
7z a "logs-$Env:FILE_SUFFIX.zip" 'usage_log.txt' -ir'!test\*.log'
7z a "usage-log-$Env:FILE_SUFFIX.zip" 'usage_log.txt' -ir'!test\*.log'
# S3 upload
- name: Store Test Downloaded JSONs on S3
@ -112,7 +112,7 @@ runs:
${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
retention-days: 14
if-no-files-found: ignore
path: logs-*.zip
path: usage-log-*.zip
# GHA upload
- name: Store Test Downloaded JSONs on Github
@ -146,7 +146,7 @@ runs:
continue-on-error: true
with:
# Add the run attempt, see [Artifact run attempt]
name: logs-runattempt${{ github.run_attempt }}-${{ inputs.file-suffix }}.zip
name: usage-log-runattempt${{ github.run_attempt }}-${{ inputs.file-suffix }}.zip
retention-days: 14
if-no-files-found: ignore
path: |

View File

@ -1 +1 @@
6518fa9b2c74e84d7eb1fc6e3eb51e43213f0c05
a8f4e97bd5356a7a77510cdf6a3a62e25a5dc602

View File

@ -1 +1 @@
de731af65b4f04696e85c729e3282450b51b95fd
1b2746f642cc2c99fe9d1a0c34359c0de45341c2

View File

@ -0,0 +1 @@
0c4e82511d349358d2c8c492dd833334e742f27f

1
.github/ci_commit_pins/timm.txt vendored Normal file
View File

@ -0,0 +1 @@
b9d43c7dcac1fe05e851dd7be7187b108af593d2

View File

@ -1 +1 @@
99944a2fb8624947f9c0e2edc898ff42a16124da
9371b9e13c826f3930e54346b4d619cb59182f68

View File

@ -1 +1 @@
c1e2095c3a16fbe7db25b9e2f206025488c2c203
47cd5ea8e21d7596a24907710411d6b4a43f628d

View File

@ -1 +1 @@
r2.2
r2.1

12
.github/labeler.yml vendored
View File

@ -15,7 +15,6 @@
"ciflow/inductor":
- torch/_decomp/**
- torch/_dynamo/**
- torch/_export/**
- torch/_inductor/**
- benchmarks/dynamo/**
- torch/_subclasses/fake_tensor.py
@ -29,10 +28,6 @@
- .github/ci_commit_pins/**
- c10/core/Sym*
- torch/fx/experimental/symbolic_shapes.py
- test/distributed/_tensor/test_dtensor_compile.py
- test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
- torch/distributed/_tensor/**
- torch/distributed/fsdp/**
"module: cpu":
- aten/src/ATen/cpu/**
@ -71,10 +66,3 @@
"ciflow/trunk":
- .ci/docker/ci_commit_pins/triton.txt
"module: distributed":
- torch/csrc/distributed/**
- torch/distributed/**
- torch/nn/parallel/**
- test/distributed/**
- torch/testing/_internal/distributed/**

View File

@ -4,19 +4,15 @@
- .ci/onnx/*
- .ci/docker/common/install_onnx.sh
- aten/src/ATen/core/interned_strings.h
- benchmarks/dynamo/**
- docs/source/onnx.rst
- docs/source/onnx*
- docs/source/scripts/onnx/**
- docs/source/_static/img/onnx/**
- scripts/onnx/**
- test/onnx/**
- test/onnx_caffe2/**
- tools/onnx/**
- torch/_dynamo/backends/onnxrt.py
- torch/_C/__init__.pyi.in
- torch/_C/_onnx.pyi
- torch/_logging/**
- torch/csrc/jit/passes/onnx.*
- torch/csrc/jit/passes/onnx/**
- torch/csrc/jit/serialization/export.*
@ -26,6 +22,8 @@
- torch/testing/_internal/common_methods_invocations.py
- third_party/onnx
- caffe2/python/onnx/**
- benchmarks/dynamo/_onnx/**
- torch/_logging/**
approved_by:
- BowenBao
- abock
@ -74,7 +72,6 @@
- name: OSS CI / pytorchbot
patterns:
- .github/ci_commit_pins/audio.txt
- .github/ci_commit_pins/vision.txt
- .github/ci_commit_pins/torchdynamo.txt
- .ci/docker/ci_commit_pins/triton.txt
@ -85,19 +82,6 @@
- EasyCLA
- Lint
- pull
- inductor
- name: OSS CI /pytorchbot / Executorch
patterns:
- .ci/docker/ci_commit_pins/executorch.txt
approved_by:
- pytorchbot
ignore_flaky_failures: false
mandatory_checks_name:
- EasyCLA
- Lint
- pull / linux-jammy-py3-clang12-executorch / build
- pull / linux-jammy-py3-clang12-executorch / test (executorch, 1, 1, linux.2xlarge)
- name: OSS CI / pytorchbot / XLA
patterns:
@ -108,8 +92,8 @@
mandatory_checks_name:
- EasyCLA
- Lint
- pull / linux-focal-py3_8-clang9-xla / build
- pull / linux-focal-py3_8-clang9-xla / test (xla, 1, 1, linux.12xlarge)
- pull / linux-bionic-py3_8-clang8-xla / build
- pull / linux-bionic-py3_8-clang8-xla / test (xla, 1, 1, linux.12xlarge)
- name: Documentation
patterns:
@ -139,6 +123,9 @@
- name: PrimTorch
patterns:
- aten/src/ATen/native_functions.yaml
- aten/src/ATen/native/**
- test/**
- torch/_meta_registrations.py
- torch/_decomp/**
- torch/_refs/**
@ -332,7 +319,6 @@
- XiaobingSuper
- jgong5
- vfdev-5
- leslie-fang-intel
mandatory_checks_name:
- EasyCLA
- Lint
@ -351,21 +337,6 @@
- Lint
- pull
- name: x86 CPU quantization
patterns:
- torch/ao/quantization/quantizer/x86_inductor_quantizer.py
- torch/_inductor/fx_passes/quantization.py
- test/quantization/core/test_quantized_op.py
- test/inductor/test_mkldnn_pattern_matcher.py
- test/quantization/pt2e/test_x86inductor_quantizer.py
approved_by:
- leslie-fang-intel
- jgong5
mandatory_checks_name:
- EasyCLA
- Lint
- pull
- name: Autocast
patterns:
- torch/amp/**

View File

@ -10,7 +10,6 @@ ciflow_push_tags:
- ciflow/mps
- ciflow/nightly
- ciflow/periodic
- ciflow/rocm
- ciflow/slow
- ciflow/trunk
- ciflow/unstable

View File

@ -1,5 +1,7 @@
blas=1.0
cmake=3.22.1
mkl=2022.1.0
mkl-include=2022.1.0
ninja=1.10.2
numpy=1.23.3
pyyaml=6.0

View File

@ -5,7 +5,7 @@ cmake=3.22.*
typing-extensions=4.3.0
dataclasses=0.8
pip=22.2.2
pillow=10.0.1
pillow=9.2.0
pkg-config=0.29.2
wheel=0.37.1
# NB: This is intentionally held back because anaconda main doesn't

View File

@ -7,7 +7,7 @@ cmake=3.22.*
typing-extensions=4.3.0
dataclasses=0.8
pip=22.2.2
pillow=10.0.1
pillow=9.2.0
libuv=1.40.0
pkg-config=0.29.2
wheel=0.37.1

View File

@ -1,4 +1,3 @@
# iOS simulator requirements
coremltools==5.0b5
protobuf==3.20.2
optree==0.9.1

View File

@ -10,7 +10,6 @@ numba<=0.49.1; platform_machine != "arm64"
opt-einsum>=3.3
psutil==5.9.1
nvidia-ml-py==11.525.84
packaging==23.1
pygments==2.15.0
pytest==7.3.2
pytest-xdist==3.3.1
@ -26,5 +25,3 @@ sympy==1.11.1
pytest-cpp==2.3.0
rockset==1.0.3
z3-solver==4.12.2.0
tensorboard==2.13.0
optree==0.9.1

View File

@ -1,2 +1,2 @@
typing-extensions>=4.8.0
typing-extensions
jinja2

View File

@ -60,20 +60,12 @@ def build_triton(
build_conda: bool = False,
build_rocm: bool = False,
py_version: Optional[str] = None,
release: bool = False,
) -> Path:
env = os.environ.copy()
if "MAX_JOBS" not in env:
max_jobs = os.cpu_count() or 1
env["MAX_JOBS"] = str(max_jobs)
version_suffix = ""
if not release:
# Nightly binaries include the triton commit hash, i.e. 2.1.0+e6216047b8
# while release build should only include the version, i.e. 2.1.0
version_suffix = f"+{commit_hash[:10]}"
version += version_suffix
with TemporaryDirectory() as tmpdir:
triton_basedir = Path(tmpdir) / "triton"
triton_pythondir = triton_basedir / "python"
@ -84,18 +76,11 @@ def build_triton(
triton_repo = "https://github.com/openai/triton"
triton_pkg_name = "pytorch-triton"
check_call(["git", "clone", triton_repo], cwd=tmpdir)
if release:
ver, rev, patch = version.split(".")
check_call(
["git", "checkout", f"release/{ver}.{rev}.x"], cwd=triton_basedir
)
else:
check_call(["git", "checkout", commit_hash], cwd=triton_basedir)
check_call(["git", "checkout", commit_hash], cwd=triton_basedir)
if build_conda:
with open(triton_basedir / "meta.yaml", "w") as meta:
print(
f"package:\n name: torchtriton\n version: {version}\n",
f"package:\n name: torchtriton\n version: {version}+{commit_hash[:10]}\n",
file=meta,
)
print("source:\n path: .\n", file=meta)
@ -118,7 +103,7 @@ def build_triton(
patch_init_py(
triton_pythondir / "triton" / "__init__.py",
version=f"{version}",
version=f"{version}+{commit_hash[:10]}",
)
if py_version is None:
py_version = f"{sys.version_info.major}.{sys.version_info.minor}"
@ -141,21 +126,17 @@ def build_triton(
shutil.copy(conda_path, Path.cwd())
return Path.cwd() / conda_path.name
# change built wheel name and version
env["TRITON_WHEEL_NAME"] = triton_pkg_name
env["TRITON_WHEEL_VERSION_SUFFIX"] = version_suffix
patch_setup_py(
triton_pythondir / "setup.py",
name=triton_pkg_name,
version=f"{version}+{commit_hash[:10]}",
)
patch_init_py(
triton_pythondir / "triton" / "__init__.py",
version=f"{version}",
version=f"{version}+{commit_hash[:10]}",
)
if build_rocm:
# TODO: Remove me when ROCM triton is updated
patch_setup_py(
triton_pythondir / "setup.py",
name=triton_pkg_name,
version=f"{version}",
)
check_call("scripts/amd/setup_rocm_libs.sh", cwd=triton_basedir, shell=True)
print("ROCm libraries setup for triton installation...")
@ -176,14 +157,12 @@ def main() -> None:
from argparse import ArgumentParser
parser = ArgumentParser("Build Triton binaries")
parser.add_argument("--release", action="store_true")
parser.add_argument("--build-conda", action="store_true")
parser.add_argument("--build-rocm", action="store_true")
parser.add_argument("--py-version", type=str)
parser.add_argument("--commit-hash", type=str)
parser.add_argument("--triton-version", type=str, default=read_triton_version())
args = parser.parse_args()
build_triton(
build_rocm=args.build_rocm,
commit_hash=args.commit_hash
@ -192,7 +171,6 @@ def main() -> None:
version=args.triton_version,
build_conda=args.build_conda,
py_version=args.py_version,
release=args.release,
)

View File

@ -1,7 +1,6 @@
#!/usr/bin/env python3
"""Check whether a PR has required labels."""
import sys
from typing import Any
from github_utils import gh_delete_comment, gh_post_pr_comment
@ -47,7 +46,7 @@ def main() -> None:
except Exception as e:
pass
sys.exit(0)
exit(0)
if __name__ == "__main__":

Binary file not shown.

View File

@ -1,5 +1,6 @@
#!/usr/bin/env python3
import argparse
import sys
from pathlib import Path
@ -9,11 +10,9 @@ import yaml
REPO_ROOT = Path(__file__).resolve().parent.parent.parent
WORKFLOWS = REPO_ROOT / ".github" / "workflows"
EXPECTED_GROUP_PREFIX = (
"${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}"
)
EXPECTED_GROUP = (
EXPECTED_GROUP_PREFIX + "-${{ github.event_name == 'workflow_dispatch' }}"
"${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}"
"-${{ github.event_name == 'workflow_dispatch' }}"
)
@ -27,8 +26,15 @@ def should_check(filename: Path) -> bool:
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Ensure all relevant GitHub actions jobs will be cancelled based on a concurrency key"
)
args = parser.parse_args()
files = list(WORKFLOWS.glob("*.yml"))
errors_found = False
files = [f for f in WORKFLOWS.glob("*.yml") if should_check(f)]
files = [f for f in files if should_check(f)]
names = set()
for filename in files:
with open(filename) as f:
@ -40,18 +46,7 @@ if __name__ == "__main__":
errors_found = True
names.add(name)
actual = data.get("concurrency", {})
if filename.name == "create_release.yml":
if not actual.get("group", "").startswith(EXPECTED_GROUP_PREFIX):
print(
f"'concurrency' incorrect or not found in '{filename.relative_to(REPO_ROOT)}'",
file=sys.stderr,
)
print(
f"concurrency group should start with {EXPECTED_GROUP_PREFIX} but found {actual.get('group', None)}",
file=sys.stderr,
)
errors_found = True
elif not actual.get("group", "").startswith(EXPECTED_GROUP):
if not actual.get("group", "").startswith(EXPECTED_GROUP):
print(
f"'concurrency' incorrect or not found in '{filename.relative_to(REPO_ROOT)}'",
file=sys.stderr,

View File

@ -62,9 +62,9 @@ SUPPORTED_PERIODICAL_MODES: Dict[str, Callable[[Optional[str]], bool]] = {
}
# The link to the published list of disabled jobs
DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=jbbJUxI_SSZFssBBGCU6ybH9sxHitHLY"
DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json"
# and unstable jobs
UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=hUtTalgnWb1m3AtJyVLUdu7DBrnddRkp"
UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json"
# Some constants used to handle disabled and unstable jobs
JOB_NAME_SEP = "/"
@ -410,17 +410,16 @@ def process_jobs(
if target_job in (TEST_JOB_NAME, BUILD_AND_TEST_JOB_NAME):
target_cfg = m.group("cfg")
# NB: There can be multiple unstable configurations, i.e. inductor, inductor_huggingface
test_matrix = _filter_jobs(
return _filter_jobs(
test_matrix=test_matrix,
issue_type=issue_type,
target_cfg=target_cfg,
)
else:
warnings.warn(
f"Found a matching {issue_type.value} issue {target_url} for {workflow} / {job_name}, "
+ f"but the name {target_job_cfg} is invalid"
)
warnings.warn(
f"Found a matching {issue_type.value} issue {target_url} for {workflow} / {job_name}, "
+ f"but the name {target_job_cfg} is invalid"
)
# Found no matching target, return the same input test matrix
return test_matrix

View File

@ -10,19 +10,13 @@ architectures:
* Latest ROCM
"""
import os
from typing import Dict, List, Optional, Tuple
CUDA_ARCHES = ["11.8", "12.1"]
CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1"}
CUDA_ARCHES_CUDNN_VERSION = {"11.8": "8", "12.1": "8"}
ROCM_ARCHES = ["5.6", "5.7"]
ROCM_ARCHES = ["5.5", "5.6"]
CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]
@ -31,80 +25,6 @@ CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]
CPU_AARCH64_ARCH = ["cpu-aarch64"]
PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"11.8": (
"nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | " # noqa: B950
"nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'"
),
"12.1": (
"nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | " # noqa: B950
"nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'"
),
}
def get_nccl_submodule_version() -> str:
from pathlib import Path
nccl_version_mk = (
Path(__file__).absolute().parent.parent.parent
/ "third_party"
/ "nccl"
/ "nccl"
/ "makefiles"
/ "version.mk"
)
if not nccl_version_mk.exists():
raise RuntimeError(
"Please make sure that nccl submodule is checked out when importing this script"
)
with nccl_version_mk.open("r") as f:
content = f.read()
d = {}
for l in content.split("\n"):
if not l.startswith("NCCL_"):
continue
(k, v) = l.split(":=")
d[k.strip()] = v.strip()
return f"{d['NCCL_MAJOR']}.{d['NCCL_MINOR']}.{d['NCCL_PATCH']}"
def get_nccl_wheel_version(arch_version: str) -> str:
import re
requirements = map(
str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
)
return [x for x in requirements if x.startswith("nvidia-nccl-cu")][0].split("==")[1]
def validate_nccl_dep_consistency(arch_version: str) -> None:
wheel_ver = get_nccl_wheel_version(arch_version)
submodule_ver = get_nccl_submodule_version()
if wheel_ver != submodule_ver:
raise RuntimeError(
f"NCCL submodule version {submodule_ver} differs from wheel version {wheel_ver}"
)
def arch_type(arch_version: str) -> str:
if arch_version in CUDA_ARCHES:
return "cuda"
@ -118,29 +38,23 @@ def arch_type(arch_version: str) -> str:
return "cpu"
# This can be updated to the release version when cutting release branch, i.e. 2.1
DEFAULT_TAG = os.getenv("RELEASE_VERSION_TAG", "main")
WHEEL_CONTAINER_IMAGES = {
**{
gpu_arch: f"pytorch/manylinux-builder:cuda{gpu_arch}-{DEFAULT_TAG}"
gpu_arch: f"pytorch/manylinux-builder:cuda{gpu_arch}"
for gpu_arch in CUDA_ARCHES
},
**{
gpu_arch: f"pytorch/manylinux-builder:rocm{gpu_arch}-{DEFAULT_TAG}"
gpu_arch: f"pytorch/manylinux-builder:rocm{gpu_arch}"
for gpu_arch in ROCM_ARCHES
},
"cpu": f"pytorch/manylinux-builder:cpu-{DEFAULT_TAG}",
"cpu-cxx11-abi": f"pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-{DEFAULT_TAG}",
"cpu-aarch64": f"pytorch/manylinuxaarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
"cpu": "pytorch/manylinux-builder:cpu",
"cpu-cxx11-abi": "pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi",
"cpu-aarch64": "pytorch/manylinuxaarch64-builder:cpu-aarch64",
}
CONDA_CONTAINER_IMAGES = {
**{
gpu_arch: f"pytorch/conda-builder:cuda{gpu_arch}-{DEFAULT_TAG}"
for gpu_arch in CUDA_ARCHES
},
"cpu": f"pytorch/conda-builder:cpu-{DEFAULT_TAG}",
**{gpu_arch: f"pytorch/conda-builder:cuda{gpu_arch}" for gpu_arch in CUDA_ARCHES},
"cpu": "pytorch/conda-builder:cpu",
}
PRE_CXX11_ABI = "pre-cxx11"
@ -150,38 +64,26 @@ DEBUG = "debug"
LIBTORCH_CONTAINER_IMAGES: Dict[Tuple[str, str], str] = {
**{
(
gpu_arch,
PRE_CXX11_ABI,
): f"pytorch/manylinux-builder:cuda{gpu_arch}-{DEFAULT_TAG}"
(gpu_arch, PRE_CXX11_ABI): f"pytorch/manylinux-builder:cuda{gpu_arch}"
for gpu_arch in CUDA_ARCHES
},
**{
(
gpu_arch,
CXX11_ABI,
): f"pytorch/libtorch-cxx11-builder:cuda{gpu_arch}-{DEFAULT_TAG}"
(gpu_arch, CXX11_ABI): f"pytorch/libtorch-cxx11-builder:cuda{gpu_arch}"
for gpu_arch in CUDA_ARCHES
},
**{
(
gpu_arch,
PRE_CXX11_ABI,
): f"pytorch/manylinux-builder:rocm{gpu_arch}-{DEFAULT_TAG}"
(gpu_arch, PRE_CXX11_ABI): f"pytorch/manylinux-builder:rocm{gpu_arch}"
for gpu_arch in ROCM_ARCHES
},
**{
(
gpu_arch,
CXX11_ABI,
): f"pytorch/libtorch-cxx11-builder:rocm{gpu_arch}-{DEFAULT_TAG}"
(gpu_arch, CXX11_ABI): f"pytorch/libtorch-cxx11-builder:rocm{gpu_arch}"
for gpu_arch in ROCM_ARCHES
},
("cpu", PRE_CXX11_ABI): f"pytorch/manylinux-builder:cpu-{DEFAULT_TAG}",
("cpu", CXX11_ABI): f"pytorch/libtorch-cxx11-builder:cpu-{DEFAULT_TAG}",
("cpu", PRE_CXX11_ABI): "pytorch/manylinux-builder:cpu",
("cpu", CXX11_ABI): "pytorch/libtorch-cxx11-builder:cpu",
}
FULL_PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11", "3.12"]
FULL_PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11"]
def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
@ -288,6 +190,7 @@ def generate_wheels_matrix(
os: str,
arches: Optional[List[str]] = None,
python_versions: Optional[List[str]] = None,
gen_special_an_non_special_wheel: bool = True,
) -> List[Dict[str, str]]:
package_type = "wheel"
if os == "linux" or os == "linux-aarch64":
@ -321,8 +224,9 @@ def generate_wheels_matrix(
else arch_version
)
# 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
if arch_version in ["12.1", "11.8"] and os == "linux":
# special 12.1 wheels package without dependencies
# dependency downloaded via pip install
if arch_version == "12.1" and os == "linux":
ret.append(
{
"python_version": python_version,
@ -334,36 +238,41 @@ def generate_wheels_matrix(
"devtoolset": "",
"container_image": WHEEL_CONTAINER_IMAGES[arch_version],
"package_type": package_type,
"pytorch_extra_install_requirements": PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version], # fmt: skip
"build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace( # noqa: B950
"pytorch_extra_install_requirements": "nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | " # noqa: B950
"nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'",
"build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-with-pypi-cudnn".replace( # noqa: B950
".", "_"
),
}
)
else:
ret.append(
{
"python_version": python_version,
"gpu_arch_type": gpu_arch_type,
"gpu_arch_version": gpu_arch_version,
"desired_cuda": translate_desired_cuda(
gpu_arch_type, gpu_arch_version
),
"devtoolset": "cxx11-abi"
if arch_version == "cpu-cxx11-abi"
else "",
"container_image": WHEEL_CONTAINER_IMAGES[arch_version],
"package_type": package_type,
"build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
".", "_"
),
"pytorch_extra_install_requirements":
PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.1"] # fmt: skip
if os != "linux" else "",
}
)
if not gen_special_an_non_special_wheel:
continue
ret.append(
{
"python_version": python_version,
"gpu_arch_type": gpu_arch_type,
"gpu_arch_version": gpu_arch_version,
"desired_cuda": translate_desired_cuda(
gpu_arch_type, gpu_arch_version
),
"devtoolset": "cxx11-abi"
if arch_version == "cpu-cxx11-abi"
else "",
"container_image": WHEEL_CONTAINER_IMAGES[arch_version],
"package_type": package_type,
"build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
".", "_"
),
}
)
return ret
validate_nccl_dep_consistency("12.1")
validate_nccl_dep_consistency("11.8")

View File

@ -60,7 +60,7 @@ class BinaryBuildWorkflow:
branches: str = "nightly"
# Mainly for macos
cross_compile_arm64: bool = False
macos_runner: str = "macos-12-xl"
xcode_version: str = ""
def __post_init__(self) -> None:
if self.abi_version:
@ -125,9 +125,7 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
package_type="libtorch",
abi_version=generate_binary_build_matrix.CXX11_ABI,
build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
OperatingSystem.LINUX,
generate_binary_build_matrix.CXX11_ABI,
libtorch_variants=["shared-with-deps"],
OperatingSystem.LINUX, generate_binary_build_matrix.CXX11_ABI
),
ciflow_config=CIFlowConfig(
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
@ -139,9 +137,7 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
package_type="libtorch",
abi_version=generate_binary_build_matrix.PRE_CXX11_ABI,
build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
OperatingSystem.LINUX,
generate_binary_build_matrix.PRE_CXX11_ABI,
libtorch_variants=["shared-with-deps"],
OperatingSystem.LINUX, generate_binary_build_matrix.PRE_CXX11_ABI
),
ciflow_config=CIFlowConfig(
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
@ -158,6 +154,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
OperatingSystem.LINUX,
arches=["11.8", "12.1"],
python_versions=["3.8"],
gen_special_an_non_special_wheel=False,
),
branches="main",
),
@ -215,9 +212,7 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
package_type="libtorch",
abi_version=generate_binary_build_matrix.RELEASE,
build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
OperatingSystem.WINDOWS,
generate_binary_build_matrix.RELEASE,
libtorch_variants=["shared-with-deps"],
OperatingSystem.WINDOWS, generate_binary_build_matrix.RELEASE
),
ciflow_config=CIFlowConfig(
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
@ -229,9 +224,7 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
package_type="libtorch",
abi_version=generate_binary_build_matrix.DEBUG,
build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
OperatingSystem.WINDOWS,
generate_binary_build_matrix.DEBUG,
libtorch_variants=["shared-with-deps"],
OperatingSystem.WINDOWS, generate_binary_build_matrix.DEBUG
),
ciflow_config=CIFlowConfig(
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
@ -301,39 +294,20 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
package_type="libtorch",
abi_version=generate_binary_build_matrix.CXX11_ABI,
build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
OperatingSystem.MACOS,
generate_binary_build_matrix.CXX11_ABI,
libtorch_variants=["shared-with-deps"],
OperatingSystem.MACOS, generate_binary_build_matrix.CXX11_ABI
),
ciflow_config=CIFlowConfig(
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
isolated_workflow=True,
),
),
BinaryBuildWorkflow(
os=OperatingSystem.MACOS_ARM64,
package_type="libtorch",
abi_version=generate_binary_build_matrix.CXX11_ABI,
build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
OperatingSystem.MACOS,
generate_binary_build_matrix.CXX11_ABI,
libtorch_variants=["shared-with-deps"],
),
cross_compile_arm64=False,
macos_runner="macos-13-xlarge",
ciflow_config=CIFlowConfig(
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
isolated_workflow=True,
),
),
BinaryBuildWorkflow(
os=OperatingSystem.MACOS_ARM64,
package_type="wheel",
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
OperatingSystem.MACOS_ARM64
),
cross_compile_arm64=False,
macos_runner="macos-13-xlarge",
cross_compile_arm64=True,
ciflow_config=CIFlowConfig(
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
isolated_workflow=True,

View File

@ -1,42 +0,0 @@
#!/usr/bin/env python3
"""Generates a matrix for docker releases through github actions
Will output a condensed version of the matrix. Will include fllowing:
* CUDA version short
* CUDA full verison
* CUDNN version short
* Image type either runtime or devel
* Platform linux/arm64,linux/amd64
"""
import json
from typing import Dict, List
import generate_binary_build_matrix
DOCKER_IMAGE_TYPES = ["runtime", "devel"]
def generate_docker_matrix() -> Dict[str, List[Dict[str, str]]]:
ret: List[Dict[str, str]] = []
for cuda, version in generate_binary_build_matrix.CUDA_ARCHES_FULL_VERSION.items():
for image in DOCKER_IMAGE_TYPES:
ret.append(
{
"cuda": cuda,
"cuda_full_version": version,
"cudnn_version": generate_binary_build_matrix.CUDA_ARCHES_CUDNN_VERSION[
cuda
],
"image_type": image,
"platform": "linux/arm64,linux/amd64",
}
)
return {"include": ret}
if __name__ == "__main__":
build_matrix = generate_docker_matrix()
print(json.dumps(build_matrix))

View File

@ -111,7 +111,7 @@ def fetch_jobs(url: str, headers: Dict[str, str]) -> List[Dict[str, str]]:
# running.
def find_job_id_name(args: Any) -> Tuple[str, str]:
def find_job_id(args: Any) -> str:
# From https://docs.github.com/en/actions/learn-github-actions/environment-variables
PYTORCH_REPO = os.environ.get("GITHUB_REPOSITORY", "pytorch/pytorch")
PYTORCH_GITHUB_API = f"https://api.github.com/repos/{PYTORCH_REPO}"
@ -130,28 +130,15 @@ def find_job_id_name(args: Any) -> Tuple[str, str]:
for job in jobs:
if job["runner_name"] == args.runner_name:
return (job["id"], job["name"])
return job["id"]
raise RuntimeError(f"Can't find job id for runner {args.runner_name}")
def set_output(name: str, val: Any) -> None:
if os.getenv("GITHUB_OUTPUT"):
with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
print(f"{name}={val}", file=env)
print(f"setting {name}={val}")
else:
print(f"::set-output name={name}::{val}")
def main() -> None:
args = parse_args()
try:
# Get both the job ID and job name because we have already spent a request
# here to get the job info
job_id, job_name = find_job_id_name(args)
set_output("job-id", job_id)
set_output("job-name", job_name)
print(find_job_id(args))
except Exception as e:
print(repr(e), file=sys.stderr)
print(f"workflow-{args.workflow_run_id}")

View File

@ -5,15 +5,12 @@ import os
import warnings
from dataclasses import dataclass
from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union
from typing import Any, Callable, cast, Dict, List, Optional, Tuple
from urllib.error import HTTPError
from urllib.parse import quote
from urllib.request import Request, urlopen
GITHUB_API_URL = "https://api.github.com"
@dataclass
class GitHubComment:
body_text: str
@ -29,20 +26,16 @@ def gh_fetch_url_and_headers(
url: str,
*,
headers: Optional[Dict[str, str]] = None,
data: Union[Optional[Dict[str, Any]], str] = None,
data: Optional[Dict[str, Any]] = None,
method: Optional[str] = None,
reader: Callable[[Any], Any] = lambda x: x.read(),
) -> Tuple[Any, Any]:
if headers is None:
headers = {}
token = os.environ.get("GITHUB_TOKEN")
if token is not None and url.startswith(f"{GITHUB_API_URL}/"):
if token is not None and url.startswith("https://api.github.com/"):
headers["Authorization"] = f"token {token}"
data_ = None
if data is not None:
data_ = data.encode() if isinstance(data, str) else json.dumps(data).encode()
data_ = json.dumps(data).encode() if data is not None else None
try:
with urlopen(Request(url, headers=headers, data=data_, method=method)) as conn:
return conn.headers, reader(conn)
@ -64,7 +57,7 @@ def gh_fetch_url(
url: str,
*,
headers: Optional[Dict[str, str]] = None,
data: Union[Optional[Dict[str, Any]], str] = None,
data: Optional[Dict[str, Any]] = None,
method: Optional[str] = None,
reader: Callable[[Any], Any] = lambda x: x.read(),
) -> Any:
@ -132,7 +125,7 @@ def gh_post_pr_comment(
org: str, repo: str, pr_num: int, comment: str, dry_run: bool = False
) -> List[Dict[str, Any]]:
return _gh_post_comment(
f"{GITHUB_API_URL}/repos/{org}/{repo}/issues/{pr_num}/comments",
f"https://api.github.com/repos/{org}/{repo}/issues/{pr_num}/comments",
comment,
dry_run,
)
@ -142,14 +135,14 @@ def gh_post_commit_comment(
org: str, repo: str, sha: str, comment: str, dry_run: bool = False
) -> List[Dict[str, Any]]:
return _gh_post_comment(
f"{GITHUB_API_URL}/repos/{org}/{repo}/commits/{sha}/comments",
f"https://api.github.com/repos/{org}/{repo}/commits/{sha}/comments",
comment,
dry_run,
)
def gh_delete_comment(org: str, repo: str, comment_id: int) -> None:
url = f"{GITHUB_API_URL}/repos/{org}/{repo}/issues/comments/{comment_id}"
url = f"https://api.github.com/repos/{org}/{repo}/issues/comments/{comment_id}"
gh_fetch_url(url, method="DELETE")
@ -160,7 +153,7 @@ def gh_fetch_merge_base(org: str, repo: str, base: str, head: str) -> str:
# https://docs.github.com/en/rest/commits/commits?apiVersion=2022-11-28#compare-two-commits
try:
json_data = gh_fetch_url(
f"{GITHUB_API_URL}/repos/{org}/{repo}/compare/{base}...{head}",
f"https://api.github.com/repos/{org}/{repo}/compare/{base}...{head}",
headers={"Accept": "application/vnd.github.v3+json"},
reader=json.load,
)
@ -174,18 +167,3 @@ def gh_fetch_merge_base(org: str, repo: str, base: str, head: str) -> str:
warnings.warn(f"Failed to get merge base for {base}...{head}: {error}")
return merge_base
def gh_update_pr_state(org: str, repo: str, pr_num: int, state: str = "open") -> None:
url = f"{GITHUB_API_URL}/repos/{org}/{repo}/pulls/{pr_num}"
try:
gh_fetch_url(url, method="PATCH", data={"state": state})
except HTTPError as err:
# When trying to open the pull request, error 422 means that the branch
# has been deleted and the API couldn't re-open it
if err.code == 422 and state == "open":
warnings.warn(
f"Failed to open {pr_num} because its head branch has been deleted: {err}"
)
else:
raise

105513
.github/scripts/gql_mocks.json generated vendored Normal file

File diff suppressed because one or more lines are too long

Binary file not shown.

View File

@ -38,12 +38,6 @@ def parse_args() -> argparse.Namespace:
required=True,
help="A unique job identifier that should be the same for all runs of job",
)
parser.add_argument(
"--sha", required="--upload" in sys.argv, help="SHA of the commit"
) # Only required for upload
parser.add_argument(
"--test_config", required="--upload" in sys.argv, help="The test config"
) # Only required for upload
parser.add_argument(
"--shard", required="--upload" in sys.argv, help="The shard id"
) # Only required for upload
@ -90,8 +84,6 @@ def main() -> None:
pr_identifier=pr_identifier,
repo=repo,
job_identifier=args.job_identifier,
sha=args.sha,
test_config=args.test_config,
shard=args.shard,
cache_dir=cache_dir,
bucket=args.bucket,

View File

@ -56,8 +56,6 @@ def upload_pytest_cache(
pr_identifier: PRIdentifier,
repo: GithubRepo,
job_identifier: str,
sha: str,
test_config: str,
shard: str,
cache_dir: Path,
temp_dir: Path,
@ -81,11 +79,25 @@ def upload_pytest_cache(
if not bucket:
bucket = BUCKET
# Upload the cache
obj_key_prefix = _get_s3_key_prefix(
pr_identifier, repo, job_identifier, sha, test_config, shard
# Merge the current cache with any caches from previous runs before uploading
# We only need to merge it with the cache for the same shard (which will have already been downloaded if it exists)
# since the other shards will handle themselves
shard_cache_path = _get_temp_cache_dir_path(
temp_dir, pr_identifier, repo, job_identifier, shard
)
zip_file_path = zip_folder(cache_dir, temp_dir / ZIP_UPLOAD / obj_key_prefix)
if shard_cache_path.is_dir():
_merge_pytest_caches(shard_cache_path, cache_dir)
#
# Upload the cache
#
obj_key_prefix = _get_s3_key_prefix(pr_identifier, repo, job_identifier, shard)
# This doesn't include the zip file extension. That'll get added later
zip_file_path = temp_dir / ZIP_UPLOAD / obj_key_prefix
zip_file_path = zip_folder(cache_dir, zip_file_path)
obj_key = f"{obj_key_prefix}{os.path.splitext(zip_file_path)[1]}" # Keep the new file extension
upload_file_to_s3(zip_file_path, bucket, obj_key)
@ -124,22 +136,38 @@ def download_pytest_cache(
)
for downloaded_zip in downloads:
# Unzip into random folder, then merge with the current cache
cache_dir_for_shard = (
temp_dir / UNZIPPED_CACHES / os.urandom(16).hex() / PYTEST_CACHE_DIR_NAME
# the file name of the zip is the shard id
shard = os.path.splitext(os.path.basename(downloaded_zip))[0]
cache_dir_for_shard = _get_temp_cache_dir_path(
temp_dir, pr_identifier, repo, job_identifier, shard
)
unzip_folder(downloaded_zip, cache_dir_for_shard)
print(f"Merging cache from {downloaded_zip}")
print(
f"Merging cache for job_identifier `{job_identifier}`, shard `{shard}` into `{dest_cache_dir}`"
)
_merge_pytest_caches(cache_dir_for_shard, dest_cache_dir)
def _get_temp_cache_dir_path(
temp_dir: Path,
pr_identifier: PRIdentifier,
repo: GithubRepo,
job_identifier: str,
shard: str,
) -> Path:
return (
temp_dir
/ UNZIPPED_CACHES
/ _get_s3_key_prefix(pr_identifier, repo, job_identifier, shard)
/ PYTEST_CACHE_DIR_NAME
)
def _get_s3_key_prefix(
pr_identifier: PRIdentifier,
repo: GithubRepo,
job_identifier: str,
sha: str = "",
test_config: str = "",
shard: str = "",
) -> str:
"""
@ -148,10 +176,6 @@ def _get_s3_key_prefix(
"""
prefix = f"{PYTEST_CACHE_KEY_PREFIX}/{repo.owner}/{repo.name}/{pr_identifier}/{sanitize_for_s3(job_identifier)}"
if sha:
prefix += f"/{sha}"
if test_config:
prefix += f"/{sanitize_for_s3(test_config)}"
if shard:
prefix += f"/{shard}"

47298
.github/scripts/rockset_mocks.json vendored Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -1,64 +0,0 @@
import argparse
import subprocess
from typing import Dict
import generate_binary_build_matrix
def tag_image(
image: str,
default_tag: str,
release_version: str,
dry_run: str,
tagged_images: Dict[str, bool],
) -> None:
if image in tagged_images:
return
release_image = image.replace(f"-{default_tag}", f"-{release_version}")
print(f"Tagging {image} to {release_image} , dry_run: {dry_run}")
if dry_run == "disabled":
subprocess.check_call(["docker", "pull", image])
subprocess.check_call(["docker", "tag", image, release_image])
subprocess.check_call(["docker", "push", release_image])
tagged_images[image] = True
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"--version",
help="Version to tag",
type=str,
default="2.2",
)
parser.add_argument(
"--dry-run",
help="No Runtime Error check",
type=str,
choices=["enabled", "disabled"],
default="enabled",
)
options = parser.parse_args()
tagged_images: Dict[str, bool] = dict()
platform_images = [
generate_binary_build_matrix.WHEEL_CONTAINER_IMAGES,
generate_binary_build_matrix.LIBTORCH_CONTAINER_IMAGES,
generate_binary_build_matrix.CONDA_CONTAINER_IMAGES,
]
default_tag = generate_binary_build_matrix.DEFAULT_TAG
for platform_image in platform_images: # type: ignore[attr-defined]
for arch in platform_image.keys(): # type: ignore[attr-defined]
tag_image(
platform_image[arch], # type: ignore[index]
default_tag,
options.version,
options.dry_run,
tagged_images,
)
if __name__ == "__main__":
main()

View File

@ -102,30 +102,6 @@ MOCKED_DISABLED_UNSTABLE_JOBS = {
"manywheel-py3_8-cuda11_8-build",
"",
],
"inductor / cuda12.1-py3.10-gcc9-sm86 / test (inductor)": [
"pytorchbot",
"107079",
"https://github.com/pytorch/pytorch/issues/107079",
"inductor",
"cuda12.1-py3.10-gcc9-sm86",
"test (inductor)",
],
"inductor / cuda12.1-py3.10-gcc9-sm86 / test (inductor_huggingface)": [
"pytorchbot",
"109153",
"https://github.com/pytorch/pytorch/issues/109153",
"inductor",
"cuda12.1-py3.10-gcc9-sm86",
"test (inductor_huggingface)",
],
"inductor / cuda12.1-py3.10-gcc9-sm86 / test (inductor_huggingface_dynamic)": [
"pytorchbot",
"109154",
"https://github.com/pytorch/pytorch/issues/109154",
"inductor",
"cuda12.1-py3.10-gcc9-sm86",
"test (inductor_huggingface_dynamic)",
],
}
MOCKED_PR_INFO = {
@ -593,37 +569,6 @@ class TestConfigFilter(TestCase):
"expected": '{"include": [{"config": "default", "unstable": "unstable"}]}',
"description": "Both binary build and test jobs are unstable",
},
{
"workflow": "inductor",
"job_name": "cuda12.1-py3.10-gcc9-sm86 / build",
"test_matrix": """
{ include: [
{ config: "inductor" },
{ config: "inductor_huggingface", shard: 1 },
{ config: "inductor_huggingface", shard: 2 },
{ config: "inductor_timm", shard: 1 },
{ config: "inductor_timm", shard: 2 },
{ config: "inductor_torchbench" },
{ config: "inductor_huggingface_dynamic" },
{ config: "inductor_torchbench_dynamic" },
{ config: "inductor_distributed" },
]}
""",
"expected": """
{ "include": [
{ "config": "inductor", "unstable": "unstable" },
{ "config": "inductor_huggingface", "shard": 1, "unstable": "unstable" },
{ "config": "inductor_huggingface", "shard": 2, "unstable": "unstable" },
{ "config": "inductor_timm", "shard": 1 },
{ "config": "inductor_timm", "shard": 2 },
{ "config": "inductor_torchbench" },
{ "config": "inductor_huggingface_dynamic", "unstable": "unstable" },
{ "config": "inductor_torchbench_dynamic" },
{ "config": "inductor_distributed" }
]}
""",
"description": "Marking multiple unstable configurations",
},
]
for case in testcases:
@ -632,7 +577,7 @@ class TestConfigFilter(TestCase):
test_matrix = yaml.safe_load(case["test_matrix"])
filtered_test_matrix = mark_unstable_jobs(workflow, job_name, test_matrix)
self.assertEqual(json.loads(case["expected"]), filtered_test_matrix)
self.assertEqual(case["expected"], json.dumps(filtered_test_matrix))
@mock.patch("subprocess.check_output")
def test_perform_misc_tasks(self, mocked_subprocess: Any) -> None:

View File

@ -7,12 +7,11 @@
# GraphQL queries in trymerge.py, please make sure to delete `gql_mocks.json`
# And re-run the test locally with ones PAT
import gzip
import json
import os
import warnings
from hashlib import sha256
from typing import Any, Dict, List, Optional
from typing import Any, cast, Dict, List, Optional
from unittest import main, mock, skip, TestCase
from urllib.error import HTTPError
@ -20,20 +19,18 @@ from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
from trymerge import (
categorize_checks,
DRCI_CHECKRUN_NAME,
find_matching_merge_rule,
FlakyRule,
get_classifications,
get_drci_classifications,
get_rockset_results,
gh_get_team_members,
gh_graphql,
GitHubPR,
JobCheckState,
is_broken_trunk,
main as trymerge_main,
MandatoryChecksMissingError,
MergeRule,
PostCommentError,
RE_GHSTACK_DESC,
read_merge_rules,
remove_job_name_suffix,
validate_revert,
@ -42,10 +39,6 @@ from trymerge import (
if "GIT_REMOTE_URL" not in os.environ:
os.environ["GIT_REMOTE_URL"] = "https://github.com/pytorch/pytorch"
GQL_MOCKS = "gql_mocks.json.gz"
ROCKSET_MOCKS = "rockset_mocks.json.gz"
DRCI_MOCKS = "drci_mocks.json.gz"
def mock_query(
fallback_function: Any,
@ -58,11 +51,11 @@ def mock_query(
def get_mocked_queries() -> Any:
if not os.path.exists(gql_db_fname):
return {}
with gzip.open(gql_db_fname, encoding="utf-8", mode="rt") as f:
with open(gql_db_fname, encoding="utf-8") as f:
return json.load(f)
def save_mocked_queries(obj: Any) -> None:
with gzip.open(gql_db_fname, encoding="utf-8", mode="wt") as f:
with open(gql_db_fname, encoding="utf-8", mode="w") as f:
json.dump(obj, f, indent=2)
f.write("\n")
@ -75,20 +68,19 @@ def mock_query(
try:
rc = fallback_function(*args)
except HTTPError as err:
if err.code == 401 or err.code == 403:
if err.code == 401:
err_msg = f"If you are seeing this message during workflow run, please make sure to update {file_name}"
err_msg += f" locally, by deleting it and running {os.path.basename(__file__)} with"
err_msg += " GitHub Personal Access Token passed via GITHUB_TOKEN,"
err_msg += " the rockset api key passed via ROCKSET_API_KEY,"
err_msg += " and drci api key passed via DRCI_BOT_KEY environment variables"
err_msg += f" locally, by deleting it and running {os.path.basename(__file__)} with "
err_msg += " GitHub Personal Access Token passed via GITHUB_TOKEN environment variable"
err_msg += (
" the rockset api key passed via ROCKSET_API_KEY environment variable"
)
if (
os.getenv("GITHUB_TOKEN") is None
or os.getenv("ROCKSET_API_KEY") is None
or os.getenv("DRCI_BOT_KEY") is None
):
err_msg = (
"Failed to update cached queries as GITHUB_TOKEN or ROCKSET_API_KEY or DRCI_BOT_KEY "
+ "is not defined. "
"Failed to update cached GraphQL queries as GITHUB_TOKEN or ROCKSET_API_KEY is not defined."
+ err_msg
)
raise RuntimeError(err_msg) from err
@ -108,29 +100,19 @@ def mocked_gh_graphql(query: str, **kwargs: Any) -> Any:
def gh_graphql_wrapper(query: str, kwargs: Any) -> Any:
return gh_graphql(query, **kwargs)
return mock_query(gh_graphql_wrapper, GQL_MOCKS, key_function, query, kwargs)
return mock_query(gh_graphql_wrapper, "gql_mocks.json", key_function, query, kwargs)
def mocked_rockset_results(head_sha: str, merge_base: str, num_retries: int = 3) -> Any:
return mock_query(
get_rockset_results,
ROCKSET_MOCKS,
"rockset_mocks.json",
lambda x, y: f"{x} {y}",
head_sha,
merge_base,
)
def mocked_drci_classifications(pr_num: int, project: str, num_retries: int = 3) -> Any:
return mock_query(
get_drci_classifications,
DRCI_MOCKS,
lambda x, y: f"{x} {y}",
pr_num,
project,
)
def mock_parse_args(revert: bool = False, force: bool = False) -> Any:
class Object:
def __init__(self) -> None:
@ -207,18 +189,6 @@ def mocked_read_merge_rules(repo: Any, org: str, project: str) -> List[MergeRule
],
ignore_flaky_failures=True,
),
MergeRule(
name="xla",
patterns=[".github/ci_commit_pins/xla.txt"],
approved_by=["pytorchbot"],
mandatory_checks_name=[
"Lint",
"EasyCLA",
"pull / linux-focal-py3_8-clang9-xla / build",
"pull / linux-focal-py3_8-clang9-xla / test (xla, 1, 1, linux.12xlarge)",
],
ignore_flaky_failures=True,
),
]
@ -226,6 +196,16 @@ def mocked_read_merge_rules_raise(repo: Any, org: str, project: str) -> List[Mer
raise RuntimeError("testing")
def empty_flaky_rules() -> List[FlakyRule]:
return []
def xla_is_flaky_rules() -> List[FlakyRule]:
return [
FlakyRule("xla", ["FAILED: Build did NOT complete successfully"]),
]
def xla_merge_rules(repo: Any, org: str, project: str) -> List[MergeRule]:
return [
MergeRule(
@ -237,7 +217,6 @@ def xla_merge_rules(repo: Any, org: str, project: str) -> List[MergeRule]:
"EasyCLA",
"pull / linux-bionic-py3_8-clang8-xla / build",
"pull / linux-bionic-py3_8-clang8-xla / test (xla, 1, 1, linux.4xlarge)",
"inductor / cuda11.8-py3.10-gcc7-sm86 / test (inductor_torchbench_dynamic, 1, 1, linux.g5.4xlarge.nvidia.gpu)",
],
ignore_flaky_failures=False,
),
@ -259,11 +238,9 @@ class DummyGitRepo(GitRepo):
return "super awsome commit message"
@mock.patch("trymerge.read_flaky_rules", side_effect=empty_flaky_rules)
@mock.patch("trymerge.get_rockset_results", side_effect=empty_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch(
"trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
)
class TestTryMerge(TestCase):
def test_merge_rules_valid(self, *args: Any) -> None:
"Test that merge_rules.yaml can be parsed"
@ -274,7 +251,7 @@ class TestTryMerge(TestCase):
@mock.patch("trymerge.read_merge_rules", side_effect=mocked_read_merge_rules)
def test_match_rules(self, *args: Any) -> None:
"Tests that PR passes merge rules"
pr = GitHubPR("pytorch", "pytorch", 109999)
pr = GitHubPR("pytorch", "pytorch", 77700)
repo = DummyGitRepo()
self.assertTrue(find_matching_merge_rule(pr, repo) is not None)
@ -327,9 +304,14 @@ class TestTryMerge(TestCase):
def test_internal_changes(self, *args: Any) -> None:
"Tests that PR with internal changes is detected"
pr = GitHubPR("pytorch", "pytorch", 110140)
pr = GitHubPR("pytorch", "pytorch", 73969)
self.assertTrue(pr.has_internal_changes())
def test_checksuites_pagination(self, *args: Any) -> None:
"Tests that PR with lots of checksuits can be fetched"
pr = GitHubPR("pytorch", "pytorch", 73811)
self.assertEqual(len(pr.get_checkrun_conclusions()), 76)
def test_comments_pagination(self, *args: Any) -> None:
"Tests that PR with 50+ comments can be fetched"
pr = GitHubPR("pytorch", "pytorch", 31093)
@ -341,9 +323,7 @@ class TestTryMerge(TestCase):
# see https://gist.github.com/malfet/9b93bc7eeddeaf1d84546efc4f0c577f
pr = GitHubPR("pytorch", "pytorch", 68111)
self.assertGreater(len(pr.get_comments()), 20)
# NS(09/27/2023): GitHub seems to recycle older checkruns
# https://github.com/pytorch/pytorch/pull/68111/checks shows 0 runs
# self.assertGreater(len(pr.get_checkrun_conclusions()), 3)
self.assertGreater(len(pr.get_checkrun_conclusions()), 3)
self.assertGreater(pr.get_commit_count(), 60)
def test_gql_retrieve_checksuites(self, *args: Any) -> None:
@ -388,16 +368,14 @@ class TestTryMerge(TestCase):
def test_get_checkruns_many_runs(self, *args: Any) -> None:
"""Tests that all checkruns can be fetched"""
pr = GitHubPR("pytorch", "pytorch", 105260)
pr = GitHubPR("pytorch", "pytorch", 77700)
conclusions = pr.get_checkrun_conclusions()
self.assertEqual(len(conclusions), 221)
self.assertTrue(
"pull / linux-docs / build-docs-cpp-false" in conclusions.keys()
)
self.assertEqual(len(conclusions), 79)
self.assertTrue("pull / linux-docs / build-docs (cpp)" in conclusions.keys())
def test_cancelled_gets_ignored(self, *args: Any) -> None:
"""Tests that cancelled workflow does not override existing successfull status"""
pr = GitHubPR("pytorch", "pytorch", 110367)
pr = GitHubPR("pytorch", "pytorch", 82169)
conclusions = pr.get_checkrun_conclusions()
lint_checks = [name for name in conclusions.keys() if "Lint" in name]
self.assertTrue(len(lint_checks) > 0)
@ -545,7 +523,108 @@ class TestTryMerge(TestCase):
for case in test_cases:
self.assertEqual(case["expected"], remove_job_name_suffix(case["name"]))
def test_get_merge_base(self, *args: Any) -> None:
def test_is_broken_trunk(self, *args: Any) -> None:
test_cases: List[Dict[str, Any]] = [
{
"head_job": None,
"base_jobs": {
"job_a": {
"conclusion": "success",
"failure_captures": ["a", "b"],
},
"job_b": {
"conclusion": "failure",
"failure_captures": ["a", "b"],
},
},
"expected": False,
"description": "Invalid input - head job",
},
{
"head_job": {
"conclusion": "failure",
"failure_captures": ["a", "b"],
},
"base_jobs": None,
"expected": False,
"description": "Invalid input - base jobs",
},
{
"head_job": {
"conclusion": "failure",
"failure_captures": ["a", "b"],
},
"base_jobs": {},
"expected": False,
"description": "Invalid input - empty base jobs",
},
{
"head_job": {
"conclusion": "failure",
"failure_captures": ["x", "y"],
},
"base_jobs": {
"job_a": {
"conclusion": "success",
"failure_captures": ["a", "b"],
},
"job_b": {
"conclusion": "failure",
"failure_captures": ["x", "y"],
},
},
"expected": True,
"description": "Found a match",
},
{
"head_job": {
"conclusion": "success",
"failure_captures": ["x", "y"],
},
"base_jobs": {
"job_a": {
"conclusion": "success",
"failure_captures": ["a", "b"],
},
"job_b": {
"conclusion": "failure",
"failure_captures": ["x", "y"],
},
},
"expected": False,
"description": "Not found - different conclusion",
},
{
"head_job": {
"conclusion": "failure",
"failure_captures": ["a", "b"],
},
"base_jobs": {
"job_a": {
"conclusion": "success",
"failure_captures": ["a", "b"],
},
"job_b": {
"conclusion": "failure",
"failure_captures": ["x", "y"],
},
},
"expected": False,
"description": "Not found - different captured failures",
},
]
for case in test_cases:
self.assertEqual(
case["expected"], is_broken_trunk(case["head_job"], case["base_jobs"])
)
def test_get_merge_base(
self,
mock_gh_graphql: Any,
mock_get_rockset_results: Any,
mock_read_flaky_rules: Any,
) -> None:
pr = GitHubPR("pytorch", "pytorch", 104121)
mock_merge_base = "mocked-sha"
@ -563,130 +642,57 @@ class TestTryMerge(TestCase):
@mock.patch("trymerge.get_rockset_results", side_effect=mocked_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
@mock.patch(
"trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
)
class TestBypassFailures(TestCase):
def test_get_classifications(self, *args: Any) -> None:
pr = GitHubPR("pytorch", "pytorch", 109584)
flaky_rules = [
# Try a regex rule
FlakyRule("distributed", ["##\\[error\\]The operation [wW]as .+"])
]
pr = GitHubPR("pytorch", "pytorch", 92863)
checks = pr.get_checkrun_conclusions()
checks = get_classifications(
pr.pr_num,
pr.project,
checks,
[],
checks, pr.last_commit()["oid"], pr.get_merge_base(), flaky_rules, []
)
self.assertTrue(
checks[
"pull / linux-focal-py3.11-clang10 / test (dynamo, 1, 2, linux.2xlarge)"
"pull / linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.4xlarge)"
].classification
== "BROKEN_TRUNK"
)
self.assertTrue(
checks[
"trunk / win-vs2019-cpu-py3 / test (default, 2, 3, windows.4xlarge.nonephemeral)"
"pull / linux-focal-py3.7-gcc7 / test (distributed, 1, 2, linux.2xlarge)"
].classification
== "FLAKY"
)
self.assertTrue(
checks[
"pull / linux-jammy-py3.8-gcc11 / test (distributed, 1, 2, linux.2xlarge)"
].classification
== "FLAKY"
)
self.assertTrue(
checks[
"pull / linux-focal-cuda11.8-py3.10-gcc9 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)"
].classification
== "FLAKY"
)
# Set the threshold larger or equal to the number of ok failures
pending, failed, ignorable = categorize_checks(
checks, list(checks.keys()), ok_failed_checks_threshold=6
checks, list(checks.keys()), ok_failed_checks_threshold=2
)
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == 0)
self.assertTrue(len(ignorable["FLAKY"]) == 4)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 2)
self.assertTrue(len(ignorable["FLAKY"]) == 1)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)
# Not set any threshold, defaults to -1 to ignore all flaky and broken trunk failures
pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == 0)
self.assertTrue(len(ignorable["FLAKY"]) == 4)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 2)
self.assertTrue(len(ignorable["FLAKY"]) == 1)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)
# Set the threshold lower than the number of ok failures
pending, failed, ignorable = categorize_checks(
checks, list(checks.keys()), ok_failed_checks_threshold=1
)
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == 6)
self.assertTrue(len(ignorable["FLAKY"]) == 4)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 2)
# Set the threshold to 0 like when ignore_flaky_failures is on
pending, failed, ignorable = categorize_checks(
checks, list(checks.keys()), ok_failed_checks_threshold=1
)
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == 6)
self.assertTrue(len(ignorable["FLAKY"]) == 4)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 2)
def test_get_classifications_flaky_fullname(self, *args: Any) -> None:
pr = GitHubPR("pytorch", "pytorch", 110362)
checks = pr.get_checkrun_conclusions()
checks = get_classifications(
pr.pr_num,
pr.project,
checks,
[],
)
pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == 0)
self.assertTrue(len(ignorable["FLAKY"]) == 1)
def test_get_classifications_invalid_cancel(self, *args: Any) -> None:
pr = GitHubPR("pytorch", "pytorch", 110367)
checks = pr.get_checkrun_conclusions()
checks = get_classifications(
pr.pr_num,
pr.project,
checks,
[],
)
pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == 0)
self.assertTrue(len(ignorable["FLAKY"]) == 0)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 0)
self.assertTrue(len(ignorable["UNSTABLE"]) == 3)
def test_get_classifications_similar_failures(self, *args: Any) -> None:
pr = GitHubPR("pytorch", "pytorch", 109750)
checks = pr.get_checkrun_conclusions()
checks = get_classifications(
pr.pr_num,
pr.project,
checks,
[],
)
pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == 0)
self.assertTrue(len(failed) == 2)
self.assertTrue(len(ignorable["FLAKY"]) == 1)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)
def test_get_classifications_unstable(self, *args: Any) -> None:
pr = GitHubPR("pytorch", "pytorch", 104312)
checks = pr.get_checkrun_conclusions()
checks = get_classifications(
pr.pr_num,
pr.project,
checks,
[],
checks, pr.last_commit()["oid"], pr.get_merge_base(), [], []
)
workflow_name = "linux-bionic-cuda12.1-py3.10-gcc9-bazel-test"
job_name = "build-and-test (default, 1, 1, linux.4xlarge.nvidia.gpu, unstable)"
@ -700,6 +706,19 @@ class TestBypassFailures(TestCase):
self.assertTrue(len(failed) == 0)
self.assertTrue(len(ignorable["UNSTABLE"]) == 1)
def test_get_classifications_pending_unstable(self, *args: Any) -> None:
pr = GitHubPR("pytorch", "pytorch", 105998)
checks = pr.get_checkrun_conclusions()
checks = get_classifications(
checks, pr.last_commit()["oid"], pr.get_merge_base(), [], []
)
pending, failed, ignorable = categorize_checks(
checks, list(checks.keys()), ok_failed_checks_threshold=1
)
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == 3)
self.assertTrue(len(ignorable["UNSTABLE"]) == 3)
def test_get_classifications_broken_trunk(self, *args: Any) -> None:
# The mock merge base is the actual value returned by gh_fetch_merge_base
test_cases = [
@ -707,13 +726,13 @@ class TestBypassFailures(TestCase):
# This PR had one broken trunk failure but it was run on a different shard
# than the one on the base commit. This should still count as broken trunk
"pr_num": 104214,
"related_failure_count": 0,
"mock_merge_base": "436d035dc74db9c703297a62163b0cad0c546665",
"unrelated_failure_count": 1,
},
{
# This PR had one broken trunk failure and it used ghstack
"pr_num": 105145,
"related_failure_count": 0,
"mock_merge_base": "194fe1d12f9860734cc28ed21bdabda2fbb06336",
"unrelated_failure_count": 1,
},
{
@ -722,81 +741,112 @@ class TestBypassFailures(TestCase):
# keep the failure record from the merge base so that it can
# be used to detect broken trunk
"pr_num": 107160,
"related_failure_count": 0,
"mock_merge_base": "a5d841ef01e615e2a654fb12cf0cd08697d12ccf",
"unrelated_failure_count": 4,
},
{
# This PR used Dr.CI broken trunk classification
"pr_num": 111253,
"related_failure_count": 1,
"unrelated_failure_count": 2,
},
]
for case in test_cases:
pr_num = case["pr_num"]
related_failure_count = case["related_failure_count"]
mock_merge_base = case["mock_merge_base"]
unrelated_failure_count = case["unrelated_failure_count"]
pr = GitHubPR("pytorch", "pytorch", pr_num)
checks = pr.get_checkrun_conclusions()
checks = get_classifications(
pr.pr_num,
pr.project,
checks,
[],
)
pr = GitHubPR("pytorch", "pytorch", cast(int, pr_num))
with mock.patch(
"trymerge.gh_fetch_merge_base", return_value=mock_merge_base
) as mocked_gh_fetch_merge_base:
checks = pr.get_checkrun_conclusions()
checks = get_classifications(
checks, pr.last_commit()["oid"], pr.get_merge_base(), [], []
)
pending, failed, _ = categorize_checks(checks, list(checks.keys()))
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == related_failure_count)
pending, failed, _ = categorize_checks(checks, list(checks.keys()))
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == 0)
# When the ok_failed_checks_threshold is set to 0, the broken trunk failure
# won't be ignored
pending, failed, _ = categorize_checks(
checks, list(checks.keys()), ok_failed_checks_threshold=0
)
self.assertTrue(len(pending) == 0)
self.assertTrue(
len(failed) == unrelated_failure_count + related_failure_count
)
# When the ok_failed_checks_threshold is set to 0, the broken trunk failure
# won't be ignored
pending, failed, _ = categorize_checks(
checks, list(checks.keys()), ok_failed_checks_threshold=0
)
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == unrelated_failure_count)
def test_ignore_current(self, *args: Any) -> None:
# Test various interactions of the failure classifier to ensure that ignore
# current checks takes place after other classifications: flaky, unstable,
# or broken trunk. Only actual new failures should be kept in the list of
# ignore current checks to use to record force merge with actual failures
flaky = "pull / linux-focal-cuda11.8-py3.10-gcc9 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)"
flaky_rules = [
FlakyRule("distributed", ["##\\[error\\]The operation was canceled."])
]
flaky = (
"pull / linux-focal-py3.7-gcc7 / test (distributed, 1, 2, linux.2xlarge)"
)
broken_trunk = (
"pull / linux-focal-py3.11-clang10 / test (dynamo, 1, 2, linux.2xlarge)"
"pull / linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.4xlarge)"
)
pr = GitHubPR("pytorch", "pytorch", 109584)
pr = GitHubPR("pytorch", "pytorch", 92863)
checks = pr.get_checkrun_conclusions()
# No broken trunk or flaky rules, then all failures are ignored when ic is used
checks = get_classifications(
checks, pr.last_commit()["oid"], None, [], [broken_trunk, flaky]
)
self.assertTrue(checks[flaky].classification == "IGNORE_CURRENT_CHECK")
self.assertTrue(checks[broken_trunk].classification == "IGNORE_CURRENT_CHECK")
_, failed, ignorable = categorize_checks(
checks, list(checks.keys()), ok_failed_checks_threshold=2
)
self.assertTrue(len(failed) == 0)
self.assertTrue(len(ignorable["IGNORE_CURRENT_CHECK"]) == 2)
self.assertTrue(len(ignorable["FLAKY"]) == 0)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 0)
# Known flaky failure takes precedence over ignore current (need to set the
# merge base here to get the results from Rockset, and that categorize the
# broken trunk failure too
checks = get_classifications(
pr.pr_num,
pr.project,
checks,
pr.last_commit()["oid"],
pr.get_merge_base(),
flaky_rules,
[broken_trunk, flaky],
)
self.assertTrue(checks[flaky].classification == "FLAKY")
self.assertTrue(checks[broken_trunk].classification == "BROKEN_TRUNK")
_, failed, ignorable = categorize_checks(checks, list(checks.keys()))
_, failed, ignorable = categorize_checks(
checks, list(checks.keys()), ok_failed_checks_threshold=2
)
self.assertTrue(len(failed) == 0)
self.assertTrue(len(ignorable["IGNORE_CURRENT_CHECK"]) == 0)
self.assertTrue(len(ignorable["FLAKY"]) == 4)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 2)
self.assertTrue(len(ignorable["FLAKY"]) == 1)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)
# Broken trunk takes precedence over ignore current (no flaky rule is set here)
checks = get_classifications(
checks,
pr.last_commit()["oid"],
pr.get_merge_base(),
[],
[broken_trunk, flaky],
)
self.assertTrue(checks[flaky].classification == "IGNORE_CURRENT_CHECK")
self.assertTrue(checks[broken_trunk].classification == "BROKEN_TRUNK")
_, failed, ignorable = categorize_checks(
checks, list(checks.keys()), ok_failed_checks_threshold=2
)
self.assertTrue(len(failed) == 0)
self.assertTrue(len(ignorable["IGNORE_CURRENT_CHECK"]) == 1)
self.assertTrue(len(ignorable["FLAKY"]) == 0)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)
@mock.patch("trymerge.read_flaky_rules", side_effect=xla_is_flaky_rules)
@mock.patch("trymerge.read_merge_rules", side_effect=xla_merge_rules)
def test_dont_ignore_flaky_failures(self, *args: Any) -> None:
"""
Regression test for https://github.com/pytorch/test-infra/issues/4126
"""
pr = GitHubPR("pytorch", "pytorch", 105312)
"""Regression test for https://github.com/pytorch/test-infra/issues/4126"""
pr = GitHubPR("pytorch", "pytorch", 100369)
repo = DummyGitRepo()
# Check that failure is classified as flaky but still raises exception
with warnings.catch_warnings(record=True) as w, self.assertRaises(RuntimeError):
@ -811,97 +861,14 @@ class TestBypassFailures(TestCase):
@mock.patch("trymerge.get_rockset_results", side_effect=mocked_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
@mock.patch("trymerge.get_drci_classifications", return_value={})
class TestBypassFailuresOnSandCastle(TestCase):
def test_get_classifications(self, *args: Any) -> None:
pr = GitHubPR("pytorch", "pytorch", 111467)
checks = pr.get_checkrun_conclusions()
checks = get_classifications(
pr.pr_num,
pr.project,
checks,
[],
)
pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == 0)
self.assertTrue(len(ignorable["FLAKY"]) == 1)
self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)
def test_get_classifications_drci_checkrun_not_found(self, *args: Any) -> None:
pr = GitHubPR("pytorch", "pytorch", 111467)
# No summary
checks = pr.get_checkrun_conclusions()
checks[DRCI_CHECKRUN_NAME] = JobCheckState(
DRCI_CHECKRUN_NAME,
"",
"NEUTRAL",
None,
1,
"",
None,
)
checks = get_classifications(
pr.pr_num,
pr.project,
checks,
[],
)
pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == 2)
# Empty summary
checks = pr.get_checkrun_conclusions()
checks[DRCI_CHECKRUN_NAME] = JobCheckState(
DRCI_CHECKRUN_NAME,
"",
"NEUTRAL",
None,
1,
"",
"",
)
checks = get_classifications(
pr.pr_num,
pr.project,
checks,
[],
)
pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == 2)
# No Dr.CI checkrun
checks = pr.get_checkrun_conclusions()
del checks[DRCI_CHECKRUN_NAME]
checks = get_classifications(
pr.pr_num,
pr.project,
checks,
[],
)
pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
self.assertTrue(len(pending) == 0)
self.assertTrue(len(failed) == 2)
@mock.patch("trymerge.get_rockset_results", side_effect=mocked_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
@mock.patch(
"trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
)
class TestGitHubPRGhstackDependencies(TestCase):
class TestGitHubPRGhstackDependencies2(TestCase):
def test_pr_dependencies(self, *args: Any) -> None:
pr = GitHubPR("pytorch", "pytorch", 106068)
msg = pr.gen_commit_message(filter_ghstack=True)
self.assertEqual(
msg,
f"{pr.get_title()} (#106068)\n\n{RE_GHSTACK_DESC.sub('', pr.get_body())}\n"
"Pull Request resolved: https://github.com/pytorch/pytorch/pull/106068\n"
"Approved by: https://github.com/ezyang, https://github.com/fegin\n",
assert msg == (
"[FSDP] Break up `_post_backward_hook` into smaller funcs (#106068)\n\n\nDifferential Revision: ["
"D47852461](https://our.internmc.facebook.com/intern/diff/D47852461)\nPull Request resolved: "
"https://github.com/pytorch/pytorch/pull/106068\nApproved by: \n"
)
def test_pr_dependencies_ghstack(self, *args: Any) -> None:
@ -909,13 +876,13 @@ class TestGitHubPRGhstackDependencies(TestCase):
pr1 = GitHubPR("pytorch", "pytorch", 106033)
pr2 = GitHubPR("pytorch", "pytorch", 106034)
pr = GitHubPR("pytorch", "pytorch", 106068)
msg = pr.gen_commit_message(filter_ghstack=True, ghstack_deps=[pr0, pr1, pr2])
self.assertEqual(
msg,
f"{pr.get_title()} (#106068)\n\n{RE_GHSTACK_DESC.sub('', pr.get_body())}\n"
"Pull Request resolved: https://github.com/pytorch/pytorch/pull/106068\n"
"Approved by: https://github.com/ezyang, https://github.com/fegin\n"
"ghstack dependencies: #106032, #106033, #106034\n",
assert msg == (
"[FSDP] Break up `_post_backward_hook` into smaller funcs (#106068)\n\n\nDifferential Revision: ["
"D47852461](https://our.internmc.facebook.com/intern/diff/D47852461)\nPull Request resolved: "
"https://github.com/pytorch/pytorch/pull/106068\nApproved by: \n"
"ghstack dependencies: #106032, #106033, #106034\n"
)
@skip(
@ -964,7 +931,7 @@ class TestGitHubPRGhstackDependencies(TestCase):
mock_repo.cherry_pick.assert_any_call("rev2")
mock_repo.cherry_pick.assert_any_call("rev123")
self.assertTrue(mock.call("rev1") not in mock_repo.cherry_pick.call_args_list)
assert mock.call("rev1") not in mock_repo.cherry_pick.call_args_list
# Verify the first call
message = mock_repo.amend_commit_message.call_args_list[0].args[0]
@ -977,8 +944,8 @@ class TestGitHubPRGhstackDependencies(TestCase):
"dependencies: #106032, #106033\n"
)
self.assertTrue(message.startswith(prefix))
self.assertTrue(message.endswith(suffix))
assert message.startswith(prefix)
assert message.endswith(suffix)
# Verify the second call
mock_repo.amend_commit_message.assert_any_call(

View File

@ -30,7 +30,6 @@ from github_utils import (
gh_fetch_url,
gh_post_commit_comment,
gh_post_pr_comment,
gh_update_pr_state,
GitHubComment,
)
@ -62,7 +61,6 @@ class JobCheckState(NamedTuple):
classification: Optional[str]
job_id: Optional[int]
title: Optional[str]
summary: Optional[str]
JobNameToStateDict = Dict[str, JobCheckState]
@ -76,6 +74,29 @@ class WorkflowCheckState:
self.jobs: JobNameToStateDict = {}
class FlakyRule:
def __init__(self, name: str, captures: List[str]):
self.name = re.compile(name)
self.captures = [re.compile(r) for r in captures]
def matches(self, job: Optional[Dict[str, Any]]) -> bool:
return (
job is not None
and self.name.search(job.get("name", "")) is not None
and job.get("failure_captures") is not None
and all(
any(
r.search(capture) is not None
for capture in job.get("failure_captures", [])
)
for r in self.captures
)
)
def __repr__(self) -> str:
return f"FlakyRule[name='{self.name}', captures={self.captures}]"
GH_PR_REVIEWS_FRAGMENT = """
fragment PRReviews on PullRequestReviewConnection {
nodes {
@ -120,7 +141,6 @@ fragment PRCheckSuites on CheckSuiteConnection {
detailsUrl
databaseId
title
summary
}
pageInfo {
endCursor
@ -312,7 +332,6 @@ query ($owner: String!, $name: String!, $number: Int!, $cs_cursor: String, $cr_c
detailsUrl
databaseId
title
summary
}
pageInfo {
endCursor
@ -437,7 +456,6 @@ MERGE_RULE_PATH = Path(".github") / "merge_rules.yaml"
ROCKSET_MERGES_COLLECTION = "merges"
ROCKSET_MERGES_WORKSPACE = "commons"
REMOTE_MAIN_BRANCH = "origin/main"
DRCI_CHECKRUN_NAME = "Dr.CI"
INTERNAL_CHANGES_CHECKRUN_NAME = "Meta Internal-Only Changes Check"
HAS_NO_CONNECTED_DIFF_TITLE = (
"There is no internal Diff connected, this can be merged now"
@ -551,7 +569,6 @@ def add_workflow_conclusions(
classification=None,
job_id=checkrun_node["databaseId"],
title=checkrun_node["title"],
summary=checkrun_node["summary"],
)
if bool(checkruns["pageInfo"]["hasNextPage"]):
@ -582,7 +599,6 @@ def add_workflow_conclusions(
classification=None,
job_id=None,
title=None,
summary=None,
)
for job_name, job in no_workflow_obj.jobs.items():
res[job_name] = job
@ -908,7 +924,6 @@ class GitHubPR:
classification=None,
job_id=None,
title=None,
summary=None,
)
return self.conclusions
@ -1246,6 +1261,13 @@ def read_merge_rules(
return [MergeRule(**x) for x in rc]
@lru_cache(maxsize=None)
def read_flaky_rules() -> List[FlakyRule]:
# NOTE: This is currently hardcoded, can be extended to do per repo rules
FLAKY_RULES_URL = "https://raw.githubusercontent.com/pytorch/test-infra/generated-stats/stats/flaky-rules.json"
return _get_flaky_rules(FLAKY_RULES_URL)
def find_matching_merge_rule(
pr: GitHubPR,
repo: Optional[GitRepo] = None,
@ -1276,15 +1298,25 @@ def find_matching_merge_rule(
reject_reason = f"No rule found to match PR. Please [report]{issue_link} this issue to DevX team."
rules = read_merge_rules(repo, pr.org, pr.project)
flaky_rules = read_flaky_rules()
if not rules:
reject_reason = f"Rejecting the merge as no rules are defined for the repository in {MERGE_RULE_PATH}"
raise RuntimeError(reject_reason)
checks = pr.get_checkrun_conclusions()
base_rev = None
try:
# is allowed to fail if git is not available
base_rev = pr.get_merge_base()
except Exception as e:
print(
f"Failed fetching base git revision for {pr.pr_num}. Skipping additional classifications.\n"
f"{type(e)}\n{e}"
)
checks = get_classifications(
pr.pr_num,
pr.project,
checks,
pr.last_commit()["oid"],
base_rev,
flaky_rules,
ignore_current_checks=ignore_current_checks,
)
@ -1435,6 +1467,11 @@ def checks_to_markdown_bullets(
]
@retries_decorator(rc=[])
def _get_flaky_rules(url: str) -> List[FlakyRule]:
return [FlakyRule(**rule) for rule in gh_fetch_json_list(url)]
@retries_decorator()
def save_merge_record(
collection: str,
@ -1538,27 +1575,6 @@ where
return []
@retries_decorator()
def get_drci_classifications(pr_num: int, project: str = "pytorch") -> Any:
"""
Query HUD API to find similar failures to decide if they are flaky
"""
# NB: This doesn't work internally atm because this requires making an
# external API call to HUD
failures = gh_fetch_url(
f"https://hud.pytorch.org/api/drci/drci?prNumber={pr_num}",
data=f"repo={project}",
headers={
"Authorization": os.getenv("DRCI_BOT_KEY", ""),
"Accept": "application/vnd.github.v3+json",
},
method="POST",
reader=json.load,
)
return failures.get(str(pr_num), {}) if failures else {}
REMOVE_JOB_NAME_SUFFIX_REGEX = re.compile(r", [0-9]+, [0-9]+, .+\)$")
@ -1567,86 +1583,78 @@ def remove_job_name_suffix(name: str, replacement: str = ")") -> str:
def is_broken_trunk(
name: str,
drci_classifications: Any,
head_job: Optional[Dict[str, Any]], base_jobs: Optional[Dict[str, Dict[str, Any]]]
) -> bool:
if not name or not drci_classifications:
if not head_job or not base_jobs:
return False
# Consult the list of broken trunk failures from Dr.CI
return any(
name == broken_trunk["name"]
for broken_trunk in drci_classifications.get("BROKEN_TRUNK", [])
)
def is_flaky(
name: str,
drci_classifications: Any,
) -> bool:
if not name or not drci_classifications:
return False
# Consult the list of flaky failures from Dr.CI
return any(name == flaky["name"] for flaky in drci_classifications.get("FLAKY", []))
def is_invalid_cancel(
name: str,
conclusion: Optional[str],
drci_classifications: Any,
) -> bool:
"""
After https://github.com/pytorch/test-infra/pull/4579, invalid cancelled
signals have been removed from HUD and Dr.CI. The same needs to be done
here for consistency
"""
if (
not name
or not drci_classifications
or not conclusion
or conclusion.upper() != "CANCELLED"
):
return False
# If a job is cancelled and not listed as a failure by Dr.CI, it's an
# invalid signal and can be ignored
return all(
name != failure["name"] for failure in drci_classifications.get("FAILED", [])
head_job["conclusion"] == base_job["conclusion"]
and head_job["failure_captures"] == base_job["failure_captures"]
for base_job in base_jobs.values()
)
def get_classifications(
pr_num: int,
project: str,
checks: Dict[str, JobCheckState],
head_sha: str,
merge_base: Optional[str],
flaky_rules: List[FlakyRule],
ignore_current_checks: Optional[List[str]],
) -> Dict[str, JobCheckState]:
# Get the failure classification from Dr.CI, which is the source of truth
# going forward. It's preferable to try calling Dr.CI API directly first
# to get the latest results as well as update Dr.CI PR comment
drci_classifications = get_drci_classifications(pr_num=pr_num, project=project)
print(f"From Dr.CI API: {json.dumps(drci_classifications)}")
# Group by job name without shard id and suffix to correctly identify broken
# trunk failures, i.e. linux-bionic-cuda12.1-py3.10-gcc9-sm86 / test (default)
head_sha_jobs: Dict[str, Dict[str, Dict[str, Any]]] = defaultdict(dict)
merge_base_jobs: Dict[str, Dict[str, Dict[str, Any]]] = defaultdict(dict)
# NB: if the latest results from Dr.CI is not available, i.e. when calling from
# SandCastle, we fallback to any results we can find on Dr.CI check run summary
if (
not drci_classifications
and DRCI_CHECKRUN_NAME in checks
and checks[DRCI_CHECKRUN_NAME]
and checks[DRCI_CHECKRUN_NAME].summary
):
drci_summary = checks[DRCI_CHECKRUN_NAME].summary
try:
print(f"From Dr.CI checkrun summary: {drci_summary}")
drci_classifications = json.loads(str(drci_summary))
except json.JSONDecodeError as error:
warn("Invalid Dr.CI checkrun summary")
drci_classifications = {}
if merge_base is not None:
def insert(
d: Dict[str, Dict[str, Dict[str, Any]]],
key: str,
val: Dict[str, Any],
overwrite_failed_run_attempt: bool,
) -> None:
key_no_suffix = remove_job_name_suffix(key)
if key not in d[key_no_suffix]:
d[key_no_suffix][key] = val
return
# When overwrite_failed_run_attempt is set to True, always overwrite
# the job with the result from the latest attempt. This option is for
# jobs from the pull request head_sha where the latest retry is used
# when merging
#
# When overwrite_failed_run_attempt is False, only overwrite the job
# with the result from the latest attempt if the latest retry failed.
# This option is for jobs from the merger_base where we want to record
# failures for broken trunk
if d[key_no_suffix][key]["id"] < val["id"] and (
overwrite_failed_run_attempt or not is_passing_status(val["conclusion"])
):
d[key_no_suffix][key] = val
rockset_results = get_rockset_results(head_sha, merge_base)
for rockset_result in rockset_results:
name = f"{rockset_result['workflow_name']} / {rockset_result['name']}"
if rockset_result["head_sha"] == head_sha:
insert(
head_sha_jobs,
name,
rockset_result,
overwrite_failed_run_attempt=True,
)
else:
insert(
merge_base_jobs,
name,
rockset_result,
overwrite_failed_run_attempt=False,
)
checks_with_classifications = checks.copy()
for name, check in checks.items():
if check.status == "SUCCESS" or check.status == "NEUTRAL":
if check.status == "SUCCESS":
continue
if "unstable" in name:
@ -1657,13 +1665,13 @@ def get_classifications(
"UNSTABLE",
check.job_id,
check.title,
check.summary,
)
continue
# NB: It's important to note that when it comes to ghstack and broken trunk classification,
# Dr.CI uses the base of the whole stack
if is_broken_trunk(name, drci_classifications):
name_no_suffix = remove_job_name_suffix(name)
head_sha_job = head_sha_jobs.get(name_no_suffix, {}).get(name)
if is_broken_trunk(head_sha_job, merge_base_jobs.get(name_no_suffix)):
checks_with_classifications[name] = JobCheckState(
check.name,
check.url,
@ -1671,34 +1679,12 @@ def get_classifications(
"BROKEN_TRUNK",
check.job_id,
check.title,
check.summary,
)
continue
elif is_flaky(name, drci_classifications):
elif any(rule.matches(head_sha_job) for rule in flaky_rules):
checks_with_classifications[name] = JobCheckState(
check.name,
check.url,
check.status,
"FLAKY",
check.job_id,
check.title,
check.summary,
)
continue
elif is_invalid_cancel(name, check.status, drci_classifications):
# NB: Create a new category here for invalid cancelled signals because
# there are usually many of them when they happen. So, they shouldn't
# be counted toward ignorable failures threshold
checks_with_classifications[name] = JobCheckState(
check.name,
check.url,
check.status,
"INVALID_CANCEL",
check.job_id,
check.title,
check.summary,
check.name, check.url, check.status, "FLAKY", check.job_id, check.title
)
continue
@ -1710,7 +1696,6 @@ def get_classifications(
"IGNORE_CURRENT_CHECK",
check.job_id,
check.title,
check.summary,
)
return checks_with_classifications
@ -1804,7 +1789,6 @@ def try_revert(
if not dry_run:
pr.add_numbered_label("reverted")
gh_post_commit_comment(pr.org, pr.project, commit_sha, revert_msg)
gh_update_pr_state(pr.org, pr.project, pr.pr_num)
def prefix_with_github_url(suffix_str: str) -> str:
@ -1880,8 +1864,6 @@ def categorize_checks(
# ignored anyway. This is useful to not need to wait for scarce resources
# like ROCm, which is also frequently in unstable mode
pending_checks.append((checkname, url, job_id))
elif classification == "INVALID_CANCEL":
continue
elif not is_passing_status(check_runs[checkname].status):
target = (
ignorable_failed_checks[classification]
@ -1927,8 +1909,7 @@ def merge(
ignore_current: bool = False,
) -> None:
initial_commit_sha = pr.last_commit()["oid"]
pr_link = f"https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num}"
print(f"Attempting merge of {initial_commit_sha} ({pr_link})")
print(f"Attempting merge of {initial_commit_sha}")
if MERGE_IN_PROGRESS_LABEL not in pr.get_labels():
gh_add_labels(pr.org, pr.project, pr.pr_num, [MERGE_IN_PROGRESS_LABEL])
@ -1993,6 +1974,7 @@ def merge(
start_time = time.time()
last_exception = ""
elapsed_time = 0.0
flaky_rules = read_flaky_rules()
ignore_current_checks = [
x[0] for x in ignore_current_checks_info
] # convert to List[str] for convenience
@ -2025,9 +2007,10 @@ def merge(
checks = pr.get_checkrun_conclusions()
checks = get_classifications(
pr.pr_num,
pr.project,
checks,
pr.last_commit()["oid"],
pr.get_merge_base(),
flaky_rules,
ignore_current_checks=ignore_current_checks,
)
pending, failing, _ = categorize_checks(

View File

@ -51,7 +51,7 @@ def post_already_uptodate(
def rebase_onto(
pr: GitHubPR, repo: GitRepo, onto_branch: str, dry_run: bool = False
) -> bool:
) -> None:
branch = f"pull/{pr.pr_num}/head"
remote_url = f"https://github.com/{pr.info['headRepository']['nameWithOwner']}.git"
refspec = f"{branch}:{pr.head_ref()}"
@ -68,7 +68,6 @@ def rebase_onto(
push_result = repo._run_git("push", "-f", remote_url, refspec)
if "Everything up-to-date" in push_result:
post_already_uptodate(pr, repo, onto_branch, dry_run)
return False
else:
gh_post_comment(
pr.org,
@ -79,21 +78,18 @@ def rebase_onto(
+ "git pull --rebase`)",
dry_run=dry_run,
)
return True
def rebase_ghstack_onto(
pr: GitHubPR, repo: GitRepo, onto_branch: str, dry_run: bool = False
) -> bool:
) -> None:
if (
subprocess.run(
[sys.executable, "-m", "ghstack", "--help"],
capture_output=True,
check=False,
[sys.executable, "-m", "ghstack", "--help"], capture_output=True
).returncode
!= 0
):
subprocess.run([sys.executable, "-m", "pip", "install", "ghstack"], check=True)
subprocess.run([sys.executable, "-m", "pip", "install", "ghstack"])
orig_ref = f"{re.sub(r'/head$', '/orig', pr.head_ref())}"
repo.fetch(orig_ref, orig_ref)
@ -119,9 +115,8 @@ def rebase_ghstack_onto(
if dry_run:
print("Don't know how to dry-run ghstack")
return False
else:
ghstack_result = subprocess.run(["ghstack"], capture_output=True, check=True)
ghstack_result = subprocess.run(["ghstack"], capture_output=True)
push_result = ghstack_result.stdout.decode("utf-8")
print(push_result)
if ghstack_result.returncode != 0:
@ -171,8 +166,6 @@ def rebase_ghstack_onto(
in push_result
):
post_already_uptodate(pr, repo, onto_branch, dry_run)
return False
return True
def additional_rebase_failure_info(e: Exception) -> str:
@ -229,10 +222,9 @@ def main() -> None:
try:
if pr.is_ghstack_pr():
with git_config_guard(repo):
rc = rebase_ghstack_onto(pr, repo, onto_branch, dry_run=args.dry_run)
rebase_ghstack_onto(pr, repo, onto_branch, dry_run=args.dry_run)
else:
rc = rebase_onto(pr, repo, onto_branch, dry_run=args.dry_run)
sys.exit(0 if rc else 1)
rebase_onto(pr, repo, onto_branch, dry_run=args.dry_run)
except Exception as e:
msg = f"Rebase failed due to {e}"

View File

@ -114,8 +114,7 @@ def main() -> None:
# query to see if a pr already exists
params = {
"q": f"is:pr is:open in:title author:pytorchupdatebot repo:{OWNER}/{REPO} {args.repo_name} hash update",
"sort": "created",
"q": f"is:pr is:open in:title author:pytorchmergebot repo:{OWNER}/{REPO} {args.repo_name} hash update"
}
response = git_api("/search/issues", params)
if response["total_count"] != 0:

View File

@ -8,7 +8,7 @@
# NOTE: If testing pytorch/builder changes you can change this variable to change what pytorch/builder reference
# the binary builds will check out
{%- set builder_repo = "pytorch/builder" -%}
{%- set builder_branch = "release/2.2" -%}
{%- set builder_branch = "release/2.1" -%}
{%- macro concurrency(build_environment) -%}
concurrency:
@ -36,10 +36,9 @@ concurrency:
{%- macro setup_ec2_windows() -%}
!{{ display_ec2_information() }}
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.2
continue-on-error: true
uses: seemethere/add-github-ssh-key@v1
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
- name: Enable long paths on Windows
shell: powershell

View File

@ -7,7 +7,6 @@
name: !{{ build_environment }}
{%- endblock %}
on:
push:
{%- if branches == "nightly" %}
@ -56,14 +55,12 @@ jobs:
uses: ./.github/workflows/_binary-build-linux.yml
with:!{{ upload.binary_env_as_input(config) }}
{%- if "aarch64" in build_environment %}
runs_on: linux.arm64.2xlarge
runs_on: linux.t4g.2xlarge
ALPINE_IMAGE: "arm64v8/alpine"
{%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
runs_on: linux.24xlarge
{%- endif %}
build_name: !{{ config["build_name"] }}
build_environment: !{{ build_environment }}
{%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0 %}
{%- if config.pytorch_extra_install_requirements is defined %}
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
{%- endif %}
secrets:
@ -77,7 +74,7 @@ jobs:
build_name: !{{ config["build_name"] }}
build_environment: !{{ build_environment }}
{%- if "aarch64" in build_environment %}
runs_on: linux.arm64.2xlarge
runs_on: linux.t4g.2xlarge
ALPINE_IMAGE: "arm64v8/alpine"
{%- elif config["gpu_arch_type"] == "rocm" %}
runs_on: linux.rocm.gpu
@ -106,7 +103,7 @@ jobs:
run: |
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.2
uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
with:
docker-image: !{{ config["container_image"] }}
- name: Test Pytorch binary

View File

@ -58,12 +58,9 @@ jobs:
{%- for config in build_configs %}
!{{ config["build_name"] }}-build:
if: ${{ github.repository_owner == 'pytorch' }}
runs-on: !{{ macos_runner }}
runs-on: macos-12-xl
timeout-minutes: !{{ common.timeout_minutes }}
!{{ upload.binary_env(config, true) }}
{%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0 %}
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
{%- endif %}
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -72,15 +69,11 @@ jobs:
- name: Install conda and dependencies
run: |
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-x86_64.sh
chmod +x "${RUNNER_TEMP}/conda.sh"
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
if [ -d "/Applications/Xcode_14.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
fi
echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
!{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
!{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch, checkout_pr_head=False) }}
- name: Install sccache (only for non-forked PRs, and pushes to trunk)

View File

@ -53,9 +53,6 @@
{%- macro upload_binaries(config, is_windows=False, has_test=True, use_s3=True) -%}
!{{ config["build_name"] }}-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
{%- if has_test %}
needs: !{{ config["build_name"] }}-test
{%- else %}
@ -68,7 +65,8 @@
{%- endif %}
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
{%- endmacro %}

View File

@ -59,9 +59,6 @@ jobs:
runs-on: windows.4xlarge.nonephemeral
timeout-minutes: !{{ common.timeout_minutes }}
!{{ upload.binary_env(config, True) }}
{%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0 %}
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
{%- endif %}
steps:
!{{ common.setup_ec2_windows() }}
!{{ set_runner_specific_vars() }}

View File

@ -36,7 +36,7 @@ jobs:
keep-going: ${{ steps.filter.outputs.keep-going }}
steps:
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.2
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.1
with:
fetch-depth: 1
submodules: false
@ -58,25 +58,25 @@ jobs:
runs-on: ${{ matrix.runner }}
steps:
- name: Setup SSH (Click me for login details)
uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.2
uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.1
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
# [see note: pytorch repo ref]
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.2
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.1
- name: Setup Linux
uses: ./.github/actions/setup-linux
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.2
uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.1
with:
docker-image-name: ${{ inputs.docker-image-name }}
- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.2
uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
@ -140,5 +140,5 @@ jobs:
if: always()
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.2
uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.1
if: always()

View File

@ -36,7 +36,7 @@ jobs:
keep-going: ${{ steps.filter.outputs.keep-going }}
steps:
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.2
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.1
with:
fetch-depth: 1
submodules: false
@ -58,25 +58,25 @@ jobs:
runs-on: ${{ matrix.runner }}
steps:
- name: Setup SSH (Click me for login details)
uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.2
uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.1
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
# [see note: pytorch repo ref]
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.2
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.1
- name: Setup Linux
uses: ./.github/actions/setup-linux
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.2
uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.1
with:
docker-image-name: ${{ inputs.docker-image-name }}
- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.2
uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
@ -157,7 +157,7 @@ jobs:
# run gradle buildRelease
(echo "./.circleci/scripts/build_android_gradle.sh" | docker exec \
-e BUILD_ENVIRONMENT="pytorch-linux-focal-py3-clang9-android-ndk-r21e-gradle-build" \
-e BUILD_ENVIRONMENT="pytorch-linux-focal-py3-clang7-android-ndk-r19c-gradle-build" \
-e MAX_JOBS="$(nproc --ignore=2)" \
-e AWS_DEFAULT_REGION \
-e PR_NUMBER \
@ -185,5 +185,5 @@ jobs:
if: always()
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.2
uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.1
if: always()

View File

@ -41,7 +41,7 @@ jobs:
reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }}
steps:
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.2
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.1
with:
fetch-depth: 1
submodules: false
@ -63,30 +63,30 @@ jobs:
runs-on: ${{ matrix.runner }}
steps:
- name: Setup SSH (Click me for login details)
uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.2
uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.1
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
# [see note: pytorch repo ref]
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.2
uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.1
- name: Setup Linux
uses: ./.github/actions/setup-linux
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.2
uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.1
with:
docker-image-name: ${{ inputs.docker-image-name }}
- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.2
uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.2
uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.1
if: ${{ inputs.cuda-version != 'cpu' }}
- name: Output disk space left
@ -120,7 +120,6 @@ jobs:
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_RUN_NUMBER: ${{ github.run_number }}
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
PYTORCH_RETRY_TEST_CASES: 1
PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
REENABLED_ISSUES: ${{ needs.filter.outputs.reenabled-issues }}
@ -148,7 +147,6 @@ jobs:
-e GITHUB_JOB \
-e GITHUB_RUN_NUMBER \
-e GITHUB_RUN_ATTEMPT \
-e JOB_ID \
-e GIT_DEFAULT_BRANCH="$GIT_DEFAULT_BRANCH" \
-e SHARD_NUMBER \
-e NUM_TEST_SHARDS \
@ -186,7 +184,7 @@ jobs:
shell: bash
if: always() && steps.test.conclusion
run: |
cat test/**/*_toprint.log || true
cat test/**/*.log || true
- name: Chown workspace
uses: ./.github/actions/chown-workspace
@ -199,5 +197,5 @@ jobs:
file-suffix: bazel-${{ github.job }}_${{ steps.get-job-id.outputs.job-id }}
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.2
uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.1
if: always()

Some files were not shown because too many files have changed in this diff Show More