[nccl-pg] print broadcast ncclunique id duration (#123963 )

Summary: Print NCCL PG broadcast nccl unique id duration for measurement. Differential Revision: D56048059 Pull Request resolved: https://github.com/pytorch/pytorch/pull/123963 Approved by: https://github.com/wconstab
[nccl-pg] Pass pg name and desc to NCCL communicator (#124149 )
2025-10-23 14:59:34 +08:00 · 2024-04-16 17:03:25 -07:00 · 2024-04-16 15:08:38 -07:00 · 2024-04-16 15:08:38 -07:00 · 2024-04-16 15:08:22 -07:00 · 2024-04-16 13:48:35 -07:00
14040 changed files with 676912 additions and 450845 deletions
--- a/.bazelignore
+++ b/.bazelignore
@ -1,4 +1,3 @@
 # We do not use this library in our Bazel build. It contains an
 # infinitely recursing symlink that makes Bazel very unhappy.
 third_party/ittapi/
-third_party/opentelemetry-cpp
--- a/.ci/docker/README.md
+++ b/.ci/docker/README.md
@ -19,7 +19,6 @@ See `build.sh` for valid build environments (it's the giant switch).
 * `ubuntu` -- Dockerfile for Ubuntu image for CPU build and test jobs
 * `ubuntu-cuda` -- Dockerfile for Ubuntu image with CUDA support for nvidia-docker
 * `ubuntu-rocm` -- Dockerfile for Ubuntu image with ROCm support
-* `ubuntu-xpu` -- Dockerfile for Ubuntu image with XPU support

 ## Usage

--- a/.ci/docker/aotriton_version.txt
+++ b/.ci/docker/aotriton_version.txt
@ -1,5 +0,0 @@
-0.6b
-manylinux_2_17
-rocm6
-04b5df8c8123f90cba3ede7e971e6fbc6040d506
-3db6ecbc915893ff967abd6e1b43bd5f54949868873be60dc802086c3863e648
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -71,8 +71,6 @@ if [[ "$image" == *cuda* && "$UBUNTU_VERSION" != "22.04" ]]; then
  DOCKERFILE="${OS}-cuda/Dockerfile"
 elif [[ "$image" == *rocm* ]]; then
  DOCKERFILE="${OS}-rocm/Dockerfile"
-elif [[ "$image" == *xpu* ]]; then
-  DOCKERFILE="${OS}-xpu/Dockerfile"
 elif [[ "$image" == *cuda*linter* ]]; then
  # Use a separate Dockerfile for linter to keep a small image size
  DOCKERFILE="linter-cuda/Dockerfile"
@ -84,30 +82,16 @@ fi
 # CMake 3.18 is needed to support CUDA17 language variant
 CMAKE_VERSION=3.18.5

-_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
-_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
+_UCX_COMMIT=00bcc6bb18fc282eb160623b4c0d300147f579af
+_UCC_COMMIT=7cb07a76ccedad7e56ceb136b865eb9319c258ea

 # It's annoying to rename jobs every time you want to rewrite a
 # configuration, so we hardcode everything here rather than do it
 # from scratch
 case "$image" in
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
+  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -119,24 +103,9 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks)
+  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -149,39 +118,9 @@ case "$image" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-focal-cuda12.1-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9)
+  pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9)
    CUDA_VERSION=11.8.0
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -193,37 +132,9 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
+  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -291,7 +202,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    ROCM_VERSION=6.0
+    ROCM_VERSION=5.6
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
@ -302,18 +213,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    ROCM_VERSION=6.1
-    NINJA_VERSION=1.9.0
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-xpu-2024.0-py3)
-    ANACONDA_PYTHON_VERSION=3.8
-    GCC_VERSION=11
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    XPU_VERSION=0.5
+    ROCM_VERSION=5.7
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
@ -330,10 +230,10 @@ case "$image" in
    DOCS=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda11.8-cudnn9-py3.8-clang12)
+  pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12)
    ANACONDA_PYTHON_VERSION=3.8
    CUDA_VERSION=11.8
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    CLANG_VERSION=12
    PROTOBUF=yes
    DB=yes
@ -365,7 +265,6 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    DOCS=yes
-    UNINSTALL_DILL=yes
    ;;
  pytorch-linux-jammy-py3-clang12-executorch)
    ANACONDA_PYTHON_VERSION=3.10
@ -373,13 +272,6 @@ case "$image" in
    CONDA_CMAKE=yes
    EXECUTORCH=yes
    ;;
-  pytorch-linux-jammy-py3.12-halide)
-    CUDA_VERSION=12.4
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=11
-    CONDA_CMAKE=yes
-    HALIDE=yes
-    ;;
  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
    # We will need to update mypy version eventually, but that's for another day. The task
@ -387,26 +279,11 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CONDA_CMAKE=yes
    ;;
-  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter)
+  pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter)
    ANACONDA_PYTHON_VERSION=3.9
    CUDA_VERSION=11.8
    CONDA_CMAKE=yes
    ;;
-  pytorch-linux-jammy-aarch64-py3.10-gcc11)
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
-    ACL=yes
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    CONDA_CMAKE=yes
-    # snadampal: skipping sccache due to the following issue
-    # https://github.com/pytorch/pytorch/issues/121559
-    SKIP_SCCACHE_INSTALL=yes
-    # snadampal: skipping llvm src build install because the current version
-    # from pytorch/llvm:9.0.1 is x86 specific
-    SKIP_LLVM_SRC_BUILD_INSTALL=yes
-    ;;
  *)
    # Catch-all for builds that are not hardcoded.
    PROTOBUF=yes
@ -454,7 +331,7 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
 #when using cudnn version 8 install it separately from cuda
 if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
  IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
-  if [[ ${CUDNN_VERSION} == 9 ]]; then
+  if [[ ${CUDNN_VERSION} == 8 ]]; then
    IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
  fi
 fi
@ -497,17 +374,12 @@ docker build \
       --build-arg "DOCS=${DOCS}" \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
-       --build-arg "HALIDE=${HALIDE}" \
-       --build-arg "XPU_VERSION=${XPU_VERSION}" \
-       --build-arg "ACL=${ACL:-}" \
-       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
-       --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
       -f $(dirname ${DOCKERFILE})/Dockerfile \
       -t "$tmp_tag" \
       "$@" \
       .

-# NVIDIA dockers for RC releases use tag names like `11.0-cudnn9-devel-ubuntu18.04-rc`,
+# NVIDIA dockers for RC releases use tag names like `11.0-cudnn8-devel-ubuntu18.04-rc`,
 # for this case we will set UBUNTU_VERSION to `18.04-rc` so that the Dockerfile could
 # find the correct image. As a result, here we have to replace the
 #   "$UBUNTU_VERSION" == "18.04-rc"
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -62,7 +62,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV
+# (optional) Install vision packages like OpenCV and ffmpeg
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -77,9 +77,6 @@ RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
-COPY ./common/install_amdsmi.sh install_amdsmi.sh
-RUN bash ./install_amdsmi.sh
-RUN rm install_amdsmi.sh
 ENV PATH /opt/rocm/bin:$PATH
 ENV PATH /opt/rocm/hcc/bin:$PATH
 ENV PATH /opt/rocm/hip/bin:$PATH
@ -113,13 +110,6 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt

-# Install AOTriton (Early fail)
-COPY ./aotriton_version.txt aotriton_version.txt
-COPY ./common/common_utils.sh common_utils.sh
-COPY ./common/install_aotriton.sh install_aotriton.sh
-RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
-ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
-
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-d4b3e5cc607e97afdba79dc90f8ef968142f347c
+ca6322dcfc51b209a06b76d160bd95d81d58f15c
--- a/.ci/docker/ci_commit_pins/halide.txt
+++ b/.ci/docker/ci_commit_pins/halide.txt
@ -1 +0,0 @@
-340136fec6d3ebc73e7a19eba1663e9b0ba8ab2d
--- a/.ci/docker/ci_commit_pins/huggingface.txt
+++ b/.ci/docker/ci_commit_pins/huggingface.txt
@ -1 +1 @@
-243e186efbf7fb93328dd6b34927a4e8c8f24395
+6c26faa159b79a42d7fa46cb66e2d21523351987
--- a/.ci/docker/ci_commit_pins/triton-rocm.txt
+++ b/.ci/docker/ci_commit_pins/triton-rocm.txt
@ -1 +1 @@
-01cbe5045a6898c9a925f01435c8277b2fe6afcc
+dafe1459823b9549417ed95e9720f1b594fab329
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +0,0 @@
-aac14a3b93f11d781d1d5ebc5400b15ae8df5185
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-45fff310c891f5a92d55445adf8cc9d29df5841e
+bcad9dabe15021c53b6a88296e9d7a210044f108
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -1,16 +0,0 @@
-set -euo pipefail
-
-readonly version=v24.04
-readonly src_host=https://review.mlplatform.org/ml
-readonly src_repo=ComputeLibrary
-
-# Clone ACL
-[[ ! -d ${src_repo} ]] && git clone ${src_host}/${src_repo}.git
-cd ${src_repo}
-
-git checkout $version
-
-# Build with scons
-scons -j8  Werror=0 debug=0 neon=1 opencl=0 embed_kernels=0 \
-  os=linux arch=armv8a build=native multi_isa=1 \
-  fixed_format_kernels=1 openmp=1 cppthreads=0
--- a/.ci/docker/common/install_amdsmi.sh
+++ b/.ci/docker/common/install_amdsmi.sh
@ -1,5 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-cd /opt/rocm/share/amd_smi && pip install .
--- a/.ci/docker/common/install_aotriton.sh
+++ b/.ci/docker/common/install_aotriton.sh
@ -1,23 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
-
-TARBALL='aotriton.tar.bz2'
-# This read command alwasy returns with exit code 1
-read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
-ARCH=$(uname -m)
-AOTRITON_INSTALL_PREFIX="$1"
-AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}.tar.bz2"
-
-cd "${AOTRITON_INSTALL_PREFIX}"
-# Must use -L to follow redirects
-curl -L --retry 3 -o "${TARBALL}" "${AOTRITON_URL}"
-ACTUAL_SHA256=$(sha256sum "${TARBALL}" | cut -d " " -f 1)
-if [ "${SHA256}" != "${ACTUAL_SHA256}" ]; then
-  echo -n "Error: The SHA256 of downloaded tarball is ${ACTUAL_SHA256},"
-  echo " which does not match the expected value ${SHA256}."
-  exit
-fi
-tar xf "${TARBALL}" && rm -rf "${TARBALL}"
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@ -3,7 +3,7 @@
 set -ex

 install_ubuntu() {
-  # NVIDIA dockers for RC releases use tag names like `11.0-cudnn9-devel-ubuntu18.04-rc`,
+  # NVIDIA dockers for RC releases use tag names like `11.0-cudnn8-devel-ubuntu18.04-rc`,
  # for this case we will set UBUNTU_VERSION to `18.04-rc` so that the Dockerfile could
  # find the correct image. As a result, here we have to check for
  #   "$UBUNTU_VERSION" == "18.04"*
@ -61,7 +61,6 @@ install_ubuntu() {
    ${maybe_libiomp_dev} \
    libyaml-dev \
    libz-dev \
-    libjemalloc2 \
    libjpeg-dev \
    libasound2-dev \
    libsndfile-dev \
@ -75,7 +74,6 @@ install_ubuntu() {
    libtool \
    vim \
    unzip \
-    gpg-agent \
    gdb

  # Should resolve issues related to various apt package repository cert issues
@ -113,6 +111,7 @@ install_centos() {
    glibc-devel \
    glibc-headers \
    glog-devel \
+    hiredis-devel \
    libstdc++-devel \
    libsndfile-devel \
    make \
@ -152,7 +151,7 @@ wget https://ossci-linux.s3.amazonaws.com/valgrind-${VALGRIND_VERSION}.tar.bz2
 tar -xjf valgrind-${VALGRIND_VERSION}.tar.bz2
 cd valgrind-${VALGRIND_VERSION}
 ./configure --prefix=/usr/local
-make -j$[$(nproc) - 2]
+make -j6
 sudo make install
 cd ../../
 rm -rf valgrind_build
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -9,19 +9,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1)
  MINOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 2)

-if [[ $(uname -m) == "aarch64" ]]; then
-  BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"
  case "$MAJOR_PYTHON_VERSION" in
-    3)
-      CONDA_FILE="Miniforge3-Linux-aarch64.sh"
+    2)
+      CONDA_FILE="Miniconda2-latest-Linux-x86_64.sh"
    ;;
-    *)
-      echo "Unsupported ANACONDA_PYTHON_VERSION: $ANACONDA_PYTHON_VERSION"
-      exit 1
-      ;;
-  esac
-else
-  case "$MAJOR_PYTHON_VERSION" in
    3)
      CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh"
    ;;
@ -30,7 +21,6 @@ else
      exit 1
      ;;
  esac
-fi

  mkdir -p /opt/conda
  chown jenkins:jenkins /opt/conda
@ -57,39 +47,15 @@ fi
  # Uncomment the below when resolved to track the latest conda update
  # as_jenkins conda update -y -n base conda

-  if [[ $(uname -m) == "aarch64" ]]; then
-    export SYSROOT_DEP="sysroot_linux-aarch64=2.17"
-  else
-    export SYSROOT_DEP="sysroot_linux-64=2.17"
-  fi
-
  # Install correct Python version
-  # Also ensure sysroot is using a modern GLIBC to match system compilers
-  as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
-             python="$ANACONDA_PYTHON_VERSION" \
-             ${SYSROOT_DEP}
-
-  # libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30
-  # which is provided in libstdcxx 12 and up.
-  conda_install libstdcxx-ng=12.3.0 -c conda-forge
+  as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION"

  # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
-  if [[ $(uname -m) == "aarch64" ]]; then
-    CONDA_COMMON_DEPS="astunparse pyyaml setuptools openblas==0.3.25=*openmp* ninja==1.11.1 scons==4.5.2"
-
-    if [ "$ANACONDA_PYTHON_VERSION" = "3.8" ]; then
-      conda_install numpy=1.24.4 ${CONDA_COMMON_DEPS}
-    else
-      conda_install numpy=1.26.2 ${CONDA_COMMON_DEPS}
-    fi
+  CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"
+  if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ]; then
+    conda_install numpy=1.23.5 ${CONDA_COMMON_DEPS}
  else
-    CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"
-
-    if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ]; then
-      conda_install numpy=1.26.0 ${CONDA_COMMON_DEPS}
-    else
-      conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
-    fi
+    conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
  fi

  # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
@ -123,5 +89,14 @@ fi
    pip_install -r /opt/conda/requirements-docs.txt
  fi

+  # HACK HACK HACK
+  # gcc-9 for ubuntu-18.04 from http://ppa.launchpad.net/ubuntu-toolchain-r/test/ubuntu
+  # Pulls llibstdc++6 13.1.0-8ubuntu1~18.04 which is too new for conda
+  # So remove libstdc++6.so.3.29 installed by https://anaconda.org/anaconda/libstdcxx-ng/files?version=11.2.0
+  # Same is true for gcc-12 from Ubuntu-22.04
+  if grep -e [12][82].04.[623] /etc/issue >/dev/null; then
+    rm /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/lib/libstdc++.so.6
+  fi
+
  popd
 fi
--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@ -1,22 +1,27 @@
 #!/bin/bash

-if [[ -n "${CUDNN_VERSION}" ]]; then
+if [[ ${CUDNN_VERSION} == 8 ]]; then
    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-    mkdir tmp_cudnn
-    pushd tmp_cudnn
-    if [[ ${CUDA_VERSION:0:2} == "12" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda12-archive"
-    elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
+    mkdir tmp_cudnn && cd tmp_cudnn
+    CUDNN_NAME="cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive"
+    if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-8.9.2.26_cuda12-archive"
+        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
+    elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-8.7.0.84_cuda11-archive"
+        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/${CUDNN_NAME}.tar.xz
    else
-        print "Unsupported CUDA version ${CUDA_VERSION}"
-        exit 1
+        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/${CUDNN_NAME}.tar.xz
    fi
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
+
    tar xf ${CUDNN_NAME}.tar.xz
+    cp -a ${CUDNN_NAME}/include/* /usr/include/
    cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
+    cp -a ${CUDNN_NAME}/include/* /usr/include/x86_64-linux-gnu/
+
    cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/
-    popd
+    cp -a ${CUDNN_NAME}/lib/* /usr/lib/x86_64-linux-gnu/
+    cd ..
    rm -rf tmp_cudnn
    ldconfig
 fi
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -1,26 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-# cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
-mkdir tmp_cusparselt && cd tmp_cusparselt
-
-if [[ ${CUDA_VERSION:0:4} =~ ^12\.[1-4]$ ]]; then
-    arch_path='sbsa'
-    export TARGETARCH=${TARGETARCH:-$(uname -m)}
-    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
-        arch_path='x86_64'
-    fi
-    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.5.2.1-archive"
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
-elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
-    CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.4.0.7-archive"
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
-fi
-
-tar xf ${CUSPARSELT_NAME}.tar.xz
-cp -a ${CUSPARSELT_NAME}/include/* /usr/local/cuda/include/
-cp -a ${CUSPARSELT_NAME}/lib/* /usr/local/cuda/lib64/
-cd ..
-rm -rf tmp_cusparselt
-ldconfig
--- a/.ci/docker/common/install_db.sh
+++ b/.ci/docker/common/install_db.sh
@ -4,6 +4,11 @@ set -ex

 install_ubuntu() {
  apt-get update
+  apt-get install -y --no-install-recommends \
+          libhiredis-dev \
+          libleveldb-dev \
+          liblmdb-dev \
+          libsnappy-dev

  # Cleanup
  apt-get autoclean && apt-get clean
@ -15,6 +20,12 @@ install_centos() {
  # See http://fedoraproject.org/wiki/EPEL
  yum --enablerepo=extras install -y epel-release

+  yum install -y \
+      hiredis-devel \
+      leveldb-devel \
+      lmdb-devel \
+      snappy-devel
+
  # Cleanup
  yum clean all
  rm -rf /var/cache/yum
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -48,6 +48,7 @@ setup_executorch() {

  install_flatc_from_source
  pip_install .
+  build_executorch_runner "cmake"

  # Make sure that all the newly generate files are owned by Jenkins
  chown -R jenkins .
--- a/.ci/docker/common/install_halide.sh
+++ b/.ci/docker/common/install_halide.sh
@ -1,46 +0,0 @@
-#!/bin/bash
-set -ex
-
-source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
-
-COMMIT=$(get_pinned_commit halide)
-test -n "$COMMIT"
-
-# activate conda to populate CONDA_PREFIX
-test -n "$ANACONDA_PYTHON_VERSION"
-eval "$(conda shell.bash hook)"
-conda activate py_$ANACONDA_PYTHON_VERSION
-
-if [ -n "${UBUNTU_VERSION}" ];then
-    apt update
-    apt-get install -y lld liblld-15-dev libpng-dev libjpeg-dev libgl-dev \
-                  libopenblas-dev libeigen3-dev libatlas-base-dev libzstd-dev
-fi
-
-conda_install numpy scipy imageio cmake ninja
-
-git clone --depth 1 --branch release/16.x --recursive https://github.com/llvm/llvm-project.git
-cmake -DCMAKE_BUILD_TYPE=Release \
-        -DLLVM_ENABLE_PROJECTS="clang" \
-        -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \
-        -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_ENABLE_ASSERTIONS=ON \
-        -DLLVM_ENABLE_EH=ON -DLLVM_ENABLE_RTTI=ON -DLLVM_BUILD_32_BITS=OFF \
-        -S llvm-project/llvm -B llvm-build -G Ninja
-cmake --build llvm-build
-cmake --install llvm-build --prefix llvm-install
-export LLVM_ROOT=`pwd`/llvm-install
-export LLVM_CONFIG=$LLVM_ROOT/bin/llvm-config
-
-git clone https://github.com/halide/Halide.git
-pushd Halide
-git checkout ${COMMIT} && git submodule update --init --recursive
-pip_install -r requirements.txt
-cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -S . -B build
-cmake --build build
-test -e ${CONDA_PREFIX}/lib/python3 || ln -s python${ANACONDA_PYTHON_VERSION} ${CONDA_PREFIX}/lib/python3
-cmake --install build --prefix ${CONDA_PREFIX}
-chown -R jenkins ${CONDA_PREFIX}
-popd
-rm -rf Halide llvm-build llvm-project llvm-install
-
-python -c "import halide"  # check for errors
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -26,19 +26,18 @@ pip_install \
  pytest-cov==4.0.0 \
  pytest-subtests==0.10.0 \
  tabulate==0.9.0 \
-  transformers==4.36.2
+  transformers==4.32.1

 pip_install coloredlogs packaging
+retry pip_install -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ --no-cache-dir --no-input ort-nightly==1.17.0.dev20231005006

-pip_install onnxruntime==1.18
-pip_install onnx==1.16.0
-# pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@3e869ef8ccf19b5ebd21c10d3e9c267c9a9fa729" --no-deps
-pip_install onnxscript==0.1.0.dev20240523 --no-deps
+pip_install -i https://test.pypi.org/simple/ onnx==1.15.0rc2
+pip_install onnxscript==0.1.0.dev20231128 --no-deps

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
 IMPORT_SCRIPT_FILENAME="/tmp/onnx_import_script.py"
-as_jenkins echo 'import transformers; transformers.AutoModel.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3");' > "${IMPORT_SCRIPT_FILENAME}"
+as_jenkins echo 'import transformers; transformers.AutoModel.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2");' > "${IMPORT_SCRIPT_FILENAME}"

 # Need a PyTorch version for transformers to work
 pip_install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
--- a/.ci/docker/common/install_openssl.sh
+++ b/.ci/docker/common/install_openssl.sh
@ -9,8 +9,7 @@ tar xf "${OPENSSL}.tar.gz"
 cd "${OPENSSL}"
 ./config --prefix=/opt/openssl -d '-Wl,--enable-new-dtags,-rpath,$(LIBRPATH)'
 # NOTE: openssl install errors out when built with the -j option
-NPROC=$[$(nproc) - 2]
-make -j${NPROC}; make install_sw
+make -j6; make install_sw
 # Link the ssl libraries to the /usr/lib folder.
 sudo ln -s /opt/openssl/lib/lib* /usr/lib
 cd ..
--- a/.ci/docker/common/install_protobuf.sh
+++ b/.ci/docker/common/install_protobuf.sh
@ -2,18 +2,55 @@

 set -ex

-pb_dir="/usr/temp_pb_install_dir"
-mkdir -p $pb_dir
+# This function installs protobuf 3.17
+install_protobuf_317() {
+  pb_dir="/usr/temp_pb_install_dir"
+  mkdir -p $pb_dir

-# On the nvidia/cuda:9-cudnn7-devel-centos7 image we need this symlink or
-# else it will fail with
-#   g++: error: ./../lib64/crti.o: No such file or directory
-ln -s /usr/lib64 "$pb_dir/lib64"
+  # On the nvidia/cuda:9-cudnn7-devel-centos7 image we need this symlink or
+  # else it will fail with
+  #   g++: error: ./../lib64/crti.o: No such file or directory
+  ln -s /usr/lib64 "$pb_dir/lib64"

-curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.17.3/protobuf-all-3.17.3.tar.gz" --retry 3
+  curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.17.3/protobuf-all-3.17.3.tar.gz" --retry 3
+  tar -xvz -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz
+  # -j6 to balance memory usage and speed.
+  # naked `-j` seems to use too much memory.
+  pushd "$pb_dir" && ./configure && make -j6 && make -j6 check && sudo make -j6 install && sudo ldconfig
+  popd
+  rm -rf $pb_dir
+}

-tar -xvz --no-same-owner -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz
-NPROC=$[$(nproc) - 2]
-pushd "$pb_dir" && ./configure && make -j${NPROC} && make -j${NPROC} check && sudo make -j${NRPOC} install && sudo ldconfig
-popd
-rm -rf $pb_dir
+install_ubuntu() {
+  # Ubuntu 14.04 has cmake 2.8.12 as the default option, so we will
+  # install cmake3 here and use cmake3.
+  apt-get update
+  if [[ "$UBUNTU_VERSION" == 14.04 ]]; then
+    apt-get install -y --no-install-recommends cmake3
+  fi
+
+  # Cleanup
+  apt-get autoclean && apt-get clean
+  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+  install_protobuf_317
+}
+
+install_centos() {
+  install_protobuf_317
+}
+
+# Install base packages depending on the base OS
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  ubuntu)
+    install_ubuntu
+    ;;
+  centos)
+    install_centos
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -6,6 +6,9 @@ ver() {
    printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
 }

+# Map ROCm version to AMDGPU version
+declare -A AMDGPU_VERSIONS=( ["5.0"]="21.50" ["5.1.1"]="22.10.1" ["5.2"]="22.20" )
+
 install_ubuntu() {
    apt-get update
    if [[ $UBUNTU_VERSION == 18.04 ]]; then
@ -23,14 +26,31 @@ install_ubuntu() {
    apt-get install -y libc++1
    apt-get install -y libc++abi1

-    # Add amdgpu repository
-    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
-    echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then
+        # Add amdgpu repository
+        UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
+        local amdgpu_baseurl
+        if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
+        else
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/ubuntu"
+        fi
+        echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
+    fi
+
+    ROCM_REPO="ubuntu"
+    if [[ $(ver $ROCM_VERSION) -lt $(ver 4.2) ]]; then
+        ROCM_REPO="xenial"
+    fi
+
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
+        ROCM_REPO="${UBUNTU_VERSION_NAME}"
+    fi

    # Add rocm repository
    wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
    local rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
-    echo "deb [arch=amd64] ${rocm_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/rocm.list
+    echo "deb [arch=amd64] ${rocm_baseurl} ${ROCM_REPO} main" > /etc/apt/sources.list.d/rocm.list
    apt-get update --allow-insecure-repositories

    DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
@ -39,29 +59,27 @@ install_ubuntu() {
                   rocm-libs \
                   rccl \
                   rocprofiler-dev \
-                   roctracer-dev \
-                   amd-smi-lib
-
-    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.1) ]]; then
-        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
-    fi
+                   roctracer-dev

    # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
    # search for all unversioned packages
    # if search fails it will abort this script; use true to avoid case where search fails
-    MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
-    if [[ "x${MIOPENHIPGFX}" = x ]]; then
-      echo "miopen-hip-gfx package not available" && exit 1
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 5.5) ]]; then
+        MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
+        if [[ "x${MIOPENHIPGFX}" = x ]]; then
+          echo "miopen-hip-gfx package not available" && exit 1
+        else
+          DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
+        fi
    else
-      DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
+        MIOPENKERNELS=$(apt-cache search --names-only miopenkernels | awk '{print $1}' | grep -F -v . || true)
+        if [[ "x${MIOPENKERNELS}" = x ]]; then
+          echo "miopenkernels package not available" && exit 1
+        else
+          DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENKERNELS}
+        fi
    fi

-    # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
-    for kdb in /opt/rocm/share/miopen/db/*.kdb
-    do
-        sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
-    done
-
    # Cleanup
    apt-get autoclean && apt-get clean
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
@ -77,19 +95,25 @@ install_centos() {
  yum install -y epel-release
  yum install -y dkms kernel-headers-`uname -r` kernel-devel-`uname -r`

-  # Add amdgpu repository
-  local amdgpu_baseurl
-  if [[ $OS_VERSION == 9 ]]; then
-      amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/9.0/main/x86_64"
-  else
-      amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64"
+  if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then
+      # Add amdgpu repository
+      local amdgpu_baseurl
+      if [[ $OS_VERSION == 9 ]]; then
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/9.0/main/x86_64"
+      else
+        if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64"
+        else
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/7.9/main/x86_64"
+        fi
+      fi
+      echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo
+      echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo
+      echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo
+      echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo
+      echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo
+      echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo
  fi
-  echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo
-  echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo
-  echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo
-  echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo
-  echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo
-  echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo

  local rocm_baseurl="http://repo.radeon.com/rocm/yum/${ROCM_VERSION}"
  echo "[ROCm]" > /etc/yum.repos.d/rocm.repo
@ -107,24 +131,26 @@ install_centos() {
                   rocm-libs \
                   rccl \
                   rocprofiler-dev \
-                   roctracer-dev \
-                   amd-smi-lib
+                   roctracer-dev

  # precompiled miopen kernels; search for all unversioned packages
  # if search fails it will abort this script; use true to avoid case where search fails
-  MIOPENHIPGFX=$(yum -q search miopen-hip-gfx | grep miopen-hip-gfx | awk '{print $1}'| grep -F kdb. || true)
-  if [[ "x${MIOPENHIPGFX}" = x ]]; then
-    echo "miopen-hip-gfx package not available" && exit 1
+  if [[ $(ver $ROCM_VERSION) -ge $(ver 5.5) ]]; then
+      MIOPENHIPGFX=$(yum -q search miopen-hip-gfx | grep miopen-hip-gfx | awk '{print $1}'| grep -F kdb. || true)
+      if [[ "x${MIOPENHIPGFX}" = x ]]; then
+        echo "miopen-hip-gfx package not available" && exit 1
+      else
+        yum install -y ${MIOPENHIPGFX}
+      fi
  else
-    yum install -y ${MIOPENHIPGFX}
+      MIOPENKERNELS=$(yum -q search miopenkernels | grep miopenkernels- | awk '{print $1}'| grep -F kdb. || true)
+      if [[ "x${MIOPENKERNELS}" = x ]]; then
+        echo "miopenkernels package not available" && exit 1
+      else
+        yum install -y ${MIOPENKERNELS}
+      fi
  fi

-  # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
-  for kdb in /opt/rocm/share/miopen/db/*.kdb
-  do
-      sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
-  done
-
  # Cleanup
  yum clean all
  rm -rf /var/cache/yum
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -7,7 +7,7 @@ git clone https://bitbucket.org/icl/magma.git
 pushd magma

 # Version 2.7.2 + ROCm related updates
-git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6
+git checkout 823531632140d0edcb7e77c3edc0e837421471c5

 cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
 echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -13,11 +13,8 @@ conda_reinstall() {
 }

 if [ -n "${ROCM_VERSION}" ]; then
-  TRITON_REPO="https://github.com/openai/triton"
+  TRITON_REPO="https://github.com/ROCmSoftwarePlatform/triton"
  TRITON_TEXT_FILE="triton-rocm"
-elif [ -n "${XPU_VERSION}" ]; then
-  TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
-  TRITON_TEXT_FILE="triton-xpu"
 else
  TRITON_REPO="https://github.com/openai/triton"
  TRITON_TEXT_FILE="triton"
@ -67,6 +64,5 @@ if [ -n "${CONDA_CMAKE}" ]; then
  # latest numpy version, which fails ASAN tests with the following import error: Numba
  # needs NumPy 1.20 or less.
  conda_reinstall cmake="${CMAKE_VERSION}"
-  # Note that we install numpy with pip as conda might not have the version we want
-  pip_install --force-reinstall numpy=="${NUMPY_VERSION}"
+  conda_reinstall numpy="${NUMPY_VERSION}"
 fi
--- a/.ci/docker/common/install_ucc.sh
+++ b/.ci/docker/common/install_ucc.sh
@ -36,12 +36,7 @@ function install_ucc() {
  git submodule update --init --recursive

  ./autogen.sh
-  # We only run distributed tests on Tesla M60 and A10G
-  NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
-  ./configure --prefix=$UCC_HOME          \
-    --with-ucx=$UCX_HOME                  \
-    --with-cuda=$with_cuda                \
-    --with-nvcc-gencode="${NVCC_GENCODE}"
+  ./configure --prefix=$UCC_HOME --with-ucx=$UCX_HOME --with-cuda=$with_cuda
  time make -j
  sudo make install

--- a/.ci/docker/common/install_vision.sh
+++ b/.ci/docker/common/install_vision.sh
@ -5,7 +5,8 @@ set -ex
 install_ubuntu() {
  apt-get update
  apt-get install -y --no-install-recommends \
-          libopencv-dev
+          libopencv-dev \
+          libavcodec-dev

  # Cleanup
  apt-get autoclean && apt-get clean
@ -18,7 +19,8 @@ install_centos() {
  yum --enablerepo=extras install -y epel-release

  yum install -y \
-      opencv-devel
+      opencv-devel \
+      ffmpeg-devel

  # Cleanup
  yum clean all
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -1,114 +0,0 @@
-#!/bin/bash
-set -xe
-
-
-# Intel® software for general purpose GPU capabilities.
-# Refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
-
-# Users should update to the latest version as it becomes available
-
-function install_ubuntu() {
-    apt-get update -y
-    apt-get install -y gpg-agent wget
-
-    # Set up the repository. To do this, download the key to the system keyring
-    wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
-        | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
-    wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
-        | gpg --dearmor --output /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
-
-    # Add the signed entry to APT sources and configure the APT client to use the Intel repository
-    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] \
-        https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
-        | tee /etc/apt/sources.list.d/intel-gpu-jammy.list
-    echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] \
-        https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" \
-        | tee /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list
-
-    # Update the packages list and repository index
-    apt-get update
-
-    # The xpu-smi packages
-    apt-get install -y flex bison xpu-smi
-    # Compute and Media Runtimes
-    apt-get install -y \
-        intel-opencl-icd intel-level-zero-gpu level-zero \
-        intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
-        libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
-        libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
-        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
-    # Development Packages
-    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
-    # Install Intel Support Packages
-    if [ -n "$XPU_VERSION" ]; then
-        apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION}
-    else
-        apt-get install -y intel-for-pytorch-gpu-dev
-    fi
-
-    # Cleanup
-    apt-get autoclean && apt-get clean
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-}
-
-function install_centos() {
-    dnf install -y 'dnf-command(config-manager)'
-    dnf config-manager --add-repo \
-        https://repositories.intel.com/gpu/rhel/8.6/production/2328/unified/intel-gpu-8.6.repo
-    # To add the EPEL repository needed for DKMS
-    dnf -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm
-        # https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
-
-    # Create the YUM repository file in the /temp directory as a normal user
-    tee > /tmp/oneAPI.repo << EOF
-[oneAPI]
-name=Intel® oneAPI repository
-baseurl=https://yum.repos.intel.com/oneapi
-enabled=1
-gpgcheck=1
-repo_gpgcheck=1
-gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-EOF
-
-    # Move the newly created oneAPI.repo file to the YUM configuration directory /etc/yum.repos.d
-    mv /tmp/oneAPI.repo /etc/yum.repos.d
-
-    # The xpu-smi packages
-    dnf install -y flex bison xpu-smi
-    # Compute and Media Runtimes
-    dnf install -y \
-        intel-opencl intel-media intel-mediasdk libmfxgen1 libvpl2\
-        level-zero intel-level-zero-gpu mesa-dri-drivers mesa-vulkan-drivers \
-        mesa-vdpau-drivers libdrm mesa-libEGL mesa-libgbm mesa-libGL \
-        mesa-libxatracker libvpl-tools intel-metrics-discovery \
-        intel-metrics-library intel-igc-core intel-igc-cm \
-        libva libva-utils intel-gmmlib libmetee intel-gsc intel-ocloc hwinfo clinfo
-    # Development packages
-    dnf install -y --refresh \
-        intel-igc-opencl-devel level-zero-devel intel-gsc-devel libmetee-devel \
-        level-zero-devel
-    # Install Intel® oneAPI Base Toolkit
-    dnf install intel-basekit -y
-
-    # Cleanup
-    dnf clean all
-    rm -rf /var/cache/yum
-    rm -rf /var/lib/yum/yumdb
-    rm -rf /var/lib/yum/history
-}
-
-
-# The installation depends on the base OS
-ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
-case "$ID" in
-    ubuntu)
-        install_ubuntu
-    ;;
-    centos)
-        install_centos
-    ;;
-    *)
-        echo "Unable to determine OS..."
-        exit 1
-    ;;
-esac
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -15,7 +15,7 @@ click
 #Pinned versions:
 #test that import:

-coremltools==5.0b5 ; python_version < "3.12"
+coremltools==5.0b5
 #Description: Apple framework for ML integration
 #Pinned versions: 5.0b5
 #test that import:
@ -25,11 +25,6 @@ coremltools==5.0b5 ; python_version < "3.12"
 #Pinned versions:
 #test that import:

-dill==0.3.7
-#Description: dill extends pickle with serializing and de-serializing for most built-ins
-#Pinned versions: 0.3.7
-#test that import: dynamo/test_replay_record.py test_dataloader.py test_datapipe.py test_serialization.py
-
 expecttest==0.1.6
 #Description: method for writing tests where test framework auto populates
 # the expected output based on previous runs
@ -52,11 +47,6 @@ junitparser==2.1.1
 #Pinned versions: 2.1.1
 #test that import:

-lark==0.12.0
-#Description: parser
-#Pinned versions: 0.12.0
-#test that import:
-
 librosa>=0.6.2 ; python_version < "3.11"
 #Description: A python package for music and audio analysis
 #Pinned versions: >=0.6.2
@ -76,7 +66,7 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Description: A testing library that allows you to replace parts of your
 #system under test with mock objects
 #Pinned versions:
-#test that import: test_modules.py, test_nn.py,
+#test that import: test_module_init.py, test_modules.py, test_nn.py,
 #test_testing.py

 #MonkeyType # breaks pytorch-xla-linux-bionic-py3.7-clang8
@ -85,10 +75,10 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.10.0
+mypy==1.7.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
-#Pinned versions: 1.10.0
+#Pinned versions: 1.7.0
 #test that import: test_typing.py, test_type_hints.py

 networkx==2.8.8
@ -134,9 +124,9 @@ opt-einsum==3.3
 #Pinned versions: 3.3
 #test that import: test_linalg.py

-optree==0.11.0
+optree==0.9.1
 #Description: A library for tree manipulation
-#Pinned versions: 0.11.0
+#Pinned versions: 0.9.1
 #test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
 #test_pytree.py, test_ops.py, test_control_flow.py, test_modules.py,
 #common_utils.py, test_eager_transforms.py, test_python_dispatch.py,
@ -147,9 +137,9 @@ optree==0.11.0
 #test_pointwise_ops.py, test_dtensor_ops.py, test_torchinductor.py, test_fx.py,
 #test_fake_tensor.py, test_mps.py

-pillow==10.3.0
+pillow==10.0.1
 #Description:  Python Imaging Library fork
-#Pinned versions: 10.3.0
+#Pinned versions: 10.0.1
 #test that import:

 protobuf==3.20.2
@ -172,6 +162,11 @@ pytest-xdist==3.3.1
 #Pinned versions:
 #test that import:

+pytest-shard==0.1.2
+#Description: plugin spliting up tests in pytest
+#Pinned versions:
+#test that import:
+
 pytest-flakefinder==1.1.0
 #Description: plugin for rerunning tests a fixed number of times in pytest
 #Pinned versions: 1.1.0
@ -228,11 +223,12 @@ scikit-image==0.20.0 ; python_version >= "3.10"
 #Pinned versions: 0.20.3
 #test that import:

-scipy==1.10.1 ; python_version <= "3.11"
-scipy==1.12.0 ; python_version == "3.12"
+scipy==1.6.3 ; python_version < "3.10"
+scipy==1.8.1 ; python_version == "3.10"
+scipy==1.10.1 ; python_version == "3.11"
 # Pin SciPy because of failing distribution tests (see #60347)
 #Description: scientific python
-#Pinned versions: 1.10.1
+#Pinned versions: 1.6.3
 #test that import: test_unary_ufuncs.py, test_torch.py,test_tensor_creation_ops.py
 #test_spectral_ops.py, test_sparse_csr.py, test_reductions.py,test_nn.py
 #test_linalg.py, test_binary_ufuncs.py
@ -247,8 +243,7 @@ tb-nightly==2.13.0a20230426
 #Pinned versions:
 #test that import:

-# needed by torchgen utils
-typing-extensions
+#typing-extensions
 #Description: type hints for python
 #Pinned versions:
 #test that import:
@ -263,10 +258,9 @@ unittest-xml-reporting<=3.2.0,>=2.0.0
 #Pinned versions:
 #test that import:

-#lintrunner is supported on aarch64-linux only from 0.12.4 version
-lintrunner==0.12.5
+lintrunner==0.10.7
 #Description: all about linters!
-#Pinned versions: 0.12.5
+#Pinned versions: 0.10.7
 #test that import:

 rockset==1.0.3
@ -274,14 +268,14 @@ rockset==1.0.3
 #Pinned versions: 1.0.3
 #test that import:

-ghstack==0.8.0
+ghstack==0.7.1
 #Description: ghstack tool
-#Pinned versions: 0.8.0
+#Pinned versions: 0.7.1
 #test that import:

-jinja2==3.1.4
+jinja2==3.1.2
 #Description: jinja2 template engine
-#Pinned versions: 3.1.4
+#Pinned versions: 3.1.2
 #test that import:

 pytest-cpp==2.3.0
@ -299,16 +293,8 @@ tensorboard==2.13.0
 #Pinned versions:
 #test that import: test_tensorboard

-pywavelets==1.4.1 ; python_version < "3.12"
-pywavelets==1.5.0 ; python_version >= "3.12"
+pywavelets==1.4.1
 #Description: This is a requirement of scikit-image, we need to pin
 # it here because 1.5.0 conflicts with numpy 1.21.2 used in CI
 #Pinned versions: 1.4.1
 #test that import:
-
-lxml==5.0.0.
-#Description: This is a requirement of unittest-xml-reporting
-
-# Python-3.9 binaries
-
-PyGithub==2.3.0
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.0.0
+2.1.0
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -56,7 +56,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV
+# (optional) Install vision packages like OpenCV and ffmpeg
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -103,14 +103,6 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt

-ARG HALIDE
-# Build and install halide
-COPY ./common/install_halide.sh install_halide.sh
-COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/halide.txt halide.txt
-RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
-RUN rm install_halide.sh common_utils.sh halide.txt
-
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
@ -147,20 +139,13 @@ COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
 ARG CUDNN_VERSION
 ARG CUDA_VERSION
 COPY ./common/install_cudnn.sh install_cudnn.sh
-RUN if [ -n "${CUDNN_VERSION}" ]; then bash install_cudnn.sh; fi
+RUN if [ "${CUDNN_VERSION}" -eq 8 ]; then bash install_cudnn.sh; fi
 RUN rm install_cudnn.sh

-# Install CUSPARSELT
-ARG CUDA_VERSION
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
-RUN bash install_cusparselt.sh
-RUN rm install_cusparselt.sh
-
 # Delete /usr/local/cuda-11.X/cuda-11.X symlinks
 RUN if [ -h /usr/local/cuda-11.6/cuda-11.6 ]; then rm /usr/local/cuda-11.6/cuda-11.6; fi
 RUN if [ -h /usr/local/cuda-11.7/cuda-11.7 ]; then rm /usr/local/cuda-11.7/cuda-11.7; fi
 RUN if [ -h /usr/local/cuda-12.1/cuda-12.1 ]; then rm /usr/local/cuda-12.1/cuda-12.1; fi
-RUN if [ -h /usr/local/cuda-12.4/cuda-12.4 ]; then rm /usr/local/cuda-12.4/cuda-12.4; fi

 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -53,7 +53,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV
+# (optional) Install vision packages like OpenCV and ffmpeg
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -78,11 +78,6 @@ ENV MAGMA_HOME /opt/rocm/magma
 ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8

-# Install amdsmi
-COPY ./common/install_amdsmi.sh install_amdsmi.sh
-RUN bash ./install_amdsmi.sh
-RUN rm install_amdsmi.sh
-
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
@ -105,13 +100,6 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt

-# Install AOTriton
-COPY ./aotriton_version.txt aotriton_version.txt
-COPY ./common/common_utils.sh common_utils.sh
-COPY ./common/install_aotriton.sh install_aotriton.sh
-RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
-ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
-
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -1,118 +0,0 @@
-ARG UBUNTU_VERSION
-
-FROM ubuntu:${UBUNTU_VERSION}
-
-ARG UBUNTU_VERSION
-
-ENV DEBIAN_FRONTEND noninteractive
-
-ARG CLANG_VERSION
-
-# Install common dependencies (so that this step can be cached separately)
-COPY ./common/install_base.sh install_base.sh
-RUN bash ./install_base.sh && rm install_base.sh
-
-# Install clang
-ARG LLVMDEV
-COPY ./common/install_clang.sh install_clang.sh
-RUN bash ./install_clang.sh && rm install_clang.sh
-
-# Install user
-COPY ./common/install_user.sh install_user.sh
-RUN bash ./install_user.sh && rm install_user.sh
-
-# Install katex
-ARG KATEX
-COPY ./common/install_docs_reqs.sh install_docs_reqs.sh
-RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
-
-# Install conda and other packages (e.g., numpy, pytest)
-ARG ANACONDA_PYTHON_VERSION
-ARG CONDA_CMAKE
-ARG DOCS
-ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
-ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
-ENV DOCS=$DOCS
-COPY requirements-ci.txt requirements-docs.txt /opt/conda/
-COPY ./common/install_conda.sh install_conda.sh
-COPY ./common/common_utils.sh common_utils.sh
-RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt /opt/conda/requirements-docs.txt
-
-# Install gcc
-ARG GCC_VERSION
-COPY ./common/install_gcc.sh install_gcc.sh
-RUN bash ./install_gcc.sh && rm install_gcc.sh
-
-# Install lcov for C++ code coverage
-COPY ./common/install_lcov.sh install_lcov.sh
-RUN  bash ./install_lcov.sh && rm install_lcov.sh
-
-COPY ./common/install_openssl.sh install_openssl.sh
-RUN bash ./install_openssl.sh
-ENV OPENSSL_ROOT_DIR /opt/openssl
-ENV OPENSSL_DIR /opt/openssl
-RUN rm install_openssl.sh
-
-ARG INDUCTOR_BENCHMARKS
-COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
-COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface.txt huggingface.txt
-COPY ci_commit_pins/timm.txt timm.txt
-RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
-
-# Install XPU Dependencies
-ARG XPU_VERSION
-COPY ./common/install_xpu.sh install_xpu.sh
-RUN bash ./install_xpu.sh && rm install_xpu.sh
-
-ARG TRITON
-# Install triton, this needs to be done before sccache because the latter will
-# try to reach out to S3, which docker build runners don't have access
-COPY ./common/install_triton.sh install_triton.sh
-COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/triton-xpu.txt triton-xpu.txt
-COPY triton_version.txt triton_version.txt
-RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
-RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt
-
-# (optional) Install database packages like LMDB and LevelDB
-ARG DB
-COPY ./common/install_db.sh install_db.sh
-RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
-RUN rm install_db.sh
-ENV INSTALLED_DB ${DB}
-
-# (optional) Install vision packages like OpenCV
-ARG VISION
-COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
-RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
-RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
-ENV INSTALLED_VISION ${VISION}
-
-# (optional) Install non-default CMake version
-ARG CMAKE_VERSION
-COPY ./common/install_cmake.sh install_cmake.sh
-RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
-RUN rm install_cmake.sh
-
-# (optional) Install non-default Ninja version
-ARG NINJA_VERSION
-COPY ./common/install_ninja.sh install_ninja.sh
-RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
-RUN rm install_ninja.sh
-
-# Install ccache/sccache (do this last, so we get priority in PATH)
-COPY ./common/install_cache.sh install_cache.sh
-ENV PATH /opt/cache/bin:$PATH
-RUN bash ./install_cache.sh && rm install_cache.sh
-
-# Include BUILD_ENVIRONMENT environment variable in image
-ARG BUILD_ENVIRONMENT
-ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
-
-# Install LLVM dev version (Defined in the pytorch/builder github repository)
-COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
-
-USER jenkins
-CMD ["bash"]
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -37,7 +37,6 @@ COPY requirements-ci.txt requirements-docs.txt /opt/conda/
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt /opt/conda/requirements-docs.txt
-RUN if [ -n "${UNINSTALL_DILL}" ]; then pip uninstall -y dill; fi

 # Install gcc
 ARG GCC_VERSION
@ -80,7 +79,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV
+# (optional) Install vision packages like OpenCV and ffmpeg
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -155,33 +154,16 @@ COPY ci_commit_pins/executorch.txt executorch.txt
 RUN if [ -n "${EXECUTORCH}" ]; then bash ./install_executorch.sh; fi
 RUN rm install_executorch.sh common_utils.sh executorch.txt

-ARG HALIDE
-# Build and install halide
-COPY ./common/install_halide.sh install_halide.sh
-COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/halide.txt halide.txt
-RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
-RUN rm install_halide.sh common_utils.sh halide.txt
-
 ARG ONNX
 # Install ONNX dependencies
 COPY ./common/install_onnx.sh ./common/common_utils.sh ./
 RUN if [ -n "${ONNX}" ]; then bash ./install_onnx.sh; fi
 RUN rm install_onnx.sh common_utils.sh

-# (optional) Build ACL
-ARG ACL
-COPY ./common/install_acl.sh install_acl.sh
-RUN if [ -n "${ACL}" ]; then bash ./install_acl.sh; fi
-RUN rm install_acl.sh
-ENV INSTALLED_ACL ${ACL}
-
 # Install ccache/sccache (do this last, so we get priority in PATH)
-ARG SKIP_SCCACHE_INSTALL
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
-RUN if [ -z "${SKIP_SCCACHE_INSTALL}" ]; then bash ./install_cache.sh; fi
-RUN rm install_cache.sh
+RUN bash ./install_cache.sh && rm install_cache.sh

 # Add jni.h for java host build
 COPY ./common/install_jni.sh install_jni.sh
@ -198,9 +180,7 @@ ARG BUILD_ENVIRONMENT
 ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}

 # Install LLVM dev version (Defined in the pytorch/builder github repository)
-ARG SKIP_LLVM_SRC_BUILD_INSTALL
 COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
-RUN if [ -n "${SKIP_LLVM_SRC_BUILD_INSTALL}" ]; then set -eu; rm -rf /opt/llvm; fi

 # AWS specific CUDA build guidance
 ENV TORCH_CUDA_ARCH_LIST Maxwell
--- a/.ci/onnx/common.sh
+++ b/.ci/onnx/common.sh
@ -1,9 +1,5 @@
-#!/bin/bash
-
 set -ex

-source "$(dirname "${BASH_SOURCE[0]}")/../pytorch/common_utils.sh"
-
 LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
 TEST_DIR="$ROOT_DIR/test"
--- a/.ci/onnx/test.sh
+++ b/.ci/onnx/test.sh
@ -3,20 +3,6 @@
 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"

-# Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
-WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
-cleanup_workspace() {
-  echo "sudo may print the following warning message that can be ignored. The chown command will still run."
-  echo "    sudo: setrlimit(RLIMIT_STACK): Operation not permitted"
-  echo "For more details refer to https://github.com/sudo-project/sudo/issues/42"
-  sudo chown -R "$WORKSPACE_ORIGINAL_OWNER_ID" /var/lib/jenkins/workspace
-}
-# Disable shellcheck SC2064 as we want to parse the original owner immediately.
-# shellcheck disable=SC2064
-trap_add cleanup_workspace EXIT
-sudo chown -R jenkins /var/lib/jenkins/workspace
-git config --global --add safe.directory /var/lib/jenkins/workspace
-
 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
  # TODO: This can be removed later once vision is also part of the Docker image
  pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -28,8 +28,6 @@ echo "Environment variables:"
 env

 if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
-  # Use jemalloc during compilation to mitigate https://github.com/pytorch/pytorch/issues/116289
-  export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
  echo "NVCC version:"
  nvcc --version
 fi
@ -44,7 +42,15 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
  fi
 fi

-if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
+if [[ ${BUILD_ENVIRONMENT} == *"caffe2"* ]]; then
+  echo "Caffe2 build is ON"
+  export BUILD_CAFFE2=ON
+fi
+
+if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then
+  export ATEN_THREADING=TBB
+  export USE_TBB=1
+elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
  export ATEN_THREADING=NATIVE
 fi

@ -73,35 +79,7 @@ if ! which conda; then
    export USE_MKLDNN=0
  fi
 else
-  # CMAKE_PREFIX_PATH precedences
-  # 1. $CONDA_PREFIX, if defined. This follows the pytorch official build instructions.
-  # 2. /opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}, if ANACONDA_PYTHON_VERSION defined.
-  #    This is for CI, which defines ANACONDA_PYTHON_VERSION but not CONDA_PREFIX.
-  # 3. $(conda info --base). The fallback value of pytorch official build
-  #    instructions actually refers to this.
-  #    Commonly this is /opt/conda/
-  if [[ -v CONDA_PREFIX ]]; then
-    export CMAKE_PREFIX_PATH=${CONDA_PREFIX}
-  elif [[ -v ANACONDA_PYTHON_VERSION ]]; then
-    export CMAKE_PREFIX_PATH="/opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}"
-  else
-    # already checked by `! which conda`
-    CMAKE_PREFIX_PATH="$(conda info --base)"
-    export CMAKE_PREFIX_PATH
-  fi
-
-  # Workaround required for MKL library linkage
-  # https://github.com/pytorch/pytorch/issues/119557
-  if [ "$ANACONDA_PYTHON_VERSION" = "3.12" ]; then
-    export CMAKE_LIBRARY_PATH="/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/lib/"
-    export CMAKE_INCLUDE_PATH="/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/include/"
-  fi
-fi
-
-if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
-  export USE_MKLDNN=1
-  export USE_MKLDNN_ACL=1
-  export ACL_ROOT_DIR=/ComputeLibrary
+  export CMAKE_PREFIX_PATH=/opt/conda
 fi

 if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then
@ -173,12 +151,6 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
  python tools/amd_build/build_amd.py
 fi

-if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
-  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/compiler/latest/env/vars.sh
-  export USE_XPU=1
-fi
-
 # sccache will fail for CUDA builds if all cores are used for compiling
 # gcc 7 with sccache seems to have intermittent OOM issue if all cores are used
 if [ -z "$MAX_JOBS" ]; then
@ -230,24 +202,6 @@ if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]
  export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi

-# Do not change workspace permissions for ROCm CI jobs
-# as it can leave workspace with bad permissions for cancelled jobs
-if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
-  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
-  cleanup_workspace() {
-    echo "sudo may print the following warning message that can be ignored. The chown command will still run."
-    echo "    sudo: setrlimit(RLIMIT_STACK): Operation not permitted"
-    echo "For more details refer to https://github.com/sudo-project/sudo/issues/42"
-    sudo chown -R "$WORKSPACE_ORIGINAL_OWNER_ID" /var/lib/jenkins/workspace
-  }
-  # Disable shellcheck SC2064 as we want to parse the original owner immediately.
-  # shellcheck disable=SC2064
-  trap_add cleanup_workspace EXIT
-  sudo chown -R jenkins /var/lib/jenkins/workspace
-  git config --global --add safe.directory /var/lib/jenkins/workspace
-fi
-
 if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
  set -e

@ -273,22 +227,15 @@ else
  ( ! get_exit_code python setup.py clean bad_argument )

  if [[ "$BUILD_ENVIRONMENT" != *libtorch* ]]; then
+
    # rocm builds fail when WERROR=1
    # XLA test build fails when WERROR=1
    # set only when building other architectures
    # or building non-XLA tests.
    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
          "$BUILD_ENVIRONMENT" != *xla* ]]; then
-      if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
-        # Install numpy-2.0 release candidate for builds
-        # Which should be backward compatible with Numpy-1.X
-        python -mpip install --pre numpy==2.0.0rc1
-      fi
      WERROR=1 python setup.py bdist_wheel
    else
-      if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
-        source .ci/pytorch/install_cache_xla.sh
-      fi
      python setup.py bdist_wheel
    fi
    pip_install_whl "$(echo dist/*.whl)"
@ -330,7 +277,7 @@ else
    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
    mkdir -p "$CUSTOM_OP_BUILD"
    pushd "$CUSTOM_OP_BUILD"
-    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -343,7 +290,7 @@ else
    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
    mkdir -p "$JIT_HOOK_BUILD"
    pushd "$JIT_HOOK_BUILD"
-    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -355,7 +302,7 @@ else
    python --version
    mkdir -p "$CUSTOM_BACKEND_BUILD"
    pushd "$CUSTOM_BACKEND_BUILD"
-    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -386,8 +333,4 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];
  python tools/stats/export_test_times.py
 fi

-# snadampal: skipping it till sccache support added for aarch64
-# https://github.com/pytorch/pytorch/issues/121559
-if [[ "$BUILD_ENVIRONMENT" != *aarch64* ]]; then
-  print_sccache_stats
-fi
+print_sccache_stats
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -158,11 +158,6 @@ function install_torchvision() {
  fi
 }

-function install_tlparse() {
-  pip_install --user "tlparse==0.3.7"
-  PATH="$(python -m site --user-base)/bin:$PATH"
-}
-
 function install_torchrec_and_fbgemm() {
  local torchrec_commit
  torchrec_commit=$(get_pinned_commit torchrec)
--- a/.ci/pytorch/docs-test.sh
+++ b/.ci/pytorch/docs-test.sh
@ -6,4 +6,4 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 echo "Testing pytorch docs"

 cd docs
-TERM=vt100 make doctest
+make doctest
--- a/.ci/pytorch/install_cache_xla.sh
+++ b/.ci/pytorch/install_cache_xla.sh
@ -1,37 +0,0 @@
-#!/bin/bash
-
-# Script for installing sccache on the xla build job, which uses xla's docker
-# image and doesn't have sccache installed on it.  This is mostly copied from
-# .ci/docker/install_cache.sh.  Changes are: removing checks that will always
-# return the same thing, ex checks for for rocm, CUDA, and changing the path
-# where sccache is installed, and not changing /etc/environment.
-
-set -ex
-
-install_binary() {
-  echo "Downloading sccache binary from S3 repo"
-  curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /tmp/cache/bin/sccache
-}
-
-mkdir -p /tmp/cache/bin
-mkdir -p /tmp/cache/lib
-export PATH="/tmp/cache/bin:$PATH"
-
-install_binary
-chmod a+x /tmp/cache/bin/sccache
-
-function write_sccache_stub() {
-  # Unset LD_PRELOAD for ps because of asan + ps issues
-  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589
-  # shellcheck disable=SC2086
-  # shellcheck disable=SC2059
-  printf "#!/bin/sh\nif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then\n  exec sccache $(which $1) \"\$@\"\nelse\n  exec $(which $1) \"\$@\"\nfi" > "/tmp/cache/bin/$1"
-  chmod a+x "/tmp/cache/bin/$1"
-}
-
-write_sccache_stub cc
-write_sccache_stub c++
-write_sccache_stub gcc
-write_sccache_stub g++
-write_sccache_stub clang
-write_sccache_stub clang++
--- a/.ci/pytorch/macos-common.sh
+++ b/.ci/pytorch/macos-common.sh
@ -9,7 +9,7 @@ sysctl -a | grep machdep.cpu

 # These are required for both the build job and the test job.
 # In the latter to test cpp extensions.
-export MACOSX_DEPLOYMENT_TARGET=11.1
+export MACOSX_DEPLOYMENT_TARGET=11.0
 export CXX=clang++
 export CC=clang

--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -149,8 +149,6 @@ test_jit_hooks() {
  assert_git_not_dirty
 }

-install_tlparse
-
 if [[ $NUM_TEST_SHARDS -gt 1 ]]; then
  test_python_shard "${SHARD_NUMBER}"
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -18,7 +18,6 @@ time python test/run_test.py --verbose -i distributed/test_c10d_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_nccl
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
-time python test/run_test.py --verbose -i distributed/test_cuda_p2p
 time python test/run_test.py --verbose -i distributed/test_store
 time python test/run_test.py --verbose -i distributed/test_pg_wrapper
 time python test/run_test.py --verbose -i distributed/rpc/cuda/test_tensorpipe_agent
@ -35,6 +34,7 @@ time python test/run_test.py --verbose -i distributed/_shard/sharded_tensor/test
 # functional collective tests
 time python test/run_test.py --verbose -i distributed/test_functional_api

+
 # DTensor tests
 time python test/run_test.py --verbose -i distributed/_tensor/test_random_ops
 time python test/run_test.py --verbose -i distributed/_tensor/test_dtensor_compile
@ -46,17 +46,9 @@ time python test/run_test.py --verbose -i distributed/test_device_mesh
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_ddp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_fsdp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_examples
-time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state
-
-# FSDP2 tests
-time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh
-
-# Pipelining composability tests
-time python test/run_test.py --verbose -i distributed/pipelining/test_composability.py

 # Other tests
 time python test/run_test.py --verbose -i test_cuda_primary_ctx
-time python test/run_test.py --verbose -i test_optim -- -k test_forloop_goes_right_direction_multigpu
-time python test/run_test.py --verbose -i test_optim -- -k test_mixed_device_dtype
+time python test/run_test.py --verbose -i test_optim -- -k optimizers_with_varying_tensors
 time python test/run_test.py --verbose -i test_foreach -- -k test_tensors_grouping
 assert_git_not_dirty
--- a/.ci/pytorch/perf_test/compare_with_baseline.py
+++ b/.ci/pytorch/perf_test/compare_with_baseline.py
@ -59,16 +59,16 @@ print("sample mean: ", sample_mean)
 print("sample sigma: ", sample_sigma)

 if math.isnan(sample_mean):
-    raise Exception("""Error: sample mean is NaN""")  # noqa: TRY002
+    raise Exception("""Error: sample mean is NaN""")
 elif math.isnan(sample_sigma):
-    raise Exception("""Error: sample sigma is NaN""")  # noqa: TRY002
+    raise Exception("""Error: sample sigma is NaN""")

 z_value = (sample_mean - mean) / sigma

 print("z-value: ", z_value)

 if z_value >= 3:
-    raise Exception(  # noqa: TRY002
+    raise Exception(
        f"""\n
 z-value >= 3, there is high chance of perf regression.\n
 To reproduce this regression, run
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -26,8 +26,8 @@ echo "error: python_doc_push_script.sh: version (arg2) not specified"
 fi

 # Argument 1: Where to copy the built documentation to
-# (pytorch_docs/$install_path)
-install_path="${1:-${DOCS_INSTALL_PATH:-${DOCS_VERSION}}}"
+# (pytorch.github.io/$install_path)
+install_path="${1:-${DOCS_INSTALL_PATH:-docs/${DOCS_VERSION}}}"
 if [ -z "$install_path" ]; then
 echo "error: python_doc_push_script.sh: install_path (arg1) not specified"
  exit 1
@ -68,8 +68,8 @@ build_docs () {
 }


-git clone https://github.com/pytorch/docs pytorch_docs -b "$branch" --depth 1
-pushd pytorch_docs
+git clone https://github.com/pytorch/pytorch.github.io -b "$branch" --depth 1
+pushd pytorch.github.io

 export LC_ALL=C
 export PATH=/opt/conda/bin:$PATH
@ -105,7 +105,6 @@ if [ "$is_main_doc" = true ]; then
    echo undocumented objects found:
    cat build/coverage/python.txt
    echo "Make sure you've updated relevant .rsts in docs/source!"
-    echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
    exit 1
  fi
 else
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -6,27 +6,6 @@

 set -ex

-# shellcheck source=./common.sh
-source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
-
-# Do not change workspace permissions for ROCm CI jobs
-# as it can leave workspace with bad permissions for cancelled jobs
-if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
-  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
-  cleanup_workspace() {
-    echo "sudo may print the following warning message that can be ignored. The chown command will still run."
-    echo "    sudo: setrlimit(RLIMIT_STACK): Operation not permitted"
-    echo "For more details refer to https://github.com/sudo-project/sudo/issues/42"
-    sudo chown -R "$WORKSPACE_ORIGINAL_OWNER_ID" /var/lib/jenkins/workspace
-  }
-  # Disable shellcheck SC2064 as we want to parse the original owner immediately.
-  # shellcheck disable=SC2064
-  trap_add cleanup_workspace EXIT
-  sudo chown -R jenkins /var/lib/jenkins/workspace
-  git config --global --add safe.directory /var/lib/jenkins/workspace
-fi
-
 echo "Environment variables:"
 env

@ -39,10 +18,6 @@ BUILD_DIR="build"
 BUILD_RENAMED_DIR="build_renamed"
 BUILD_BIN_DIR="$BUILD_DIR"/bin

-#Set Default values for these variables in case they are not set
-SHARD_NUMBER="${SHARD_NUMBER:=1}"
-NUM_TEST_SHARDS="${NUM_TEST_SHARDS:=1}"
-
 export VALGRIND=ON
 # export TORCH_INDUCTOR_INSTALL_GXX=ON
 if [[ "$BUILD_ENVIRONMENT" == *clang9* ]]; then
@ -111,6 +86,9 @@ if [[ -n $TESTS_TO_INCLUDE ]]; then
  INCLUDE_CLAUSE="--include $TESTS_TO_INCLUDE"
 fi

+# shellcheck source=./common.sh
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+
 echo "Environment variables"
 env

@ -146,10 +124,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm* ]]; then
  # mainly used so that we're not spending extra cycles testing cpu
  # devices on expensive gpu machines
  export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"
-elif [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
-  export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu"
-  # setting PYTHON_TEST_EXTRA_OPTION
-  export PYTHON_TEST_EXTRA_OPTION="--xpu"
 fi

 if [[ "$TEST_CONFIG" == *crossref* ]]; then
@ -157,22 +131,11 @@ if [[ "$TEST_CONFIG" == *crossref* ]]; then
 fi

 if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
-  # regression in ROCm 6.0 on MI50 CI runners due to hipblaslt; remove in 6.1
-  export VALGRIND=OFF
  # Print GPU info
  rocminfo
  rocminfo | grep -E 'Name:.*\sgfx|Marketing'
 fi

-if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
-  # Source Intel oneAPI envrioment script to enable xpu runtime related libraries
-  # refer to https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/2024-0/use-the-setvars-and-oneapi-vars-scripts-with-linux.html
-  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/compiler/latest/env/vars.sh
-  # Check XPU status before testing
-  xpu-smi discovery
-fi
-
 if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
  # JIT C++ extensions require ninja.
  pip_install --user "ninja==1.10.2"
@ -181,13 +144,6 @@ if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
  export PATH="$HOME/.local/bin:$PATH"
 fi

-if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
-  # TODO: revisit this once the CI is stabilized on aarch64 linux
-  export VALGRIND=OFF
-fi
-
-install_tlparse
-
 # DANGER WILL ROBINSON.  The LD_PRELOAD here could cause you problems
 # if you're not careful.  Check this if you made some changes and the
 # ASAN test is not working
@ -234,6 +190,8 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
    export LD_PRELOAD=/usr/lib/llvm-15/lib/clang/15.0.7/lib/linux/libclang_rt.asan-x86_64.so
    # Disable valgrind for asan
    export VALGRIND=OFF
+    # Increase stack size, because ASAN red zones use more stack
+    ulimit -s 81920

    (cd test && python -c "import torch; print(torch.__version__, torch.version.git_version)")
    echo "The next four invocations are expected to crash; if they don't that means ASAN/UBSAN is misconfigured"
@ -264,18 +222,6 @@ elif [[ $TEST_CONFIG == 'nogpu_AVX512' ]]; then
  export ATEN_CPU_CAPABILITY=avx2
 fi

-# temp workarounds for https://github.com/pytorch/pytorch/issues/126692, remove when fixed
-if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
-  pushd test
-  CUDA_VERSION=$(python -c "import torch; print(torch.version.cuda)")
-  if [ "$CUDA_VERSION" == "12.4" ]; then
-    ISCUDA124="cu124"
-  else
-    ISCUDA124=""
-  fi
-  popd
-fi
-
 test_python_legacy_jit() {
  time python test/run_test.py --include test_jit_legacy test_jit_fuser_legacy --verbose
  assert_git_not_dirty
@ -289,14 +235,14 @@ test_python_shard() {

  # Bare --include flag is not supported and quoting for lint ends up with flag not being interpreted correctly
  # shellcheck disable=SC2086
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose

  assert_git_not_dirty
 }

 test_python() {
  # shellcheck disable=SC2086
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose
  assert_git_not_dirty
 }

@ -307,13 +253,33 @@ test_dynamo_shard() {
    exit 1
  fi
  python tools/dynamo/verify_dynamo.py
-  # PLEASE DO NOT ADD ADDITIONAL EXCLUDES HERE.
-  # Instead, use @skipIfTorchDynamo on your tests.
+  # Temporarily disable test_fx for dynamo pending the investigation on TTS
+  # regression in https://github.com/pytorch/torchdynamo/issues/784
  time python test/run_test.py --dynamo \
-    --exclude-inductor-tests \
    --exclude-jit-executor \
    --exclude-distributed-tests \
-    --exclude-torch-export-tests \
+    --exclude \
+      test_autograd \
+      test_jit \
+      test_proxy_tensor \
+      test_quantization \
+      test_public_bindings \
+      test_dataloader \
+      test_reductions \
+      test_namedtensor \
+      test_namedtuple_return_api \
+      profiler/test_profiler \
+      profiler/test_profiler_tree \
+      test_overrides \
+      test_python_dispatch \
+      test_fx \
+      test_package \
+      test_legacy_vmap \
+      test_custom_ops \
+      test_content_store \
+      export/test_db \
+      functorch/test_dims \
+      functorch/test_aotdispatch \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose
  assert_git_not_dirty
@ -322,24 +288,11 @@ test_dynamo_shard() {
 test_inductor_distributed() {
  # Smuggle a few multi-gpu tests here so that we don't have to request another large node
  echo "Testing multi_gpu tests in test_torchinductor"
-  python test/run_test.py -i inductor/test_torchinductor.py -k test_multi_gpu --verbose
-  python test/run_test.py -i inductor/test_aot_inductor.py -k test_non_default_cuda_device --verbose
-  python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
-  python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
-  python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
-  python test/run_test.py -i distributed/tensor/parallel/test_fsdp_2d_parallel.py --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py -k test_clip_grad_norm_2d --verbose
-  python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose
+  pytest test/inductor/test_torchinductor.py -k test_multi_gpu
+  pytest test/inductor/test_aot_inductor.py -k test_non_default_cuda_device
+  pytest test/inductor/test_aot_inductor.py -k test_replicate_on_devices
+  pytest test/distributed/_tensor/test_dtensor_compile.py
+  pytest test/distributed/tensor/parallel/test_fsdp_2d_parallel.py

  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
  # with if required # gpus aren't available
@ -351,32 +304,12 @@ test_inductor() {
  python tools/dynamo/verify_dynamo.py
  python test/run_test.py --inductor --include test_modules test_ops test_ops_gradients test_torch --verbose
  # Do not add --inductor for the following inductor unit tests, otherwise we will fail because of nested dynamo state
-  python test/run_test.py --include inductor/test_torchinductor inductor/test_torchinductor_opinfo inductor/test_aot_inductor --verbose
+  python test/run_test.py --include inductor/test_torchinductor inductor/test_torchinductor_opinfo --verbose

  # docker build uses bdist_wheel which does not work with test_aot_inductor
  # TODO: need a faster way to build
-  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-      BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
-      CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
-  fi
-}
-
-test_inductor_cpp_wrapper_abi_compatible() {
-  export TORCHINDUCTOR_ABI_COMPATIBLE=1
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-
-  echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
-  # cpu stack allocation causes segfault and needs more investigation
-  PYTORCH_TESTING_DEVICE_ONLY_FOR="" python test/run_test.py --include inductor/test_cpu_cpp_wrapper
-  python test/run_test.py --include inductor/test_cuda_cpp_wrapper
-
-  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
-    --training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
-    --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv"
-  python benchmarks/dynamo/check_accuracy.py \
-    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
-    --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/inductor_timm_training.csv"
+  BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
+  CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aot_inductor
 }

 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
@ -456,8 +389,8 @@ test_perf_for_dashboard() {
            --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *cppwrapper-true* ]] && [[ "$mode" == "inference" ]]; then
-        TORCHINDUCTOR_CPP_WRAPPER=1 python "benchmarks/dynamo/$suite.py" \
-            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
+        python "benchmarks/dynamo/$suite.py" \
+            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs --cpp-wrapper "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_cpp_wrapper_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *freezing_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
@ -471,7 +404,7 @@ test_perf_for_dashboard() {
            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_autotune_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *aotinductor-true* ]] && [[ "$mode" == "inference" ]]; then
-        TORCHINDUCTOR_ABI_COMPATIBLE=1 python "benchmarks/dynamo/$suite.py" \
+        python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --export-aot-inductor --disable-cudagraphs "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_aot_inductor_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
@ -480,17 +413,6 @@ test_perf_for_dashboard() {
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_max_autotune_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
-      if [[ "$DASHBOARD_TAG" == *cudagraphs_low_precision-true* ]] && [[ "$mode" == "inference" ]]; then
-        # TODO: This has a new dtype called quant and the benchmarks script needs to be updated to support this.
-        # The tentative command is as follows. It doesn't work now, but it's ok because we only need mock data
-        # to fill the dashboard.
-        python "benchmarks/dynamo/$suite.py" \
-          "${target_flag[@]}" --"$mode" --quant --backend "$backend" "$@" \
-          --output "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_cuda_${target}.csv" || true
-        # Copy cudagraph results as mock data, easiest choice?
-        cp "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_cuda_${target}.csv" \
-          "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_cuda_${target}.csv"
-      fi
    done
  done
 }
@ -526,11 +448,6 @@ test_single_dynamo_benchmark() {
    test_perf_for_dashboard "$suite" \
      "${DYNAMO_BENCHMARK_FLAGS[@]}" "$@" "${partition_flags[@]}"
  else
-    if [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
-      # Test AOTInductor with the ABI-compatible mode on CI
-      # This can be removed once the ABI-compatible mode becomes default.
-      export TORCHINDUCTOR_ABI_COMPATIBLE=1
-    fi
    python "benchmarks/dynamo/$suite.py" \
      --ci --accuracy --timing --explain \
      "${DYNAMO_BENCHMARK_FLAGS[@]}" \
@ -538,23 +455,13 @@ test_single_dynamo_benchmark() {
      --output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
    python benchmarks/dynamo/check_accuracy.py \
      --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/${TEST_CONFIG}_${name}.csv"
+      --expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
    python benchmarks/dynamo/check_graph_breaks.py \
      --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/${TEST_CONFIG}_${name}.csv"
+      --expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
  fi
 }

-test_inductor_micro_benchmark() {
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
-}
-
-test_inductor_halide() {
-  python test/run_test.py --include inductor/test_halide.py --verbose
-  assert_git_not_dirty
-}
-
 test_dynamo_benchmark() {
  # Usage: test_dynamo_benchmark huggingface 0
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -570,11 +477,7 @@ test_dynamo_benchmark() {
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
  else
    if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
-      if [[ "${TEST_CONFIG}" == *freezing* ]]; then
-        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 --freezing "$@"
-      else
-        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 "$@"
-      fi
+      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 "$@"
    elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
    else
@ -588,37 +491,20 @@ test_inductor_torchbench_smoketest_perf() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

-  # Test some models in the cpp wrapper mode
-  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-    --bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-    --bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-    --bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-  python benchmarks/dynamo/check_accuracy.py \
-    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
-    --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/inductor_torchbench_inference.csv"
-
  python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
    --output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
  # The threshold value needs to be actively maintained to make this check useful
  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4

-  TORCHINDUCTOR_ABI_COMPATIBLE=1 python benchmarks/dynamo/torchbench.py --device cuda --performance --bfloat16 --inference \
+  python benchmarks/dynamo/torchbench.py --device cuda --performance --bfloat16 --inference \
    --export-aot-inductor --only nanogpt --output "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv"
  # The threshold value needs to be actively maintained to make this check useful
  # The perf number of nanogpt seems not very stable, e.g.
  # https://github.com/pytorch/pytorch/actions/runs/7158691360/job/19491437314,
  # and thus we lower its threshold to reduce flakiness. If this continues to be a problem,
  # we switch to use some other model.
-  # Use 4.7 for cuda 12.4, change back to 4.9 after fixing https://github.com/pytorch/pytorch/issues/126692
-  if [ "$CUDA_VERSION" == "12.4" ]; then
-    THRESHOLD=4.7
-  else
-    THRESHOLD=4.9
-  fi
-  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t $THRESHOLD
+  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.9

  # Check memory compression ratio for a few models
  for test in hf_Albert timm_vision_transformer; do
@ -630,65 +516,6 @@ test_inductor_torchbench_smoketest_perf() {
      "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv" \
      --expected benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
  done
-
-  # Perform some "warm-start" runs for a few huggingface models.
-  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
-    python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
-      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
-    python benchmarks/dynamo/check_accuracy.py \
-      --actual "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/inductor_huggingface_training.csv"
-  done
-}
-
-test_inductor_torchbench_cpu_smoketest_perf(){
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-
-  #set jemalloc
-  JEMALLOC_LIB="/usr/lib/x86_64-linux-gnu/libjemalloc.so.2"
-  IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
-  export LD_PRELOAD="$JEMALLOC_LIB":"$IOMP_LIB":"$LD_PRELOAD"
-  export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
-  export KMP_AFFINITY=granularity=fine,compact,1,0
-  export KMP_BLOCKTIME=1
-  CORES=$(lscpu | grep Core | awk '{print $4}')
-  export OMP_NUM_THREADS=$CORES
-  end_core=$(( CORES-1 ))
-
-  MODELS_SPEEDUP_TARGET=benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
-
-  grep -v '^ *#' < "$MODELS_SPEEDUP_TARGET" | while IFS=',' read -r -a model_cfg
-  do
-    local model_name=${model_cfg[0]}
-    local data_type=${model_cfg[1]}
-    local speedup_target=${model_cfg[4]}
-    if [[ ${model_cfg[3]} == "cpp" ]]; then
-      export TORCHINDUCTOR_CPP_WRAPPER=1
-    else
-      unset TORCHINDUCTOR_CPP_WRAPPER
-    fi
-    local output_name="$TEST_REPORTS_DIR/inductor_inference_${model_cfg[0]}_${model_cfg[1]}_${model_cfg[2]}_${model_cfg[3]}_cpu_smoketest.csv"
-
-    if [[ ${model_cfg[2]} == "dynamic" ]]; then
-      taskset -c 0-"$end_core" python benchmarks/dynamo/torchbench.py \
-        --inference --performance --"$data_type" -dcpu -n50 --only "$model_name" --dynamic-shapes \
-        --dynamic-batch-only --freezing --timeout 9000 --backend=inductor --output "$output_name"
-    else
-      taskset -c 0-"$end_core" python benchmarks/dynamo/torchbench.py \
-        --inference --performance --"$data_type" -dcpu -n50 --only "$model_name" \
-        --freezing --timeout 9000 --backend=inductor --output "$output_name"
-    fi
-    cat "$output_name"
-    # The threshold value needs to be actively maintained to make this check useful.
-    python benchmarks/dynamo/check_perf_csv.py -f "$output_name" -t "$speedup_target"
-  done
-}
-
-test_torchbench_gcp_smoketest(){
-  pushd "${TORCHBENCHPATH}"
-  python test.py -v
-  popd
 }

 test_python_gloo_with_tls() {
@ -722,6 +549,7 @@ test_aten() {
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libmkldnn* "$TEST_BASE_DIR"
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libnccl* "$TEST_BASE_DIR"
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libtorch* "$TEST_BASE_DIR"
+  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libtbb* "$TEST_BASE_DIR"

  ls "$TEST_BASE_DIR"
  aten/tools/run_tests.sh "$TEST_BASE_DIR"
@ -746,6 +574,21 @@ test_without_numpy() {
  popd
 }

+# pytorch extensions require including torch/extension.h which includes all.h
+# which includes utils.h which includes Parallel.h.
+# So you can call for instance parallel_for() from your extension,
+# but the compilation will fail because of Parallel.h has only declarations
+# and definitions are conditionally included Parallel.h(see last lines of Parallel.h).
+# I tried to solve it #39612 and #39881 by including Config.h into Parallel.h
+# But if Pytorch is built with TBB it provides Config.h
+# that has AT_PARALLEL_NATIVE_TBB=1(see #3961 or #39881) and it means that if you include
+# torch/extension.h which transitively includes Parallel.h
+# which transitively includes tbb.h which is not available!
+if [[ "${BUILD_ENVIRONMENT}" == *tbb* ]]; then
+  sudo mkdir -p /usr/include/tbb
+  sudo cp -r "$PWD"/third_party/tbb/include/tbb/* /usr/include/tbb
+fi
+
 test_libtorch() {
  local SHARD="$1"

@ -759,6 +602,7 @@ test_libtorch() {
    ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libshm* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
+    ln -sf "$TORCH_LIB_DIR"/libtbb* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libnvfuser* "$TORCH_BIN_DIR"

    export CPP_TESTS_DIR="${TORCH_BIN_DIR}"
@ -820,19 +664,6 @@ test_libtorch_api() {
  fi
 }

-test_xpu_bin(){
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-
-  for xpu_case in "${BUILD_BIN_DIR}"/*{xpu,sycl}*; do
-    if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then
-      case_name=$(basename "$xpu_case")
-      echo "Testing ${case_name} ..."
-      "$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml
-    fi
-  done
-}
-
 test_aot_compilation() {
  echo "Testing Ahead of Time compilation"
  ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
@ -895,6 +726,7 @@ test_rpc() {
  # test reporting process to function as expected.
  ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
  ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
+  ln -sf "$TORCH_LIB_DIR"/libtbb* "$TORCH_BIN_DIR"

  CPP_TESTS_DIR="${TORCH_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_cpp_rpc
 }
@ -1072,8 +904,7 @@ test_bazel() {

    tools/bazel test --config=cpu-only --test_timeout=480 --test_output=all --test_tag_filters=-gpu-required --test_filter=-*CUDA :all_tests
  else
-    # Increase the test timeout to 480 like CPU tests because modules_test frequently timeout
-    tools/bazel test --test_timeout=480 --test_output=errors \
+    tools/bazel test --test_output=errors \
      //:any_test \
      //:autograd_test \
      //:dataloader_test \
@ -1168,17 +999,14 @@ test_docs_test() {
 }

 test_executorch() {
-  echo "Install torchvision and torchaudio"
-  install_torchvision
-  install_torchaudio
-
  pushd /executorch

-  # NB: We need to build ExecuTorch runner here and not inside the Docker image
-  # because it depends on PyTorch
-  # shellcheck disable=SC1091
-  source .ci/scripts/utils.sh
-  build_executorch_runner "cmake"
+  echo "Install torchvision and torchaudio"
+  # TODO(huydhn): Switch this to the pinned commits on ExecuTorch once they are
+  # there.  These libraries need to be built here, and not part of the Docker
+  # image because they require the target version of torch to be installed first
+  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git"
+  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/vision.git"

  echo "Run ExecuTorch regression tests for some models"
  # NB: This is a sample model, more can be added here
@ -1196,33 +1024,11 @@ test_executorch() {
  assert_git_not_dirty
 }

-test_linux_aarch64(){
-  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
-       test_transformers test_multiprocessing test_numpy_interop --verbose
-
-  # Dynamo tests
-  python test/run_test.py --include dynamo/test_compile dynamo/test_backends dynamo/test_comptime dynamo/test_config \
-       dynamo/test_functions dynamo/test_fx_passes_pre_grad dynamo/test_interop dynamo/test_model_output dynamo/test_modules \
-       dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles --verbose
-
-  # Inductor tests
-  python test/run_test.py --include inductor/test_torchinductor inductor/test_benchmark_fusion inductor/test_codecache \
-       inductor/test_config inductor/test_control_flow inductor/test_coordinate_descent_tuner inductor/test_fx_fusion \
-       inductor/test_group_batch_fusion inductor/test_inductor_freezing inductor/test_inductor_utils \
-       inductor/test_inplacing_pass inductor/test_kernel_benchmark inductor/test_layout_optim \
-       inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
-       inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
-       inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \
-       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes --verbose
-}
-
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
  (cd test && python -c "import torch; print(torch.__config__.show())")
  (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
 fi
-if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
-  test_linux_aarch64
-elif [[ "${TEST_CONFIG}" == *backward* ]]; then
+if [[ "${TEST_CONFIG}" == *backward* ]]; then
  test_forward_backward_compatibility
  # Do NOT add tests after bc check tests, see its comment.
 elif [[ "${TEST_CONFIG}" == *xla* ]]; then
@ -1247,10 +1053,6 @@ elif [[ "$TEST_CONFIG" == deploy ]]; then
  test_torch_deploy
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
-elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
-  test_inductor_halide
-elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
-  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
  install_torchvision
  id=$((SHARD_NUMBER-1))
@ -1273,14 +1075,6 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
    checkout_install_torchbench hf_Bert hf_Albert nanogpt timm_vision_transformer
    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
-  elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
-    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_gcn \
-      llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
-      shufflenet_v2_x1_0 hf_GPT2
-    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
-  elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
-    checkout_install_torchbench
-    TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest
  else
    checkout_install_torchbench
    # Do this after checkout_install_torchbench to ensure we clobber any
@ -1290,33 +1084,24 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
    fi
    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
  fi
-elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper_abi_compatible* ]]; then
-  install_torchvision
-  test_inductor_cpp_wrapper_abi_compatible
 elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
  install_torchvision
  test_inductor
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
+  test_without_numpy
  install_torchvision
  test_dynamo_shard 1
  test_aten
-elif [[ "${TEST_CONFIG}" == *dynamo* && $SHARD_NUMBER -gt 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
+elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
  install_torchvision
-  test_dynamo_shard "${SHARD_NUMBER}"
-elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
-  install_torchvision
-  test_python_shard "$SHARD_NUMBER"
-  test_aten
+  test_dynamo_shard 2
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  test_without_numpy
  install_torchvision
  test_python_shard 1
  test_aten
  test_libtorch 1
-  if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
-    test_xpu_bin
-  fi
 elif [[ "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
  install_torchvision
  test_python_shard 2
@ -1337,11 +1122,10 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
  test_libtorch
 elif [[ "${TEST_CONFIG}" = docs_test ]]; then
  test_docs_test
-elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
+elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
  install_torchvision
  test_python
  test_aten
-  test_xpu_bin
 else
  install_torchvision
  install_monkeytype
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -16,23 +16,24 @@ set PATH=C:\Program Files\CMake\bin;C:\Program Files\7-Zip;C:\ProgramData\chocol

 set INSTALLER_DIR=%SCRIPT_HELPERS_DIR%\installation-helpers

+
+call %INSTALLER_DIR%\install_mkl.bat
+if errorlevel 1 exit /b
+if not errorlevel 0 exit /b
+
 call %INSTALLER_DIR%\install_magma.bat
-if errorlevel 1 goto fail
-if not errorlevel 0 goto fail
+if errorlevel 1 exit /b
+if not errorlevel 0 exit /b

 call %INSTALLER_DIR%\install_sccache.bat
-if errorlevel 1 goto fail
-if not errorlevel 0 goto fail
+if errorlevel 1 exit /b
+if not errorlevel 0 exit /b

 :: Miniconda has been installed as part of the Windows AMI with all the dependencies.
 :: We just need to activate it here
 call %INSTALLER_DIR%\activate_miniconda3.bat
-if errorlevel 1 goto fail
-if not errorlevel 0 goto fail
-
-call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0
-if errorlevel 1 goto fail
-if not errorlevel 0 goto fail
+if errorlevel 1 exit /b
+if not errorlevel 0 exit /b

 :: Override VS env here
 pushd .
@ -41,8 +42,8 @@ if "%VC_VERSION%" == "" (
 ) else (
    call "C:\Program Files (x86)\Microsoft Visual Studio\%VC_YEAR%\%VC_PRODUCT%\VC\Auxiliary\Build\vcvarsall.bat" x64 -vcvars_ver=%VC_VERSION%
 )
-if errorlevel 1 goto fail
-if not errorlevel 0 goto fail
+if errorlevel 1 exit /b
+if not errorlevel 0 exit /b
@echo on
 popd

@ -52,12 +53,12 @@ set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION%

 if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
    echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.'
-    goto fail
+    exit /b 1
 )
 rem version transformer, for example 10.1 to 10_1.
 if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
    echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.'
-    goto fail
+    exit /b 1
 )
 set VERSION_SUFFIX=%CUDA_VERSION:.=_%
 set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
@ -88,8 +89,8 @@ set SCCACHE_IGNORE_SERVER_IO_ERROR=1
 sccache --stop-server
 sccache --start-server
 sccache --zero-stats
-set CMAKE_C_COMPILER_LAUNCHER=sccache
-set CMAKE_CXX_COMPILER_LAUNCHER=sccache
+set CC=sccache-cl
+set CXX=sccache-cl

 set CMAKE_GENERATOR=Ninja

@ -101,8 +102,8 @@ if "%USE_CUDA%"=="1" (
  :: CMake requires a single command as CUDA_NVCC_EXECUTABLE, so we push the wrappers
  :: randomtemp.exe and sccache.exe into a batch file which CMake invokes.
  curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.4/randomtemp.exe --output %TMP_DIR_WIN%\bin\randomtemp.exe
-  if errorlevel 1 goto fail
-  if not errorlevel 0 goto fail
+  if errorlevel 1 exit /b
+  if not errorlevel 0 exit /b
  echo @"%TMP_DIR_WIN%\bin\randomtemp.exe" "%TMP_DIR_WIN%\bin\sccache.exe" "%CUDA_PATH%\bin\nvcc.exe" %%* > "%TMP_DIR%/bin/nvcc.bat"
  cat %TMP_DIR%/bin/nvcc.bat
  set CUDA_NVCC_EXECUTABLE=%TMP_DIR%/bin/nvcc.bat
@ -114,8 +115,8 @@ if "%USE_CUDA%"=="1" (
 set

 python setup.py bdist_wheel
-if errorlevel 1 goto fail
-if not errorlevel 0 goto fail
+if errorlevel 1 exit /b
+if not errorlevel 0 exit /b
 sccache --show-stats
 python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
 (
@ -135,8 +136,3 @@ python -c "import os, glob; os.system('python -mpip install --no-index --no-deps

 sccache --show-stats --stats-format json | jq .stats > sccache-stats-%BUILD_ENVIRONMENT%-%OUR_GITHUB_JOB_ID%.json
 sccache --stop-server
-
-exit /b 0
-
-:fail
-exit /b 1
--- a/.ci/pytorch/win-test-helpers/installation-helpers/install_mkl.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/install_mkl.bat
@ -0,0 +1,14 @@
+if "%REBUILD%"=="" (
+  if "%BUILD_ENVIRONMENT%"=="" (
+    curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/mkl_2020.2.254.7z --output %TMP_DIR_WIN%\mkl.7z
+  ) else (
+    aws s3 cp s3://ossci-windows/mkl_2020.2.254.7z %TMP_DIR_WIN%\mkl.7z --quiet
+  )
+  if errorlevel 1 exit /b
+  if not errorlevel 0 exit /b
+  7z x -aoa %TMP_DIR_WIN%\mkl.7z -o%TMP_DIR_WIN%\mkl
+  if errorlevel 1 exit /b
+  if not errorlevel 0 exit /b
+)
+set CMAKE_INCLUDE_PATH=%TMP_DIR_WIN%\mkl\include
+set LIB=%TMP_DIR_WIN%\mkl\lib;%LIB%
--- a/.ci/pytorch/win-test-helpers/installation-helpers/install_sccache.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/install_sccache.bat
@ -1,13 +1,18 @@
 mkdir %TMP_DIR_WIN%\bin

 if "%REBUILD%"=="" (
-  IF EXIST %TMP_DIR_WIN%\bin\sccache.exe (
+  :check_sccache
+  %TMP_DIR_WIN%\bin\sccache.exe --show-stats || (
    taskkill /im sccache.exe /f /t || ver > nul
    del %TMP_DIR_WIN%\bin\sccache.exe || ver > nul
+    del %TMP_DIR_WIN%\bin\sccache-cl.exe || ver > nul
+    if "%BUILD_ENVIRONMENT%"=="" (
+      curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache.exe --output %TMP_DIR_WIN%\bin\sccache.exe
+      curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache-cl.exe --output %TMP_DIR_WIN%\bin\sccache-cl.exe
+    ) else (
+      aws s3 cp s3://ossci-windows/sccache.exe %TMP_DIR_WIN%\bin\sccache.exe
+      aws s3 cp s3://ossci-windows/sccache-cl.exe %TMP_DIR_WIN%\bin\sccache-cl.exe
+    )
+    goto :check_sccache
  )
-  if "%BUILD_ENVIRONMENT%"=="" (
-    curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache-v0.7.4.exe --output %TMP_DIR_WIN%\bin\sccache.exe
-  ) else (
-    aws s3 cp s3://ossci-windows/sccache-v0.7.4.exe %TMP_DIR_WIN%\bin\sccache.exe
-  )
-)
+)
--- a/.circleci/README.md
+++ b/.circleci/README.md
@ -1,4 +1,468 @@
 Warning
 =======

-PyTorch migration from CircleCI to github actions has been completed. All continuous integration & deployment workflows are defined in  `.github/workflows` folder
+Contents may be out of date. Our CircleCI workflows are gradually being migrated to Github actions.
+
+Structure of CI
+===============
+
+setup job:
+1. Does a git checkout
+2. Persists CircleCI scripts (everything in `.circleci`) into a workspace.  Why?
+   We don't always do a Git checkout on all subjobs, but we usually
+   still want to be able to call scripts one way or another in a subjob.
+   Persisting files this way lets us have access to them without doing a
+   checkout.  This workspace is conventionally mounted on `~/workspace`
+   (this is distinguished from `~/project`, which is the conventional
+   working directory that CircleCI will default to starting your jobs
+   in.)
+3. Write out the commit message to `.circleci/COMMIT_MSG`.  This is so
+   we can determine in subjobs if we should actually run the jobs or
+   not, even if there isn't a Git checkout.
+
+
+CircleCI configuration generator
+================================
+
+One may no longer make changes to the `.circleci/config.yml` file directly.
+Instead, one must edit these Python scripts or files in the `verbatim-sources/` directory.
+
+
+Usage
+----------
+
+1. Make changes to these scripts.
+2. Run the `regenerate.sh` script in this directory and commit the script changes and the resulting change to `config.yml`.
+
+You'll see a build failure on GitHub if the scripts don't agree with the checked-in version.
+
+
+Motivation
+----------
+
+These scripts establish a single, authoritative source of documentation for the CircleCI configuration matrix.
+The documentation, in the form of diagrams, is automatically generated and cannot drift out of sync with the YAML content.
+
+Furthermore, consistency is enforced within the YAML config itself, by using a single source of data to generate
+multiple parts of the file.
+
+* Facilitates one-off culling/enabling of CI configs for testing PRs on special targets
+
+Also see https://github.com/pytorch/pytorch/issues/17038
+
+
+Future direction
+----------------
+
+### Declaring sparse config subsets
+See comment [here](https://github.com/pytorch/pytorch/pull/17323#pullrequestreview-206945747):
+
+In contrast with a full recursive tree traversal of configuration dimensions,
+> in the future I think we actually want to decrease our matrix somewhat and have only a few mostly-orthogonal builds that taste as many different features as possible on PRs, plus a more complete suite on every PR and maybe an almost full suite nightly/weekly (we don't have this yet). Specifying PR jobs in the future might be easier to read with an explicit list when we come to this.
+----------------
+----------------
+
+# How do the binaries / nightlies / releases work?
+
+### What is a binary?
+
+A binary or package (used interchangeably) is a pre-built collection of c++ libraries, header files, python bits, and other files. We build these and distribute them so that users do not need to install from source.
+
+A **binary configuration** is a collection of
+
+* release or nightly
+    * releases are stable, nightlies are beta and built every night
+* python version
+    * linux: 3.7m (mu is wide unicode or something like that. It usually doesn't matter but you should know that it exists)
+    * macos: 3.7, 3.8
+    * windows: 3.7, 3.8
+* cpu version
+    * cpu, cuda 9.0, cuda 10.0
+    * The supported cuda versions occasionally change
+* operating system
+    * Linux - these are all built on CentOS. There haven't been any problems in the past building on CentOS and using on Ubuntu
+    * MacOS
+    * Windows - these are built on Azure pipelines
+* devtoolset version (gcc compiler version)
+    * This only matters on Linux cause only Linux uses gcc. tldr is gcc made a backwards incompatible change from gcc 4.8 to gcc 5, because it had to change how it implemented std::vector and std::string
+
+### Where are the binaries?
+
+The binaries are built in CircleCI. There are nightly binaries built every night at 9pm PST (midnight EST) and release binaries corresponding to Pytorch releases, usually every few months.
+
+We have 3 types of binary packages
+
+* pip packages - nightlies are stored on s3 (pip install -f \<a s3 url\>). releases are stored in a pip repo (pip install torch) (ask Soumith about this)
+* conda packages - nightlies and releases are both stored in a conda repo. Nighty packages have a '_nightly' suffix
+* libtorch packages - these are zips of all the c++ libraries, header files, and sometimes dependencies. These are c++ only
+    * shared with dependencies (the only supported option for Windows)
+    * static with dependencies
+    * shared without dependencies
+    * static without dependencies
+
+All binaries are built in CircleCI workflows except Windows. There are checked-in workflows (committed into the .circleci/config.yml) to build the nightlies every night. Releases are built by manually pushing a PR that builds the suite of release binaries (overwrite the config.yml to build the release)
+
+# CircleCI structure of the binaries
+
+Some quick vocab:
+
+* A \**workflow** is a CircleCI concept; it is a DAG of '**jobs**'. ctrl-f 'workflows' on https://github.com/pytorch/pytorch/blob/main/.circleci/config.yml to see the workflows.
+* **jobs** are a sequence of '**steps**'
+* **steps** are usually just a bash script or a builtin CircleCI command. *All steps run in new environments, environment variables declared in one script DO NOT persist to following steps*
+* CircleCI has a **workspace**, which is essentially a cache between steps of the *same job* in which you can store artifacts between steps.
+
+## How are the workflows structured?
+
+The nightly binaries have 3 workflows. We have one job (actually 3 jobs:  build, test, and upload) per binary configuration
+
+1. binary_builds
+    1. every day midnight EST
+    2. linux: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/linux-binary-build-defaults.yml
+    3. macos: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/macos-binary-build-defaults.yml
+    4. For each binary configuration, e.g. linux_conda_3.7_cpu there is a
+        1. binary_linux_conda_3.7_cpu_build
+            1. Builds the build. On linux jobs this uses the 'docker executor'.
+            2. Persists the package to the workspace
+        2. binary_linux_conda_3.7_cpu_test
+            1. Loads the package to the workspace
+            2. Spins up a docker image (on Linux), mapping the package and code repos into the docker
+            3. Runs some smoke tests in the docker
+            4. (Actually, for macos this is a step rather than a separate job)
+        3. binary_linux_conda_3.7_cpu_upload
+            1. Logs in to aws/conda
+            2. Uploads the package
+2. update_s3_htmls
+    1. every day 5am EST
+    2. https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/binary_update_htmls.yml
+    3. See below for what these are for and why they're needed
+    4. Three jobs that each examine the current contents of aws and the conda repo and update some html files in s3
+3. binarysmoketests
+    1. every day
+    2. https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml
+    3. For each binary configuration, e.g. linux_conda_3.7_cpu there is a
+        1. smoke_linux_conda_3.7_cpu
+            1. Downloads the package from the cloud, e.g. using the official pip or conda instructions
+            2. Runs the smoke tests
+
+## How are the jobs structured?
+
+The jobs are in https://github.com/pytorch/pytorch/tree/main/.circleci/verbatim-sources. Jobs are made of multiple steps. There are some shared steps used by all the binaries/smokes. Steps of these jobs are all delegated to scripts in https://github.com/pytorch/pytorch/tree/main/.circleci/scripts .
+
+* Linux jobs: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/linux-binary-build-defaults.yml
+    * binary_linux_build.sh
+    * binary_linux_test.sh
+    * binary_linux_upload.sh
+* MacOS jobs: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/macos-binary-build-defaults.yml
+    * binary_macos_build.sh
+    * binary_macos_test.sh
+    * binary_macos_upload.sh
+* Update html jobs: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/binary_update_htmls.yml
+    * These delegate from the pytorch/builder repo
+    * https://github.com/pytorch/builder/blob/main/cron/update_s3_htmls.sh
+    * https://github.com/pytorch/builder/blob/main/cron/upload_binary_sizes.sh
+* Smoke jobs (both linux and macos): https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml
+    * These delegate from the pytorch/builder repo
+    * https://github.com/pytorch/builder/blob/main/run_tests.sh
+    * https://github.com/pytorch/builder/blob/main/smoke_test.sh
+    * https://github.com/pytorch/builder/blob/main/check_binary.sh
+* Common shared code (shared across linux and macos): https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/nightly-binary-build-defaults.yml
+    * binary_checkout.sh - checks out pytorch/builder repo. Right now this also checks out pytorch/pytorch, but it shouldn't. pytorch/pytorch should just be shared through the workspace. This can handle being run before binary_populate_env.sh
+    * binary_populate_env.sh - parses BUILD_ENVIRONMENT into the separate env variables that make up a binary configuration. Also sets lots of default values, the date, the version strings, the location of folders in s3, all sorts of things. This generally has to be run before other steps.
+    * binary_install_miniconda.sh - Installs miniconda, cross platform. Also hacks this for the update_binary_sizes job that doesn't have the right env variables
+    * binary_run_in_docker.sh - Takes a bash script file (the actual test code) from a hardcoded location, spins up a docker image, and runs the script inside the docker image
+
+### **Why do the steps all refer to scripts?**
+
+CircleCI creates a  final yaml file by inlining every <<* segment, so if we were to keep all the code in the config.yml itself then the config size would go over 4 MB and cause infra problems.
+
+### **What is binary_run_in_docker for?**
+
+So, CircleCI has several executor types: macos, machine, and docker are the ones we use. The 'machine' executor gives you two cores on some linux vm. The 'docker' executor gives you considerably more cores (nproc was 32 instead of 2 back when I tried in February). Since the dockers are faster, we try to run everything that we can in dockers. Thus
+
+* linux build jobs use the docker executor. Running them on the docker executor was at least 2x faster than running them on the machine executor
+* linux test jobs use the machine executor in order for them to properly interface with GPUs since docker executors cannot execute with attached GPUs
+* linux upload jobs use the machine executor. The upload jobs are so short that it doesn't really matter what they use
+* linux smoke test jobs use the machine executor for the same reason as the linux test jobs
+
+binary_run_in_docker.sh is a way to share the docker start-up code between the binary test jobs and the binary smoke test jobs
+
+### **Why does binary_checkout also checkout pytorch? Why shouldn't it?**
+
+We want all the nightly binary jobs to run on the exact same git commit, so we wrote our own checkout logic to ensure that the same commit was always picked. Later circleci changed that to use a single pytorch checkout and persist it through the workspace (they did this because our config file was too big, so they wanted to take a lot of the setup code into scripts, but the scripts needed the code repo to exist to be called, so they added a prereq step called 'setup' to checkout the code and persist the needed scripts to the workspace). The changes to the binary jobs were not properly tested, so they all broke from missing pytorch code no longer existing. We hotfixed the problem by adding the pytorch checkout back to binary_checkout, so now there's two checkouts of pytorch on the binary jobs. This problem still needs to be fixed, but it takes careful tracing of which code is being called where.
+
+# Code structure of the binaries (circleci agnostic)
+
+## Overview
+
+The code that runs the binaries lives in two places, in the normal [github.com/pytorch/pytorch](http://github.com/pytorch/pytorch), but also in [github.com/pytorch/builder](http://github.com/pytorch/builder), which is a repo that defines how all the binaries are built. The relevant code is
+
+
+```
+# All code needed to set-up environments for build code to run in,
+# but only code that is specific to the current CI system
+pytorch/pytorch
+- .circleci/                # Folder that holds all circleci related stuff
+  - config.yml              # GENERATED file that actually controls all circleci behavior
+  - verbatim-sources        # Used to generate job/workflow sections in ^
+  - scripts/                # Code needed to prepare circleci environments for binary build scripts
+- setup.py                  # Builds pytorch. This is wrapped in pytorch/builder
+- cmake files               # used in normal building of pytorch
+# All code needed to prepare a binary build, given an environment
+# with all the right variables/packages/paths.
+pytorch/builder
+# Given an installed binary and a proper python env, runs some checks
+# to make sure the binary was built the proper way. Checks things like
+# the library dependencies, symbols present, etc.
+- check_binary.sh
+# Given an installed binary, runs python tests to make sure everything
+# is in order. These should be de-duped. Right now they both run smoke
+# tests, but are called from different places. Usually just call some
+# import statements, but also has overlap with check_binary.sh above
+- run_tests.sh
+- smoke_test.sh
+# Folders that govern how packages are built. See paragraphs below
+- conda/
+  - build_pytorch.sh          # Entrypoint. Delegates to proper conda build folder
+  - switch_cuda_version.sh    # Switches activate CUDA installation in Docker
+  - pytorch-nightly/          # Build-folder
+- manywheel/
+  - build_cpu.sh              # Entrypoint for cpu builds
+  - build.sh                  # Entrypoint for CUDA builds
+  - build_common.sh           # Actual build script that ^^ call into
+- wheel/
+  - build_wheel.sh            # Entrypoint for wheel builds
+- windows/
+  - build_pytorch.bat         # Entrypoint for wheel builds on Windows
+```
+
+Every type of package has an entrypoint build script that handles the all the important logic.
+
+## Conda
+
+Linux, MacOS and Windows use the same code flow for the conda builds.
+
+Conda packages are built with conda-build, see https://conda.io/projects/conda-build/en/latest/resources/commands/conda-build.html
+
+Basically, you pass `conda build` a build folder (pytorch-nightly/ above) that contains a build script and a meta.yaml. The meta.yaml specifies in what python environment to build the package in, and what dependencies the resulting package should have, and the build script gets called in the env to build the thing.
+tl;dr on conda-build is
+
+1. Creates a brand new conda environment, based off of deps in the meta.yaml
+    1. Note that environment variables do not get passed into this build env unless they are specified in the meta.yaml
+    2. If the build fails this environment will stick around. You can activate it for much easier debugging. The “General Python” section below explains what exactly a python “environment” is.
+2. Calls build.sh in the environment
+3. Copies the finished package to a new conda env, also specified by the meta.yaml
+4. Runs some simple import tests (if specified in the meta.yaml)
+5. Saves the finished package as a tarball
+
+The build.sh we use is essentially a wrapper around `python setup.py build`, but it also manually copies in some of our dependent libraries into the resulting tarball and messes with some rpaths.
+
+The entrypoint file `builder/conda/build_conda.sh` is complicated because
+
+* It works for Linux, MacOS and Windows
+    * The mac builds used to create their own environments, since they all used to be on the same machine. There’s now a lot of extra logic to handle conda envs. This extra machinery could be removed
+* It used to handle testing too, which adds more logic messing with python environments too. This extra machinery could be removed.
+
+## Manywheels (linux pip and libtorch packages)
+
+Manywheels are pip packages for linux distros. Note that these manywheels are not actually manylinux compliant.
+
+`builder/manywheel/build_cpu.sh` and `builder/manywheel/build.sh` (for CUDA builds) just set different env vars and then call into `builder/manywheel/build_common.sh`
+
+The entrypoint file `builder/manywheel/build_common.sh` is really really complicated because
+
+* This used to handle building for several different python versions at the same time. The loops have been removed, but there's still unnecessary folders and movements here and there.
+    * The script is never used this way anymore. This extra machinery could be removed.
+* This used to handle testing the pip packages too. This is why there’s testing code at the end that messes with python installations and stuff
+    * The script is never used this way anymore. This extra machinery could be removed.
+* This also builds libtorch packages
+    * This should really be separate. libtorch packages are c++ only and have no python. They should not share infra with all the python specific stuff in this file.
+* There is a lot of messing with rpaths. This is necessary, but could be made much much simpler if the above issues were fixed.
+
+## Wheels (MacOS pip and libtorch packages)
+
+The entrypoint file `builder/wheel/build_wheel.sh` is complicated because
+
+* The mac builds used to all run on one machine (we didn’t have autoscaling mac machines till circleci). So this script handled siloing itself by setting-up and tearing-down its build env and siloing itself into its own build directory.
+    * The script is never used this way anymore. This extra machinery could be removed.
+* This also builds libtorch packages
+    * Ditto the comment above. This should definitely be separated out.
+
+Note that the MacOS Python wheels are still built in conda environments. Some of the dependencies present during build also come from conda.
+
+## Windows Wheels (Windows pip and libtorch packages)
+
+The entrypoint file `builder/windows/build_pytorch.bat` is complicated because
+
+* This used to handle building for several different python versions at the same time. This is why there are loops everywhere
+    * The script is never used this way anymore. This extra machinery could be removed.
+* This used to handle testing the pip packages too. This is why there’s testing code at the end that messes with python installations and stuff
+    * The script is never used this way anymore. This extra machinery could be removed.
+* This also builds libtorch packages
+    * This should really be separate. libtorch packages are c++ only and have no python. They should not share infra with all the python specific stuff in this file.
+
+Note that the Windows Python wheels are still built in conda environments. Some of the dependencies present during build also come from conda.
+
+## General notes
+
+### Note on run_tests.sh, smoke_test.sh, and check_binary.sh
+
+* These should all be consolidated
+* These must run on all OS types: MacOS, Linux, and Windows
+* These all run smoke tests at the moment. They inspect the packages some, maybe run a few import statements. They DO NOT run the python tests nor the cpp tests. The idea is that python tests on main and PR merges will catch all breakages. All these tests have to do is make sure the special binary machinery didn’t mess anything up.
+* There are separate run_tests.sh and smoke_test.sh because one used to be called by the smoke jobs and one used to be called by the binary test jobs (see circleci structure section above). This is still true actually, but these could be united into a single script that runs these checks, given an installed pytorch package.
+
+### Note on libtorch
+
+Libtorch packages are built in the wheel build scripts: manywheel/build_*.sh for linux and build_wheel.sh for mac. There are several things wrong with this
+
+* It’s confusing. Most of those scripts deal with python specifics.
+* The extra conditionals everywhere severely complicate the wheel build scripts
+* The process for building libtorch is different from the official instructions (a plain call to cmake, or a call to a script)
+
+### Note on docker images / Dockerfiles
+
+All linux builds occur in docker images. The docker images are
+
+* pytorch/conda-cuda
+    * Has ALL CUDA versions installed. The script pytorch/builder/conda/switch_cuda_version.sh sets /usr/local/cuda to a symlink to e.g. /usr/local/cuda-10.0 to enable different CUDA builds
+    * Also used for cpu builds
+* pytorch/manylinux-cuda90
+* pytorch/manylinux-cuda100
+    * Also used for cpu builds
+
+The Dockerfiles are available in pytorch/builder, but there is no circleci job or script to build these docker images, and they cannot be run locally (unless you have the correct local packages/paths). Only Soumith can build them right now.
+
+### General Python
+
+* This is still a good explanation of python installations https://caffe2.ai/docs/faq.html#why-do-i-get-import-errors-in-python-when-i-try-to-use-caffe2
+
+# How to manually rebuild the binaries
+
+tl;dr make a PR that looks like https://github.com/pytorch/pytorch/pull/21159
+
+Sometimes we want to push a change to mainand then rebuild all of today's binaries after that change. As of May 30, 2019 there isn't a way to manually run a workflow in the UI. You can manually re-run a workflow, but it will use the exact same git commits as the first run and will not include any changes. So we have to make a PR and then force circleci to run the binary workflow instead of the normal tests. The above PR is an example of how to do this; essentially you copy-paste the binarybuilds workflow steps into the default workflow steps. If you need to point the builder repo to a different commit then you'd need to change https://github.com/pytorch/pytorch/blob/main/.circleci/scripts/binary_checkout.sh#L42-L45 to checkout what you want.
+
+## How to test changes to the binaries via .circleci
+
+Writing PRs that test the binaries is annoying, since the default circleci jobs that run on PRs are not the jobs that you want to run. Likely, changes to the binaries will touch something under .circleci/ and require that .circleci/config.yml be regenerated (.circleci/config.yml controls all .circleci behavior, and is generated using `.circleci/regenerate.sh` in python 3.7). But you also need to manually hardcode the binary jobs that you want to test into the .circleci/config.yml workflow, so you should actually make at least two commits, one for your changes and one to temporarily hardcode jobs. See https://github.com/pytorch/pytorch/pull/22928 as an example of how to do this.
+
+```sh
+# Make your changes
+touch .circleci/verbatim-sources/nightly-binary-build-defaults.yml
+# Regenerate the yaml, has to be in python 3.7
+.circleci/regenerate.sh
+# Make a commit
+git add .circleci *
+git commit -m "My real changes"
+git push origin my_branch
+# Now hardcode the jobs that you want in the .circleci/config.yml workflows section
+# Also eliminate ensure-consistency and should_run_job checks
+# e.g. https://github.com/pytorch/pytorch/commit/2b3344bfed8772fe86e5210cc4ee915dee42b32d
+# Make a commit you won't keep
+git add .circleci
+git commit -m "[DO NOT LAND] testing binaries for above changes"
+git push origin my_branch
+# Now you need to make some changes to the first commit.
+git rebase -i HEAD~2 # mark the first commit as 'edit'
+# Make the changes
+touch .circleci/verbatim-sources/nightly-binary-build-defaults.yml
+.circleci/regenerate.sh
+# Ammend the commit and recontinue
+git add .circleci
+git commit --amend
+git rebase --continue
+# Update the PR, need to force since the commits are different now
+git push origin my_branch --force
+```
+
+The advantage of this flow is that you can make new changes to the base commit and regenerate the .circleci without having to re-write which binary jobs you want to test on. The downside is that all updates will be force pushes.
+
+## How to build a binary locally
+
+### Linux
+
+You can build Linux binaries locally easily using docker.
+
+```sh
+# Run the docker
+# Use the correct docker image, pytorch/conda-cuda used here as an example
+#
+# -v path/to/foo:path/to/bar makes path/to/foo on your local machine (the
+#    machine that you're running the command on) accessible to the docker
+#    container at path/to/bar. So if you then run `touch path/to/bar/baz`
+#    in the docker container then you will see path/to/foo/baz on your local
+#    machine. You could also clone the pytorch and builder repos in the docker.
+#
+# If you know how, add ccache as a volume too and speed up everything
+docker run \
+    -v your/pytorch/repo:/pytorch \
+    -v your/builder/repo:/builder \
+    -v where/you/want/packages/to/appear:/final_pkgs \
+    -it pytorch/conda-cuda /bin/bash
+# Export whatever variables are important to you. All variables that you'd
+# possibly need are in .circleci/scripts/binary_populate_env.sh
+# You should probably always export at least these 3 variables
+export PACKAGE_TYPE=conda
+export DESIRED_PYTHON=3.7
+export DESIRED_CUDA=cpu
+# Call the entrypoint
+# `|& tee foo.log` just copies all stdout and stderr output to foo.log
+# The builds generate lots of output so you probably need this when
+# building locally.
+/builder/conda/build_pytorch.sh |& tee build_output.log
+```
+
+**Building CUDA binaries on docker**
+
+You can build CUDA binaries on CPU only machines, but you can only run CUDA binaries on CUDA machines. This means that you can build a CUDA binary on a docker on your laptop if you so choose (though it’s gonna take a long time).
+
+For Facebook employees, ask about beefy machines that have docker support and use those instead of your laptop; it will be 5x as fast.
+
+### MacOS
+
+There’s no easy way to generate reproducible hermetic MacOS environments. If you have a Mac laptop then you can try emulating the .circleci environments as much as possible, but you probably have packages in /usr/local/, possibly installed by brew, that will probably interfere with the build. If you’re trying to repro an error on a Mac build in .circleci and you can’t seem to repro locally, then my best advice is actually to iterate on .circleci    :/
+
+But if you want to try, then I’d recommend
+
+```sh
+# Create a new terminal
+# Clear your LD_LIBRARY_PATH and trim as much out of your PATH as you
+# know how to do
+# Install a new miniconda
+# First remove any other python or conda installation from your PATH
+# Always install miniconda 3, even if building for Python <3
+new_conda="~/my_new_conda"
+conda_sh="$new_conda/install_miniconda.sh"
+curl -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+chmod +x "$conda_sh"
+"$conda_sh" -b -p "$MINICONDA_ROOT"
+rm -f "$conda_sh"
+export PATH="~/my_new_conda/bin:$PATH"
+# Create a clean python env
+# All MacOS builds use conda to manage the python env and dependencies
+# that are built with, even the pip packages
+conda create -yn binary python=2.7
+conda activate binary
+# Export whatever variables are important to you. All variables that you'd
+# possibly need are in .circleci/scripts/binary_populate_env.sh
+# You should probably always export at least these 3 variables
+export PACKAGE_TYPE=conda
+export DESIRED_PYTHON=3.7
+export DESIRED_CUDA=cpu
+# Call the entrypoint you want
+path/to/builder/wheel/build_wheel.sh
+```
+
+N.B. installing a brand new miniconda is important. This has to do with how conda installations work. See the “General Python” section above, but tldr; is that
+
+1. You make the ‘conda’ command accessible by prepending `path/to/conda_root/bin` to your PATH.
+2. You make a new env and activate it, which then also gets prepended to your PATH. Now you have `path/to/conda_root/envs/new_env/bin:path/to/conda_root/bin:$PATH`
+3. Now say you (or some code that you ran) call python executable `foo`
+    1. if you installed `foo` in `new_env`, then `path/to/conda_root/envs/new_env/bin/foo` will get called, as expected.
+    2. But if you forgot to installed `foo` in `new_env` but happened to previously install it in your root conda env (called ‘base’), then unix/linux will still find `path/to/conda_root/bin/foo` . This is dangerous, since `foo` can be a different version than you want; `foo` can even be for an incompatible python version!
+
+Newer conda versions and proper python hygiene can prevent this, but just install a new miniconda to be safe.
+
+### Windows
+
+TODO: fill in
--- a/test/distributed/pipelining/init.py
+++ b/test/distributed/pipelining/init.py
--- a/aten/src/ATen/native/vulkan/api/StringUtil.cpp
+++ b/aten/src/ATen/native/vulkan/api/StringUtil.cpp
--- a/.circleci/cimodel/data/binary_build_data.py
+++ b/.circleci/cimodel/data/binary_build_data.py
@ -0,0 +1,198 @@
+"""
+This module models the tree of configuration variants
+for "smoketest" builds.
+
+Each subclass of ConfigNode represents a layer of the configuration hierarchy.
+These tree nodes encapsulate the logic for whether a branch of the hierarchy
+should be "pruned".
+"""
+
+from collections import OrderedDict
+
+import cimodel.data.dimensions as dimensions
+
+from cimodel.lib.conf_tree import ConfigNode
+
+
+LINKING_DIMENSIONS = [
+    "shared",
+    "static",
+]
+
+
+DEPS_INCLUSION_DIMENSIONS = [
+    "with-deps",
+    "without-deps",
+]
+
+
+def get_processor_arch_name(gpu_version):
+    return (
+        "cpu"
+        if not gpu_version
+        else (
+            "cu" + gpu_version.strip("cuda")
+            if gpu_version.startswith("cuda")
+            else gpu_version
+        )
+    )
+
+
+CONFIG_TREE_DATA = OrderedDict()
+
+# GCC config variants:
+#
+# All the nightlies (except libtorch with new gcc ABI) are built with devtoolset7,
+# which can only build with old gcc ABI. It is better than devtoolset3
+# because it understands avx512, which is needed for good fbgemm performance.
+#
+# Libtorch with new gcc ABI is built with gcc 5.4 on Ubuntu 16.04.
+LINUX_GCC_CONFIG_VARIANTS = OrderedDict(
+    manywheel=["devtoolset7"],
+    conda=["devtoolset7"],
+    libtorch=[
+        "devtoolset7",
+        "gcc5.4_cxx11-abi",
+    ],
+)
+
+WINDOWS_LIBTORCH_CONFIG_VARIANTS = [
+    "debug",
+    "release",
+]
+
+
+class TopLevelNode(ConfigNode):
+    def __init__(self, node_name, config_tree_data, smoke):
+        super().__init__(None, node_name)
+
+        self.config_tree_data = config_tree_data
+        self.props["smoke"] = smoke
+
+    def get_children(self):
+        return [
+            OSConfigNode(self, x, c, p) for (x, (c, p)) in self.config_tree_data.items()
+        ]
+
+
+class OSConfigNode(ConfigNode):
+    def __init__(self, parent, os_name, gpu_versions, py_tree):
+        super().__init__(parent, os_name)
+
+        self.py_tree = py_tree
+        self.props["os_name"] = os_name
+        self.props["gpu_versions"] = gpu_versions
+
+    def get_children(self):
+        return [PackageFormatConfigNode(self, k, v) for k, v in self.py_tree.items()]
+
+
+class PackageFormatConfigNode(ConfigNode):
+    def __init__(self, parent, package_format, python_versions):
+        super().__init__(parent, package_format)
+
+        self.props["python_versions"] = python_versions
+        self.props["package_format"] = package_format
+
+    def get_children(self):
+        if self.find_prop("os_name") == "linux":
+            return [
+                LinuxGccConfigNode(self, v)
+                for v in LINUX_GCC_CONFIG_VARIANTS[self.find_prop("package_format")]
+            ]
+        elif (
+            self.find_prop("os_name") == "windows"
+            and self.find_prop("package_format") == "libtorch"
+        ):
+            return [
+                WindowsLibtorchConfigNode(self, v)
+                for v in WINDOWS_LIBTORCH_CONFIG_VARIANTS
+            ]
+        else:
+            return [ArchConfigNode(self, v) for v in self.find_prop("gpu_versions")]
+
+
+class LinuxGccConfigNode(ConfigNode):
+    def __init__(self, parent, gcc_config_variant):
+        super().__init__(parent, "GCC_CONFIG_VARIANT=" + str(gcc_config_variant))
+
+        self.props["gcc_config_variant"] = gcc_config_variant
+
+    def get_children(self):
+        gpu_versions = self.find_prop("gpu_versions")
+
+        # XXX devtoolset7 on CUDA 9.0 is temporarily disabled
+        # see https://github.com/pytorch/pytorch/issues/20066
+        if self.find_prop("gcc_config_variant") == "devtoolset7":
+            gpu_versions = filter(lambda x: x != "cuda_90", gpu_versions)
+
+        # XXX disabling conda rocm build since docker images are not there
+        if self.find_prop("package_format") == "conda":
+            gpu_versions = filter(
+                lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions
+            )
+
+        # XXX libtorch rocm build  is temporarily disabled
+        if self.find_prop("package_format") == "libtorch":
+            gpu_versions = filter(
+                lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions
+            )
+
+        return [ArchConfigNode(self, v) for v in gpu_versions]
+
+
+class WindowsLibtorchConfigNode(ConfigNode):
+    def __init__(self, parent, libtorch_config_variant):
+        super().__init__(
+            parent, "LIBTORCH_CONFIG_VARIANT=" + str(libtorch_config_variant)
+        )
+
+        self.props["libtorch_config_variant"] = libtorch_config_variant
+
+    def get_children(self):
+        return [ArchConfigNode(self, v) for v in self.find_prop("gpu_versions")]
+
+
+class ArchConfigNode(ConfigNode):
+    def __init__(self, parent, gpu):
+        super().__init__(parent, get_processor_arch_name(gpu))
+
+        self.props["gpu"] = gpu
+
+    def get_children(self):
+        return [PyVersionConfigNode(self, v) for v in self.find_prop("python_versions")]
+
+
+class PyVersionConfigNode(ConfigNode):
+    def __init__(self, parent, pyver):
+        super().__init__(parent, pyver)
+
+        self.props["pyver"] = pyver
+
+    def get_children(self):
+        package_format = self.find_prop("package_format")
+        os_name = self.find_prop("os_name")
+
+        has_libtorch_variants = package_format == "libtorch" and os_name == "linux"
+        linking_variants = LINKING_DIMENSIONS if has_libtorch_variants else []
+
+        return [LinkingVariantConfigNode(self, v) for v in linking_variants]
+
+
+class LinkingVariantConfigNode(ConfigNode):
+    def __init__(self, parent, linking_variant):
+        super().__init__(parent, linking_variant)
+
+    def get_children(self):
+        return [
+            DependencyInclusionConfigNode(self, v) for v in DEPS_INCLUSION_DIMENSIONS
+        ]
+
+
+class DependencyInclusionConfigNode(ConfigNode):
+    def __init__(self, parent, deps_variant):
+        super().__init__(parent, deps_variant)
+
+        self.props["libtorch_variant"] = "-".join(
+            [self.parent.get_label(), self.get_label()]
+        )
--- a/.circleci/cimodel/data/binary_build_definitions.py
+++ b/.circleci/cimodel/data/binary_build_definitions.py
@ -0,0 +1,275 @@
+from collections import OrderedDict
+
+import cimodel.data.binary_build_data as binary_build_data
+
+import cimodel.data.simple.util.branch_filters as branch_filters
+import cimodel.lib.conf_tree as conf_tree
+import cimodel.lib.miniutils as miniutils
+
+
+class Conf:
+    def __init__(
+        self,
+        os,
+        gpu_version,
+        pydistro,
+        parms,
+        smoke,
+        libtorch_variant,
+        gcc_config_variant,
+        libtorch_config_variant,
+    ):
+        self.os = os
+        self.gpu_version = gpu_version
+        self.pydistro = pydistro
+        self.parms = parms
+        self.smoke = smoke
+        self.libtorch_variant = libtorch_variant
+        self.gcc_config_variant = gcc_config_variant
+        self.libtorch_config_variant = libtorch_config_variant
+
+    def gen_build_env_parms(self):
+        elems = (
+            [self.pydistro]
+            + self.parms
+            + [binary_build_data.get_processor_arch_name(self.gpu_version)]
+        )
+        if self.gcc_config_variant is not None:
+            elems.append(str(self.gcc_config_variant))
+        if self.libtorch_config_variant is not None:
+            elems.append(str(self.libtorch_config_variant))
+        return elems
+
+    def gen_docker_image(self):
+        if self.gcc_config_variant == "gcc5.4_cxx11-abi":
+            if self.gpu_version is None:
+                return miniutils.quote("pytorch/libtorch-cxx11-builder:cpu")
+            else:
+                return miniutils.quote(
+                    f"pytorch/libtorch-cxx11-builder:{self.gpu_version}"
+                )
+        if self.pydistro == "conda":
+            if self.gpu_version is None:
+                return miniutils.quote("pytorch/conda-builder:cpu")
+            else:
+                return miniutils.quote(f"pytorch/conda-builder:{self.gpu_version}")
+
+        docker_word_substitution = {
+            "manywheel": "manylinux",
+            "libtorch": "manylinux",
+        }
+
+        docker_distro_prefix = miniutils.override(
+            self.pydistro, docker_word_substitution
+        )
+
+        # The cpu nightlies are built on the pytorch/manylinux-cuda102 docker image
+        # TODO cuda images should consolidate into tag-base images similar to rocm
+        alt_docker_suffix = (
+            "cuda102"
+            if not self.gpu_version
+            else (
+                "rocm:" + self.gpu_version.strip("rocm")
+                if self.gpu_version.startswith("rocm")
+                else self.gpu_version
+            )
+        )
+        docker_distro_suffix = (
+            alt_docker_suffix
+            if self.pydistro != "conda"
+            else ("cuda" if alt_docker_suffix.startswith("cuda") else "rocm")
+        )
+        return miniutils.quote(
+            "pytorch/" + docker_distro_prefix + "-" + docker_distro_suffix
+        )
+
+    def get_name_prefix(self):
+        return "smoke" if self.smoke else "binary"
+
+    def gen_build_name(self, build_or_test, nightly):
+        parts = [self.get_name_prefix(), self.os] + self.gen_build_env_parms()
+
+        if nightly:
+            parts.append("nightly")
+
+        if self.libtorch_variant:
+            parts.append(self.libtorch_variant)
+
+        if not self.smoke:
+            parts.append(build_or_test)
+
+        joined = "_".join(parts)
+        return joined.replace(".", "_")
+
+    def gen_workflow_job(self, phase, upload_phase_dependency=None, nightly=False):
+        job_def = OrderedDict()
+        job_def["name"] = self.gen_build_name(phase, nightly)
+        job_def["build_environment"] = miniutils.quote(
+            " ".join(self.gen_build_env_parms())
+        )
+        if self.smoke:
+            job_def["requires"] = [
+                "update_s3_htmls",
+            ]
+            job_def["filters"] = branch_filters.gen_filter_dict(
+                branches_list=["postnightly"],
+            )
+        else:
+            filter_branch = r"/.*/"
+            job_def["filters"] = branch_filters.gen_filter_dict(
+                branches_list=[filter_branch],
+                tags_list=[branch_filters.RC_PATTERN],
+            )
+        if self.libtorch_variant:
+            job_def["libtorch_variant"] = miniutils.quote(self.libtorch_variant)
+        if phase == "test":
+            if not self.smoke:
+                job_def["requires"] = [self.gen_build_name("build", nightly)]
+            if not (self.smoke and self.os == "macos") and self.os != "windows":
+                job_def["docker_image"] = self.gen_docker_image()
+
+            # fix this. only works on cuda not rocm
+            if self.os != "windows" and self.gpu_version:
+                job_def["use_cuda_docker_runtime"] = miniutils.quote("1")
+        else:
+            if self.os == "linux" and phase != "upload":
+                job_def["docker_image"] = self.gen_docker_image()
+
+        if phase == "test":
+            if self.gpu_version:
+                if self.os == "windows":
+                    job_def["executor"] = "windows-with-nvidia-gpu"
+                else:
+                    job_def["resource_class"] = "gpu.medium"
+
+        os_name = miniutils.override(self.os, {"macos": "mac"})
+        job_name = "_".join([self.get_name_prefix(), os_name, phase])
+        return {job_name: job_def}
+
+    def gen_upload_job(self, phase, requires_dependency):
+        """Generate binary_upload job for configuration
+
+          Output looks similar to:
+
+        - binary_upload:
+            name: binary_linux_manywheel_3_7m_cu113_devtoolset7_nightly_upload
+            context: org-member
+            requires: binary_linux_manywheel_3_7m_cu113_devtoolset7_nightly_test
+            filters:
+              branches:
+                only:
+                  - nightly
+              tags:
+                only: /v[0-9]+(\\.[0-9]+)*-rc[0-9]+/
+            package_type: manywheel
+            upload_subfolder: cu113
+        """
+        return {
+            "binary_upload": OrderedDict(
+                {
+                    "name": self.gen_build_name(phase, nightly=True),
+                    "context": "org-member",
+                    "requires": [
+                        self.gen_build_name(requires_dependency, nightly=True)
+                    ],
+                    "filters": branch_filters.gen_filter_dict(
+                        branches_list=["nightly"],
+                        tags_list=[branch_filters.RC_PATTERN],
+                    ),
+                    "package_type": self.pydistro,
+                    "upload_subfolder": binary_build_data.get_processor_arch_name(
+                        self.gpu_version,
+                    ),
+                }
+            )
+        }
+
+
+def get_root(smoke, name):
+    return binary_build_data.TopLevelNode(
+        name,
+        binary_build_data.CONFIG_TREE_DATA,
+        smoke,
+    )
+
+
+def gen_build_env_list(smoke):
+    root = get_root(smoke, "N/A")
+    config_list = conf_tree.dfs(root)
+
+    newlist = []
+    for c in config_list:
+        conf = Conf(
+            c.find_prop("os_name"),
+            c.find_prop("gpu"),
+            c.find_prop("package_format"),
+            [c.find_prop("pyver")],
+            c.find_prop("smoke")
+            and not (c.find_prop("os_name") == "macos_arm64"),  # don't test arm64
+            c.find_prop("libtorch_variant"),
+            c.find_prop("gcc_config_variant"),
+            c.find_prop("libtorch_config_variant"),
+        )
+        newlist.append(conf)
+
+    return newlist
+
+
+def predicate_exclude_macos(config):
+    return config.os == "linux" or config.os == "windows"
+
+
+def get_nightly_uploads():
+    configs = gen_build_env_list(False)
+    mylist = []
+    for conf in configs:
+        phase_dependency = "test" if predicate_exclude_macos(conf) else "build"
+        mylist.append(conf.gen_upload_job("upload", phase_dependency))
+
+    return mylist
+
+
+def get_post_upload_jobs():
+    return [
+        {
+            "update_s3_htmls": {
+                "name": "update_s3_htmls",
+                "context": "org-member",
+                "filters": branch_filters.gen_filter_dict(
+                    branches_list=["postnightly"],
+                ),
+            },
+        },
+    ]
+
+
+def get_nightly_tests():
+    configs = gen_build_env_list(False)
+    filtered_configs = filter(predicate_exclude_macos, configs)
+
+    tests = []
+    for conf_options in filtered_configs:
+        yaml_item = conf_options.gen_workflow_job("test", nightly=True)
+        tests.append(yaml_item)
+
+    return tests
+
+
+def get_jobs(toplevel_key, smoke):
+    jobs_list = []
+    configs = gen_build_env_list(smoke)
+    phase = "build" if toplevel_key == "binarybuilds" else "test"
+    for build_config in configs:
+        # don't test for macos_arm64 as it's cross compiled
+        if phase != "test" or build_config.os != "macos_arm64":
+            jobs_list.append(build_config.gen_workflow_job(phase, nightly=True))
+
+    return jobs_list
+
+
+def get_binary_build_jobs():
+    return get_jobs("binarybuilds", False)
+
+
+def get_binary_smoke_test_jobs():
+    return get_jobs("binarysmoketests", True)
--- a/.circleci/cimodel/data/dimensions.py
+++ b/.circleci/cimodel/data/dimensions.py
@ -0,0 +1,19 @@
+PHASES = ["build", "test"]
+
+CUDA_VERSIONS = [
+    "102",
+    "113",
+    "116",
+    "117",
+]
+
+ROCM_VERSIONS = [
+    "4.3.1",
+    "4.5.2",
+]
+
+ROCM_VERSION_LABELS = ["rocm" + v for v in ROCM_VERSIONS]
+
+GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ROCM_VERSION_LABELS
+
+STANDARD_PYTHON_VERSIONS = ["3.7", "3.8", "3.9", "3.10"]
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@ -0,0 +1,296 @@
+from cimodel.lib.conf_tree import ConfigNode
+
+
+CONFIG_TREE_DATA = []
+
+
+def get_major_pyver(dotted_version):
+    parts = dotted_version.split(".")
+    return "py" + parts[0]
+
+
+class TreeConfigNode(ConfigNode):
+    def __init__(self, parent, node_name, subtree):
+        super().__init__(parent, self.modify_label(node_name))
+        self.subtree = subtree
+        self.init2(node_name)
+
+    def modify_label(self, label):
+        return label
+
+    def init2(self, node_name):
+        pass
+
+    def get_children(self):
+        return [self.child_constructor()(self, k, v) for (k, v) in self.subtree]
+
+
+class TopLevelNode(TreeConfigNode):
+    def __init__(self, node_name, subtree):
+        super().__init__(None, node_name, subtree)
+
+    # noinspection PyMethodMayBeStatic
+    def child_constructor(self):
+        return DistroConfigNode
+
+
+class DistroConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["distro_name"] = node_name
+
+    def child_constructor(self):
+        distro = self.find_prop("distro_name")
+
+        next_nodes = {
+            "xenial": XenialCompilerConfigNode,
+            "bionic": BionicCompilerConfigNode,
+        }
+        return next_nodes[distro]
+
+
+class PyVerConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["pyver"] = node_name
+        self.props["abbreviated_pyver"] = get_major_pyver(node_name)
+        if node_name == "3.9":
+            self.props["abbreviated_pyver"] = "py3.9"
+
+    # noinspection PyMethodMayBeStatic
+    def child_constructor(self):
+        return ExperimentalFeatureConfigNode
+
+
+class ExperimentalFeatureConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["experimental_feature"] = node_name
+
+    def child_constructor(self):
+        experimental_feature = self.find_prop("experimental_feature")
+
+        next_nodes = {
+            "asan": AsanConfigNode,
+            "xla": XlaConfigNode,
+            "mps": MPSConfigNode,
+            "vulkan": VulkanConfigNode,
+            "parallel_tbb": ParallelTBBConfigNode,
+            "crossref": CrossRefConfigNode,
+            "dynamo": DynamoConfigNode,
+            "parallel_native": ParallelNativeConfigNode,
+            "onnx": ONNXConfigNode,
+            "libtorch": LibTorchConfigNode,
+            "important": ImportantConfigNode,
+            "build_only": BuildOnlyConfigNode,
+            "shard_test": ShardTestConfigNode,
+            "cuda_gcc_override": CudaGccOverrideConfigNode,
+            "pure_torch": PureTorchConfigNode,
+            "slow_gradcheck": SlowGradcheckConfigNode,
+        }
+        return next_nodes[experimental_feature]
+
+
+class SlowGradcheckConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["is_slow_gradcheck"] = True
+
+    def child_constructor(self):
+        return ExperimentalFeatureConfigNode
+
+
+class PureTorchConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "PURE_TORCH=" + str(label)
+
+    def init2(self, node_name):
+        self.props["is_pure_torch"] = node_name
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class XlaConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "XLA=" + str(label)
+
+    def init2(self, node_name):
+        self.props["is_xla"] = node_name
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class MPSConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "MPS=" + str(label)
+
+    def init2(self, node_name):
+        self.props["is_mps"] = node_name
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class AsanConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "Asan=" + str(label)
+
+    def init2(self, node_name):
+        self.props["is_asan"] = node_name
+
+    def child_constructor(self):
+        return ExperimentalFeatureConfigNode
+
+
+class ONNXConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "Onnx=" + str(label)
+
+    def init2(self, node_name):
+        self.props["is_onnx"] = node_name
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class VulkanConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "Vulkan=" + str(label)
+
+    def init2(self, node_name):
+        self.props["is_vulkan"] = node_name
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class ParallelTBBConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "PARALLELTBB=" + str(label)
+
+    def init2(self, node_name):
+        self.props["parallel_backend"] = "paralleltbb"
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class CrossRefConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["is_crossref"] = node_name
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class DynamoConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["is_dynamo"] = node_name
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class ParallelNativeConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "PARALLELNATIVE=" + str(label)
+
+    def init2(self, node_name):
+        self.props["parallel_backend"] = "parallelnative"
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class LibTorchConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "BUILD_TEST_LIBTORCH=" + str(label)
+
+    def init2(self, node_name):
+        self.props["is_libtorch"] = node_name
+
+    def child_constructor(self):
+        return ExperimentalFeatureConfigNode
+
+
+class CudaGccOverrideConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["cuda_gcc_override"] = node_name
+
+    def child_constructor(self):
+        return ExperimentalFeatureConfigNode
+
+
+class BuildOnlyConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["build_only"] = node_name
+
+    def child_constructor(self):
+        return ExperimentalFeatureConfigNode
+
+
+class ShardTestConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["shard_test"] = node_name
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class ImportantConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "IMPORTANT=" + str(label)
+
+    def init2(self, node_name):
+        self.props["is_important"] = node_name
+
+    def get_children(self):
+        return []
+
+
+class XenialCompilerConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return label or "<unspecified>"
+
+    def init2(self, node_name):
+        self.props["compiler_name"] = node_name
+
+    # noinspection PyMethodMayBeStatic
+    def child_constructor(self):
+        return (
+            XenialCompilerVersionConfigNode
+            if self.props["compiler_name"]
+            else PyVerConfigNode
+        )
+
+
+class BionicCompilerConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return label or "<unspecified>"
+
+    def init2(self, node_name):
+        self.props["compiler_name"] = node_name
+
+    # noinspection PyMethodMayBeStatic
+    def child_constructor(self):
+        return (
+            BionicCompilerVersionConfigNode
+            if self.props["compiler_name"]
+            else PyVerConfigNode
+        )
+
+
+class XenialCompilerVersionConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["compiler_version"] = node_name
+
+    # noinspection PyMethodMayBeStatic
+    def child_constructor(self):
+        return PyVerConfigNode
+
+
+class BionicCompilerVersionConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["compiler_version"] = node_name
+
+    # noinspection PyMethodMayBeStatic
+    def child_constructor(self):
+        return PyVerConfigNode
--- a/.circleci/cimodel/data/pytorch_build_definitions.py
+++ b/.circleci/cimodel/data/pytorch_build_definitions.py
@ -0,0 +1,382 @@
+from collections import OrderedDict
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+import cimodel.data.dimensions as dimensions
+import cimodel.lib.conf_tree as conf_tree
+import cimodel.lib.miniutils as miniutils
+from cimodel.data.pytorch_build_data import CONFIG_TREE_DATA, TopLevelNode
+from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN
+from cimodel.data.simple.util.docker_constants import gen_docker_image
+
+
+@dataclass
+class Conf:
+    distro: str
+    parms: List[str]
+    parms_list_ignored_for_docker_image: Optional[List[str]] = None
+    pyver: Optional[str] = None
+    cuda_version: Optional[str] = None
+    rocm_version: Optional[str] = None
+    # TODO expand this to cover all the USE_* that we want to test for
+    #  tesnrorrt, leveldb, lmdb, redis, opencv, mkldnn, ideep, etc.
+    # (from https://github.com/pytorch/pytorch/pull/17323#discussion_r259453608)
+    is_xla: bool = False
+    is_vulkan: bool = False
+    is_pure_torch: bool = False
+    restrict_phases: Optional[List[str]] = None
+    gpu_resource: Optional[str] = None
+    dependent_tests: List = field(default_factory=list)
+    parent_build: Optional["Conf"] = None
+    is_libtorch: bool = False
+    is_important: bool = False
+    parallel_backend: Optional[str] = None
+    build_only: bool = False
+
+    @staticmethod
+    def is_test_phase(phase):
+        return "test" in phase
+
+    # TODO: Eliminate the special casing for docker paths
+    # In the short term, we *will* need to support special casing as docker images are merged for caffe2 and pytorch
+    def get_parms(self, for_docker):
+        leading = []
+        # We just don't run non-important jobs on pull requests;
+        # previously we also named them in a way to make it obvious
+        # if self.is_important and not for_docker:
+        #    leading.append("AAA")
+        leading.append("pytorch")
+        if self.is_xla and not for_docker:
+            leading.append("xla")
+        if self.is_vulkan and not for_docker:
+            leading.append("vulkan")
+        if self.is_libtorch and not for_docker:
+            leading.append("libtorch")
+        if self.is_pure_torch and not for_docker:
+            leading.append("pure_torch")
+        if self.parallel_backend is not None and not for_docker:
+            leading.append(self.parallel_backend)
+
+        cuda_parms = []
+        if self.cuda_version:
+            cudnn = "cudnn8" if self.cuda_version.startswith("11.") else "cudnn7"
+            cuda_parms.extend(["cuda" + self.cuda_version, cudnn])
+        if self.rocm_version:
+            cuda_parms.extend([f"rocm{self.rocm_version}"])
+        result = leading + ["linux", self.distro] + cuda_parms + self.parms
+        if not for_docker and self.parms_list_ignored_for_docker_image is not None:
+            result = result + self.parms_list_ignored_for_docker_image
+        return result
+
+    def gen_docker_image_path(self):
+        parms_source = self.parent_build or self
+        base_build_env_name = "-".join(parms_source.get_parms(True))
+        image_name, _ = gen_docker_image(base_build_env_name)
+        return miniutils.quote(image_name)
+
+    def gen_docker_image_requires(self):
+        parms_source = self.parent_build or self
+        base_build_env_name = "-".join(parms_source.get_parms(True))
+        _, requires = gen_docker_image(base_build_env_name)
+        return miniutils.quote(requires)
+
+    def get_build_job_name_pieces(self, build_or_test):
+        return self.get_parms(False) + [build_or_test]
+
+    def gen_build_name(self, build_or_test):
+        return (
+            ("_".join(map(str, self.get_build_job_name_pieces(build_or_test))))
+            .replace(".", "_")
+            .replace("-", "_")
+        )
+
+    def get_dependents(self):
+        return self.dependent_tests or []
+
+    def gen_workflow_params(self, phase):
+        parameters = OrderedDict()
+        build_job_name_pieces = self.get_build_job_name_pieces(phase)
+
+        build_env_name = "-".join(map(str, build_job_name_pieces))
+        parameters["build_environment"] = miniutils.quote(build_env_name)
+        parameters["docker_image"] = self.gen_docker_image_path()
+        if Conf.is_test_phase(phase) and self.gpu_resource:
+            parameters["use_cuda_docker_runtime"] = miniutils.quote("1")
+        if Conf.is_test_phase(phase):
+            resource_class = "large"
+            if self.gpu_resource:
+                resource_class = "gpu." + self.gpu_resource
+            if self.rocm_version is not None:
+                resource_class = "pytorch/amd-gpu"
+            parameters["resource_class"] = resource_class
+        if phase == "build" and self.rocm_version is not None:
+            parameters["resource_class"] = "xlarge"
+        if hasattr(self, "filters"):
+            parameters["filters"] = self.filters
+        if self.build_only:
+            parameters["build_only"] = miniutils.quote(str(int(True)))
+        return parameters
+
+    def gen_workflow_job(self, phase):
+        job_def = OrderedDict()
+        job_def["name"] = self.gen_build_name(phase)
+
+        if Conf.is_test_phase(phase):
+            # TODO When merging the caffe2 and pytorch jobs, it might be convenient for a while to make a
+            #  caffe2 test job dependent on a pytorch build job. This way we could quickly dedup the repeated
+            #  build of pytorch in the caffe2 build job, and just run the caffe2 tests off of a completed
+            #  pytorch build job (from https://github.com/pytorch/pytorch/pull/17323#discussion_r259452641)
+
+            dependency_build = self.parent_build or self
+            job_def["requires"] = [dependency_build.gen_build_name("build")]
+            job_name = "pytorch_linux_test"
+        else:
+            job_name = "pytorch_linux_build"
+            job_def["requires"] = [self.gen_docker_image_requires()]
+
+        if not self.is_important:
+            job_def["filters"] = gen_filter_dict()
+        job_def.update(self.gen_workflow_params(phase))
+
+        return {job_name: job_def}
+
+
+# TODO This is a hack to special case some configs just for the workflow list
+class HiddenConf:
+    def __init__(self, name, parent_build=None, filters=None):
+        self.name = name
+        self.parent_build = parent_build
+        self.filters = filters
+
+    def gen_workflow_job(self, phase):
+        return {
+            self.gen_build_name(phase): {
+                "requires": [self.parent_build.gen_build_name("build")],
+                "filters": self.filters,
+            }
+        }
+
+    def gen_build_name(self, _):
+        return self.name
+
+
+class DocPushConf:
+    def __init__(self, name, parent_build=None, branch="master"):
+        self.name = name
+        self.parent_build = parent_build
+        self.branch = branch
+
+    def gen_workflow_job(self, phase):
+        return {
+            "pytorch_doc_push": {
+                "name": self.name,
+                "branch": self.branch,
+                "requires": [self.parent_build],
+                "context": "org-member",
+                "filters": gen_filter_dict(
+                    branches_list=["nightly"], tags_list=RC_PATTERN
+                ),
+            }
+        }
+
+
+def gen_docs_configs(xenial_parent_config):
+    configs = []
+
+    configs.append(
+        HiddenConf(
+            "pytorch_python_doc_build",
+            parent_build=xenial_parent_config,
+            filters=gen_filter_dict(
+                branches_list=["master", "main", "nightly"], tags_list=RC_PATTERN
+            ),
+        )
+    )
+    configs.append(
+        DocPushConf(
+            "pytorch_python_doc_push",
+            parent_build="pytorch_python_doc_build",
+            branch="site",
+        )
+    )
+
+    configs.append(
+        HiddenConf(
+            "pytorch_cpp_doc_build",
+            parent_build=xenial_parent_config,
+            filters=gen_filter_dict(
+                branches_list=["master", "main", "nightly"], tags_list=RC_PATTERN
+            ),
+        )
+    )
+    configs.append(
+        DocPushConf(
+            "pytorch_cpp_doc_push",
+            parent_build="pytorch_cpp_doc_build",
+            branch="master",
+        )
+    )
+    return configs
+
+
+def get_root():
+    return TopLevelNode("PyTorch Builds", CONFIG_TREE_DATA)
+
+
+def gen_tree():
+    root = get_root()
+    configs_list = conf_tree.dfs(root)
+    return configs_list
+
+
+def instantiate_configs(only_slow_gradcheck):
+    config_list = []
+
+    root = get_root()
+    found_configs = conf_tree.dfs(root)
+    for fc in found_configs:
+        restrict_phases = None
+        distro_name = fc.find_prop("distro_name")
+        compiler_name = fc.find_prop("compiler_name")
+        compiler_version = fc.find_prop("compiler_version")
+        is_xla = fc.find_prop("is_xla") or False
+        is_asan = fc.find_prop("is_asan") or False
+        is_crossref = fc.find_prop("is_crossref") or False
+        is_dynamo = fc.find_prop("is_dynamo") or False
+        is_onnx = fc.find_prop("is_onnx") or False
+        is_pure_torch = fc.find_prop("is_pure_torch") or False
+        is_vulkan = fc.find_prop("is_vulkan") or False
+        is_slow_gradcheck = fc.find_prop("is_slow_gradcheck") or False
+        parms_list_ignored_for_docker_image = []
+
+        if only_slow_gradcheck ^ is_slow_gradcheck:
+            continue
+
+        python_version = None
+        if compiler_name == "cuda" or compiler_name == "android":
+            python_version = fc.find_prop("pyver")
+            parms_list = [fc.find_prop("abbreviated_pyver")]
+        else:
+            parms_list = ["py" + fc.find_prop("pyver")]
+
+        cuda_version = None
+        rocm_version = None
+        if compiler_name == "cuda":
+            cuda_version = fc.find_prop("compiler_version")
+
+        elif compiler_name == "rocm":
+            rocm_version = fc.find_prop("compiler_version")
+            restrict_phases = ["build", "test1", "test2", "caffe2_test"]
+
+        elif compiler_name == "android":
+            android_ndk_version = fc.find_prop("compiler_version")
+            # TODO: do we need clang to compile host binaries like protoc?
+            parms_list.append("clang5")
+            parms_list.append("android-ndk-" + android_ndk_version)
+            android_abi = fc.find_prop("android_abi")
+            parms_list_ignored_for_docker_image.append(android_abi)
+            restrict_phases = ["build"]
+
+        elif compiler_name:
+            gcc_version = compiler_name + (fc.find_prop("compiler_version") or "")
+            parms_list.append(gcc_version)
+
+        if is_asan:
+            parms_list.append("asan")
+            python_version = fc.find_prop("pyver")
+            parms_list[0] = fc.find_prop("abbreviated_pyver")
+
+        if is_crossref:
+            parms_list_ignored_for_docker_image.append("crossref")
+
+        if is_dynamo:
+            parms_list_ignored_for_docker_image.append("dynamo")
+
+        if is_onnx:
+            parms_list.append("onnx")
+            python_version = fc.find_prop("pyver")
+            parms_list[0] = fc.find_prop("abbreviated_pyver")
+            restrict_phases = ["build", "ort_test1", "ort_test2"]
+
+        if cuda_version:
+            cuda_gcc_version = fc.find_prop("cuda_gcc_override") or "gcc7"
+            parms_list.append(cuda_gcc_version)
+
+        is_libtorch = fc.find_prop("is_libtorch") or False
+        is_important = fc.find_prop("is_important") or False
+        parallel_backend = fc.find_prop("parallel_backend") or None
+        build_only = fc.find_prop("build_only") or False
+        shard_test = fc.find_prop("shard_test") or False
+        # TODO: fix pure_torch python test packaging issue.
+        if shard_test:
+            restrict_phases = ["build"] if restrict_phases is None else restrict_phases
+            restrict_phases.extend(["test1", "test2"])
+        if build_only or is_pure_torch:
+            restrict_phases = ["build"]
+
+        if is_slow_gradcheck:
+            parms_list_ignored_for_docker_image.append("old")
+            parms_list_ignored_for_docker_image.append("gradcheck")
+
+        gpu_resource = None
+        if cuda_version and cuda_version != "10":
+            gpu_resource = "medium"
+
+        c = Conf(
+            distro_name,
+            parms_list,
+            parms_list_ignored_for_docker_image,
+            python_version,
+            cuda_version,
+            rocm_version,
+            is_xla,
+            is_vulkan,
+            is_pure_torch,
+            restrict_phases,
+            gpu_resource,
+            is_libtorch=is_libtorch,
+            is_important=is_important,
+            parallel_backend=parallel_backend,
+            build_only=build_only,
+        )
+
+        # run docs builds on "pytorch-linux-xenial-py3.7-gcc5.4". Docs builds
+        # should run on a CPU-only build that runs on all PRs.
+        # XXX should this be updated to a more modern build?
+        if (
+            distro_name == "xenial"
+            and fc.find_prop("pyver") == "3.7"
+            and cuda_version is None
+            and parallel_backend is None
+            and not is_vulkan
+            and not is_pure_torch
+            and compiler_name == "gcc"
+            and fc.find_prop("compiler_version") == "5.4"
+        ):
+            c.filters = gen_filter_dict(branches_list=r"/.*/", tags_list=RC_PATTERN)
+            c.dependent_tests = gen_docs_configs(c)
+
+        config_list.append(c)
+
+    return config_list
+
+
+def get_workflow_jobs(only_slow_gradcheck=False):
+    config_list = instantiate_configs(only_slow_gradcheck)
+
+    x = []
+    for conf_options in config_list:
+        phases = conf_options.restrict_phases or dimensions.PHASES
+
+        for phase in phases:
+            # TODO why does this not have a test?
+            if Conf.is_test_phase(phase) and conf_options.cuda_version == "10":
+                continue
+
+            x.append(conf_options.gen_workflow_job(phase))
+
+        # TODO convert to recursion
+        for conf in conf_options.get_dependents():
+            x.append(conf.gen_workflow_job("test"))
+
+    return x
--- a/test/dynamo_expected_failures/ActivationCheckpointingTests.test_cond_with_kwargs
+++ b/test/dynamo_expected_failures/ActivationCheckpointingTests.test_cond_with_kwargs
--- a/.circleci/cimodel/data/simple/docker_definitions.py
+++ b/.circleci/cimodel/data/simple/docker_definitions.py
@ -0,0 +1,39 @@
+from collections import OrderedDict
+
+from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN
+
+from cimodel.lib.miniutils import quote
+
+
+# NOTE: All hardcoded docker image builds have been migrated to GHA
+IMAGE_NAMES = []
+
+# This entry should be an element from the list above
+# This should contain the image matching the "slow_gradcheck" entry in
+# pytorch_build_data.py
+SLOW_GRADCHECK_IMAGE_NAME = "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
+
+
+def get_workflow_jobs(images=IMAGE_NAMES, only_slow_gradcheck=False):
+    """Generates a list of docker image build definitions"""
+    ret = []
+    for image_name in images:
+        if image_name.startswith("docker-"):
+            image_name = image_name.lstrip("docker-")
+        if only_slow_gradcheck and image_name is not SLOW_GRADCHECK_IMAGE_NAME:
+            continue
+
+        parameters = OrderedDict(
+            {
+                "name": quote(f"docker-{image_name}"),
+                "image_name": quote(image_name),
+            }
+        )
+        if image_name == "pytorch-linux-xenial-py3.7-gcc5.4":
+            # pushing documentation on tags requires CircleCI to also
+            # build all the dependencies on tags, including this docker image
+            parameters["filters"] = gen_filter_dict(
+                branches_list=r"/.*/", tags_list=RC_PATTERN
+            )
+        ret.append(OrderedDict({"docker_build_job": parameters}))
+    return ret
--- a/.circleci/cimodel/data/simple/ios_definitions.py
+++ b/.circleci/cimodel/data/simple/ios_definitions.py
@ -0,0 +1,100 @@
+import cimodel.lib.miniutils as miniutils
+from cimodel.data.simple.util.branch_filters import gen_filter_dict_exclude
+from cimodel.data.simple.util.versions import MultiPartVersion
+
+XCODE_VERSION = MultiPartVersion([12, 5, 1])
+
+
+class ArchVariant:
+    def __init__(self, name, custom_build_name=""):
+        self.name = name
+        self.custom_build_name = custom_build_name
+
+    def render(self):
+        extra_parts = (
+            [self.custom_build_name] if len(self.custom_build_name) > 0 else []
+        )
+        return "-".join([self.name] + extra_parts).replace("_", "-")
+
+
+def get_platform(arch_variant_name):
+    return "SIMULATOR" if arch_variant_name == "x86_64" else "OS"
+
+
+class IOSJob:
+    def __init__(
+        self, xcode_version, arch_variant, is_org_member_context=True, extra_props=None
+    ):
+        self.xcode_version = xcode_version
+        self.arch_variant = arch_variant
+        self.is_org_member_context = is_org_member_context
+        self.extra_props = extra_props
+
+    def gen_name_parts(self):
+        version_parts = self.xcode_version.render_dots_or_parts("-")
+        build_variant_suffix = self.arch_variant.render()
+        return (
+            [
+                "ios",
+            ]
+            + version_parts
+            + [
+                build_variant_suffix,
+            ]
+        )
+
+    def gen_job_name(self):
+        return "-".join(self.gen_name_parts())
+
+    def gen_tree(self):
+        platform_name = get_platform(self.arch_variant.name)
+        props_dict = {
+            "name": self.gen_job_name(),
+            "build_environment": self.gen_job_name(),
+            "ios_arch": self.arch_variant.name,
+            "ios_platform": platform_name,
+        }
+
+        if self.is_org_member_context:
+            props_dict["context"] = "org-member"
+
+        if self.extra_props:
+            props_dict.update(self.extra_props)
+
+        props_dict["filters"] = gen_filter_dict_exclude()
+
+        return [{"pytorch_ios_build": props_dict}]
+
+
+WORKFLOW_DATA = [
+    IOSJob(
+        XCODE_VERSION,
+        ArchVariant("x86_64"),
+        is_org_member_context=False,
+        extra_props={"lite_interpreter": miniutils.quote(str(int(True)))},
+    ),
+    # IOSJob(XCODE_VERSION, ArchVariant("arm64"), extra_props={
+    #     "lite_interpreter": miniutils.quote(str(int(True)))}),
+    # IOSJob(XCODE_VERSION, ArchVariant("arm64", "metal"), extra_props={
+    #     "use_metal": miniutils.quote(str(int(True))),
+    #     "lite_interpreter": miniutils.quote(str(int(True)))}),
+    # IOSJob(XCODE_VERSION, ArchVariant("arm64", "custom-ops"), extra_props={
+    #     "op_list": "mobilenetv2.yaml",
+    #     "lite_interpreter": miniutils.quote(str(int(True)))}),
+    IOSJob(
+        XCODE_VERSION,
+        ArchVariant("x86_64", "coreml"),
+        is_org_member_context=False,
+        extra_props={
+            "use_coreml": miniutils.quote(str(int(True))),
+            "lite_interpreter": miniutils.quote(str(int(True))),
+        },
+    ),
+    # IOSJob(XCODE_VERSION, ArchVariant("arm64", "coreml"), extra_props={
+    #     "use_coreml": miniutils.quote(str(int(True))),
+    #     "lite_interpreter": miniutils.quote(str(int(True)))}),
+]
+
+
+def get_workflow_jobs():
+    return [item.gen_tree() for item in WORKFLOW_DATA]
--- a/.circleci/cimodel/data/simple/macos_definitions.py
+++ b/.circleci/cimodel/data/simple/macos_definitions.py
@ -0,0 +1,54 @@
+class MacOsJob:
+    def __init__(self, os_version, is_build=False, is_test=False, extra_props=tuple()):
+        # extra_props is tuple type, because mutable data structures for argument defaults
+        # is not recommended.
+        self.os_version = os_version
+        self.is_build = is_build
+        self.is_test = is_test
+        self.extra_props = dict(extra_props)
+
+    def gen_tree(self):
+        non_phase_parts = ["pytorch", "macos", self.os_version, "py3"]
+
+        extra_name_list = [name for name, exist in self.extra_props.items() if exist]
+        full_job_name_list = (
+            non_phase_parts
+            + extra_name_list
+            + [
+                "build" if self.is_build else None,
+                "test" if self.is_test else None,
+            ]
+        )
+
+        full_job_name = "_".join(list(filter(None, full_job_name_list)))
+
+        test_build_dependency = "_".join(non_phase_parts + ["build"])
+        extra_dependencies = [test_build_dependency] if self.is_test else []
+        job_dependencies = extra_dependencies
+
+        # Yes we name the job after itself, it needs a non-empty value in here
+        # for the YAML output to work.
+        props_dict = {"requires": job_dependencies, "name": full_job_name}
+
+        return [{full_job_name: props_dict}]
+
+
+WORKFLOW_DATA = [
+    MacOsJob("10_15", is_build=True),
+    MacOsJob("10_13", is_build=True),
+    MacOsJob(
+        "10_13",
+        is_build=False,
+        is_test=True,
+    ),
+    MacOsJob(
+        "10_13",
+        is_build=True,
+        is_test=True,
+        extra_props=tuple({"lite_interpreter": True}.items()),
+    ),
+]
+
+
+def get_workflow_jobs():
+    return [item.gen_tree() for item in WORKFLOW_DATA]
--- a/.circleci/cimodel/data/simple/mobile_definitions.py
+++ b/.circleci/cimodel/data/simple/mobile_definitions.py
@ -0,0 +1,51 @@
+"""
+PyTorch Mobile PR builds (use linux host toolchain + mobile build options)
+"""
+
+import cimodel.data.simple.util.branch_filters
+import cimodel.lib.miniutils as miniutils
+
+
+class MobileJob:
+    def __init__(
+        self, docker_image, docker_requires, variant_parts, is_master_only=False
+    ):
+        self.docker_image = docker_image
+        self.docker_requires = docker_requires
+        self.variant_parts = variant_parts
+        self.is_master_only = is_master_only
+
+    def gen_tree(self):
+        non_phase_parts = [
+            "pytorch",
+            "linux",
+            "xenial",
+            "py3",
+            "clang5",
+            "mobile",
+        ] + self.variant_parts
+
+        full_job_name = "_".join(non_phase_parts)
+        build_env_name = "-".join(non_phase_parts)
+
+        props_dict = {
+            "build_environment": build_env_name,
+            "build_only": miniutils.quote(str(int(True))),
+            "docker_image": self.docker_image,
+            "requires": self.docker_requires,
+            "name": full_job_name,
+        }
+
+        if self.is_master_only:
+            props_dict[
+                "filters"
+            ] = cimodel.data.simple.util.branch_filters.gen_filter_dict()
+
+        return [{"pytorch_linux_build": props_dict}]
+
+
+WORKFLOW_DATA = []
+
+
+def get_workflow_jobs():
+    return [item.gen_tree() for item in WORKFLOW_DATA]
--- a/.circleci/cimodel/data/simple/nightly_ios.py
+++ b/.circleci/cimodel/data/simple/nightly_ios.py
@ -0,0 +1,96 @@
+import cimodel.data.simple.ios_definitions as ios_definitions
+import cimodel.lib.miniutils as miniutils
+
+
+class IOSNightlyJob:
+    def __init__(self, variant, is_full_jit=False, is_upload=False):
+        self.variant = variant
+        self.is_full_jit = is_full_jit
+        self.is_upload = is_upload
+
+    def get_phase_name(self):
+        return "upload" if self.is_upload else "build"
+
+    def get_common_name_pieces(self, sep):
+        extra_name_suffix = [self.get_phase_name()] if self.is_upload else []
+
+        extra_name = ["full_jit"] if self.is_full_jit else []
+
+        common_name_pieces = (
+            [
+                "ios",
+            ]
+            + extra_name
+            + []
+            + ios_definitions.XCODE_VERSION.render_dots_or_parts(sep)
+            + [
+                "nightly",
+                self.variant,
+                "build",
+            ]
+            + extra_name_suffix
+        )
+
+        return common_name_pieces
+
+    def gen_job_name(self):
+        return "_".join(["pytorch"] + self.get_common_name_pieces(None))
+
+    def gen_tree(self):
+        build_configs = BUILD_CONFIGS_FULL_JIT if self.is_full_jit else BUILD_CONFIGS
+        extra_requires = (
+            [x.gen_job_name() for x in build_configs] if self.is_upload else []
+        )
+
+        props_dict = {
+            "build_environment": "-".join(
+                ["libtorch"] + self.get_common_name_pieces(".")
+            ),
+            "requires": extra_requires,
+            "context": "org-member",
+            "filters": {"branches": {"only": "nightly"}},
+        }
+
+        if not self.is_upload:
+            props_dict["ios_arch"] = self.variant
+            props_dict["ios_platform"] = ios_definitions.get_platform(self.variant)
+            props_dict["name"] = self.gen_job_name()
+            props_dict["use_metal"] = miniutils.quote(str(int(True)))
+            props_dict["use_coreml"] = miniutils.quote(str(int(True)))
+
+        if self.is_full_jit:
+            props_dict["lite_interpreter"] = miniutils.quote(str(int(False)))
+
+        template_name = "_".join(
+            [
+                "binary",
+                "ios",
+                self.get_phase_name(),
+            ]
+        )
+
+        return [{template_name: props_dict}]
+
+
+BUILD_CONFIGS = [
+    IOSNightlyJob("x86_64"),
+    IOSNightlyJob("arm64"),
+]
+
+BUILD_CONFIGS_FULL_JIT = [
+    IOSNightlyJob("x86_64", is_full_jit=True),
+    IOSNightlyJob("arm64", is_full_jit=True),
+]
+
+WORKFLOW_DATA = (
+    BUILD_CONFIGS
+    + BUILD_CONFIGS_FULL_JIT
+    + [
+        IOSNightlyJob("binary", is_full_jit=False, is_upload=True),
+        IOSNightlyJob("binary", is_full_jit=True, is_upload=True),
+    ]
+)
+
+
+def get_workflow_jobs():
+    return [item.gen_tree() for item in WORKFLOW_DATA]
--- a/test/dynamo_expected_failures/AotAutogradFallbackTests.test_aot_sequence_nr
+++ b/test/dynamo_expected_failures/AotAutogradFallbackTests.test_aot_sequence_nr
--- a/.circleci/cimodel/data/simple/util/branch_filters.py
+++ b/.circleci/cimodel/data/simple/util/branch_filters.py
@ -0,0 +1,36 @@
+NON_PR_BRANCH_LIST = [
+    "main",
+    "master",
+    r"/ci-all\/.*/",
+    r"/release\/.*/",
+]
+
+PR_BRANCH_LIST = [
+    r"/gh\/.*\/head/",
+    r"/pull\/.*/",
+]
+
+RC_PATTERN = r"/v[0-9]+(\.[0-9]+)*-rc[0-9]+/"
+
+MAC_IOS_EXCLUSION_LIST = ["nightly", "postnightly"]
+
+
+def gen_filter_dict(branches_list=NON_PR_BRANCH_LIST, tags_list=None):
+    """Generates a filter dictionary for use with CircleCI's job filter"""
+    filter_dict = {
+        "branches": {
+            "only": branches_list,
+        },
+    }
+
+    if tags_list is not None:
+        filter_dict["tags"] = {"only": tags_list}
+    return filter_dict
+
+
+def gen_filter_dict_exclude(branches_list=MAC_IOS_EXCLUSION_LIST):
+    return {
+        "branches": {
+            "ignore": branches_list,
+        },
+    }
--- a/.circleci/cimodel/data/simple/util/docker_constants.py
+++ b/.circleci/cimodel/data/simple/util/docker_constants.py
@ -0,0 +1,35 @@
+AWS_DOCKER_HOST = "308535385114.dkr.ecr.us-east-1.amazonaws.com"
+
+
+def gen_docker_image(container_type):
+    return (
+        "/".join([AWS_DOCKER_HOST, "pytorch", container_type]),
+        f"docker-{container_type}",
+    )
+
+
+def gen_docker_image_requires(image_name):
+    return [f"docker-{image_name}"]
+
+
+DOCKER_IMAGE_BASIC, DOCKER_REQUIREMENT_BASE = gen_docker_image(
+    "pytorch-linux-xenial-py3.7-gcc5.4"
+)
+
+DOCKER_IMAGE_CUDA_10_2, DOCKER_REQUIREMENT_CUDA_10_2 = gen_docker_image(
+    "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
+)
+
+DOCKER_IMAGE_GCC7, DOCKER_REQUIREMENT_GCC7 = gen_docker_image(
+    "pytorch-linux-xenial-py3.7-gcc7"
+)
+
+
+def gen_mobile_docker(specifier):
+    container_type = "pytorch-linux-xenial-py3-clang5-" + specifier
+    return gen_docker_image(container_type)
+
+
+DOCKER_IMAGE_ASAN, DOCKER_REQUIREMENT_ASAN = gen_mobile_docker("asan")
+
+DOCKER_IMAGE_NDK, DOCKER_REQUIREMENT_NDK = gen_mobile_docker("android-ndk-r21e")
--- a/.circleci/cimodel/data/simple/util/versions.py
+++ b/.circleci/cimodel/data/simple/util/versions.py
@ -0,0 +1,36 @@
+from typing import Optional
+
+
+class MultiPartVersion:
+    def __init__(self, parts, prefix=""):
+        self.parts = parts
+        self.prefix = prefix
+
+    def prefixed_parts(self):
+        """
+        Prepends the first element of the version list
+        with the prefix string.
+        """
+        if self.parts:
+            return [self.prefix + str(self.parts[0])] + [
+                str(part) for part in self.parts[1:]
+            ]
+        else:
+            return [self.prefix]
+
+    def render_dots_or_parts(self, sep: Optional[str] = None):
+        if sep is None:
+            return self.prefixed_parts()
+        else:
+            return [sep.join(self.prefixed_parts())]
+
+
+class CudaVersion(MultiPartVersion):
+    def __init__(self, major, minor):
+        self.major = major
+        self.minor = minor
+
+        super().__init__([self.major, self.minor], "cuda")
+
+    def __str__(self):
+        return f"{self.major}.{self.minor}"
--- a/test/dynamo_expected_failures/AutogradFunctionTests.test_graph_break_if_lifted_free_variable
+++ b/test/dynamo_expected_failures/AutogradFunctionTests.test_graph_break_if_lifted_free_variable
--- a/.circleci/cimodel/lib/conf_tree.py
+++ b/.circleci/cimodel/lib/conf_tree.py
@ -0,0 +1,111 @@
+from dataclasses import dataclass, field
+from typing import Dict, Optional
+
+
+def X(val):
+    """
+    Compact way to write a leaf node
+    """
+    return val, []
+
+
+def XImportant(name):
+    """Compact way to write an important (run on PRs) leaf node"""
+    return (name, [("important", [X(True)])])
+
+
+@dataclass
+class Ver:
+    """
+    Represents a product with a version number
+    """
+
+    name: str
+    version: str = ""
+
+    def __str__(self):
+        return self.name + self.version
+
+
+@dataclass
+class ConfigNode:
+    parent: Optional["ConfigNode"]
+    node_name: str
+    props: Dict[str, str] = field(default_factory=dict)
+
+    def get_label(self):
+        return self.node_name
+
+    # noinspection PyMethodMayBeStatic
+    def get_children(self):
+        return []
+
+    def get_parents(self):
+        return (
+            (self.parent.get_parents() + [self.parent.get_label()])
+            if self.parent
+            else []
+        )
+
+    def get_depth(self):
+        return len(self.get_parents())
+
+    def get_node_key(self):
+        return "%".join(self.get_parents() + [self.get_label()])
+
+    def find_prop(self, propname, searched=None):
+        """
+        Checks if its own dictionary has
+        the property, otherwise asks parent node.
+        """
+
+        if searched is None:
+            searched = []
+
+        searched.append(self.node_name)
+
+        if propname in self.props:
+            return self.props[propname]
+        elif self.parent:
+            return self.parent.find_prop(propname, searched)
+        else:
+            # raise Exception('Property "%s" does not exist anywhere in the tree! Searched: %s' % (propname, searched))
+            return None
+
+
+def dfs_recurse(
+    node,
+    leaf_callback=lambda x: None,
+    discovery_callback=lambda x, y, z: None,
+    child_callback=lambda x, y: None,
+    sibling_index=0,
+    sibling_count=1,
+):
+    discovery_callback(node, sibling_index, sibling_count)
+
+    node_children = node.get_children()
+    if node_children:
+        for i, child in enumerate(node_children):
+            child_callback(node, child)
+
+            dfs_recurse(
+                child,
+                leaf_callback,
+                discovery_callback,
+                child_callback,
+                i,
+                len(node_children),
+            )
+    else:
+        leaf_callback(node)
+
+
+def dfs(toplevel_config_node):
+    config_list = []
+
+    def leaf_callback(node):
+        config_list.append(node)
+
+    dfs_recurse(toplevel_config_node, leaf_callback)
+
+    return config_list
--- a/.circleci/cimodel/lib/miniutils.py
+++ b/.circleci/cimodel/lib/miniutils.py
@ -0,0 +1,10 @@
+def quote(s):
+    return sandwich('"', s)
+
+
+def sandwich(bread, jam):
+    return bread + jam + bread
+
+
+def override(word, substitutions):
+    return substitutions.get(word, word)
--- a/.circleci/cimodel/lib/miniyaml.py
+++ b/.circleci/cimodel/lib/miniyaml.py
@ -0,0 +1,51 @@
+from collections import OrderedDict
+
+import cimodel.lib.miniutils as miniutils
+
+
+LIST_MARKER = "- "
+INDENTATION_WIDTH = 2
+
+
+def is_dict(data):
+    return type(data) in [dict, OrderedDict]
+
+
+def is_collection(data):
+    return is_dict(data) or type(data) is list
+
+
+def render(fh, data, depth, is_list_member=False):
+    """
+    PyYaml does not allow precise control over the quoting
+    behavior, especially for merge references.
+    Therefore, we use this custom YAML renderer.
+    """
+
+    indentation = " " * INDENTATION_WIDTH * depth
+
+    if is_dict(data):
+        tuples = list(data.items())
+        if type(data) is not OrderedDict:
+            tuples.sort()
+
+        for i, (k, v) in enumerate(tuples):
+            if not v:
+                continue
+            # If this dict is itself a list member, the first key gets prefixed with a list marker
+            list_marker_prefix = LIST_MARKER if is_list_member and not i else ""
+
+            trailing_whitespace = "\n" if is_collection(v) else " "
+            fh.write(indentation + list_marker_prefix + k + ":" + trailing_whitespace)
+
+            render(fh, v, depth + 1 + int(is_list_member))
+
+    elif type(data) is list:
+        for v in data:
+            render(fh, v, depth, True)
+
+    else:
+        # use empty quotes to denote an empty string value instead of blank space
+        modified_data = miniutils.quote(data) if data == "" else data
+        list_member_prefix = indentation + LIST_MARKER if is_list_member else ""
+        fh.write(list_member_prefix + str(modified_data) + "\n")
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
--- a/.circleci/ensure-consistency.py
+++ b/.circleci/ensure-consistency.py
@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+
+import os
+import subprocess
+import sys
+import tempfile
+
+import generate_config_yml
+
+
+CHECKED_IN_FILE = "config.yml"
+REGENERATION_SCRIPT = "regenerate.sh"
+
+PARENT_DIR = os.path.basename(os.path.dirname(os.path.abspath(__file__)))
+README_PATH = os.path.join(PARENT_DIR, "README.md")
+
+ERROR_MESSAGE_TEMPLATE = """
+The checked-in CircleCI "%s" file does not match what was generated by the scripts.
+Please re-run the "%s" script in the "%s" directory and commit the result. See "%s" for more information.
+"""
+
+
+def check_consistency():
+    _, temp_filename = tempfile.mkstemp("-generated-config.yml")
+
+    with open(temp_filename, "w") as fh:
+        generate_config_yml.stitch_sources(fh)
+
+    try:
+        subprocess.check_call(["cmp", temp_filename, CHECKED_IN_FILE])
+    except subprocess.CalledProcessError:
+        sys.exit(
+            ERROR_MESSAGE_TEMPLATE
+            % (CHECKED_IN_FILE, REGENERATION_SCRIPT, PARENT_DIR, README_PATH)
+        )
+    finally:
+        os.remove(temp_filename)
+
+
+if __name__ == "__main__":
+    check_consistency()
--- a/.circleci/generate_config_yml.py
+++ b/.circleci/generate_config_yml.py
@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+
+"""
+This script is the source of truth for config.yml.
+Please see README.md in this directory for details.
+"""
+
+import os
+import shutil
+import sys
+from collections import namedtuple
+
+import cimodel.data.simple.docker_definitions
+import cimodel.data.simple.mobile_definitions
+import cimodel.data.simple.nightly_ios
+import cimodel.lib.miniutils as miniutils
+import cimodel.lib.miniyaml as miniyaml
+
+
+class File:
+    """
+    Verbatim copy the contents of a file into config.yml
+    """
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    def write(self, output_filehandle):
+        with open(os.path.join("verbatim-sources", self.filename)) as fh:
+            shutil.copyfileobj(fh, output_filehandle)
+
+
+class FunctionGen(namedtuple("FunctionGen", "function depth")):
+    __slots__ = ()
+
+
+class Treegen(FunctionGen):
+    """
+    Insert the content of a YAML tree into config.yml
+    """
+
+    def write(self, output_filehandle):
+        miniyaml.render(output_filehandle, self.function(), self.depth)
+
+
+class Listgen(FunctionGen):
+    """
+    Insert the content of a YAML list into config.yml
+    """
+
+    def write(self, output_filehandle):
+        miniyaml.render(output_filehandle, self.function(), self.depth)
+
+
+def horizontal_rule():
+    return "".join("#" * 78)
+
+
+class Header:
+    def __init__(self, title, summary=None):
+        self.title = title
+        self.summary_lines = summary or []
+
+    def write(self, output_filehandle):
+        text_lines = [self.title] + self.summary_lines
+        comment_lines = ["# " + x for x in text_lines]
+        lines = miniutils.sandwich([horizontal_rule()], comment_lines)
+
+        for line in filter(None, lines):
+            output_filehandle.write(line + "\n")
+
+
+def _for_all_items(items, functor) -> None:
+    if isinstance(items, list):
+        for item in items:
+            _for_all_items(item, functor)
+    if isinstance(items, dict) and len(items) == 1:
+        item_type, item = next(iter(items.items()))
+        functor(item_type, item)
+
+
+def filter_master_only_jobs(items):
+    def _is_main_or_master_item(item):
+        filters = item.get("filters", None)
+        branches = filters.get("branches", None) if filters is not None else None
+        branches_only = branches.get("only", None) if branches is not None else None
+        return (
+            ("main" in branches_only or "master" in branches_only)
+            if branches_only is not None
+            else False
+        )
+
+    master_deps = set()
+
+    def _save_requires_if_master(item_type, item):
+        requires = item.get("requires", None)
+        item_name = item.get("name", None)
+        if not isinstance(requires, list):
+            return
+        if _is_main_or_master_item(item) or item_name in master_deps:
+            master_deps.update([n.strip('"') for n in requires])
+
+    def _do_filtering(items):
+        if isinstance(items, list):
+            rc = [_do_filtering(item) for item in items]
+            return [item for item in rc if len(item if item is not None else []) > 0]
+        assert isinstance(items, dict) and len(items) == 1
+        item_type, item = next(iter(items.items()))
+        item_name = item.get("name", None)
+        item_name = item_name.strip('"') if item_name is not None else None
+        if not _is_main_or_master_item(item) and item_name not in master_deps:
+            return None
+        if "filters" in item:
+            item = item.copy()
+            item.pop("filters")
+        return {item_type: item}
+
+    # Scan of dependencies twice to pick up nested required jobs
+    # I.e. jobs depending on jobs that main-only job depend on
+    _for_all_items(items, _save_requires_if_master)
+    _for_all_items(items, _save_requires_if_master)
+    return _do_filtering(items)
+
+
+def generate_required_docker_images(items):
+    required_docker_images = set()
+
+    def _requires_docker_image(item_type, item):
+        requires = item.get("requires", None)
+        if not isinstance(requires, list):
+            return
+        for requirement in requires:
+            requirement = requirement.replace('"', "")
+            if requirement.startswith("docker-"):
+                required_docker_images.add(requirement)
+
+    _for_all_items(items, _requires_docker_image)
+    return required_docker_images
+
+
+def gen_build_workflows_tree():
+    build_workflows_functions = [
+        cimodel.data.simple.mobile_definitions.get_workflow_jobs,
+        cimodel.data.simple.nightly_ios.get_workflow_jobs,
+    ]
+    build_jobs = [f() for f in build_workflows_functions]
+    build_jobs.extend(
+        cimodel.data.simple.docker_definitions.get_workflow_jobs(
+            # sort for consistency
+            sorted(generate_required_docker_images(build_jobs))
+        )
+    )
+    master_build_jobs = filter_master_only_jobs(build_jobs)
+
+    rc = {
+        "workflows": {
+            "build": {
+                "when": r"<< pipeline.parameters.run_build >>",
+                "jobs": build_jobs,
+            },
+        }
+    }
+    if len(master_build_jobs) > 0:
+        rc["workflows"]["master_build"] = {
+            "when": r"<< pipeline.parameters.run_master_build >>",
+            "jobs": master_build_jobs,
+        }
+    return rc
+
+
+# Order of this list matters to the generated config.yml.
+YAML_SOURCES = [
+    File("header-section.yml"),
+    File("commands.yml"),
+    File("nightly-binary-build-defaults.yml"),
+    Header("Build parameters"),
+    File("build-parameters/pytorch-build-params.yml"),
+    File("build-parameters/binary-build-params.yml"),
+    Header("Job specs"),
+    File("job-specs/binary-job-specs.yml"),
+    File("job-specs/job-specs-custom.yml"),
+    File("job-specs/binary_update_htmls.yml"),
+    File("job-specs/binary-build-tests.yml"),
+    File("job-specs/docker_jobs.yml"),
+    Header("Workflows"),
+    Treegen(gen_build_workflows_tree, 0),
+]
+
+
+def stitch_sources(output_filehandle):
+    for f in YAML_SOURCES:
+        f.write(output_filehandle)
+
+
+if __name__ == "__main__":
+    stitch_sources(sys.stdout)
--- a/.circleci/regenerate.ps1
+++ b/.circleci/regenerate.ps1
@ -0,0 +1,5 @@
+cd $PSScriptRoot;
+$NewFile = New-TemporaryFile;
+python generate_config_yml.py > $NewFile.name
+(Get-Content $NewFile.name -Raw).TrimEnd().Replace("`r`n","`n") | Set-Content config.yml -Force
+Remove-Item $NewFile.name
--- a/.circleci/regenerate.sh
+++ b/.circleci/regenerate.sh
@ -0,0 +1,17 @@
+#!/bin/bash -e
+
+# Allows this script to be invoked from any directory:
+cd "$(dirname "$0")"
+
+UNCOMMIT_CHANGE=$(git status -s | grep " config.yml" | wc -l | xargs)
+if [[ $UNCOMMIT_CHANGE != 0 ]]; then
+    OLD_FILE=$(mktemp)
+    cp config.yml "$OLD_FILE"
+    echo "Uncommitted change detected in .circleci/config.yml"
+    echo "It has been backed up to $OLD_FILE"
+fi
+
+NEW_FILE=$(mktemp)
+./generate_config_yml.py > "$NEW_FILE"
+cp "$NEW_FILE" config.yml
+echo "New config generated in .circleci/config.yml"
--- a/.circleci/scripts/binary_checkout.sh
+++ b/.circleci/scripts/binary_checkout.sh
@ -0,0 +1,69 @@
+#!/bin/bash
+set -eux -o pipefail
+
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+
+# This step runs on multiple executors with different envfile locations
+if [[ "$(uname)" == Darwin ]]; then
+  # macos executor (builds and tests)
+  workdir="/Users/distiller/project"
+elif [[ "$OSTYPE" == "msys" ]]; then
+  # windows executor (builds and tests)
+  rm -rf /c/w
+  ln -s "/c/Users/circleci/project" /c/w
+  workdir="/c/w"
+elif [[ -d "/home/circleci/project" ]]; then
+  # machine executor (binary tests)
+  workdir="/home/circleci/project"
+else
+  # docker executor (binary builds)
+  workdir="/"
+fi
+
+# It is very important that this stays in sync with binary_populate_env.sh
+if [[ "$OSTYPE" == "msys" ]]; then
+  # We need to make the paths as short as possible on Windows
+  export PYTORCH_ROOT="$workdir/p"
+  export BUILDER_ROOT="$workdir/b"
+else
+  export PYTORCH_ROOT="$workdir/pytorch"
+  export BUILDER_ROOT="$workdir/builder"
+fi
+
+# Try to extract PR number from branch if not already set
+if [[ -z "${CIRCLE_PR_NUMBER:-}" ]]; then
+  CIRCLE_PR_NUMBER="$(echo ${CIRCLE_BRANCH} | sed -E -n 's/pull\/([0-9]*).*/\1/p')"
+fi
+
+# Clone the Pytorch branch
+retry git clone https://github.com/pytorch/pytorch.git "$PYTORCH_ROOT"
+pushd "$PYTORCH_ROOT"
+if [[ -n "${CIRCLE_PR_NUMBER:-}" ]]; then
+  # "smoke" binary build on PRs
+  git fetch --force origin "pull/${CIRCLE_PR_NUMBER}/head:remotes/origin/pull/${CIRCLE_PR_NUMBER}"
+  git reset --hard "$CIRCLE_SHA1"
+  git checkout -q -B "$CIRCLE_BRANCH"
+  git reset --hard "$CIRCLE_SHA1"
+elif [[ -n "${CIRCLE_SHA1:-}" ]]; then
+  # Scheduled workflows & "smoke" binary build on trunk on PR merges
+  DEFAULT_BRANCH="$(git remote show $CIRCLE_REPOSITORY_URL | awk '/HEAD branch/ {print $NF}')"
+  git reset --hard "$CIRCLE_SHA1"
+  git checkout -q -B $DEFAULT_BRANCH
+else
+  echo "Can't tell what to checkout"
+  exit 1
+fi
+retry git submodule update --init --recursive
+echo "Using Pytorch from "
+git --no-pager log --max-count 1
+popd
+
+# Clone the Builder main repo
+retry git clone -q https://github.com/pytorch/builder.git "$BUILDER_ROOT"
+pushd "$BUILDER_ROOT"
+echo "Using builder from "
+git --no-pager log --max-count 1
+popd
--- a/.circleci/scripts/binary_install_miniconda.sh
+++ b/.circleci/scripts/binary_install_miniconda.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+
+set -eux -o pipefail
+
+# This step runs on multiple executors with different envfile locations
+if [[ "$(uname)" == Darwin ]]; then
+  envfile="/Users/distiller/project/env"
+elif [[ -d "/home/circleci/project" ]]; then
+  # machine executor (binary tests)
+  envfile="/home/circleci/project/env"
+else
+  # docker executor (binary builds)
+  envfile="/env"
+fi
+
+# TODO this is super hacky and ugly. Basically, the binary_update_html job does
+# not have an env file, since it does not call binary_populate_env.sh, since it
+# does not have a BUILD_ENVIRONMENT. So for this one case, which we detect by a
+# lack of an env file, we manually export the environment variables that we
+# need to install miniconda
+if [[ ! -f "$envfile" ]]; then
+  MINICONDA_ROOT="/home/circleci/project/miniconda"
+  workdir="/home/circleci/project"
+  retry () {
+      $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+  }
+  export -f retry
+else
+  source "$envfile"
+fi
+
+conda_sh="$workdir/install_miniconda.sh"
+if [[ "$(uname)" == Darwin ]]; then
+  curl --retry 3 --retry-all-errors -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-MacOSX-x86_64.sh
+else
+  curl --retry 3 --retry-all-errors -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+fi
+chmod +x "$conda_sh"
+"$conda_sh" -b -p "$MINICONDA_ROOT"
+rm -f "$conda_sh"
+
+# We can't actually add miniconda to the PATH in the envfile, because that
+# breaks 'unbuffer' in Mac jobs. This is probably because conda comes with
+# a tclsh, which then gets inserted before the tclsh needed in /usr/bin
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -96,13 +96,8 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
    conda install \${EXTRA_CONDA_FLAGS} -y "\$pkg" --offline
  )
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
-  if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then
-    pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
-    retry pip install -q numpy protobuf typing-extensions
-  else
-    pip install "\$pkg"
-    retry pip install -q numpy protobuf typing-extensions
-  fi
+  pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
+  retry pip install -q numpy protobuf typing-extensions
 fi
 if [[ "$PACKAGE_TYPE" == libtorch ]]; then
  pkg="\$(ls /final_pkgs/*-latest.zip)"
--- a/.circleci/scripts/binary_macos_build.sh
+++ b/.circleci/scripts/binary_macos_build.sh
@ -4,6 +4,10 @@ set -eux -o pipefail
 source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"

+if [[ -z "${GITHUB_ACTIONS:-}" ]]; then
+  export PATH="${workdir:-${HOME}}/miniconda/bin:${PATH}"
+fi
+
 # Build
 export USE_PYTORCH_METAL_EXPORT=1
 export USE_COREML_DELEGATE=1
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -3,9 +3,17 @@ set -eux -o pipefail
 export TZ=UTC

 tagged_version() {
-  GIT_DIR="${workdir}/pytorch/.git"
+  # Grabs version from either the env variable CIRCLE_TAG
+  # or the pytorch git described version
+  if [[ "$OSTYPE" == "msys" &&  -z "${GITHUB_ACTIONS:-}" ]]; then
+    GIT_DIR="${workdir}/p/.git"
+  else
+    GIT_DIR="${workdir}/pytorch/.git"
+  fi
  GIT_DESCRIBE="git --git-dir ${GIT_DIR} describe --tags --match v[0-9]*.[0-9]*.[0-9]*"
-  if [[ ! -d "${GIT_DIR}" ]]; then
+  if [[ -n "${CIRCLE_TAG:-}" ]]; then
+    echo "${CIRCLE_TAG}"
+  elif [[ ! -d "${GIT_DIR}" ]]; then
    echo "Abort, abort! Git dir ${GIT_DIR} does not exists!"
    kill $$
  elif ${GIT_DESCRIBE} --exact >/dev/null; then
@ -50,8 +58,8 @@ fi
 PIP_UPLOAD_FOLDER='nightly/'
 # We put this here so that OVERRIDE_PACKAGE_VERSION below can read from it
 export DATE="$(date -u +%Y%m%d)"
-BASE_BUILD_VERSION="$(cat ${PYTORCH_ROOT}/version.txt|cut -da -f1).dev${DATE}"
-
+#TODO: We should be pulling semver version from the base version.txt
+BASE_BUILD_VERSION="2.2.0.dev$DATE"
 # Change BASE_BUILD_VERSION to git tag when on a git tag
 # Use 'git -C' to make doubly sure we're in the correct directory for checking
 # the git tag
@ -71,35 +79,6 @@ fi

 export PYTORCH_BUILD_NUMBER=1

-# Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
-TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
-
-# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
-  # Only linux Python < 3.13 are supported wheels for triton
-  TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.13'"
-  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
-  if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
-      TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
-      TRITON_REQUIREMENT="pytorch-triton==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
-  fi
-  export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
-fi
-
-# Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton rocm package
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" && "$DESIRED_PYTHON" != "3.12" ]]; then
-    TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}"
-    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
-        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-rocm.txt)
-        TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}"
-    fi
-    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
-    else
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
-    fi
-fi
-
 JAVA_HOME=
 BUILD_JNI=OFF
 if [[ "$PACKAGE_TYPE" == libtorch ]]; then
@ -145,13 +124,12 @@ if [[ "${OSTYPE}" == "msys" ]]; then
 else
  export DESIRED_DEVTOOLSET="${DESIRED_DEVTOOLSET:-}"
 fi
-
+export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}"
 export DATE="$DATE"
 export NIGHTLIES_DATE_PREAMBLE=1.14.0.dev
 export PYTORCH_BUILD_VERSION="$PYTORCH_BUILD_VERSION"
 export PYTORCH_BUILD_NUMBER="$PYTORCH_BUILD_NUMBER"
 export OVERRIDE_PACKAGE_VERSION="$PYTORCH_BUILD_VERSION"
-export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}"

 # TODO: We don't need this anymore IIUC
 export TORCH_PACKAGE_NAME='torch'
@ -184,6 +162,28 @@ if [[ "$(uname)" != Darwin ]]; then
 EOL
 fi

+if [[ -z "${GITHUB_ACTIONS:-}" ]]; then
+  cat >>"$envfile" <<EOL
+  export workdir="$workdir"
+  export MAC_PACKAGE_WORK_DIR="$workdir"
+  if [[ "$OSTYPE" == "msys" ]]; then
+    export PYTORCH_ROOT="$workdir/p"
+    export BUILDER_ROOT="$workdir/b"
+  else
+    export PYTORCH_ROOT="$workdir/pytorch"
+    export BUILDER_ROOT="$workdir/builder"
+  fi
+  export MINICONDA_ROOT="$workdir/miniconda"
+  export PYTORCH_FINAL_PACKAGE_DIR="$workdir/final_pkgs"
+
+  export CIRCLE_TAG="${CIRCLE_TAG:-}"
+  export CIRCLE_SHA1="$CIRCLE_SHA1"
+  export CIRCLE_PR_NUMBER="${CIRCLE_PR_NUMBER:-}"
+  export CIRCLE_BRANCH="$CIRCLE_BRANCH"
+  export CIRCLE_WORKFLOW_ID="$CIRCLE_WORKFLOW_ID"
+EOL
+fi
+
 echo 'retry () {' >> "$envfile"
 echo '    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)' >> "$envfile"
 echo '}' >> "$envfile"
--- a/.circleci/scripts/binary_run_in_docker.sh
+++ b/.circleci/scripts/binary_run_in_docker.sh
@ -0,0 +1,29 @@
+#!/bin/bash
+
+# This section is used in the binary_test and smoke_test jobs. It expects
+# 'binary_populate_env' to have populated /home/circleci/project/env and it
+# expects another section to populate /home/circleci/project/ci_test_script.sh
+# with the code to run in the docker
+
+# Expect all needed environment variables to be written to this file
+source /home/circleci/project/env
+echo "Running the following code in Docker"
+cat /home/circleci/project/ci_test_script.sh
+echo
+echo
+set -eux -o pipefail
+
+# Expect actual code to be written to this file
+chmod +x /home/circleci/project/ci_test_script.sh
+
+VOLUME_MOUNTS="-v /home/circleci/project/:/circleci_stuff -v /home/circleci/project/final_pkgs:/final_pkgs -v ${PYTORCH_ROOT}:/pytorch -v ${BUILDER_ROOT}:/builder"
+# Run the docker
+if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then
+  export id=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --gpus all ${VOLUME_MOUNTS} -t -d "${DOCKER_IMAGE}")
+else
+  export id=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined ${VOLUME_MOUNTS} -t -d "${DOCKER_IMAGE}")
+fi
+
+# Execute the test script that was populated by an earlier section
+export COMMAND='((echo "source /circleci_stuff/env && /circleci_stuff/ci_test_script.sh") | docker exec -i "$id" bash) 2>&1'
+echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
--- a/.circleci/scripts/build_android_gradle.sh
+++ b/.circleci/scripts/build_android_gradle.sh
@ -22,7 +22,7 @@ done < <(find /var/lib/jenkins/.gradle -type f -print0)

 # Patch pocketfft (as Android does not have aligned_alloc even if compiled with c++17
 if [ -f ~/workspace/third_party/pocketfft/pocketfft_hdronly.h ]; then
-  sed -i -e "s/__cplusplus >= 201703L/0/" ~/workspace/third_party/pocketfft/pocketfft_hdronly.h
+  sed -i -e "s/#if __cplusplus >= 201703L/#if 0/" ~/workspace/third_party/pocketfft/pocketfft_hdronly.h
 fi

 export GRADLE_LOCAL_PROPERTIES=~/workspace/android/local.properties
@ -40,7 +40,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *-gradle-custom-build* ]]; then
  # Install torch & torchvision - used to download & dump used ops from test model.
  retry pip install torch torchvision --progress-bar off

-  exec "$(dirname "${BASH_SOURCE[0]}")/../android/build_test_app_custom.sh" armeabi-v7a
+  exec "$(dirname "${BASH_SOURCE[0]}")/../../android/build_test_app_custom.sh" armeabi-v7a
 fi

 # Run default build
--- a/.circleci/scripts/setup_ci_environment.sh
+++ b/.circleci/scripts/setup_ci_environment.sh
@ -0,0 +1,111 @@
+#!/usr/bin/env bash
+set -ex -o pipefail
+
+# Remove unnecessary sources
+sudo rm -f /etc/apt/sources.list.d/google-chrome.list
+sudo rm -f /etc/apt/heroku.list
+sudo rm -f /etc/apt/openjdk-r-ubuntu-ppa-xenial.list
+sudo rm -f /etc/apt/partner.list
+
+# To increase the network reliability, let apt decide which mirror is best to use
+sudo sed -i -e 's/http:\/\/.*archive/mirror:\/\/mirrors/' -e 's/\/ubuntu\//\/mirrors.txt/' /etc/apt/sources.list
+
+retry () {
+  $*  || $* || $* || $* || $*
+}
+
+# Method adapted from here: https://askubuntu.com/questions/875213/apt-get-to-retry-downloading
+# (with use of tee to avoid permissions problems)
+# This is better than retrying the whole apt-get command
+echo "APT::Acquire::Retries \"3\";" | sudo tee /etc/apt/apt.conf.d/80-retries
+
+retry sudo apt-get update -qq
+retry sudo apt-get -y install \
+  moreutils \
+  expect-dev
+
+echo "== DOCKER VERSION =="
+docker version
+
+if ! command -v aws >/dev/null; then
+  retry sudo pip3 -q install awscli==1.19.64
+fi
+
+if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then
+  DRIVER_FN="NVIDIA-Linux-x86_64-515.76.run"
+  wget "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
+  sudo /bin/bash "$DRIVER_FN" -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
+  nvidia-smi
+
+  # Taken directly from https://github.com/NVIDIA/nvidia-docker
+  # Add the package repositories
+  distribution=$(. /etc/os-release;echo "$ID$VERSION_ID")
+  curl -s -L --retry 3 --retry-all-errors https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+  curl -s -L --retry 3 --retry-all-errors "https://nvidia.github.io/nvidia-docker/${distribution}/nvidia-docker.list" | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
+
+  retry sudo apt-get update -qq
+  # Necessary to get the `--gpus` flag to function within docker
+  retry sudo apt-get install -y nvidia-container-toolkit
+  sudo systemctl restart docker
+else
+  # Explicitly remove nvidia docker apt repositories if not building for cuda
+  sudo rm -rf /etc/apt/sources.list.d/nvidia-docker.list
+fi
+
+add_to_env_file() {
+  local name=$1
+  local value=$2
+  case "$value" in
+    *\ *)
+      # BASH_ENV should be set by CircleCI
+      echo "${name}='${value}'" >> "${BASH_ENV:-/tmp/env}"
+      ;;
+    *)
+      echo "${name}=${value}" >> "${BASH_ENV:-/tmp/env}"
+      ;;
+  esac
+}
+
+add_to_env_file CI_MASTER "${CI_MASTER:-}"
+add_to_env_file COMMIT_SOURCE "${CIRCLE_BRANCH:-}"
+add_to_env_file BUILD_ENVIRONMENT "${BUILD_ENVIRONMENT}"
+add_to_env_file CIRCLE_PULL_REQUEST "${CIRCLE_PULL_REQUEST}"
+
+
+if [[ "${BUILD_ENVIRONMENT}" == *-build ]]; then
+  add_to_env_file SCCACHE_BUCKET ossci-compiler-cache-circleci-v2
+
+  SCCACHE_MAX_JOBS=$(( $(nproc) - 1 ))
+  MEMORY_LIMIT_MAX_JOBS=8  # the "large" resource class on CircleCI has 32 CPU cores, if we use all of them we'll OOM
+  MAX_JOBS=$(( ${SCCACHE_MAX_JOBS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${SCCACHE_MAX_JOBS} ))
+  add_to_env_file MAX_JOBS "${MAX_JOBS}"
+
+  if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then
+    add_to_env_file TORCH_CUDA_ARCH_LIST 5.2
+  fi
+
+  if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then
+    # This IAM user allows write access to S3 bucket for sccache & bazels3cache
+    set +x
+    add_to_env_file XLA_CLANG_CACHE_S3_BUCKET_NAME "${XLA_CLANG_CACHE_S3_BUCKET_NAME:-}"
+    add_to_env_file AWS_ACCESS_KEY_ID "${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}"
+    add_to_env_file AWS_SECRET_ACCESS_KEY "${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}"
+    set -x
+  else
+    # This IAM user allows write access to S3 bucket for sccache
+    set +x
+    add_to_env_file XLA_CLANG_CACHE_S3_BUCKET_NAME "${XLA_CLANG_CACHE_S3_BUCKET_NAME:-}"
+    add_to_env_file AWS_ACCESS_KEY_ID "${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}"
+    add_to_env_file AWS_SECRET_ACCESS_KEY "${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}"
+    set -x
+  fi
+fi
+
+# This IAM user only allows read-write access to ECR
+set +x
+export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V4:-}
+export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V4:-}
+export AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
+export AWS_REGION=us-east-1
+aws ecr get-login-password --region $AWS_REGION|docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com
+set -x
--- a/.circleci/scripts/setup_linux_system_environment.sh
+++ b/.circleci/scripts/setup_linux_system_environment.sh
@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+set -eux -o pipefail
+
+# Set up CircleCI GPG keys for apt, if needed
+curl --retry 3 --retry-all-errors -s -L https://packagecloud.io/circleci/trusty/gpgkey | sudo apt-key add -
+
+# Stop background apt updates.  Hypothetically, the kill should not
+# be necessary, because stop is supposed to send a kill signal to
+# the process, but we've added it for good luck.  Also
+# hypothetically, it's supposed to be unnecessary to wait for
+# the process to block.  We also have that line for good luck.
+# If you like, try deleting them and seeing if it works.
+sudo systemctl stop apt-daily.service || true
+sudo systemctl kill --kill-who=all apt-daily.service || true
+
+sudo systemctl stop unattended-upgrades.service || true
+sudo systemctl kill --kill-who=all unattended-upgrades.service || true
+
+# wait until `apt-get update` has been killed
+while systemctl is-active --quiet apt-daily.service
+do
+    sleep 1;
+done
+while systemctl is-active --quiet unattended-upgrades.service
+do
+    sleep 1;
+done
+
+# See if we actually were successful
+systemctl list-units --all | cat
+
+# For good luck, try even harder to kill apt-get
+sudo pkill apt-get || true
+
+# For even better luck, purge unattended-upgrades
+sudo apt-get purge -y unattended-upgrades || true
+
+cat /etc/apt/sources.list
+
+# For the bestest luck, kill again now
+sudo pkill apt || true
+sudo pkill dpkg || true
+
+# Try to detect if apt/dpkg is stuck
+if ps auxfww | grep '[a]pt'; then
+  echo "WARNING: There are leftover apt processes; subsequent apt update will likely fail"
+fi
+if ps auxfww | grep '[d]pkg'; then
+  echo "WARNING: There are leftover dpkg processes; subsequent apt update will likely fail"
+fi
--- a/.circleci/verbatim-sources/build-parameters/binary-build-params.yml
+++ b/.circleci/verbatim-sources/build-parameters/binary-build-params.yml
@ -0,0 +1,65 @@
+binary_linux_build_params: &binary_linux_build_params
+  parameters:
+    build_environment:
+      type: string
+      default: ""
+    docker_image:
+      type: string
+      default: ""
+    libtorch_variant:
+      type: string
+      default: ""
+    resource_class:
+      type: string
+      default: "2xlarge+"
+  environment:
+    BUILD_ENVIRONMENT: << parameters.build_environment >>
+    LIBTORCH_VARIANT: << parameters.libtorch_variant >>
+    ANACONDA_USER: pytorch
+  resource_class: << parameters.resource_class >>
+  docker:
+    - image: << parameters.docker_image >>
+
+binary_linux_test_upload_params: &binary_linux_test_upload_params
+  parameters:
+    build_environment:
+      type: string
+      default: ""
+    docker_image:
+      type: string
+      default: ""
+    libtorch_variant:
+      type: string
+      default: ""
+    resource_class:
+      type: string
+      default: "medium"
+    use_cuda_docker_runtime:
+      type: string
+      default: ""
+  environment:
+    BUILD_ENVIRONMENT: << parameters.build_environment >>
+    DOCKER_IMAGE: << parameters.docker_image >>
+    USE_CUDA_DOCKER_RUNTIME: << parameters.use_cuda_docker_runtime >>
+    LIBTORCH_VARIANT: << parameters.libtorch_variant >>
+  resource_class: << parameters.resource_class >>
+
+binary_mac_params: &binary_mac_params
+  parameters:
+    build_environment:
+      type: string
+      default: ""
+  environment:
+    BUILD_ENVIRONMENT: << parameters.build_environment >>
+
+binary_windows_params: &binary_windows_params
+  parameters:
+    build_environment:
+      type: string
+      default: ""
+    executor:
+      type: string
+      default: "windows-xlarge-cpu-with-nvidia-cuda"
+  environment:
+    BUILD_ENVIRONMENT: << parameters.build_environment >>
+    JOB_EXECUTOR: <<parameters.executor>>
--- a/.circleci/verbatim-sources/build-parameters/pytorch-build-params.yml
+++ b/.circleci/verbatim-sources/build-parameters/pytorch-build-params.yml
@ -0,0 +1,105 @@
+pytorch_params: &pytorch_params
+  parameters:
+    build_environment:
+      type: string
+      default: ""
+    docker_image:
+      type: string
+      default: ""
+    resource_class:
+      type: string
+      default: "large"
+    use_cuda_docker_runtime:
+      type: string
+      default: ""
+    build_only:
+      type: string
+      default: ""
+    ci_master:
+      type: string
+      default: ""
+  environment:
+    BUILD_ENVIRONMENT: << parameters.build_environment >>
+    DOCKER_IMAGE: << parameters.docker_image >>
+    USE_CUDA_DOCKER_RUNTIME: << parameters.use_cuda_docker_runtime >>
+    BUILD_ONLY: << parameters.build_only >>
+    CI_MASTER: << pipeline.parameters.run_master_build >>
+  resource_class: << parameters.resource_class >>
+
+pytorch_ios_params: &pytorch_ios_params
+  parameters:
+    build_environment:
+      type: string
+      default: ""
+    ios_arch:
+      type: string
+      default: ""
+    ios_platform:
+      type: string
+      default: ""
+    op_list:
+      type: string
+      default: ""
+    use_metal:
+      type: string
+      default: "0"
+    lite_interpreter:
+      type: string
+      default: "1"
+    use_coreml:
+      type: string
+      default: "0"
+  environment:
+    BUILD_ENVIRONMENT: << parameters.build_environment >>
+    IOS_ARCH: << parameters.ios_arch >>
+    IOS_PLATFORM: << parameters.ios_platform >>
+    SELECTED_OP_LIST: << parameters.op_list >>
+    USE_PYTORCH_METAL: << parameters.use_metal >>
+    BUILD_LITE_INTERPRETER: << parameters.lite_interpreter >>
+    USE_COREML_DELEGATE: << parameters.use_coreml >>
+
+pytorch_windows_params: &pytorch_windows_params
+  parameters:
+    executor:
+      type: string
+      default: "windows-xlarge-cpu-with-nvidia-cuda"
+    build_environment:
+      type: string
+      default: ""
+    test_name:
+      type: string
+      default: ""
+    cuda_version:
+      type: string
+      default: "10.1"
+    python_version:
+      type: string
+      default: "3.8"
+    vs_version:
+      type: string
+      default: "16.8.6"
+    vc_version:
+      type: string
+      default: "14.16"
+    vc_year:
+      type: string
+      default: "2019"
+    vc_product:
+      type: string
+      default: "BuildTools"
+    use_cuda:
+      type: string
+      default: ""
+  environment:
+    BUILD_ENVIRONMENT: <<parameters.build_environment>>
+    SCCACHE_BUCKET: "ossci-compiler-cache"
+    CUDA_VERSION: <<parameters.cuda_version>>
+    PYTHON_VERSION: <<parameters.python_version>>
+    VS_VERSION: <<parameters.vs_version>>
+    VC_VERSION: <<parameters.vc_version>>
+    VC_YEAR: <<parameters.vc_year>>
+    VC_PRODUCT: <<parameters.vc_product>>
+    USE_CUDA: <<parameters.use_cuda>>
+    TORCH_CUDA_ARCH_LIST: "5.2 7.5"
+    JOB_BASE_NAME: <<parameters.test_name>>
+    JOB_EXECUTOR: <<parameters.executor>>
--- a/.circleci/verbatim-sources/commands.yml
+++ b/.circleci/verbatim-sources/commands.yml
@ -0,0 +1,134 @@
+commands:
+
+  calculate_docker_image_tag:
+    description: "Calculates the docker image tag"
+    steps:
+      - run:
+          name: "Calculate docker image hash"
+          command: |
+            DOCKER_TAG=$(git rev-parse HEAD:.ci/docker)
+            echo "DOCKER_TAG=${DOCKER_TAG}" >> "${BASH_ENV}"
+
+  designate_upload_channel:
+    description: "inserts the correct upload channel into ${BASH_ENV}"
+    steps:
+      - run:
+          name: adding UPLOAD_CHANNEL to BASH_ENV
+          command: |
+            our_upload_channel=nightly
+            # On tags upload to test instead
+            if [[ -n "${CIRCLE_TAG}" ]]; then
+              our_upload_channel=test
+            fi
+            echo "export UPLOAD_CHANNEL=${our_upload_channel}" >> ${BASH_ENV}
+
+  # This system setup script is meant to run before the CI-related scripts, e.g.,
+  # installing Git client, checking out code, setting up CI env, and
+  # building/testing.
+  setup_linux_system_environment:
+    steps:
+      - run:
+          name: Set Up System Environment
+          no_output_timeout: "1h"
+          command: .circleci/scripts/setup_linux_system_environment.sh
+
+  setup_ci_environment:
+    steps:
+      - run:
+          name: Set Up CI Environment After attach_workspace
+          no_output_timeout: "1h"
+          command: .circleci/scripts/setup_ci_environment.sh
+
+  brew_update:
+    description: "Update Homebrew and install base formulae"
+    steps:
+      - run:
+          name: Update Homebrew
+          no_output_timeout: "10m"
+          command: |
+            set -ex
+
+            # Update repositories manually.
+            # Running `brew update` produces a comparison between the
+            # current checkout and the updated checkout, which takes a
+            # very long time because the existing checkout is 2y old.
+            for path in $(find /usr/local/Homebrew -type d -name .git)
+            do
+            cd $path/..
+            git fetch --depth=1 origin
+            git reset --hard origin/master
+            done
+
+            export HOMEBREW_NO_AUTO_UPDATE=1
+
+            # Install expect and moreutils so that we can call `unbuffer` and `ts`.
+            # moreutils installs a `parallel` executable by default, which conflicts
+            # with the executable from the GNU `parallel`, so we must unlink GNU
+            # `parallel` first, and relink it afterwards.
+            brew unlink parallel
+            brew install moreutils
+            brew link parallel --overwrite
+            brew install expect
+
+  brew_install:
+    description: "Install Homebrew formulae"
+    parameters:
+      formulae:
+        type: string
+        default: ""
+    steps:
+      - run:
+          name: Install << parameters.formulae >>
+          no_output_timeout: "10m"
+          command: |
+            set -ex
+            export HOMEBREW_NO_AUTO_UPDATE=1
+            brew install << parameters.formulae >>
+
+  run_brew_for_macos_build:
+    steps:
+      - brew_update
+      - brew_install:
+          formulae: libomp
+
+  run_brew_for_ios_build:
+    steps:
+      - brew_update
+      - brew_install:
+          formulae: libtool
+
+  optional_merge_target_branch:
+    steps:
+      - run:
+          name: (Optional) Merge target branch
+          no_output_timeout: "10m"
+          command: |
+            if [[ -n "$CIRCLE_PULL_REQUEST" && "$CIRCLE_BRANCH" != "nightly" ]]; then
+              PR_NUM=$(basename $CIRCLE_PULL_REQUEST)
+              CIRCLE_PR_BASE_BRANCH=$(curl -s https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/$PR_NUM | jq -r '.base.ref')
+              if [[ "${BUILD_ENVIRONMENT}" == *"xla"* || "${BUILD_ENVIRONMENT}" == *"gcc5"* ]] ; then
+                set -x
+                git config --global user.email "circleci.ossci@gmail.com"
+                git config --global user.name "CircleCI"
+                git config remote.origin.url https://github.com/pytorch/pytorch.git
+                git config --add remote.origin.fetch +refs/heads/master:refs/remotes/origin/master
+                git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/master:refs/remotes/origin/master --depth=100 --quiet
+                # PRs generated from ghstack has format CIRCLE_PR_BASE_BRANCH=gh/xxx/1234/base
+                if [[ "${CIRCLE_PR_BASE_BRANCH}" == "gh/"* ]]; then
+                  CIRCLE_PR_BASE_BRANCH=master
+                fi
+                export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/$CIRCLE_PR_BASE_BRANCH`
+                echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET}
+                export GIT_COMMIT=${CIRCLE_SHA1}
+                echo "GIT_COMMIT: " ${GIT_COMMIT}
+                git checkout -f ${GIT_COMMIT}
+                git reset --hard ${GIT_COMMIT}
+                git merge --allow-unrelated-histories --no-edit --no-ff ${GIT_MERGE_TARGET}
+                echo "Merged $CIRCLE_PR_BASE_BRANCH branch before building in environment $BUILD_ENVIRONMENT"
+                set +x
+              else
+                echo "No need to merge with $CIRCLE_PR_BASE_BRANCH, skipping..."
+              fi
+            else
+              echo "This is not a pull request, skipping..."
+            fi
--- a/.circleci/verbatim-sources/header-section.yml
+++ b/.circleci/verbatim-sources/header-section.yml
@ -0,0 +1,41 @@
+# WARNING: DO NOT EDIT THIS FILE DIRECTLY!!!
+# See the README.md in this directory.
+
+# IMPORTANT: To update Docker image version, please follow
+# the instructions at
+# https://github.com/pytorch/pytorch/wiki/Docker-image-build-on-CircleCI
+
+version: 2.1
+
+parameters:
+  run_binary_tests:
+    type: boolean
+    default: false
+  run_build:
+    type: boolean
+    default: true
+  run_master_build:
+    type: boolean
+    default: false
+  run_slow_gradcheck_build:
+    type: boolean
+    default: false
+
+executors:
+  windows-with-nvidia-gpu:
+    machine:
+      resource_class: windows.gpu.nvidia.medium
+      image: windows-server-2019-nvidia:previous
+      shell: bash.exe
+
+  windows-xlarge-cpu-with-nvidia-cuda:
+    machine:
+      resource_class: windows.xlarge
+      image: windows-server-2019-vs2019:stable
+      shell: bash.exe
+
+  windows-medium-cpu-with-nvidia-cuda:
+    machine:
+      resource_class: windows.medium
+      image: windows-server-2019-vs2019:stable
+      shell: bash.exe
--- a/.circleci/verbatim-sources/job-specs/binary-build-tests.yml
+++ b/.circleci/verbatim-sources/job-specs/binary-build-tests.yml
@ -0,0 +1,14 @@
+
+# There is currently no testing for libtorch TODO
+#  binary_linux_libtorch_3.6m_cpu_test:
+#    environment:
+#      BUILD_ENVIRONMENT: "libtorch 3.6m cpu"
+#    resource_class: gpu.nvidia.small
+#    <<: *binary_linux_test
+#
+#  binary_linux_libtorch_3.6m_cu90_test:
+#    environment:
+#      BUILD_ENVIRONMENT: "libtorch 3.6m cu90"
+#    resource_class: gpu.nvidia.small
+#    <<: *binary_linux_test
+#
--- a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
@ -0,0 +1,44 @@
+jobs:
+  binary_ios_build:
+    <<: *pytorch_ios_params
+    macos:
+      xcode: "12.5.1"
+    steps:
+    - attach_workspace:
+        at: ~/workspace
+    - checkout
+    - run_brew_for_ios_build
+    - run:
+        name: Build
+        no_output_timeout: "1h"
+        command: |
+          script="/Users/distiller/project/.circleci/scripts/binary_ios_build.sh"
+          cat "$script"
+          source "$script"
+    - run:
+        name: Test
+        no_output_timeout: "30m"
+        command: |
+          script="/Users/distiller/project/.circleci/scripts/binary_ios_test.sh"
+          cat "$script"
+          source "$script"
+    - persist_to_workspace:
+        root: /Users/distiller/workspace/
+        paths: ios
+
+  binary_ios_upload:
+    <<: *pytorch_ios_params
+    macos:
+      xcode: "12.5.1"
+    steps:
+    - attach_workspace:
+        at: ~/workspace
+    - checkout
+    - run_brew_for_ios_build
+    - run:
+        name: Upload
+        no_output_timeout: "1h"
+        command: |
+          script="/Users/distiller/project/.circleci/scripts/binary_ios_upload.sh"
+          cat "$script"
+          source "$script"
--- a/.circleci/verbatim-sources/job-specs/binary_update_htmls.yml
+++ b/.circleci/verbatim-sources/job-specs/binary_update_htmls.yml
@ -0,0 +1,53 @@
+
+  # update_s3_htmls job
+  # These jobs create html files for every cpu/cu## folder in s3. The html
+  # files just store the names of all the files in that folder (which are
+  # binary files (.whl files)). This is to allow pip installs of the latest
+  # version in a folder without having to know the latest date. Pip has a flag
+  # -f that you can pass an html file listing a bunch of packages, and pip will
+  # then install the one with the most recent version.
+  update_s3_htmls: &update_s3_htmls
+    machine:
+      image: ubuntu-2004:202104-01
+    resource_class: medium
+    steps:
+    - checkout
+    - setup_linux_system_environment
+    - run:
+        <<: *binary_checkout
+    # N.B. we do not run binary_populate_env. The only variable we need is
+    # PIP_UPLOAD_FOLDER (which is 'nightly/' for the nightlies and '' for
+    # releases, and sometimes other things for special cases). Instead we
+    # expect PIP_UPLOAD_FOLDER to be passed directly in the env. This is
+    # because, unlike all the other binary jobs, these jobs only get run once,
+    # in a separate workflow. They are not a step in other binary jobs like
+    # build, test, upload.
+    #
+    # You could attach this to every job, or include it in the upload step if
+    # you wanted. You would need to add binary_populate_env in this case to
+    # make sure it has the same upload folder as the job it's attached to. This
+    # function is idempotent, so it won't hurt anything; it's just a little
+    # unnescessary"
+    - run:
+        name: define PIP_UPLOAD_FOLDER
+        command: |
+          our_upload_folder=nightly/
+          # On tags upload to test instead
+          if [[ -n "${CIRCLE_TAG}" ]]; then
+            our_upload_folder=test/
+          fi
+          echo "export PIP_UPLOAD_FOLDER=${our_upload_folder}" >> ${BASH_ENV}
+    - run:
+        name: Update s3 htmls
+        no_output_timeout: "1h"
+        command: |
+          set +x
+          echo "declare -x \"AWS_ACCESS_KEY_ID=${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}\"" >> /home/circleci/project/env
+          echo "declare -x \"AWS_SECRET_ACCESS_KEY=${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}\"" >> /home/circleci/project/env
+          source /home/circleci/project/env
+          set -eux -o pipefail
+          retry () {
+              $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+          }
+          retry pip install awscli==1.6
+          "/home/circleci/project/builder/cron/update_s3_htmls.sh"
--- a/.circleci/verbatim-sources/job-specs/docker_jobs.yml
+++ b/.circleci/verbatim-sources/job-specs/docker_jobs.yml
@ -0,0 +1,56 @@
+  docker_build_job:
+      parameters:
+        image_name:
+          type: string
+          default: ""
+      machine:
+        image: ubuntu-2004:202104-01
+      resource_class: large
+      environment:
+        IMAGE_NAME: << parameters.image_name >>
+        # Enable 'docker manifest'
+        DOCKER_CLI_EXPERIMENTAL: "enabled"
+        DOCKER_BUILDKIT: 1
+      steps:
+        - checkout
+        - calculate_docker_image_tag
+        - run:
+            name: Check if image should be built
+            command: |
+              set +x
+              export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_DOCKER_BUILDER_V1}
+              export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_DOCKER_BUILDER_V1}
+              export AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
+              export AWS_REGION=us-east-1
+              aws ecr get-login-password --region $AWS_REGION|docker login --username AWS \
+                       --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com
+              set -x
+              # Check if image already exists, if it does then skip building it
+              if docker manifest inspect "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${IMAGE_NAME}:${DOCKER_TAG}"; then
+                circleci-agent step halt
+                # circleci-agent step halt doesn't actually halt the step so we need to
+                # explicitly exit the step here ourselves before it causes too much trouble
+                exit 0
+              fi
+              # Covers the case where a previous tag doesn't exist for the tree
+              # this is only really applicable on trees that don't have `.ci/docker` at its merge base, i.e. nightly
+              if ! git rev-parse "$(git merge-base HEAD << pipeline.git.base_revision >>):.ci/docker"; then
+                echo "Directory '.ci/docker' not found in tree << pipeline.git.base_revision >>, you should probably rebase onto a more recent commit"
+                exit 1
+              fi
+              PREVIOUS_DOCKER_TAG=$(git rev-parse "$(git merge-base HEAD << pipeline.git.base_revision >>):ci/docker")
+              # If no image exists but the hash is the same as the previous hash then we should error out here
+              if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
+                echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
+                echo "       contact the PyTorch team to restore the original images"
+                exit 1
+              fi
+        - run:
+            name: build_docker_image_<< parameters.image_name >>
+            no_output_timeout: "1h"
+            command: |
+              set +x
+              export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_DOCKER_BUILDER_V1}
+              export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_DOCKER_BUILDER_V1}
+              set -x
+              cd .ci/docker && ./build_docker.sh
--- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
+++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
@ -0,0 +1,745 @@
+  pytorch_doc_push:
+    resource_class: medium
+    machine:
+      image: ubuntu-2004:202104-01
+    parameters:
+      branch:
+        type: string
+        default: "main"
+    steps:
+    - attach_workspace:
+        at: /tmp/workspace
+    - run:
+        name: Generate netrc
+        command: |
+          # set credentials for https pushing
+          cat > ~/.netrc \<<DONE
+            machine github.com
+            login pytorchbot
+            password ${GITHUB_PYTORCHBOT_TOKEN}
+          DONE
+    - run:
+        name: Docs push
+        command: |
+          pushd /tmp/workspace
+          git push -u origin "<< parameters.branch >>"
+
+  pytorch_macos_10_15_py3_build:
+    environment:
+      BUILD_ENVIRONMENT: pytorch-macos-10.15-py3-arm64-build
+    macos:
+      xcode: "12.3.0"
+    steps:
+      - checkout
+      - run_brew_for_macos_build
+      - run:
+          name: Build
+          no_output_timeout: "1h"
+          command: |
+            set -e
+            export CROSS_COMPILE_ARM64=1
+            export JOB_BASE_NAME=$CIRCLE_JOB
+
+            # Install sccache
+            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo chmod +x /usr/local/bin/sccache
+            export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
+
+            # This IAM user allows write access to S3 bucket for sccache
+            set +x
+            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4}
+            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4}
+            set -x
+
+            chmod a+x .ci/pytorch/macos-build.sh
+            unbuffer .ci/pytorch/macos-build.sh 2>&1 | ts
+
+      - persist_to_workspace:
+          root: /Users/distiller/workspace/
+          paths:
+            - miniconda3
+      - store_artifacts:
+          path: /Users/distiller/project/dist
+
+  pytorch_macos_10_13_py3_build:
+    environment:
+      BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-build
+    macos:
+      xcode: "12.0"
+    steps:
+      - checkout
+      - run_brew_for_macos_build
+      - run:
+          name: Build
+          no_output_timeout: "1h"
+          command: |
+            set -e
+            export JOB_BASE_NAME=$CIRCLE_JOB
+
+            # Install sccache
+            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo chmod +x /usr/local/bin/sccache
+            export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
+
+            # This IAM user allows write access to S3 bucket for sccache
+            set +x
+            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4}
+            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4}
+            set -x
+
+            chmod a+x .ci/pytorch/macos-build.sh
+            unbuffer .ci/pytorch/macos-build.sh 2>&1 | ts
+
+      - persist_to_workspace:
+          root: /Users/distiller/workspace/
+          paths:
+            - miniconda3
+
+  mac_build:
+    parameters:
+      build-environment:
+        type: string
+        description: Top-level label for what's being built/tested.
+      xcode-version:
+        type: string
+        default: "13.3.1"
+        description: What xcode version to build with.
+      build-generates-artifacts:
+        type: boolean
+        default: true
+        description: if the build generates build artifacts
+      python-version:
+        type: string
+        default: "3.8"
+    macos:
+      xcode: << parameters.xcode-version >>
+    resource_class: medium
+    environment:
+      BUILD_ENVIRONMENT: << parameters.build-environment >>
+      AWS_REGION: us-east-1
+    steps:
+
+      - checkout
+      - run_brew_for_macos_build
+
+      - run:
+          name: Install sccache
+          command: |
+            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo chmod +x /usr/local/bin/sccache
+            echo "export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${BASH_ENV}"
+            echo "export SCCACHE_S3_KEY_PREFIX=${GITHUB_WORKFLOW}" >> "${BASH_ENV}"
+
+            set +x
+            echo "export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4}" >> "${BASH_ENV}"
+            echo "export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4}" >> "${BASH_ENV}"
+            set -x
+
+      - run:
+          name: Get workflow job id
+          command: |
+            echo "export OUR_GITHUB_JOB_ID=${CIRCLE_WORKFLOW_JOB_ID}" >> "${BASH_ENV}"
+
+      - run:
+          name: Build
+          command: |
+            set -x
+
+            git submodule sync
+            git submodule update --init --recursive --depth 1 --jobs 0
+
+            export PATH="/usr/local/bin:$PATH"
+            export WORKSPACE_DIR="${HOME}/workspace"
+            mkdir -p "${WORKSPACE_DIR}"
+            MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-py38_4.12.0-MacOSX-x86_64.sh"
+            if [  << parameters.python-version >> == 3.9.12 ]; then
+              MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-MacOSX-x86_64.sh"
+            fi
+
+            # If a local installation of conda doesn't exist, we download and install conda
+            if [ ! -d "${WORKSPACE_DIR}/miniconda3" ]; then
+              mkdir -p "${WORKSPACE_DIR}"
+              curl --retry 3 ${MINICONDA_URL} -o "${WORKSPACE_DIR}"/miniconda3.sh
+              bash "${WORKSPACE_DIR}"/miniconda3.sh -b -p "${WORKSPACE_DIR}"/miniconda3
+            fi
+            export PATH="${WORKSPACE_DIR}/miniconda3/bin:$PATH"
+            # shellcheck disable=SC1091
+            source "${WORKSPACE_DIR}"/miniconda3/bin/activate
+
+            brew link --force libomp
+
+            echo "export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${BASH_ENV}"
+            .ci/pytorch/macos-build.sh
+
+      - when:
+          condition: << parameters.build-generates-artifacts >>
+          steps:
+            - run:
+                name: Archive artifacts into zip
+                command: |
+                  zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json .additional_ci_files
+                  cp artifacts.zip /Users/distiller/workspace
+
+      - persist_to_workspace:
+          root: /Users/distiller/workspace/
+          paths:
+            - miniconda3
+            - artifacts.zip
+
+      - store_artifacts:
+          path: /Users/distiller/project/artifacts.zip
+
+  mac_test:
+    parameters:
+      build-environment:
+        type: string
+      shard-number:
+        type: string
+      num-test-shards:
+        type: string
+      xcode-version:
+        type: string
+      test-config:
+        type: string
+        default: 'default'
+
+    macos:
+      xcode: << parameters.xcode-version >>
+    environment:
+      GIT_DEFAULT_BRANCH: 'master'
+      BUILD_ENVIRONMENT: << parameters.build-environment >>
+      TEST_CONFIG: << parameters.test-config >>
+      SHARD_NUMBER: << parameters.shard-number >>
+      NUM_TEST_SHARDS: << parameters.num-test-shards >>
+    steps:
+      - checkout
+      - attach_workspace:
+          at: ~/workspace
+      - run_brew_for_macos_build
+      - run:
+          name: Test
+          no_output_timeout: "2h"
+          command: |
+            set -x
+
+            git submodule sync --recursive
+            git submodule update --init --recursive
+
+            mv ~/workspace/artifacts.zip .
+            unzip artifacts.zip
+
+            export IN_CI=1
+
+            COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
+
+            export PATH="/usr/local/bin:$PATH"
+            export WORKSPACE_DIR="${HOME}/workspace"
+            mkdir -p "${WORKSPACE_DIR}"
+
+            export PATH="${WORKSPACE_DIR}/miniconda3/bin:$PATH"
+            source "${WORKSPACE_DIR}"/miniconda3/bin/activate
+
+            # sanitize the input commit message and PR body here:
+
+            # trim all new lines from commit messages to avoid issues with batch environment
+            # variable copying. see https://github.com/pytorch/pytorch/pull/80043#issuecomment-1167796028
+            COMMIT_MESSAGES="${COMMIT_MESSAGES//[$'\n\r']}"
+
+            # then trim all special characters like single and double quotes to avoid unescaped inputs to
+            # wreak havoc internally
+            export COMMIT_MESSAGES="${COMMIT_MESSAGES//[\'\"]}"
+
+            python3 -mpip install dist/*.whl
+            .ci/pytorch/macos-test.sh
+      - run:
+          name: Copy files for uploading test stats
+          command: |
+            # copy into a parent folder test-reports because we can't use CIRCLEI_BUILD_NUM in path when persisting to workspace
+            mkdir -p test-reports/test-reports_${CIRCLE_BUILD_NUM}/test/test-reports
+            cp -r test/test-reports test-reports/test-reports_${CIRCLE_BUILD_NUM}/test/test-reports
+      - store_test_results:
+          path: test/test-reports
+      - persist_to_workspace:
+          root: /Users/distiller/project/
+          paths:
+            - test-reports
+
+  upload_test_stats:
+    machine: # executor type
+      image: ubuntu-2004:202010-01 # # recommended linux image - includes Ubuntu 20.04, docker 19.03.13, docker-compose 1.27.4
+    steps:
+      - checkout
+      - attach_workspace:
+          at: ~/workspace
+      - run:
+          name: upload
+          command: |
+            set -ex
+            if [ -z ${AWS_ACCESS_KEY_FOR_OSSCI_ARTIFACT_UPLOAD} ]; then
+              echo "No credentials found, cannot upload test stats (are you on a fork?)"
+              exit 0
+            fi
+            cp -r ~/workspace/test-reports/* ~/project
+            pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
+            export AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_FOR_OSSCI_ARTIFACT_UPLOAD}
+            export AWS_SECRET_ACCESS_KEY=${AWS_SECRET_KEY_FOR_OSSCI_ARTIFACT_UPLOAD}
+            # i dont know how to get the run attempt number for reruns so default to 1
+            python3 -m tools.stats.upload_test_stats --workflow-run-id "${CIRCLE_WORKFLOW_JOB_ID}" --workflow-run-attempt 1 --head-branch << pipeline.git.branch >> --circleci
+  pytorch_macos_10_13_py3_test:
+    environment:
+      BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-test
+    macos:
+      xcode: "12.0"
+    steps:
+      - checkout
+      - attach_workspace:
+          at: ~/workspace
+      - run_brew_for_macos_build
+      - run:
+          name: Test
+          no_output_timeout: "1h"
+          command: |
+            set -e
+            export JOB_BASE_NAME=$CIRCLE_JOB
+
+            chmod a+x .ci/pytorch/macos-test.sh
+            unbuffer .ci/pytorch/macos-test.sh 2>&1 | ts
+      - store_test_results:
+          path: test/test-reports
+
+  pytorch_macos_10_13_py3_lite_interpreter_build_test:
+    environment:
+      BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-test
+    macos:
+      xcode: "12.0"
+    steps:
+      - checkout
+      - attach_workspace:
+          at: ~/workspace
+      - run_brew_for_macos_build
+      - run:
+          name: Test
+          no_output_timeout: "1h"
+          command: |
+            set -e
+            export BUILD_LITE_INTERPRETER=1
+            export JOB_BASE_NAME=$CIRCLE_JOB
+            chmod a+x ${HOME}/project/.ci/pytorch/macos-lite-interpreter-build-test.sh
+            unbuffer ${HOME}/project/.ci/pytorch/macos-lite-interpreter-build-test.sh 2>&1 | ts
+      - store_test_results:
+          path: test/test-reports
+
+  pytorch_android_gradle_build:
+    environment:
+      BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c"
+      PYTHON_VERSION: "3.7"
+    resource_class: large
+    machine:
+      image: ubuntu-2004:202104-01
+    steps:
+    - checkout
+    - calculate_docker_image_tag
+    - setup_linux_system_environment
+    - setup_ci_environment
+    - run:
+        name: pytorch android gradle build
+        no_output_timeout: "1h"
+        command: |
+          set -eux
+          docker_image_commit=${DOCKER_IMAGE}:build-${DOCKER_TAG}-${CIRCLE_SHA1}
+
+          docker_image_libtorch_android_x86_32=${docker_image_commit}-android-x86_32
+          docker_image_libtorch_android_x86_64=${docker_image_commit}-android-x86_64
+          docker_image_libtorch_android_arm_v7a=${docker_image_commit}-android-arm-v7a
+          docker_image_libtorch_android_arm_v8a=${docker_image_commit}-android-arm-v8a
+
+          echo "docker_image_commit: "${docker_image_commit}
+          echo "docker_image_libtorch_android_x86_32: "${docker_image_libtorch_android_x86_32}
+          echo "docker_image_libtorch_android_x86_64: "${docker_image_libtorch_android_x86_64}
+          echo "docker_image_libtorch_android_arm_v7a: "${docker_image_libtorch_android_arm_v7a}
+          echo "docker_image_libtorch_android_arm_v8a: "${docker_image_libtorch_android_arm_v8a}
+
+          # x86_32
+          time docker pull ${docker_image_libtorch_android_x86_32} >/dev/null
+          export id_x86_32=$(docker run --env-file "${BASH_ENV}" -e GRADLE_OFFLINE=1 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${docker_image_libtorch_android_x86_32})
+
+          export COMMAND='((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "$id_x86_32" bash) 2>&1'
+          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
+          # arm-v7a
+          time docker pull ${docker_image_libtorch_android_arm_v7a} >/dev/null
+          export id_arm_v7a=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${docker_image_libtorch_android_arm_v7a})
+
+          export COMMAND='((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "$id_arm_v7a" bash) 2>&1'
+          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
+          mkdir -p ~/workspace/build_android_install_arm_v7a
+          docker cp $id_arm_v7a:/var/lib/jenkins/workspace/build_android/install ~/workspace/build_android_install_arm_v7a
+
+          # x86_64
+          time docker pull ${docker_image_libtorch_android_x86_64} >/dev/null
+          export id_x86_64=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${docker_image_libtorch_android_x86_64})
+
+          export COMMAND='((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "$id_x86_64" bash) 2>&1'
+          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
+          mkdir -p ~/workspace/build_android_install_x86_64
+          docker cp $id_x86_64:/var/lib/jenkins/workspace/build_android/install ~/workspace/build_android_install_x86_64
+
+          # arm-v8a
+          time docker pull ${docker_image_libtorch_android_arm_v8a} >/dev/null
+          export id_arm_v8a=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${docker_image_libtorch_android_arm_v8a})
+
+          export COMMAND='((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "$id_arm_v8a" bash) 2>&1'
+          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
+          mkdir -p ~/workspace/build_android_install_arm_v8a
+          docker cp $id_arm_v8a:/var/lib/jenkins/workspace/build_android/install ~/workspace/build_android_install_arm_v8a
+
+          docker cp ~/workspace/build_android_install_arm_v7a $id_x86_32:/var/lib/jenkins/workspace/build_android_install_arm_v7a
+          docker cp ~/workspace/build_android_install_x86_64 $id_x86_32:/var/lib/jenkins/workspace/build_android_install_x86_64
+          docker cp ~/workspace/build_android_install_arm_v8a $id_x86_32:/var/lib/jenkins/workspace/build_android_install_arm_v8a
+
+          # run gradle buildRelease
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/build_android_gradle.sh") | docker exec -u jenkins -i "$id_x86_32" bash) 2>&1'
+          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
+          mkdir -p ~/workspace/build_android_artifacts
+          docker cp $id_x86_32:/var/lib/jenkins/workspace/android/artifacts.tgz ~/workspace/build_android_artifacts/
+
+          output_image=$docker_image_libtorch_android_x86_32-gradle
+          docker commit "$id_x86_32" ${output_image}
+          time docker push ${output_image}
+    - store_artifacts:
+        path: ~/workspace/build_android_artifacts/artifacts.tgz
+        destination: artifacts.tgz
+
+  pytorch_android_publish_snapshot:
+    environment:
+      BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-publish-snapshot
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c"
+      PYTHON_VERSION: "3.7"
+    resource_class: large
+    machine:
+      image: ubuntu-2004:202104-01
+    steps:
+    - checkout
+    - calculate_docker_image_tag
+    - setup_linux_system_environment
+    - setup_ci_environment
+    - run:
+        name: pytorch android gradle build
+        no_output_timeout: "1h"
+        command: |
+          set -eux
+          docker_image_commit=${DOCKER_IMAGE}:build-${DOCKER_TAG}-${CIRCLE_SHA1}
+
+          docker_image_libtorch_android_x86_32_gradle=${docker_image_commit}-android-x86_32-gradle
+
+          echo "docker_image_commit: "${docker_image_commit}
+          echo "docker_image_libtorch_android_x86_32_gradle: "${docker_image_libtorch_android_x86_32_gradle}
+
+          # x86_32
+          time docker pull ${docker_image_libtorch_android_x86_32_gradle} >/dev/null
+          export id_x86_32=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${docker_image_libtorch_android_x86_32_gradle})
+
+          export COMMAND='((echo "sudo chown -R jenkins workspace" && echo "export BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}" && echo "export SONATYPE_NEXUS_USERNAME=${SONATYPE_NEXUS_USERNAME}" && echo "export SONATYPE_NEXUS_PASSWORD=${SONATYPE_NEXUS_PASSWORD}" && echo "export ANDROID_SIGN_KEY=${ANDROID_SIGN_KEY}" && echo "export ANDROID_SIGN_PASS=${ANDROID_SIGN_PASS}" && echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/publish_android_snapshot.sh") | docker exec -u jenkins -i "$id_x86_32" bash) 2>&1'
+          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
+          output_image=${docker_image_libtorch_android_x86_32_gradle}-publish-snapshot
+          docker commit "$id_x86_32" ${output_image}
+          time docker push ${output_image}
+
+  pytorch_android_gradle_build-x86_32:
+    environment:
+      BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-only-x86_32
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c"
+      PYTHON_VERSION: "3.7"
+    resource_class: large
+    machine:
+      image: ubuntu-2004:202104-01
+    steps:
+    - checkout
+    - calculate_docker_image_tag
+    - setup_linux_system_environment
+    - checkout
+    - setup_ci_environment
+    - run:
+        name: pytorch android gradle build only x86_32 (for PR)
+        no_output_timeout: "1h"
+        command: |
+          set -e
+          docker_image_libtorch_android_x86_32=${DOCKER_IMAGE}:build-${DOCKER_TAG}-${CIRCLE_SHA1}-android-x86_32
+          echo "docker_image_libtorch_android_x86_32: "${docker_image_libtorch_android_x86_32}
+
+          # x86
+          time docker pull ${docker_image_libtorch_android_x86_32} >/dev/null
+          export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${docker_image_libtorch_android_x86_32})
+
+          export COMMAND='((echo "export BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}" && echo "export GRADLE_OFFLINE=1" && echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/build_android_gradle.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
+          mkdir -p ~/workspace/build_android_x86_32_artifacts
+          docker cp $id:/var/lib/jenkins/workspace/android/artifacts.tgz ~/workspace/build_android_x86_32_artifacts/
+
+          output_image=${docker_image_libtorch_android_x86_32}-gradle
+          docker commit "$id" ${output_image}
+          time docker push ${output_image}
+    - store_artifacts:
+        path: ~/workspace/build_android_x86_32_artifacts/artifacts.tgz
+        destination: artifacts.tgz
+
+  pytorch_ios_build:
+    <<: *pytorch_ios_params
+    macos:
+      xcode: "12.5.1"
+    steps:
+      - run:
+          name: checkout with retry
+          command: |
+            checkout() {
+              set -ex
+              # Workaround old docker images with incorrect $HOME
+              # check https://github.com/docker/docker/issues/2968 for details
+              if [ "${HOME}" = "/" ]
+                then
+                export HOME=$(getent passwd $(id -un) | cut -d: -f6)
+              fi
+
+              mkdir -p ~/.ssh
+
+              echo 'github.com ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ==
+              ' >> ~/.ssh/known_hosts
+
+              # use git+ssh instead of https
+              git config --global url."ssh://git@github.com".insteadOf "https://github.com" || true
+              git config --global gc.auto 0 || true
+
+              echo 'Cloning git repository'
+              mkdir -p '/Users/distiller/project'
+              cd '/Users/distiller/project'
+              git clone "$CIRCLE_REPOSITORY_URL" .
+              echo 'Checking out branch'
+              git checkout --force -B "$CIRCLE_BRANCH" "$CIRCLE_SHA1"
+              git --no-pager log --no-color -n 1 --format='HEAD is now at %h %s'
+            }
+
+            retry () {
+              $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+            }
+            retry checkout
+      - run_brew_for_ios_build
+      - run:
+          name: Setup Fastlane
+          no_output_timeout: "1h"
+          command: |
+            set -e
+            PROJ_ROOT=/Users/distiller/project
+            cd ${PROJ_ROOT}/ios/TestApp
+            # install fastlane
+            sudo gem install bundler && bundle install
+      - run:
+          name: Build
+          no_output_timeout: "1h"
+          command: |
+            set -e
+            WORKSPACE=/Users/distiller/workspace
+            PROJ_ROOT=/Users/distiller/project
+            export TCLLIBPATH="/usr/local/lib"
+
+            # Install conda
+            curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-MacOSX-x86_64.sh
+            chmod +x ~/conda.sh
+            /bin/bash ~/conda.sh -b -p ~/anaconda
+            export PATH="~/anaconda/bin:${PATH}"
+            source ~/anaconda/bin/activate
+
+            # Install dependencies
+            retry () {
+                $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+            }
+
+            retry conda install numpy ninja pyyaml mkl mkl-include setuptools cmake requests typing-extensions --yes
+
+            # sync submodules
+            cd ${PROJ_ROOT}
+            git submodule sync
+            git submodule update --init --recursive --depth 1 --jobs 0
+
+            # export
+            export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+
+            # run build script
+            chmod a+x ${PROJ_ROOT}/scripts/build_ios.sh
+            echo "IOS_ARCH: ${IOS_ARCH}"
+            echo "IOS_PLATFORM: ${IOS_PLATFORM}"
+            echo "USE_PYTORCH_METAL": "${USE_METAL}"
+            echo "BUILD_LITE_INTERPRETER": "${BUILD_LITE_INTERPRETER}"
+            echo "USE_COREML_DELEGATE": "${USE_COREML_DELEGATE}"
+
+            #check the custom build flag
+            echo "SELECTED_OP_LIST: ${SELECTED_OP_LIST}"
+            if [ -n "${SELECTED_OP_LIST}" ]; then
+                export SELECTED_OP_LIST="${PROJ_ROOT}/ios/TestApp/custom_build/${SELECTED_OP_LIST}"
+            fi
+            export IOS_ARCH=${IOS_ARCH}
+            export IOS_PLATFORM=${IOS_PLATFORM}
+            export USE_COREML_DELEGATE=${USE_COREML_DELEGATE}
+            if [ ${IOS_PLATFORM} != "SIMULATOR" ]; then
+              export USE_PYTORCH_METAL=${USE_METAL}
+            fi
+            unbuffer ${PROJ_ROOT}/scripts/build_ios.sh 2>&1 | ts
+      - run:
+          name: Run Build Test
+          no_output_timeout: "30m"
+          command: |
+            set -e
+            PROJ_ROOT=/Users/distiller/project
+            # run the ruby build script
+            if ! [ -x "$(command -v xcodebuild)" ]; then
+              echo 'Error: xcodebuild is not installed.'
+              exit 1
+            fi
+            ruby ${PROJ_ROOT}/scripts/xcode_build.rb -i ${PROJ_ROOT}/build_ios/install -x ${PROJ_ROOT}/ios/TestApp/TestApp.xcodeproj -p ${IOS_PLATFORM}
+            if ! [ "$?" -eq "0" ]; then
+              echo 'xcodebuild failed!'
+              exit 1
+            fi
+      - run:
+          name: Run Simulator Tests
+          no_output_timeout: "2h"
+          command: |
+            set -e
+            if [ ${IOS_PLATFORM} != "SIMULATOR" ]; then
+              echo "not SIMULATOR build, skip it."
+              exit 0
+            fi
+            WORKSPACE=/Users/distiller/workspace
+            PROJ_ROOT=/Users/distiller/project
+            source ~/anaconda/bin/activate
+            # use the pytorch nightly build to generate models
+            pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+            # generate models for differnet backends
+            cd ${PROJ_ROOT}/ios/TestApp/benchmark
+            mkdir -p ../models
+            if [ ${USE_COREML_DELEGATE} == 1 ]; then
+              pip install coremltools==5.0b5 protobuf==3.20.1
+              python coreml_backend.py
+            else
+              cd "${PROJ_ROOT}"
+              python test/mobile/model_test/gen_test_model.py ios-test
+            fi
+            cd "${PROJ_ROOT}/ios/TestApp/benchmark"
+            if [ ${BUILD_LITE_INTERPRETER} == 1 ]; then
+              echo "Setting up the TestApp for LiteInterpreter"
+              ruby setup.rb --lite 1
+            else
+              echo "Setting up the TestApp for Full JIT"
+              ruby setup.rb
+            fi
+            cd "${PROJ_ROOT}/ios/TestApp"
+            # instruments -s -devices
+            if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then
+              if [ "${USE_COREML_DELEGATE}" == 1 ]; then
+                fastlane scan --only_testing TestAppTests/TestAppTests/testCoreML
+              else
+                fastlane scan --only_testing TestAppTests/TestAppTests/testLiteInterpreter
+              fi
+            else
+              fastlane scan --only_testing TestAppTests/TestAppTests/testFullJIT
+            fi
+  pytorch_linux_bazel_build:
+    <<: *pytorch_params
+    machine:
+      image: ubuntu-2004:202104-01
+    steps:
+    - checkout
+    - calculate_docker_image_tag
+    - setup_linux_system_environment
+    - setup_ci_environment
+    - run:
+        name: Bazel Build
+        no_output_timeout: "1h"
+        command: |
+          set -e
+          # Pull Docker image and run build
+          echo "DOCKER_IMAGE: "${DOCKER_IMAGE}:${DOCKER_TAG}
+          time docker pull ${DOCKER_IMAGE}:${DOCKER_TAG} >/dev/null
+          export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}:${DOCKER_TAG})
+
+          echo "Do NOT merge main branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT"
+
+          git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0
+
+          docker cp /home/circleci/project/. $id:/var/lib/jenkins/workspace
+
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .ci/pytorch/build.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+
+          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
+          # Push intermediate Docker image for next phase to use
+          if [ -z "${BUILD_ONLY}" ]; then
+            # Augment our output image name with bazel to avoid collisions
+            output_image=${DOCKER_IMAGE}:build-${DOCKER_TAG}-bazel-${CIRCLE_SHA1}
+            export COMMIT_DOCKER_IMAGE=$output_image
+            docker commit "$id" ${COMMIT_DOCKER_IMAGE}
+            time docker push ${COMMIT_DOCKER_IMAGE}
+          fi
+
+  pytorch_linux_bazel_test:
+    <<: *pytorch_params
+    machine:
+      image: ubuntu-2004:202104-01
+    steps:
+    - checkout
+    - calculate_docker_image_tag
+    - setup_linux_system_environment
+    - setup_ci_environment
+    - run:
+        name: Test
+        no_output_timeout: "90m"
+        command: |
+          set -e
+          output_image=${DOCKER_IMAGE}:build-${DOCKER_TAG}-bazel-${CIRCLE_SHA1}
+          export COMMIT_DOCKER_IMAGE=$output_image
+          echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
+
+          time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
+
+          if [ -n "${USE_CUDA_DOCKER_RUNTIME}" ]; then
+            export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --gpus all -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
+          else
+            export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
+          fi
+
+          retrieve_test_reports() {
+            echo "retrieving test reports"
+            docker cp -L $id:/var/lib/jenkins/workspace/bazel-testlogs ./ || echo 'No test reports found!'
+          }
+          trap "retrieve_test_reports" ERR
+
+          if [[ ${BUILD_ENVIRONMENT} == *"multigpu"* ]]; then
+            export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .ci/pytorch/multigpu-test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          else
+            export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .ci/pytorch/test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          fi
+          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
+          retrieve_test_reports
+          docker stats --all --no-stream
+    - store_test_results:
+        path: bazel-testlogs
+
+  pytorch_windows_test_multigpu:
+    machine:
+      image: ubuntu-2004:202104-01
+    steps:
+      - checkout
+      - run:
+          name: Test
+          no_output_timeout: "90m"
+          command: |
+            set -e
+            python3 -m pip install requests
+            python3 ./.circleci/scripts/trigger_azure_pipeline.py
--- a/.circleci/verbatim-sources/job-specs/job-specs-promote.yml
+++ b/.circleci/verbatim-sources/job-specs/job-specs-promote.yml
@ -0,0 +1,18 @@
+
+  promote_s3:
+    <<: *promote_common
+    steps:
+      - checkout
+      - run:
+          name: Running promote script
+          command: |
+            scripts/release/promote/wheel_to_s3.sh
+
+  promote_conda:
+    <<: *promote_common
+    steps:
+      - checkout
+      - run:
+          name: Running promote script
+          command: |
+            scripts/release/promote/conda_to_conda.sh
--- a/.circleci/verbatim-sources/job-specs/job-specs-setup.yml
+++ b/.circleci/verbatim-sources/job-specs/job-specs-setup.yml
@ -0,0 +1,29 @@
+
+  setup:
+    docker:
+      - image: circleci/python:3.7.3
+    steps:
+      - checkout
+      - run:
+          name: Save commit message
+          command: git log --format='%B' -n 1 HEAD > .circleci/scripts/COMMIT_MSG
+      # Note [Workspace for CircleCI scripts]
+      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+      # In the beginning, you wrote your CI scripts in a
+      # .circleci/config.yml file, and life was good.  Your CI
+      # configurations flourished and multiplied.
+      #
+      # Then one day, CircleCI cometh down high and say, "Your YAML file
+      # is too biggeth, it stresses our servers so."  And thus they
+      # asketh us to smite the scripts in the yml file.
+      #
+      # But you can't just put the scripts in the .circleci folder,
+      # because in some jobs, you don't ever actually checkout the
+      # source repository.  Where you gonna get the scripts from?
+      #
+      # Here's how you do it: you persist .circleci/scripts into a
+      # workspace, attach the workspace in your subjobs, and run all
+      # your scripts from there.
+      - persist_to_workspace:
+          root: .
+          paths: .circleci/scripts
--- a/.circleci/verbatim-sources/nightly-binary-build-defaults.yml
+++ b/.circleci/verbatim-sources/nightly-binary-build-defaults.yml
@ -0,0 +1,51 @@
+
+##############################################################################
+# Binary build (nightlies nightly build) defaults
+# The binary builds use the docker executor b/c at time of writing the machine
+# executor is limited to only two cores and is painfully slow (4.5+ hours per
+# GPU build). But the docker executor cannot be run with --runtime=nvidia, and
+# so the binary test/upload jobs must run on a machine executor. The package
+# built in the build job is persisted to the workspace, which the test jobs
+# expect. The test jobs just run a few quick smoke tests (very similar to the
+# second-round-user-facing smoke tests above) and then upload the binaries to
+# their final locations. The upload part requires credentials that should only
+# be available to org-members.
+#
+# binary_checkout MUST be run before other commands here. This is because the
+# other commands are written in .circleci/scripts/*.sh , so the pytorch source
+# code must be downloaded on the machine before they can be run. We cannot
+# inline all the code into this file, since that would cause the yaml size to
+# explode past 4 MB (all the code in the command section is just copy-pasted to
+# everywhere in the .circleci/config.yml file where it appears).
+##############################################################################
+
+# Checks out the Pytorch and Builder repos (always both of them), and places
+# them in the right place depending on what executor we're running on. We curl
+# our .sh file from the interweb to avoid yaml size bloat. Note that many jobs
+# do not need both the pytorch and builder repos, so this is a little wasteful
+# (smoke tests and upload jobs do not need the pytorch repo).
+binary_checkout: &binary_checkout
+  name: Checkout pytorch/builder repo
+  no_output_timeout: "30m"
+  command: .circleci/scripts/binary_checkout.sh
+
+# Parses circleci arguments in a consistent way, essentially routing to the
+# correct pythonXgccXcudaXos build we want
+binary_populate_env: &binary_populate_env
+  name: Set up binary env variables
+  command: .circleci/scripts/binary_populate_env.sh
+
+binary_install_miniconda: &binary_install_miniconda
+  name: Install miniconda
+  no_output_timeout: "1h"
+  command: .circleci/scripts/binary_install_miniconda.sh
+
+# This section is used in the binary_test and smoke_test jobs. It expects
+# 'binary_populate_env' to have populated /home/circleci/project/env and it
+# expects another section to populate /home/circleci/project/ci_test_script.sh
+# with the code to run in the docker
+binary_run_in_docker: &binary_run_in_docker
+  name: Run in docker
+  # This step only runs on circleci linux machine executors that themselves
+  # need to start docker images
+  command: .circleci/scripts/binary_run_in_docker.sh
--- a/.circleci/verbatim-sources/workflows/workflows-nightly-uploads-header.yml
+++ b/.circleci/verbatim-sources/workflows/workflows-nightly-uploads-header.yml
@ -0,0 +1,8 @@
+      #- binary_linux_libtorch_3.6m_cpu_test:
+      #    requires:
+      #      - binary_linux_libtorch_3.6m_cpu_build
+      #- binary_linux_libtorch_3.6m_cu90_test:
+      #    requires:
+      #      - binary_linux_libtorch_3.6m_cu90_build
+
+      # Nightly uploads
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .0.0
 .1.0