Revert "Require less alignment for attn bias (#114173 ) (#114837 )"

This reverts commit 59656491f3b1da809312942872cce010337504b0.
Fix NULL dereference in binary CPU ops (#115241 )
2025-10-30 03:34:56 +08:00 · 2023-12-12 08:41:07 -08:00 · 2023-12-06 01:20:06 -08:00 · 2023-12-05 14:50:58 -05:00 · 2023-12-01 10:58:57 -08:00 · 2023-11-30 08:11:08 -08:00
4350 changed files with 450472 additions and 291976 deletions
--- a/.ci/docker/README.md
+++ b/.ci/docker/README.md
@ -19,7 +19,6 @@ See `build.sh` for valid build environments (it's the giant switch).
 * `ubuntu` -- Dockerfile for Ubuntu image for CPU build and test jobs
 * `ubuntu-cuda` -- Dockerfile for Ubuntu image with CUDA support for nvidia-docker
 * `ubuntu-rocm` -- Dockerfile for Ubuntu image with ROCm support
-* `ubuntu-xpu` -- Dockerfile for Ubuntu image with XPU support

 ## Usage

--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -71,11 +71,6 @@ if [[ "$image" == *cuda* && "$UBUNTU_VERSION" != "22.04" ]]; then
  DOCKERFILE="${OS}-cuda/Dockerfile"
 elif [[ "$image" == *rocm* ]]; then
  DOCKERFILE="${OS}-rocm/Dockerfile"
-elif [[ "$image" == *xpu* ]]; then
-  DOCKERFILE="${OS}-xpu/Dockerfile"
-elif [[ "$image" == *cuda*linter* ]]; then
-  # Use a separate Dockerfile for linter to keep a small image size
-  DOCKERFILE="linter-cuda/Dockerfile"
 elif [[ "$image" == *linter* ]]; then
  # Use a separate Dockerfile for linter to keep a small image size
  DOCKERFILE="linter/Dockerfile"
@ -134,6 +129,35 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
+  pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc7)
+    CUDA_VERSION=11.8.0
+    CUDNN_VERSION=8
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=7
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
+    pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc7-inductor-benchmarks)
+    CUDA_VERSION=11.8.0
+    CUDNN_VERSION=8
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=7
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
    CUDA_VERSION=12.1.1
    CUDNN_VERSION=8
@ -157,13 +181,13 @@ case "$image" in
    CONDA_CMAKE=yes
    ONNX=yes
    ;;
-  pytorch-linux-focal-py3-clang9-android-ndk-r21e)
+  pytorch-linux-focal-py3-clang7-android-ndk-r19c)
    ANACONDA_PYTHON_VERSION=3.8
-    CLANG_VERSION=9
+    CLANG_VERSION=7
    LLVMDEV=yes
    PROTOBUF=yes
    ANDROID=yes
-    ANDROID_NDK_VERSION=r21e
+    ANDROID_NDK_VERSION=r19c
    GRADLE_VERSION=6.8.3
    NINJA_VERSION=1.9.0
    ;;
@ -204,7 +228,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    ROCM_VERSION=5.6
+    ROCM_VERSION=5.4.2
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
@ -215,20 +239,21 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    ROCM_VERSION=5.7
+    ROCM_VERSION=5.6
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-xpu-2024.0-py3)
+  pytorch-linux-focal-py3.8-gcc7)
    ANACONDA_PYTHON_VERSION=3.8
-    GCC_VERSION=11
+    GCC_VERSION=7
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    BASEKIT_VERSION=2024.0.0-49522
-    NINJA_VERSION=1.9.0
+    KATEX=yes
    CONDA_CMAKE=yes
+    TRITON=yes
+    DOCS=yes
    ;;
    pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.8
@ -261,12 +286,6 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-py3-clang15-asan)
-    ANACONDA_PYTHON_VERSION=3.10
-    CLANG_VERSION=15
-    CONDA_CMAKE=yes
-    VISION=yes
-    ;;
  pytorch-linux-jammy-py3.8-gcc11)
    ANACONDA_PYTHON_VERSION=3.8
    GCC_VERSION=11
@ -278,12 +297,6 @@ case "$image" in
    TRITON=yes
    DOCS=yes
    ;;
-  pytorch-linux-jammy-py3-clang12-executorch)
-    ANACONDA_PYTHON_VERSION=3.10
-    CLANG_VERSION=12
-    CONDA_CMAKE=yes
-    EXECUTORCH=yes
-    ;;
  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
    # We will need to update mypy version eventually, but that's for another day. The task
@ -291,11 +304,6 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CONDA_CMAKE=yes
    ;;
-  pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter)
-    ANACONDA_PYTHON_VERSION=3.9
-    CUDA_VERSION=11.8
-    CONDA_CMAKE=yes
-    ;;
  *)
    # Catch-all for builds that are not hardcoded.
    PROTOBUF=yes
@ -313,9 +321,6 @@ case "$image" in
      extract_version_from_image_name rocm ROCM_VERSION
      NINJA_VERSION=1.9.0
      TRITON=yes
-      # To ensure that any ROCm config will build using conda cmake
-      # and thus have LAPACK/MKL enabled
-      CONDA_CMAKE=yes
    fi
    if [[ "$image" == *centos7* ]]; then
      NINJA_VERSION=1.10.2
@ -349,11 +354,14 @@ if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
 fi

 # Build image
+# TODO: build-arg THRIFT is not turned on for any image, remove it once we confirm
+# it's no longer needed.
 docker build \
       --no-cache \
       --progress=plain \
       --build-arg "BUILD_ENVIRONMENT=${image}" \
       --build-arg "PROTOBUF=${PROTOBUF:-}" \
+       --build-arg "THRIFT=${THRIFT:-}" \
       --build-arg "LLVMDEV=${LLVMDEV:-}" \
       --build-arg "DB=${DB:-}" \
       --build-arg "VISION=${VISION:-}" \
@ -385,8 +393,6 @@ docker build \
       --build-arg "ONNX=${ONNX}" \
       --build-arg "DOCS=${DOCS}" \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
-       --build-arg "EXECUTORCH=${EXECUTORCH}" \
-       --build-arg "BASEKIT_VERSION=${BASEKIT_VERSION}" \
       -f $(dirname ${DOCKERFILE})/Dockerfile \
       -t "$tmp_tag" \
       "$@" \
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -98,18 +98,6 @@ COPY ./common/install_ninja.sh install_ninja.sh
 RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
 RUN rm install_ninja.sh

-ARG TRITON
-# Install triton, this needs to be done before sccache because the latter will
-# try to reach out to S3, which docker build runners don't have access
-ENV CMAKE_C_COMPILER cc
-ENV CMAKE_CXX_COMPILER c++
-COPY ./common/install_triton.sh install_triton.sh
-COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/triton-rocm.txt triton-rocm.txt
-COPY triton_version.txt triton_version.txt
-RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
-RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt
-
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +0,0 @@
-663882fe7dc518c04adf3d2ee5ccb7d99f41ade4
--- a/.ci/docker/ci_commit_pins/huggingface.txt
+++ b/.ci/docker/ci_commit_pins/huggingface.txt
@ -1 +1 @@
-6c26faa159b79a42d7fa46cb66e2d21523351987
+4.27.4
--- a/.ci/docker/ci_commit_pins/timm.txt
+++ b/.ci/docker/ci_commit_pins/timm.txt
@ -1 +1 @@
-730b907b4d45a4713cbc425cbf224c46089fd514
+b9d43c7dcac1fe05e851dd7be7187b108af593d2
--- a/.ci/docker/ci_commit_pins/triton-rocm.txt
+++ b/.ci/docker/ci_commit_pins/triton-rocm.txt
@ -1 +1 @@
-dafe1459823b9549417ed95e9720f1b594fab329
+34f8189eae57a23cc15b4b4f032fe25757e0db8e
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-e28a256d71f3cf2bcc7b69d6bda73a9b855e385e
+e6216047b8b0aef1fe8da6ca8667a3ad0a016411
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@ -9,7 +9,10 @@ install_ubuntu() {
  #   "$UBUNTU_VERSION" == "18.04"*
  # instead of
  #   "$UBUNTU_VERSION" == "18.04"
-  if [[ "$UBUNTU_VERSION" == "20.04"* ]]; then
+  if [[ "$UBUNTU_VERSION" == "18.04"* ]]; then
+    cmake3="cmake=3.10*"
+    maybe_libiomp_dev="libiomp-dev"
+  elif [[ "$UBUNTU_VERSION" == "20.04"* ]]; then
    cmake3="cmake=3.16*"
    maybe_libiomp_dev=""
  elif [[ "$UBUNTU_VERSION" == "22.04"* ]]; then
@ -20,9 +23,7 @@ install_ubuntu() {
    maybe_libiomp_dev="libiomp-dev"
  fi

-  if [[ "$CLANG_VERSION" == 15 ]]; then
-    maybe_libomp_dev="libomp-15-dev"
-  elif [[ "$CLANG_VERSION" == 12 ]]; then
+  if [[ "$CLANG_VERSION" == 12 ]]; then
    maybe_libomp_dev="libomp-12-dev"
  elif [[ "$CLANG_VERSION" == 10 ]]; then
    maybe_libomp_dev="libomp-10-dev"
@ -61,7 +62,6 @@ install_ubuntu() {
    ${maybe_libiomp_dev} \
    libyaml-dev \
    libz-dev \
-    libjemalloc2 \
    libjpeg-dev \
    libasound2-dev \
    libsndfile-dev \
@ -75,7 +75,6 @@ install_ubuntu() {
    libtool \
    vim \
    unzip \
-    gpg-agent \
    gdb

  # Should resolve issues related to various apt package repository cert issues
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -54,13 +54,23 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"
  if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ]; then
    conda_install numpy=1.23.5 ${CONDA_COMMON_DEPS}
-  else
+  elif [ "$ANACONDA_PYTHON_VERSION" = "3.10" ]; then
    conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
+  elif [ "$ANACONDA_PYTHON_VERSION" = "3.9" ]; then
+    conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
+  elif [ "$ANACONDA_PYTHON_VERSION" = "3.8" ]; then
+    conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
+  else
+    # Install `typing-extensions` for 3.7
+    conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS} typing-extensions
  fi

-  # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
-  # and libpython-static for torch deploy
-  conda_install llvmdev=8.0.0 "libpython-static=${ANACONDA_PYTHON_VERSION}"
+  # This is only supported in 3.8 upward
+  if [ "$MINOR_PYTHON_VERSION" -gt "7" ]; then
+    # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
+    # and libpython-static for torch deploy
+    conda_install llvmdev=8.0.0 "libpython-static=${ANACONDA_PYTHON_VERSION}"
+  fi

  # Use conda cmake in some cases. Conda cmake will be newer than our supported
  # min version (3.5 for xenial and 3.10 for bionic), so we only do it in those
@ -79,7 +89,13 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  # Install some other packages, including those needed for Python test reporting
  pip_install -r /opt/conda/requirements-ci.txt

-  pip_install -U scikit-learn
+  # Update scikit-learn to a python-3.8 compatible version
+  if [[ $(python -c "import sys; print(int(sys.version_info >= (3, 8)))") == "1" ]]; then
+    pip_install -U scikit-learn
+  else
+    # Pinned scikit-learn due to https://github.com/scikit-learn/scikit-learn/issues/14485 (affects gcc 5.5 only)
+    pip_install scikit-learn==0.20.3
+  fi

  if [ -n "$DOCS" ]; then
    apt-get update
--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@ -2,8 +2,8 @@

 if [[ ${CUDNN_VERSION} == 8 ]]; then
    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-    mkdir tmp_cudnn
-    pushd tmp_cudnn
+    mkdir tmp_cudnn && cd tmp_cudnn
+    CUDNN_NAME="cudnn-linux-x86_64-8.3.2.44_cuda11.5-archive"
    if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
        CUDNN_NAME="cudnn-linux-x86_64-8.9.2.26_cuda12-archive"
        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
@ -11,14 +11,17 @@ if [[ ${CUDNN_VERSION} == 8 ]]; then
        CUDNN_NAME="cudnn-linux-x86_64-8.7.0.84_cuda11-archive"
        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/${CUDNN_NAME}.tar.xz
    else
-        print "Unsupported CUDA version ${CUDA_VERSION}"
-        exit 1
+        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.3.2/local_installers/11.5/${CUDNN_NAME}.tar.xz
    fi

    tar xf ${CUDNN_NAME}.tar.xz
+    cp -a ${CUDNN_NAME}/include/* /usr/include/
    cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
+    cp -a ${CUDNN_NAME}/include/* /usr/include/x86_64-linux-gnu/
+
    cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/
-    popd
+    cp -a ${CUDNN_NAME}/lib/* /usr/lib/x86_64-linux-gnu/
+    cd ..
    rm -rf tmp_cudnn
    ldconfig
 fi
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -1,21 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-# cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
-mkdir tmp_cusparselt && cd tmp_cusparselt
-
-if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
-    CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.5.2.1-archive"
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
-elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
-    CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.4.0.7-archive"
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
-fi
-
-tar xf ${CUSPARSELT_NAME}.tar.xz
-cp -a ${CUSPARSELT_NAME}/include/* /usr/local/cuda/include/
-cp -a ${CUSPARSELT_NAME}/lib/* /usr/local/cuda/lib64/
-cd ..
-rm -rf tmp_cusparselt
-ldconfig
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -1,62 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
-
-clone_executorch() {
-  EXECUTORCH_PINNED_COMMIT=$(get_pinned_commit executorch)
-
-  # Clone the Executorch
-  git clone https://github.com/pytorch/executorch.git
-
-  # and fetch the target commit
-  pushd executorch
-  git checkout "${EXECUTORCH_PINNED_COMMIT}"
-  git submodule update --init
-  popd
-
-  chown -R jenkins executorch
-}
-
-install_buck2() {
-  pushd executorch/.ci/docker
-
-  BUCK2_VERSION=$(cat ci_commit_pins/buck2.txt)
-  source common/install_buck.sh
-
-  popd
-}
-
-install_conda_dependencies() {
-  pushd executorch/.ci/docker
-  # Install conda dependencies like flatbuffer
-  conda_install --file conda-env-ci.txt
-  popd
-}
-
-install_pip_dependencies() {
-  pushd executorch/.ci/docker
-  # Install all Python dependencies
-  pip_install -r requirements-ci.txt
-  popd
-}
-
-setup_executorch() {
-  pushd executorch
-  source .ci/scripts/utils.sh
-
-  install_flatc_from_source
-  pip_install .
-  build_executorch_runner "cmake"
-
-  # Make sure that all the newly generate files are owned by Jenkins
-  chown -R jenkins .
-  popd
-}
-
-clone_executorch
-install_buck2
-install_conda_dependencies
-install_pip_dependencies
-setup_executorch
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -6,21 +6,19 @@ source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

 function install_huggingface() {
  local version
-  commit=$(get_pinned_commit huggingface)
+  version=$(get_pinned_commit huggingface)
  pip_install pandas==2.0.3
-  pip_install "git+https://github.com/huggingface/transformers@${commit}"
+  pip_install "transformers==${version}"
 }

 function install_timm() {
  local commit
  commit=$(get_pinned_commit timm)
  pip_install pandas==2.0.3
-  pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
-  # Clean up
-  conda_run pip uninstall -y cmake torch torchvision triton
+  pip_install "git+https://github.com/rwightman/pytorch-image-models@${commit}"
 }

 # Pango is needed for weasyprint which is needed for doctr
 conda_install pango
 install_huggingface
-install_timm
+# install_timm
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -10,13 +10,13 @@ retry () {

 # A bunch of custom pip dependencies for ONNX
 pip_install \
-  beartype==0.15.0 \
+  beartype==0.10.4 \
  filelock==3.9.0 \
  flatbuffers==2.0 \
  mock==5.0.1 \
  ninja==1.10.2 \
  networkx==2.0 \
-  numpy==1.24.2
+  numpy==1.22.4

 # ONNXRuntime should be installed before installing
 # onnx-weekly. Otherwise, onnx-weekly could be
@ -26,13 +26,13 @@ pip_install \
  pytest-cov==4.0.0 \
  pytest-subtests==0.10.0 \
  tabulate==0.9.0 \
-  transformers==4.32.1
+  transformers==4.31.0

 pip_install coloredlogs packaging
-retry pip_install -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ --no-cache-dir --no-input ort-nightly==1.17.0.dev20231005006
+retry pip_install -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ --no-cache-dir --no-input ort-nightly==1.16.0.dev20230908001

-pip_install -i https://test.pypi.org/simple/ onnx==1.15.0rc2
-pip_install onnxscript==0.1.0.dev20231128 --no-deps
+pip_install onnx==1.14.1
+pip_install onnxscript-preview==0.1.0.dev20230828 --no-deps

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -5,10 +5,8 @@ set -ex
 # "install" hipMAGMA into /opt/rocm/magma by copying after build
 git clone https://bitbucket.org/icl/magma.git
 pushd magma
-
-# Version 2.7.2 + ROCm related updates
-git checkout 823531632140d0edcb7e77c3edc0e837421471c5
-
+# Fixes memory leaks of magma found while executing linalg UTs
+git checkout 28592a7170e4b3707ed92644bf4a689ed600c27f
 cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
 echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
 echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib' >> make.inc
--- a/.ci/docker/common/install_thrift.sh
+++ b/.ci/docker/common/install_thrift.sh
@ -0,0 +1,14 @@
+apt-get update
+apt-get install -y sudo wget libboost-dev libboost-test-dev libboost-program-options-dev libboost-filesystem-dev libboost-thread-dev libevent-dev automake libtool flex bison pkg-config g++ libssl-dev
+wget https://www-us.apache.org/dist/thrift/0.12.0/thrift-0.12.0.tar.gz
+tar -xvf thrift-0.12.0.tar.gz
+cd thrift-0.12.0
+for file in ./compiler/cpp/Makefile*; do
+  sed -i 's/\-Werror//' $file
+done
+./bootstrap.sh
+./configure --without-php --without-java --without-python --without-nodejs --without-go --without-ruby
+sudo make
+sudo make install
+cd ..
+rm thrift-0.12.0.tar.gz
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -23,10 +23,8 @@ fi
 # The logic here is copied from .ci/pytorch/common_utils.sh
 TRITON_PINNED_COMMIT=$(get_pinned_commit ${TRITON_TEXT_FILE})

-if [ -n "${UBUNTU_VERSION}" ];then
-    apt update
-    apt-get install -y gpg-agent
-fi
+apt update
+apt-get install -y gpg-agent

 if [ -n "${CONDA_CMAKE}" ]; then
  # Keep the current cmake and numpy version here, so we can reinstall them later
@ -38,12 +36,12 @@ if [ -z "${MAX_JOBS}" ]; then
    export MAX_JOBS=$(nproc)
 fi

-if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}" == "7" ]]; then
+if [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}" == "7" ]]; then
  # Triton needs at least gcc-9 to build
  apt-get install -y g++-9

  CXX=g++-9 pip_install "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python"
-elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
+elif [ -n "${CLANG_VERSION}" ]; then
  # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
  add-apt-repository -y ppa:ubuntu-toolchain-r/test
  apt-get install -y g++-9
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -1,115 +0,0 @@
-#!/bin/bash
-set -xe
-
-
-# Intel® software for general purpose GPU capabilities.
-# Refer to https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html
-
-# Intel® oneAPI Base Toolkit (version 2024.0.0) has been updated to include functional and security updates.
-# Refer to https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html
-
-# Users should update to the latest version as it becomes available
-
-function install_ubuntu() {
-    apt-get update -y
-    apt-get install -y gpg-agent wget
-
-    # Set up the repository. To do this, download the key to the system keyring
-    wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
-        | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
-    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
-        | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
-
-    # Add the signed entry to APT sources and configure the APT client to use the Intel repository
-    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/production/2328 unified" \
-        | tee /etc/apt/sources.list.d/intel-gpu-jammy.list
-    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
-        | tee /etc/apt/sources.list.d/oneAPI.list
-
-    # Update the packages list and repository index
-    apt-get update
-
-    # The xpu-smi packages
-    apt-get install -y flex bison xpu-smi
-    # Compute and Media Runtimes
-    apt-get install -y \
-        intel-opencl-icd intel-level-zero-gpu level-zero \
-        intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
-        libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
-        libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
-        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
-    # Development Packages
-    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
-    # Install Intel® oneAPI Base Toolkit
-    if [ -n "$BASEKIT_VERSION" ]; then
-        apt-get install intel-basekit=$BASEKIT_VERSION -y
-    else
-        apt-get install intel-basekit -y
-    fi
-
-    # Cleanup
-    apt-get autoclean && apt-get clean
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-}
-
-function install_centos() {
-    dnf install -y 'dnf-command(config-manager)'
-    dnf config-manager --add-repo \
-        https://repositories.intel.com/gpu/rhel/8.6/production/2328/unified/intel-gpu-8.6.repo
-    # To add the EPEL repository needed for DKMS
-    dnf -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm
-        # https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
-
-    # Create the YUM repository file in the /temp directory as a normal user
-    tee > /tmp/oneAPI.repo << EOF
-[oneAPI]
-name=Intel® oneAPI repository
-baseurl=https://yum.repos.intel.com/oneapi
-enabled=1
-gpgcheck=1
-repo_gpgcheck=1
-gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-EOF
-
-    # Move the newly created oneAPI.repo file to the YUM configuration directory /etc/yum.repos.d
-    mv /tmp/oneAPI.repo /etc/yum.repos.d
-
-    # The xpu-smi packages
-    dnf install -y flex bison xpu-smi
-    # Compute and Media Runtimes
-    dnf install -y \
-        intel-opencl intel-media intel-mediasdk libmfxgen1 libvpl2\
-        level-zero intel-level-zero-gpu mesa-dri-drivers mesa-vulkan-drivers \
-        mesa-vdpau-drivers libdrm mesa-libEGL mesa-libgbm mesa-libGL \
-        mesa-libxatracker libvpl-tools intel-metrics-discovery \
-        intel-metrics-library intel-igc-core intel-igc-cm \
-        libva libva-utils intel-gmmlib libmetee intel-gsc intel-ocloc hwinfo clinfo
-    # Development packages
-    dnf install -y --refresh \
-        intel-igc-opencl-devel level-zero-devel intel-gsc-devel libmetee-devel \
-        level-zero-devel
-    # Install Intel® oneAPI Base Toolkit
-    dnf install intel-basekit -y
-
-    # Cleanup
-    dnf clean all
-    rm -rf /var/cache/yum
-    rm -rf /var/lib/yum/yumdb
-    rm -rf /var/lib/yum/history
-}
-
-
-# The installation depends on the base OS
-ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
-case "$ID" in
-    ubuntu)
-        install_ubuntu
-    ;;
-    centos)
-        install_centos
-    ;;
-    *)
-        echo "Unable to determine OS..."
-        exit 1
-    ;;
-esac
--- a/.ci/docker/linter-cuda/Dockerfile
+++ b/.ci/docker/linter-cuda/Dockerfile
@ -1,44 +0,0 @@
-ARG UBUNTU_VERSION
-
-FROM ubuntu:${UBUNTU_VERSION}
-
-ARG UBUNTU_VERSION
-
-ENV DEBIAN_FRONTEND noninteractive
-
-# Install common dependencies (so that this step can be cached separately)
-COPY ./common/install_base.sh install_base.sh
-RUN bash ./install_base.sh && rm install_base.sh
-
-# Install missing libomp-dev
-RUN apt-get update && apt-get install -y --no-install-recommends libomp-dev && apt-get autoclean && apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# Install user
-COPY ./common/install_user.sh install_user.sh
-RUN bash ./install_user.sh && rm install_user.sh
-
-# Install conda and other packages (e.g., numpy, pytest)
-ARG ANACONDA_PYTHON_VERSION
-ARG CONDA_CMAKE
-ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
-ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
-COPY requirements-ci.txt /opt/conda/requirements-ci.txt
-COPY ./common/install_conda.sh install_conda.sh
-COPY ./common/common_utils.sh common_utils.sh
-RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
-
-# Install cuda and cudnn
-ARG CUDA_VERSION
-RUN wget -q https://raw.githubusercontent.com/pytorch/builder/main/common/install_cuda.sh -O install_cuda.sh
-RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
-ENV DESIRED_CUDA ${CUDA_VERSION}
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
-
-# Note that Docker build forbids copying file outside the build context
-COPY ./common/install_linter.sh install_linter.sh
-COPY ./common/common_utils.sh common_utils.sh
-RUN bash ./install_linter.sh
-RUN rm install_linter.sh common_utils.sh
-
-USER jenkins
-CMD ["bash"]
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -75,10 +75,10 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.7.0
+mypy==1.4.1
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
-#Pinned versions: 1.7.0
+#Pinned versions: 1.4.1
 #test that import: test_typing.py, test_type_hints.py

 networkx==2.8.8
@ -124,22 +124,10 @@ opt-einsum==3.3
 #Pinned versions: 3.3
 #test that import: test_linalg.py

-optree==0.9.1
-#Description: A library for tree manipulation
-#Pinned versions: 0.9.1
-#test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
-#test_pytree.py, test_ops.py, test_control_flow.py, test_modules.py,
-#common_utils.py, test_eager_transforms.py, test_python_dispatch.py,
-#test_expanded_weights.py, test_decomp.py, test_overrides.py, test_masked.py,
-#test_ops.py, test_prims.py, test_subclass.py, test_functionalization.py,
-#test_schema_check.py, test_profiler_tree.py, test_meta.py, test_torchxla_num_output.py,
-#test_utils.py, test_proxy_tensor.py, test_memory_profiler.py, test_view_ops.py,
-#test_pointwise_ops.py, test_dtensor_ops.py, test_torchinductor.py, test_fx.py,
-#test_fake_tensor.py, test_mps.py
-
-pillow==10.0.1
+pillow==9.3.0 ; python_version <= "3.8"
+pillow==9.5.0 ; python_version > "3.8"
 #Description:  Python Imaging Library fork
-#Pinned versions: 10.0.1
+#Pinned versions:
 #test that import:

 protobuf==3.20.2
@ -292,14 +280,3 @@ tensorboard==2.13.0
 #Description: Also included in .ci/docker/requirements-docs.txt
 #Pinned versions:
 #test that import: test_tensorboard
-
-pywavelets==1.4.1
-#Description: This is a requirement of scikit-image, we need to pin
-# it here because 1.5.0 conflicts with numpy 1.21.2 used in CI
-#Pinned versions: 1.4.1
-#test that import:
-
-lxml==5.0.0.
-#Description: This is a requirement of unittest-xml-reporting
-
-# Python-3.9 binaries
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-2.2.0
+2.1.0
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -79,6 +79,12 @@ ENV OPENSSL_ROOT_DIR /opt/openssl
 RUN bash ./install_openssl.sh
 ENV OPENSSL_DIR /opt/openssl

+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+COPY ./common/install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
@ -87,12 +93,6 @@ COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

-# (optional) Install non-default CMake version
-ARG CMAKE_VERSION
-COPY ./common/install_cmake.sh install_cmake.sh
-RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
-RUN rm install_cmake.sh
-
 ARG TRITON
 # Install triton, this needs to be done before sccache because the latter will
 # try to reach out to S3, which docker build runners don't have access
@ -142,12 +142,6 @@ COPY ./common/install_cudnn.sh install_cudnn.sh
 RUN if [ "${CUDNN_VERSION}" -eq 8 ]; then bash install_cudnn.sh; fi
 RUN rm install_cudnn.sh

-# Install CUSPARSELT
-ARG CUDA_VERSION
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
-RUN bash install_cusparselt.sh
-RUN rm install_cusparselt.sh
-
 # Delete /usr/local/cuda-11.X/cuda-11.X symlinks
 RUN if [ -h /usr/local/cuda-11.6/cuda-11.6 ]; then rm /usr/local/cuda-11.6/cuda-11.6; fi
 RUN if [ -h /usr/local/cuda-11.7/cuda-11.7 ]; then rm /usr/local/cuda-11.7/cuda-11.7; fi
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -1,118 +0,0 @@
-ARG UBUNTU_VERSION
-
-FROM ubuntu:${UBUNTU_VERSION}
-
-ARG UBUNTU_VERSION
-
-ENV DEBIAN_FRONTEND noninteractive
-
-ARG CLANG_VERSION
-
-# Install common dependencies (so that this step can be cached separately)
-COPY ./common/install_base.sh install_base.sh
-RUN bash ./install_base.sh && rm install_base.sh
-
-# Install clang
-ARG LLVMDEV
-COPY ./common/install_clang.sh install_clang.sh
-RUN bash ./install_clang.sh && rm install_clang.sh
-
-# Install user
-COPY ./common/install_user.sh install_user.sh
-RUN bash ./install_user.sh && rm install_user.sh
-
-# Install katex
-ARG KATEX
-COPY ./common/install_docs_reqs.sh install_docs_reqs.sh
-RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
-
-# Install conda and other packages (e.g., numpy, pytest)
-ARG ANACONDA_PYTHON_VERSION
-ARG CONDA_CMAKE
-ARG DOCS
-ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
-ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
-ENV DOCS=$DOCS
-COPY requirements-ci.txt requirements-docs.txt /opt/conda/
-COPY ./common/install_conda.sh install_conda.sh
-COPY ./common/common_utils.sh common_utils.sh
-RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt /opt/conda/requirements-docs.txt
-
-# Install gcc
-ARG GCC_VERSION
-COPY ./common/install_gcc.sh install_gcc.sh
-RUN bash ./install_gcc.sh && rm install_gcc.sh
-
-# Install lcov for C++ code coverage
-COPY ./common/install_lcov.sh install_lcov.sh
-RUN  bash ./install_lcov.sh && rm install_lcov.sh
-
-COPY ./common/install_openssl.sh install_openssl.sh
-RUN bash ./install_openssl.sh
-ENV OPENSSL_ROOT_DIR /opt/openssl
-ENV OPENSSL_DIR /opt/openssl
-RUN rm install_openssl.sh
-
-ARG INDUCTOR_BENCHMARKS
-COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
-COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface.txt huggingface.txt
-COPY ci_commit_pins/timm.txt timm.txt
-RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
-
-ARG TRITON
-# Install triton, this needs to be done before sccache because the latter will
-# try to reach out to S3, which docker build runners don't have access
-COPY ./common/install_triton.sh install_triton.sh
-COPY ./common/common_utils.sh common_utils.sh
-# TODO: will add triton xpu commit
-COPY ci_commit_pins/triton.txt triton.txt
-RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
-RUN rm install_triton.sh common_utils.sh triton.txt
-
-# (optional) Install database packages like LMDB and LevelDB
-ARG DB
-COPY ./common/install_db.sh install_db.sh
-RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
-RUN rm install_db.sh
-ENV INSTALLED_DB ${DB}
-
-# (optional) Install vision packages like OpenCV and ffmpeg
-ARG VISION
-COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
-RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
-RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
-ENV INSTALLED_VISION ${VISION}
-
-# Install XPU Dependencies
-ARG BASEKIT_VERSION
-COPY ./common/install_xpu.sh install_xpu.sh
-RUN bash ./install_xpu.sh && rm install_xpu.sh
-
-# (optional) Install non-default CMake version
-ARG CMAKE_VERSION
-COPY ./common/install_cmake.sh install_cmake.sh
-RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
-RUN rm install_cmake.sh
-
-# (optional) Install non-default Ninja version
-ARG NINJA_VERSION
-COPY ./common/install_ninja.sh install_ninja.sh
-RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
-RUN rm install_ninja.sh
-
-# Install ccache/sccache (do this last, so we get priority in PATH)
-COPY ./common/install_cache.sh install_cache.sh
-ENV PATH /opt/cache/bin:$PATH
-RUN bash ./install_cache.sh && rm install_cache.sh
-
-# Include BUILD_ENVIRONMENT environment variable in image
-ARG BUILD_ENVIRONMENT
-ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
-
-# Install LLVM dev version (Defined in the pytorch/builder github repository)
-COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
-
-USER jenkins
-CMD ["bash"]
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -17,6 +17,13 @@ ARG LLVMDEV
 COPY ./common/install_clang.sh install_clang.sh
 RUN bash ./install_clang.sh && rm install_clang.sh

+# (optional) Install thrift.
+ARG THRIFT
+COPY ./common/install_thrift.sh install_thrift.sh
+RUN if [ -n "${THRIFT}" ]; then bash ./install_thrift.sh; fi
+RUN rm install_thrift.sh
+ENV INSTALLED_THRIFT ${THRIFT}
+
 # Install user
 COPY ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh
@ -146,14 +153,6 @@ COPY ci_commit_pins/triton.txt triton.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton.txt

-ARG EXECUTORCH
-# Build and install executorch
-COPY ./common/install_executorch.sh install_executorch.sh
-COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/executorch.txt executorch.txt
-RUN if [ -n "${EXECUTORCH}" ]; then bash ./install_executorch.sh; fi
-RUN rm install_executorch.sh common_utils.sh executorch.txt
-
 ARG ONNX
 # Install ONNX dependencies
 COPY ./common/install_onnx.sh ./common/common_utils.sh ./
--- a/.ci/onnx/test.sh
+++ b/.ci/onnx/test.sh
@ -3,6 +3,11 @@
 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"

+# Use to retry ONNX test, only retry it twice
+retry () {
+    "$@" || (sleep 60 && "$@")
+}
+
 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
  # TODO: This can be removed later once vision is also part of the Docker image
  pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
@ -11,5 +16,5 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
  # NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we
  # need to bring this to the standard PyTorch run_test eventually. The issue will be tracked in
  # https://github.com/pytorch/pytorch/issues/98626
-  "$ROOT_DIR/scripts/onnx/test.sh"
+  retry "$ROOT_DIR/scripts/onnx/test.sh"
 fi
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -28,8 +28,6 @@ echo "Environment variables:"
 env

 if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
-  # Use jemalloc during compilation to mitigate https://github.com/pytorch/pytorch/issues/116289
-  export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
  echo "NVCC version:"
  nvcc --version
 fi
@ -65,12 +63,6 @@ else
  export LLVM_DIR=/opt/llvm/lib/cmake/llvm
 fi

-if [[ "$BUILD_ENVIRONMENT" == *executorch* ]]; then
-  # To build test_edge_op_registration
-  export BUILD_EXECUTORCH=ON
-  export USE_CUDA=0
-fi
-
 if ! which conda; then
  # In ROCm CIs, we are doing cross compilation on build machines with
  # intel cpu and later run tests on machines with amd cpu.
@ -153,12 +145,6 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
  python tools/amd_build/build_amd.py
 fi

-if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
-  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/compiler/latest/env/vars.sh
-  export USE_XPU=1
-fi
-
 # sccache will fail for CUDA builds if all cores are used for compiling
 # gcc 7 with sccache seems to have intermittent OOM issue if all cores are used
 if [ -z "$MAX_JOBS" ]; then
@ -173,14 +159,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* && -z "$TORCH_CUDA_ARCH_LIST" ]]; then
  exit 1
 fi

-# We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
-# memory to build and will OOM
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ "$TORCH_CUDA_ARCH_LIST" == *"8.6"* || "$TORCH_CUDA_ARCH_LIST" == *"8.0"* ]]; then
-  echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
-  echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
-  export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
-fi
-
 if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
  export CC=clang
  export CXX=clang++
@ -190,6 +168,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
  export LDSHARED="clang --shared"
  export USE_CUDA=0
  export USE_ASAN=1
+  export USE_MKLDNN=0
  export UBSAN_FLAGS="-fno-sanitize-recover=all;-fno-sanitize=float-divide-by-zero;-fno-sanitize=float-cast-overflow"
  unset USE_LLVM
 fi
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -43,7 +43,7 @@ function assert_git_not_dirty() {
    # TODO: we should add an option to `build_amd.py` that reverts the repo to
    #       an unmodified state.
    if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *xla* ]] ; then
-        git_status=$(git status --porcelain | grep -v '?? third_party' || true)
+        git_status=$(git status --porcelain)
        if [[ $git_status ]]; then
            echo "Build left local git repository checkout dirty"
            echo "git status --porcelain:"
@ -171,9 +171,16 @@ function install_torchrec_and_fbgemm() {
  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
 }

+function install_numpy_pytorch_interop() {
+  local commit
+  commit=$(get_pinned_commit numpy_pytorch_interop)
+  # TODO: --no-use-pep517 will result in failure.
+  pip_install --user "git+https://github.com/Quansight-Labs/numpy_pytorch_interop.git@${commit}"
+}
+
 function clone_pytorch_xla() {
  if [[ ! -d ./xla ]]; then
-    git clone --recursive --quiet https://github.com/pytorch/xla.git
+    git clone --recursive -b r2.1 https://github.com/pytorch/xla.git
    pushd xla
    # pin the xla hash so that we don't get broken by changes to xla
    git checkout "$(cat ../.github/ci_commit_pins/xla.txt)"
@ -205,6 +212,15 @@ function test_torch_deploy(){
 popd
 }

+function install_timm() {
+  local commit
+  commit=$(get_pinned_commit timm)
+  pip_install pandas
+  pip_install scipy
+  pip_install z3-solver
+  pip_install "git+https://github.com/rwightman/pytorch-image-models@${commit}"
+}
+
 function checkout_install_torchbench() {
  local commit
  commit=$(get_pinned_commit torchbench)
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -43,7 +43,7 @@ cross_compile_arm64() {
 compile_arm64() {
  # Compilation for arm64
  # TODO: Compile with OpenMP support (but this causes CI regressions as cross-compilation were done with OpenMP disabled)
-  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+  USE_DISTRIBUTED=0 USE_OPENMP=0 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
 }

 compile_x86_64() {
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -36,12 +36,10 @@ time python test/run_test.py --verbose -i distributed/test_functional_api


 # DTensor tests
+time python test/run_test.py --verbose -i distributed/_tensor/test_device_mesh
 time python test/run_test.py --verbose -i distributed/_tensor/test_random_ops
 time python test/run_test.py --verbose -i distributed/_tensor/test_dtensor_compile

-# DeviceMesh test
-time python test/run_test.py --verbose -i distributed/test_device_mesh
-
 # DTensor/TP tests
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_ddp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_fsdp_2d_parallel
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -18,10 +18,6 @@ BUILD_DIR="build"
 BUILD_RENAMED_DIR="build_renamed"
 BUILD_BIN_DIR="$BUILD_DIR"/bin

-#Set Default values for these variables in case they are not set
-SHARD_NUMBER="${SHARD_NUMBER:=1}"
-NUM_TEST_SHARDS="${NUM_TEST_SHARDS:=1}"
-
 export VALGRIND=ON
 # export TORCH_INDUCTOR_INSTALL_GXX=ON
 if [[ "$BUILD_ENVIRONMENT" == *clang9* ]]; then
@ -84,11 +80,6 @@ if [[ "$BUILD_ENVIRONMENT" != *bazel* ]]; then
  CUSTOM_TEST_ARTIFACT_BUILD_DIR=$(realpath "${CUSTOM_TEST_ARTIFACT_BUILD_DIR:-"build/custom_test_artifacts"}")
 fi

-# Reduce set of tests to include when running run_test.py
-if [[ -n $TESTS_TO_INCLUDE ]]; then
-  echo "Setting INCLUDE_CLAUSE"
-  INCLUDE_CLAUSE="--include $TESTS_TO_INCLUDE"
-fi

 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
@ -128,8 +119,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm* ]]; then
  # mainly used so that we're not spending extra cycles testing cpu
  # devices on expensive gpu machines
  export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"
-elif [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
-  export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu"
 fi

 if [[ "$TEST_CONFIG" == *crossref* ]]; then
@ -142,15 +131,6 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
  rocminfo | grep -E 'Name:.*\sgfx|Marketing'
 fi

-if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
-  # Source Intel oneAPI envrioment script to enable xpu runtime related libraries
-  # refer to https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/2024-0/use-the-setvars-and-oneapi-vars-scripts-with-linux.html
-  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/compiler/latest/env/vars.sh
-  # Check XPU status before testing
-  xpu-smi discovery
-fi
-
 if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
  # JIT C++ extensions require ninja.
  pip_install --user "ninja==1.10.2"
@ -168,7 +148,7 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
    export PYTORCH_TEST_WITH_ASAN=1
    export PYTORCH_TEST_WITH_UBSAN=1
    # TODO: Figure out how to avoid hard-coding these paths
-    export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-15/bin/llvm-symbolizer
+    export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-12/bin/llvm-symbolizer
    export TORCH_USE_RTLD_GLOBAL=1
    # NB: We load libtorch.so with RTLD_GLOBAL for UBSAN, unlike our
    # default behavior.
@ -202,7 +182,7 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
    # have, and it applies to child processes.

    # TODO: get rid of the hardcoded path
-    export LD_PRELOAD=/usr/lib/llvm-15/lib/clang/15.0.7/lib/linux/libclang_rt.asan-x86_64.so
+    export LD_PRELOAD=/usr/lib/llvm-12/lib/clang/12.0.1/lib/linux/libclang_rt.asan-x86_64.so
    # Disable valgrind for asan
    export VALGRIND=OFF
    # Increase stack size, because ASAN red zones use more stack
@ -248,16 +228,13 @@ test_python_shard() {
    exit 1
  fi

-  # Bare --include flag is not supported and quoting for lint ends up with flag not being interpreted correctly
-  # shellcheck disable=SC2086
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS" --verbose

  assert_git_not_dirty
 }

 test_python() {
-  # shellcheck disable=SC2086
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --verbose
  assert_git_not_dirty
 }

@ -274,7 +251,6 @@ test_dynamo_shard() {
    --exclude-jit-executor \
    --exclude-distributed-tests \
    --exclude \
-      test_ao_sparsity \
      test_autograd \
      test_jit \
      test_proxy_tensor \
@ -305,10 +281,6 @@ test_inductor_distributed() {
  # Smuggle a few multi-gpu tests here so that we don't have to request another large node
  echo "Testing multi_gpu tests in test_torchinductor"
  pytest test/inductor/test_torchinductor.py -k test_multi_gpu
-  pytest test/inductor/test_aot_inductor.py -k test_non_default_cuda_device
-  pytest test/inductor/test_aot_inductor.py -k test_replicate_on_devices
-  pytest test/distributed/_tensor/test_dtensor_compile.py
-  pytest test/distributed/tensor/parallel/test_fsdp_2d_parallel.py

  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
  # with if required # gpus aren't available
@ -324,26 +296,21 @@ test_inductor() {

  # docker build uses bdist_wheel which does not work with test_aot_inductor
  # TODO: need a faster way to build
-  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-      BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
-      CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aot_inductor
-  fi
+  BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
+  CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aot_inductor
 }

 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
 # For example 'dynamic_aot_eager_torchbench' TEST_CONFIG means we run
 # the benchmark script with '--dynamic-shapes --backend aot_eager --device cuda'
-# The matrix of test options is specified in .github/workflows/inductor.yml,
-# .github/workflows/inductor-periodic.yml, and
-# .github/workflows/inductor-perf-test-nightly.yml
+# The matrix of test options is specified in .github/workflows/periodic.yml
+# and .github/workflows/inductor.yml
 DYNAMO_BENCHMARK_FLAGS=()

 if [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--backend eager)
 elif [[ "${TEST_CONFIG}" == *aot_eager* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--backend aot_eager)
-elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
-  DYNAMO_BENCHMARK_FLAGS+=(--export-aot-inductor)
 elif [[ "${TEST_CONFIG}" == *inductor* && "${TEST_CONFIG}" != *perf* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--inductor)
 fi
@ -352,7 +319,7 @@ if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--dynamic-shapes --dynamic-batch-only)
 fi

-if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
+if [[ "${TEST_CONFIG}" == *cpu_accuracy* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
 else
  DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
@ -407,8 +374,8 @@ test_perf_for_dashboard() {
            --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *cppwrapper-true* ]] && [[ "$mode" == "inference" ]]; then
-        TORCHINDUCTOR_CPP_WRAPPER=1 python "benchmarks/dynamo/$suite.py" \
-            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
+        python "benchmarks/dynamo/$suite.py" \
+            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs --cpp-wrapper "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_cpp_wrapper_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *freezing_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
@ -416,11 +383,6 @@ test_perf_for_dashboard() {
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
-      if [[ "$DASHBOARD_TAG" == *freeze_autotune_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
-        TORCHINDUCTOR_MAX_AUTOTUNE=1 python "benchmarks/dynamo/$suite.py" \
-            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
-            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_autotune_${suite}_${dtype}_${mode}_cuda_${target}.csv"
-      fi
      if [[ "$DASHBOARD_TAG" == *aotinductor-true* ]] && [[ "$mode" == "inference" ]]; then
        python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --export-aot-inductor --disable-cudagraphs "$@" \
@ -471,12 +433,19 @@ test_single_dynamo_benchmark() {
      "${DYNAMO_BENCHMARK_FLAGS[@]}" \
      "$@" "${partition_flags[@]}" \
      --output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
-    python benchmarks/dynamo/check_accuracy.py \
-      --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
-    python benchmarks/dynamo/check_graph_breaks.py \
-      --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
+
+    if [[ "${TEST_CONFIG}" == *inductor* ]] && [[ "${TEST_CONFIG}" != *cpu_accuracy* ]]; then
+      # other jobs (e.g. periodic, cpu-accuracy) may have different set of expected models.
+      python benchmarks/dynamo/check_accuracy.py \
+        --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
+        --expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
+      python benchmarks/dynamo/check_graph_breaks.py \
+        --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
+        --expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
+    else
+      python benchmarks/dynamo/check_csv.py \
+        -f "$TEST_REPORTS_DIR/${name}_${suite}.csv"
+    fi
  fi
 }

@ -494,10 +463,8 @@ test_dynamo_benchmark() {
  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
  else
-    if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
+    if [[ "${TEST_CONFIG}" == *cpu_accuracy* ]]; then
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 "$@"
-    elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
-      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
    else
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
      test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
@ -509,27 +476,12 @@ test_inductor_torchbench_smoketest_perf() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

-  # smoke test the cpp_wrapper mode
-  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy --bfloat16 \
-    --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_smoketest.csv"
-  python benchmarks/dynamo/check_accuracy.py \
-      --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_smoketest.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"
-
  python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
    --output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
-  # The threshold value needs to be actively maintained to make this check useful
-  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4
-
-  python benchmarks/dynamo/torchbench.py --device cuda --performance --bfloat16 --inference \
-    --export-aot-inductor --only nanogpt --output "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv"
-  # The threshold value needs to be actively maintained to make this check useful
-  # The perf number of nanogpt seems not very stable, e.g.
-  # https://github.com/pytorch/pytorch/actions/runs/7158691360/job/19491437314,
-  # and thus we lower its threshold to reduce flakiness. If this continues to be a problem,
-  # we switch to use some other model.
-  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.9
+  # the reference speedup value is hardcoded in check_hf_bert_perf_csv.py
+  # this value needs to be actively maintained to make this check useful
+  python benchmarks/dynamo/check_hf_bert_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"

  # Check memory compression ratio for a few models
  for test in hf_Albert timm_vision_transformer; do
@ -653,7 +605,7 @@ test_libtorch_jit() {

  # Run jit and lazy tensor cpp tests together to finish them faster
  if [[ "$BUILD_ENVIRONMENT" == *cuda* && "$TEST_CONFIG" != *nogpu* ]]; then
-    LTC_TS_CUDA=1 python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/test_lazy
+    LTC_TS_CUDA=1 python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/nvfuser_tests cpp/test_lazy
  else
    # CUDA tests have already been skipped when CUDA is not available
    python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/test_lazy -k "not CUDA"
@ -689,20 +641,6 @@ test_libtorch_api() {
  fi
 }

-test_xpu_bin(){
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-
-  for xpu_case in "${BUILD_BIN_DIR}"/*{xpu,sycl}*
-  do
-    if [[ "$xpu_case" != *"*"* ]]; then
-      case_name=$(basename "$xpu_case")
-      echo "Testing ${case_name} ..."
-      "$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml
-    fi
-  done
-}
-
 test_aot_compilation() {
  echo "Testing Ahead of Time compilation"
  ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
@ -728,8 +666,7 @@ test_vulkan() {

 test_distributed() {
  echo "Testing distributed python tests"
-  # shellcheck disable=SC2086
-  time python test/run_test.py --distributed-tests --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" $INCLUDE_CLAUSE --verbose
+  time python test/run_test.py --distributed-tests --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
  assert_git_not_dirty

  if [[ ("$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm*) && "$SHARD_NUMBER" == 1 ]]; then
@ -1038,28 +975,9 @@ test_docs_test() {
 }

 test_executorch() {
-  pushd /executorch
-
-  echo "Install torchvision and torchaudio"
-  # TODO(huydhn): Switch this to the pinned commits on ExecuTorch once they are
-  # there.  These libraries need to be built here, and not part of the Docker
-  # image because they require the target version of torch to be installed first
-  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git"
-  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/vision.git"
-
-  echo "Run ExecuTorch regression tests for some models"
-  # NB: This is a sample model, more can be added here
-  export PYTHON_EXECUTABLE=python
-  # TODO(huydhn): Add more coverage here using ExecuTorch's gather models script
-  # shellcheck disable=SC1091
-  source .ci/scripts/test.sh mv3 cmake xnnpack-quantization-delegation ''
-
-  popd
-
  # Test torchgen generated code for Executorch.
-  echo "Testing ExecuTorch op registration"
+  echo "Testing Executorch op registration"
  "$BUILD_BIN_DIR"/test_edge_op_registration
-
  assert_git_not_dirty
 }

@ -1074,8 +992,6 @@ elif [[ "${TEST_CONFIG}" == *xla* ]]; then
  install_torchvision
  build_xla
  test_xla
-elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
-  test_executorch
 elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
  test_python_legacy_jit
 elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
@ -1098,10 +1014,11 @@ elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
  test_dynamo_benchmark huggingface "$id"
 elif [[ "${TEST_CONFIG}" == *timm* ]]; then
  install_torchvision
+  install_timm
  id=$((SHARD_NUMBER-1))
  test_dynamo_benchmark timm_models "$id"
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
-  if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
+  if [[ "${TEST_CONFIG}" == *cpu_accuracy* ]]; then
    install_torchaudio cpu
  else
    install_torchaudio cuda
@ -1112,13 +1029,13 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  # https://github.com/opencv/opencv-python/issues/885
  pip_install opencv-python==4.8.0.74
  if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
-    checkout_install_torchbench hf_Bert hf_Albert nanogpt timm_vision_transformer
+    checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
  else
    checkout_install_torchbench
    # Do this after checkout_install_torchbench to ensure we clobber any
    # nightlies that torchbench may pull in
-    if [[ "${TEST_CONFIG}" != *cpu_inductor* ]]; then
+    if [[ "${TEST_CONFIG}" != *cpu_accuracy* ]]; then
      install_torchrec_and_fbgemm
    fi
    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
@ -1128,21 +1045,21 @@ elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
  test_inductor
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
+  test_without_numpy
  install_torchvision
+  install_numpy_pytorch_interop
  test_dynamo_shard 1
  test_aten
-elif [[ "${TEST_CONFIG}" == *dynamo* && $SHARD_NUMBER -gt 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
+elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
  install_torchvision
-  test_dynamo_shard "${SHARD_NUMBER}"
+  install_numpy_pytorch_interop
+  test_dynamo_shard 2
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  test_without_numpy
  install_torchvision
  test_python_shard 1
  test_aten
  test_libtorch 1
-  if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
-    test_xpu_bin
-  fi
 elif [[ "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
  install_torchvision
  test_python_shard 2
@ -1163,15 +1080,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
  test_libtorch
 elif [[ "${TEST_CONFIG}" = docs_test ]]; then
  test_docs_test
-elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
-  install_torchvision
-  test_python
-  test_aten
-elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
-  install_torchvision
-  test_python
-  test_aten
-  test_xpu_bin
 else
  install_torchvision
  install_monkeytype
@ -1184,4 +1092,5 @@ else
  test_custom_backend
  test_torch_function_benchmark
  test_benchmarks
+  test_executorch
 fi
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -127,7 +127,8 @@ python -c "import os, glob; os.system('python -mpip install --no-index --no-deps

    :: export test times so that potential sharded tests that'll branch off this build will use consistent data
    python tools/stats/export_test_times.py
-    robocopy /E ".additional_ci_files" "%PYTORCH_FINAL_PACKAGE_DIR%\.additional_ci_files"
+    copy /Y ".pytorch-test-times.json" "%PYTORCH_FINAL_PACKAGE_DIR%"
+    copy /Y ".pytorch-test-file-ratings.json" "%PYTORCH_FINAL_PACKAGE_DIR%"

    :: Also save build/.ninja_log as an artifact
    copy /Y "build\.ninja_log" "%PYTORCH_FINAL_PACKAGE_DIR%\"
--- a/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
+++ b/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
@ -2,7 +2,6 @@

 import os
 import subprocess
-import sys

 COMMON_TESTS = [
    (
@ -54,4 +53,4 @@ if __name__ == "__main__":
                print("Reruning with traceback enabled")
                print("Command:", command_string)
                subprocess.run(command_args, check=False)
-            sys.exit(e.returncode)
+            exit(e.returncode)
--- a/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat
+++ b/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat
@ -26,6 +26,11 @@ popd
 python test_custom_ops.py -v
 if ERRORLEVEL 1 exit /b 1

+:: TODO: fix and re-enable this test
+:: See https://github.com/pytorch/pytorch/issues/25155
+:: python test_custom_classes.py -v
+:: if ERRORLEVEL 1 exit /b 1
+
 python model.py --export-script-module="build/model.pt"
 if ERRORLEVEL 1 exit /b 1

--- a/.ci/pytorch/win-test-helpers/test_libtorch.bat
+++ b/.ci/pytorch/win-test-helpers/test_libtorch.bat
@ -1,3 +1,7 @@
+:: Skip LibTorch tests when building a GPU binary and testing on a CPU machine
+:: because LibTorch tests are not well designed for this use case.
+if "%USE_CUDA%" == "0" IF NOT "%CUDA_VERSION%" == "cpu" exit /b 0
+
 call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
 if errorlevel 1 exit /b 1

@ -17,7 +21,7 @@ if not errorlevel 0 exit /b 1
 cd %TMP_DIR_WIN%\build\torch\test
 for /r "." %%a in (*.exe) do (
    call :libtorch_check "%%~na" "%%~fa"
-    if errorlevel 1 goto fail
+    if errorlevel 1 exit /b 1
 )

 goto :eof
@ -30,6 +34,18 @@ set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\test
 :: Skip verify_api_visibility as it a compile level test
 if "%~1" == "verify_api_visibility" goto :eof

+:: See https://github.com/pytorch/pytorch/issues/25161
+if "%~1" == "c10_metaprogramming_test" goto :eof
+if "%~1" == "module_test" goto :eof
+:: See https://github.com/pytorch/pytorch/issues/25312
+if "%~1" == "converter_nomigraph_test" goto :eof
+:: See https://github.com/pytorch/pytorch/issues/35636
+if "%~1" == "generate_proposals_op_gpu_test" goto :eof
+:: See https://github.com/pytorch/pytorch/issues/35648
+if "%~1" == "reshape_op_gpu_test" goto :eof
+:: See https://github.com/pytorch/pytorch/issues/35651
+if "%~1" == "utility_ops_gpu_test" goto :eof
+
 echo Running "%~2"
 if "%~1" == "c10_intrusive_ptr_benchmark" (
  :: NB: This is not a gtest executable file, thus couldn't be handled by pytest-cpp
@ -40,15 +56,11 @@ if "%~1" == "c10_intrusive_ptr_benchmark" (
 python test\run_test.py --cpp --verbose -i "cpp/%~1"
 if errorlevel 1 (
  echo %1 failed with exit code %errorlevel%
-  goto fail
+  exit /b 1
 )
 if not errorlevel 0 (
  echo %1 failed with exit code %errorlevel%
-  goto fail
+  exit /b 1
 )

-:eof
-exit /b 0
-
-:fail
-exit /b 1
+goto :eof
--- a/.ci/pytorch/win-test-helpers/test_python_jit_legacy.bat
+++ b/.ci/pytorch/win-test-helpers/test_python_jit_legacy.bat
@ -1,7 +1,8 @@
 call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat

 echo Copying over test times file
-robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"
+copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%PROJECT_DIR_WIN%"
+copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-file-ratings.json" "%PROJECT_DIR_WIN%"

 pushd test

--- a/.ci/pytorch/win-test-helpers/test_python_shard.bat
+++ b/.ci/pytorch/win-test-helpers/test_python_shard.bat
@ -22,7 +22,8 @@ if "%SHARD_NUMBER%" == "1" (
 )

 echo Copying over test times file
-robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"
+copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%PROJECT_DIR_WIN%"
+copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-file-ratings.json" "%PROJECT_DIR_WIN%"

 echo Run nn tests
 python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -38,7 +38,7 @@ fi
 python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0

 # Install Z3 optional dependency for Windows builds.
-python -m pip install z3-solver==4.12.2.0
+python -m pip install z3-solver

 run_tests() {
    # Run nvidia-smi if available
--- a/.circleci/cimodel/init.py
+++ b/.circleci/cimodel/init.py
--- a/torch/_inductor/codegen/cuda/init.py
+++ b/torch/_inductor/codegen/cuda/init.py
--- a/.circleci/cimodel/data/binary_build_data.py
+++ b/.circleci/cimodel/data/binary_build_data.py
@ -0,0 +1,198 @@
+"""
+This module models the tree of configuration variants
+for "smoketest" builds.
+
+Each subclass of ConfigNode represents a layer of the configuration hierarchy.
+These tree nodes encapsulate the logic for whether a branch of the hierarchy
+should be "pruned".
+"""
+
+from collections import OrderedDict
+
+import cimodel.data.dimensions as dimensions
+
+from cimodel.lib.conf_tree import ConfigNode
+
+
+LINKING_DIMENSIONS = [
+    "shared",
+    "static",
+]
+
+
+DEPS_INCLUSION_DIMENSIONS = [
+    "with-deps",
+    "without-deps",
+]
+
+
+def get_processor_arch_name(gpu_version):
+    return (
+        "cpu"
+        if not gpu_version
+        else (
+            "cu" + gpu_version.strip("cuda")
+            if gpu_version.startswith("cuda")
+            else gpu_version
+        )
+    )
+
+
+CONFIG_TREE_DATA = OrderedDict()
+
+# GCC config variants:
+#
+# All the nightlies (except libtorch with new gcc ABI) are built with devtoolset7,
+# which can only build with old gcc ABI. It is better than devtoolset3
+# because it understands avx512, which is needed for good fbgemm performance.
+#
+# Libtorch with new gcc ABI is built with gcc 5.4 on Ubuntu 16.04.
+LINUX_GCC_CONFIG_VARIANTS = OrderedDict(
+    manywheel=["devtoolset7"],
+    conda=["devtoolset7"],
+    libtorch=[
+        "devtoolset7",
+        "gcc5.4_cxx11-abi",
+    ],
+)
+
+WINDOWS_LIBTORCH_CONFIG_VARIANTS = [
+    "debug",
+    "release",
+]
+
+
+class TopLevelNode(ConfigNode):
+    def __init__(self, node_name, config_tree_data, smoke):
+        super().__init__(None, node_name)
+
+        self.config_tree_data = config_tree_data
+        self.props["smoke"] = smoke
+
+    def get_children(self):
+        return [
+            OSConfigNode(self, x, c, p) for (x, (c, p)) in self.config_tree_data.items()
+        ]
+
+
+class OSConfigNode(ConfigNode):
+    def __init__(self, parent, os_name, gpu_versions, py_tree):
+        super().__init__(parent, os_name)
+
+        self.py_tree = py_tree
+        self.props["os_name"] = os_name
+        self.props["gpu_versions"] = gpu_versions
+
+    def get_children(self):
+        return [PackageFormatConfigNode(self, k, v) for k, v in self.py_tree.items()]
+
+
+class PackageFormatConfigNode(ConfigNode):
+    def __init__(self, parent, package_format, python_versions):
+        super().__init__(parent, package_format)
+
+        self.props["python_versions"] = python_versions
+        self.props["package_format"] = package_format
+
+    def get_children(self):
+        if self.find_prop("os_name") == "linux":
+            return [
+                LinuxGccConfigNode(self, v)
+                for v in LINUX_GCC_CONFIG_VARIANTS[self.find_prop("package_format")]
+            ]
+        elif (
+            self.find_prop("os_name") == "windows"
+            and self.find_prop("package_format") == "libtorch"
+        ):
+            return [
+                WindowsLibtorchConfigNode(self, v)
+                for v in WINDOWS_LIBTORCH_CONFIG_VARIANTS
+            ]
+        else:
+            return [ArchConfigNode(self, v) for v in self.find_prop("gpu_versions")]
+
+
+class LinuxGccConfigNode(ConfigNode):
+    def __init__(self, parent, gcc_config_variant):
+        super().__init__(parent, "GCC_CONFIG_VARIANT=" + str(gcc_config_variant))
+
+        self.props["gcc_config_variant"] = gcc_config_variant
+
+    def get_children(self):
+        gpu_versions = self.find_prop("gpu_versions")
+
+        # XXX devtoolset7 on CUDA 9.0 is temporarily disabled
+        # see https://github.com/pytorch/pytorch/issues/20066
+        if self.find_prop("gcc_config_variant") == "devtoolset7":
+            gpu_versions = filter(lambda x: x != "cuda_90", gpu_versions)
+
+        # XXX disabling conda rocm build since docker images are not there
+        if self.find_prop("package_format") == "conda":
+            gpu_versions = filter(
+                lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions
+            )
+
+        # XXX libtorch rocm build  is temporarily disabled
+        if self.find_prop("package_format") == "libtorch":
+            gpu_versions = filter(
+                lambda x: x not in dimensions.ROCM_VERSION_LABELS, gpu_versions
+            )
+
+        return [ArchConfigNode(self, v) for v in gpu_versions]
+
+
+class WindowsLibtorchConfigNode(ConfigNode):
+    def __init__(self, parent, libtorch_config_variant):
+        super().__init__(
+            parent, "LIBTORCH_CONFIG_VARIANT=" + str(libtorch_config_variant)
+        )
+
+        self.props["libtorch_config_variant"] = libtorch_config_variant
+
+    def get_children(self):
+        return [ArchConfigNode(self, v) for v in self.find_prop("gpu_versions")]
+
+
+class ArchConfigNode(ConfigNode):
+    def __init__(self, parent, gpu):
+        super().__init__(parent, get_processor_arch_name(gpu))
+
+        self.props["gpu"] = gpu
+
+    def get_children(self):
+        return [PyVersionConfigNode(self, v) for v in self.find_prop("python_versions")]
+
+
+class PyVersionConfigNode(ConfigNode):
+    def __init__(self, parent, pyver):
+        super().__init__(parent, pyver)
+
+        self.props["pyver"] = pyver
+
+    def get_children(self):
+        package_format = self.find_prop("package_format")
+        os_name = self.find_prop("os_name")
+
+        has_libtorch_variants = package_format == "libtorch" and os_name == "linux"
+        linking_variants = LINKING_DIMENSIONS if has_libtorch_variants else []
+
+        return [LinkingVariantConfigNode(self, v) for v in linking_variants]
+
+
+class LinkingVariantConfigNode(ConfigNode):
+    def __init__(self, parent, linking_variant):
+        super().__init__(parent, linking_variant)
+
+    def get_children(self):
+        return [
+            DependencyInclusionConfigNode(self, v) for v in DEPS_INCLUSION_DIMENSIONS
+        ]
+
+
+class DependencyInclusionConfigNode(ConfigNode):
+    def __init__(self, parent, deps_variant):
+        super().__init__(parent, deps_variant)
+
+        self.props["libtorch_variant"] = "-".join(
+            [self.parent.get_label(), self.get_label()]
+        )
--- a/.circleci/cimodel/data/binary_build_definitions.py
+++ b/.circleci/cimodel/data/binary_build_definitions.py
@ -0,0 +1,275 @@
+from collections import OrderedDict
+
+import cimodel.data.binary_build_data as binary_build_data
+
+import cimodel.data.simple.util.branch_filters as branch_filters
+import cimodel.lib.conf_tree as conf_tree
+import cimodel.lib.miniutils as miniutils
+
+
+class Conf:
+    def __init__(
+        self,
+        os,
+        gpu_version,
+        pydistro,
+        parms,
+        smoke,
+        libtorch_variant,
+        gcc_config_variant,
+        libtorch_config_variant,
+    ):
+        self.os = os
+        self.gpu_version = gpu_version
+        self.pydistro = pydistro
+        self.parms = parms
+        self.smoke = smoke
+        self.libtorch_variant = libtorch_variant
+        self.gcc_config_variant = gcc_config_variant
+        self.libtorch_config_variant = libtorch_config_variant
+
+    def gen_build_env_parms(self):
+        elems = (
+            [self.pydistro]
+            + self.parms
+            + [binary_build_data.get_processor_arch_name(self.gpu_version)]
+        )
+        if self.gcc_config_variant is not None:
+            elems.append(str(self.gcc_config_variant))
+        if self.libtorch_config_variant is not None:
+            elems.append(str(self.libtorch_config_variant))
+        return elems
+
+    def gen_docker_image(self):
+        if self.gcc_config_variant == "gcc5.4_cxx11-abi":
+            if self.gpu_version is None:
+                return miniutils.quote("pytorch/libtorch-cxx11-builder:cpu")
+            else:
+                return miniutils.quote(
+                    f"pytorch/libtorch-cxx11-builder:{self.gpu_version}"
+                )
+        if self.pydistro == "conda":
+            if self.gpu_version is None:
+                return miniutils.quote("pytorch/conda-builder:cpu")
+            else:
+                return miniutils.quote(f"pytorch/conda-builder:{self.gpu_version}")
+
+        docker_word_substitution = {
+            "manywheel": "manylinux",
+            "libtorch": "manylinux",
+        }
+
+        docker_distro_prefix = miniutils.override(
+            self.pydistro, docker_word_substitution
+        )
+
+        # The cpu nightlies are built on the pytorch/manylinux-cuda102 docker image
+        # TODO cuda images should consolidate into tag-base images similar to rocm
+        alt_docker_suffix = (
+            "cuda102"
+            if not self.gpu_version
+            else (
+                "rocm:" + self.gpu_version.strip("rocm")
+                if self.gpu_version.startswith("rocm")
+                else self.gpu_version
+            )
+        )
+        docker_distro_suffix = (
+            alt_docker_suffix
+            if self.pydistro != "conda"
+            else ("cuda" if alt_docker_suffix.startswith("cuda") else "rocm")
+        )
+        return miniutils.quote(
+            "pytorch/" + docker_distro_prefix + "-" + docker_distro_suffix
+        )
+
+    def get_name_prefix(self):
+        return "smoke" if self.smoke else "binary"
+
+    def gen_build_name(self, build_or_test, nightly):
+        parts = [self.get_name_prefix(), self.os] + self.gen_build_env_parms()
+
+        if nightly:
+            parts.append("nightly")
+
+        if self.libtorch_variant:
+            parts.append(self.libtorch_variant)
+
+        if not self.smoke:
+            parts.append(build_or_test)
+
+        joined = "_".join(parts)
+        return joined.replace(".", "_")
+
+    def gen_workflow_job(self, phase, upload_phase_dependency=None, nightly=False):
+        job_def = OrderedDict()
+        job_def["name"] = self.gen_build_name(phase, nightly)
+        job_def["build_environment"] = miniutils.quote(
+            " ".join(self.gen_build_env_parms())
+        )
+        if self.smoke:
+            job_def["requires"] = [
+                "update_s3_htmls",
+            ]
+            job_def["filters"] = branch_filters.gen_filter_dict(
+                branches_list=["postnightly"],
+            )
+        else:
+            filter_branch = r"/.*/"
+            job_def["filters"] = branch_filters.gen_filter_dict(
+                branches_list=[filter_branch],
+                tags_list=[branch_filters.RC_PATTERN],
+            )
+        if self.libtorch_variant:
+            job_def["libtorch_variant"] = miniutils.quote(self.libtorch_variant)
+        if phase == "test":
+            if not self.smoke:
+                job_def["requires"] = [self.gen_build_name("build", nightly)]
+            if not (self.smoke and self.os == "macos") and self.os != "windows":
+                job_def["docker_image"] = self.gen_docker_image()
+
+            # fix this. only works on cuda not rocm
+            if self.os != "windows" and self.gpu_version:
+                job_def["use_cuda_docker_runtime"] = miniutils.quote("1")
+        else:
+            if self.os == "linux" and phase != "upload":
+                job_def["docker_image"] = self.gen_docker_image()
+
+        if phase == "test":
+            if self.gpu_version:
+                if self.os == "windows":
+                    job_def["executor"] = "windows-with-nvidia-gpu"
+                else:
+                    job_def["resource_class"] = "gpu.medium"
+
+        os_name = miniutils.override(self.os, {"macos": "mac"})
+        job_name = "_".join([self.get_name_prefix(), os_name, phase])
+        return {job_name: job_def}
+
+    def gen_upload_job(self, phase, requires_dependency):
+        """Generate binary_upload job for configuration
+
+          Output looks similar to:
+
+        - binary_upload:
+            name: binary_linux_manywheel_3_7m_cu113_devtoolset7_nightly_upload
+            context: org-member
+            requires: binary_linux_manywheel_3_7m_cu113_devtoolset7_nightly_test
+            filters:
+              branches:
+                only:
+                  - nightly
+              tags:
+                only: /v[0-9]+(\\.[0-9]+)*-rc[0-9]+/
+            package_type: manywheel
+            upload_subfolder: cu113
+        """
+        return {
+            "binary_upload": OrderedDict(
+                {
+                    "name": self.gen_build_name(phase, nightly=True),
+                    "context": "org-member",
+                    "requires": [
+                        self.gen_build_name(requires_dependency, nightly=True)
+                    ],
+                    "filters": branch_filters.gen_filter_dict(
+                        branches_list=["nightly"],
+                        tags_list=[branch_filters.RC_PATTERN],
+                    ),
+                    "package_type": self.pydistro,
+                    "upload_subfolder": binary_build_data.get_processor_arch_name(
+                        self.gpu_version,
+                    ),
+                }
+            )
+        }
+
+
+def get_root(smoke, name):
+    return binary_build_data.TopLevelNode(
+        name,
+        binary_build_data.CONFIG_TREE_DATA,
+        smoke,
+    )
+
+
+def gen_build_env_list(smoke):
+    root = get_root(smoke, "N/A")
+    config_list = conf_tree.dfs(root)
+
+    newlist = []
+    for c in config_list:
+        conf = Conf(
+            c.find_prop("os_name"),
+            c.find_prop("gpu"),
+            c.find_prop("package_format"),
+            [c.find_prop("pyver")],
+            c.find_prop("smoke")
+            and not (c.find_prop("os_name") == "macos_arm64"),  # don't test arm64
+            c.find_prop("libtorch_variant"),
+            c.find_prop("gcc_config_variant"),
+            c.find_prop("libtorch_config_variant"),
+        )
+        newlist.append(conf)
+
+    return newlist
+
+
+def predicate_exclude_macos(config):
+    return config.os == "linux" or config.os == "windows"
+
+
+def get_nightly_uploads():
+    configs = gen_build_env_list(False)
+    mylist = []
+    for conf in configs:
+        phase_dependency = "test" if predicate_exclude_macos(conf) else "build"
+        mylist.append(conf.gen_upload_job("upload", phase_dependency))
+
+    return mylist
+
+
+def get_post_upload_jobs():
+    return [
+        {
+            "update_s3_htmls": {
+                "name": "update_s3_htmls",
+                "context": "org-member",
+                "filters": branch_filters.gen_filter_dict(
+                    branches_list=["postnightly"],
+                ),
+            },
+        },
+    ]
+
+
+def get_nightly_tests():
+    configs = gen_build_env_list(False)
+    filtered_configs = filter(predicate_exclude_macos, configs)
+
+    tests = []
+    for conf_options in filtered_configs:
+        yaml_item = conf_options.gen_workflow_job("test", nightly=True)
+        tests.append(yaml_item)
+
+    return tests
+
+
+def get_jobs(toplevel_key, smoke):
+    jobs_list = []
+    configs = gen_build_env_list(smoke)
+    phase = "build" if toplevel_key == "binarybuilds" else "test"
+    for build_config in configs:
+        # don't test for macos_arm64 as it's cross compiled
+        if phase != "test" or build_config.os != "macos_arm64":
+            jobs_list.append(build_config.gen_workflow_job(phase, nightly=True))
+
+    return jobs_list
+
+
+def get_binary_build_jobs():
+    return get_jobs("binarybuilds", False)
+
+
+def get_binary_smoke_test_jobs():
+    return get_jobs("binarysmoketests", True)
--- a/.circleci/cimodel/data/dimensions.py
+++ b/.circleci/cimodel/data/dimensions.py
@ -0,0 +1,19 @@
+PHASES = ["build", "test"]
+
+CUDA_VERSIONS = [
+    "102",
+    "113",
+    "116",
+    "117",
+]
+
+ROCM_VERSIONS = [
+    "4.3.1",
+    "4.5.2",
+]
+
+ROCM_VERSION_LABELS = ["rocm" + v for v in ROCM_VERSIONS]
+
+GPU_VERSIONS = [None] + ["cuda" + v for v in CUDA_VERSIONS] + ROCM_VERSION_LABELS
+
+STANDARD_PYTHON_VERSIONS = ["3.7", "3.8", "3.9", "3.10"]
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@ -0,0 +1,296 @@
+from cimodel.lib.conf_tree import ConfigNode
+
+
+CONFIG_TREE_DATA = []
+
+
+def get_major_pyver(dotted_version):
+    parts = dotted_version.split(".")
+    return "py" + parts[0]
+
+
+class TreeConfigNode(ConfigNode):
+    def __init__(self, parent, node_name, subtree):
+        super().__init__(parent, self.modify_label(node_name))
+        self.subtree = subtree
+        self.init2(node_name)
+
+    def modify_label(self, label):
+        return label
+
+    def init2(self, node_name):
+        pass
+
+    def get_children(self):
+        return [self.child_constructor()(self, k, v) for (k, v) in self.subtree]
+
+
+class TopLevelNode(TreeConfigNode):
+    def __init__(self, node_name, subtree):
+        super().__init__(None, node_name, subtree)
+
+    # noinspection PyMethodMayBeStatic
+    def child_constructor(self):
+        return DistroConfigNode
+
+
+class DistroConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["distro_name"] = node_name
+
+    def child_constructor(self):
+        distro = self.find_prop("distro_name")
+
+        next_nodes = {
+            "xenial": XenialCompilerConfigNode,
+            "bionic": BionicCompilerConfigNode,
+        }
+        return next_nodes[distro]
+
+
+class PyVerConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["pyver"] = node_name
+        self.props["abbreviated_pyver"] = get_major_pyver(node_name)
+        if node_name == "3.9":
+            self.props["abbreviated_pyver"] = "py3.9"
+
+    # noinspection PyMethodMayBeStatic
+    def child_constructor(self):
+        return ExperimentalFeatureConfigNode
+
+
+class ExperimentalFeatureConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["experimental_feature"] = node_name
+
+    def child_constructor(self):
+        experimental_feature = self.find_prop("experimental_feature")
+
+        next_nodes = {
+            "asan": AsanConfigNode,
+            "xla": XlaConfigNode,
+            "mps": MPSConfigNode,
+            "vulkan": VulkanConfigNode,
+            "parallel_tbb": ParallelTBBConfigNode,
+            "crossref": CrossRefConfigNode,
+            "dynamo": DynamoConfigNode,
+            "parallel_native": ParallelNativeConfigNode,
+            "onnx": ONNXConfigNode,
+            "libtorch": LibTorchConfigNode,
+            "important": ImportantConfigNode,
+            "build_only": BuildOnlyConfigNode,
+            "shard_test": ShardTestConfigNode,
+            "cuda_gcc_override": CudaGccOverrideConfigNode,
+            "pure_torch": PureTorchConfigNode,
+            "slow_gradcheck": SlowGradcheckConfigNode,
+        }
+        return next_nodes[experimental_feature]
+
+
+class SlowGradcheckConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["is_slow_gradcheck"] = True
+
+    def child_constructor(self):
+        return ExperimentalFeatureConfigNode
+
+
+class PureTorchConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "PURE_TORCH=" + str(label)
+
+    def init2(self, node_name):
+        self.props["is_pure_torch"] = node_name
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class XlaConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "XLA=" + str(label)
+
+    def init2(self, node_name):
+        self.props["is_xla"] = node_name
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class MPSConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "MPS=" + str(label)
+
+    def init2(self, node_name):
+        self.props["is_mps"] = node_name
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class AsanConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "Asan=" + str(label)
+
+    def init2(self, node_name):
+        self.props["is_asan"] = node_name
+
+    def child_constructor(self):
+        return ExperimentalFeatureConfigNode
+
+
+class ONNXConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "Onnx=" + str(label)
+
+    def init2(self, node_name):
+        self.props["is_onnx"] = node_name
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class VulkanConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "Vulkan=" + str(label)
+
+    def init2(self, node_name):
+        self.props["is_vulkan"] = node_name
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class ParallelTBBConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "PARALLELTBB=" + str(label)
+
+    def init2(self, node_name):
+        self.props["parallel_backend"] = "paralleltbb"
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class CrossRefConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["is_crossref"] = node_name
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class DynamoConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["is_dynamo"] = node_name
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class ParallelNativeConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "PARALLELNATIVE=" + str(label)
+
+    def init2(self, node_name):
+        self.props["parallel_backend"] = "parallelnative"
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class LibTorchConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "BUILD_TEST_LIBTORCH=" + str(label)
+
+    def init2(self, node_name):
+        self.props["is_libtorch"] = node_name
+
+    def child_constructor(self):
+        return ExperimentalFeatureConfigNode
+
+
+class CudaGccOverrideConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["cuda_gcc_override"] = node_name
+
+    def child_constructor(self):
+        return ExperimentalFeatureConfigNode
+
+
+class BuildOnlyConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["build_only"] = node_name
+
+    def child_constructor(self):
+        return ExperimentalFeatureConfigNode
+
+
+class ShardTestConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["shard_test"] = node_name
+
+    def child_constructor(self):
+        return ImportantConfigNode
+
+
+class ImportantConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return "IMPORTANT=" + str(label)
+
+    def init2(self, node_name):
+        self.props["is_important"] = node_name
+
+    def get_children(self):
+        return []
+
+
+class XenialCompilerConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return label or "<unspecified>"
+
+    def init2(self, node_name):
+        self.props["compiler_name"] = node_name
+
+    # noinspection PyMethodMayBeStatic
+    def child_constructor(self):
+        return (
+            XenialCompilerVersionConfigNode
+            if self.props["compiler_name"]
+            else PyVerConfigNode
+        )
+
+
+class BionicCompilerConfigNode(TreeConfigNode):
+    def modify_label(self, label):
+        return label or "<unspecified>"
+
+    def init2(self, node_name):
+        self.props["compiler_name"] = node_name
+
+    # noinspection PyMethodMayBeStatic
+    def child_constructor(self):
+        return (
+            BionicCompilerVersionConfigNode
+            if self.props["compiler_name"]
+            else PyVerConfigNode
+        )
+
+
+class XenialCompilerVersionConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["compiler_version"] = node_name
+
+    # noinspection PyMethodMayBeStatic
+    def child_constructor(self):
+        return PyVerConfigNode
+
+
+class BionicCompilerVersionConfigNode(TreeConfigNode):
+    def init2(self, node_name):
+        self.props["compiler_version"] = node_name
+
+    # noinspection PyMethodMayBeStatic
+    def child_constructor(self):
+        return PyVerConfigNode
--- a/.circleci/cimodel/data/pytorch_build_definitions.py
+++ b/.circleci/cimodel/data/pytorch_build_definitions.py
@ -0,0 +1,382 @@
+from collections import OrderedDict
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+import cimodel.data.dimensions as dimensions
+import cimodel.lib.conf_tree as conf_tree
+import cimodel.lib.miniutils as miniutils
+from cimodel.data.pytorch_build_data import CONFIG_TREE_DATA, TopLevelNode
+from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN
+from cimodel.data.simple.util.docker_constants import gen_docker_image
+
+
+@dataclass
+class Conf:
+    distro: str
+    parms: List[str]
+    parms_list_ignored_for_docker_image: Optional[List[str]] = None
+    pyver: Optional[str] = None
+    cuda_version: Optional[str] = None
+    rocm_version: Optional[str] = None
+    # TODO expand this to cover all the USE_* that we want to test for
+    #  tesnrorrt, leveldb, lmdb, redis, opencv, mkldnn, ideep, etc.
+    # (from https://github.com/pytorch/pytorch/pull/17323#discussion_r259453608)
+    is_xla: bool = False
+    is_vulkan: bool = False
+    is_pure_torch: bool = False
+    restrict_phases: Optional[List[str]] = None
+    gpu_resource: Optional[str] = None
+    dependent_tests: List = field(default_factory=list)
+    parent_build: Optional["Conf"] = None
+    is_libtorch: bool = False
+    is_important: bool = False
+    parallel_backend: Optional[str] = None
+    build_only: bool = False
+
+    @staticmethod
+    def is_test_phase(phase):
+        return "test" in phase
+
+    # TODO: Eliminate the special casing for docker paths
+    # In the short term, we *will* need to support special casing as docker images are merged for caffe2 and pytorch
+    def get_parms(self, for_docker):
+        leading = []
+        # We just don't run non-important jobs on pull requests;
+        # previously we also named them in a way to make it obvious
+        # if self.is_important and not for_docker:
+        #    leading.append("AAA")
+        leading.append("pytorch")
+        if self.is_xla and not for_docker:
+            leading.append("xla")
+        if self.is_vulkan and not for_docker:
+            leading.append("vulkan")
+        if self.is_libtorch and not for_docker:
+            leading.append("libtorch")
+        if self.is_pure_torch and not for_docker:
+            leading.append("pure_torch")
+        if self.parallel_backend is not None and not for_docker:
+            leading.append(self.parallel_backend)
+
+        cuda_parms = []
+        if self.cuda_version:
+            cudnn = "cudnn8" if self.cuda_version.startswith("11.") else "cudnn7"
+            cuda_parms.extend(["cuda" + self.cuda_version, cudnn])
+        if self.rocm_version:
+            cuda_parms.extend([f"rocm{self.rocm_version}"])
+        result = leading + ["linux", self.distro] + cuda_parms + self.parms
+        if not for_docker and self.parms_list_ignored_for_docker_image is not None:
+            result = result + self.parms_list_ignored_for_docker_image
+        return result
+
+    def gen_docker_image_path(self):
+        parms_source = self.parent_build or self
+        base_build_env_name = "-".join(parms_source.get_parms(True))
+        image_name, _ = gen_docker_image(base_build_env_name)
+        return miniutils.quote(image_name)
+
+    def gen_docker_image_requires(self):
+        parms_source = self.parent_build or self
+        base_build_env_name = "-".join(parms_source.get_parms(True))
+        _, requires = gen_docker_image(base_build_env_name)
+        return miniutils.quote(requires)
+
+    def get_build_job_name_pieces(self, build_or_test):
+        return self.get_parms(False) + [build_or_test]
+
+    def gen_build_name(self, build_or_test):
+        return (
+            ("_".join(map(str, self.get_build_job_name_pieces(build_or_test))))
+            .replace(".", "_")
+            .replace("-", "_")
+        )
+
+    def get_dependents(self):
+        return self.dependent_tests or []
+
+    def gen_workflow_params(self, phase):
+        parameters = OrderedDict()
+        build_job_name_pieces = self.get_build_job_name_pieces(phase)
+
+        build_env_name = "-".join(map(str, build_job_name_pieces))
+        parameters["build_environment"] = miniutils.quote(build_env_name)
+        parameters["docker_image"] = self.gen_docker_image_path()
+        if Conf.is_test_phase(phase) and self.gpu_resource:
+            parameters["use_cuda_docker_runtime"] = miniutils.quote("1")
+        if Conf.is_test_phase(phase):
+            resource_class = "large"
+            if self.gpu_resource:
+                resource_class = "gpu." + self.gpu_resource
+            if self.rocm_version is not None:
+                resource_class = "pytorch/amd-gpu"
+            parameters["resource_class"] = resource_class
+        if phase == "build" and self.rocm_version is not None:
+            parameters["resource_class"] = "xlarge"
+        if hasattr(self, "filters"):
+            parameters["filters"] = self.filters
+        if self.build_only:
+            parameters["build_only"] = miniutils.quote(str(int(True)))
+        return parameters
+
+    def gen_workflow_job(self, phase):
+        job_def = OrderedDict()
+        job_def["name"] = self.gen_build_name(phase)
+
+        if Conf.is_test_phase(phase):
+            # TODO When merging the caffe2 and pytorch jobs, it might be convenient for a while to make a
+            #  caffe2 test job dependent on a pytorch build job. This way we could quickly dedup the repeated
+            #  build of pytorch in the caffe2 build job, and just run the caffe2 tests off of a completed
+            #  pytorch build job (from https://github.com/pytorch/pytorch/pull/17323#discussion_r259452641)
+
+            dependency_build = self.parent_build or self
+            job_def["requires"] = [dependency_build.gen_build_name("build")]
+            job_name = "pytorch_linux_test"
+        else:
+            job_name = "pytorch_linux_build"
+            job_def["requires"] = [self.gen_docker_image_requires()]
+
+        if not self.is_important:
+            job_def["filters"] = gen_filter_dict()
+        job_def.update(self.gen_workflow_params(phase))
+
+        return {job_name: job_def}
+
+
+# TODO This is a hack to special case some configs just for the workflow list
+class HiddenConf:
+    def __init__(self, name, parent_build=None, filters=None):
+        self.name = name
+        self.parent_build = parent_build
+        self.filters = filters
+
+    def gen_workflow_job(self, phase):
+        return {
+            self.gen_build_name(phase): {
+                "requires": [self.parent_build.gen_build_name("build")],
+                "filters": self.filters,
+            }
+        }
+
+    def gen_build_name(self, _):
+        return self.name
+
+
+class DocPushConf:
+    def __init__(self, name, parent_build=None, branch="master"):
+        self.name = name
+        self.parent_build = parent_build
+        self.branch = branch
+
+    def gen_workflow_job(self, phase):
+        return {
+            "pytorch_doc_push": {
+                "name": self.name,
+                "branch": self.branch,
+                "requires": [self.parent_build],
+                "context": "org-member",
+                "filters": gen_filter_dict(
+                    branches_list=["nightly"], tags_list=RC_PATTERN
+                ),
+            }
+        }
+
+
+def gen_docs_configs(xenial_parent_config):
+    configs = []
+
+    configs.append(
+        HiddenConf(
+            "pytorch_python_doc_build",
+            parent_build=xenial_parent_config,
+            filters=gen_filter_dict(
+                branches_list=["master", "main", "nightly"], tags_list=RC_PATTERN
+            ),
+        )
+    )
+    configs.append(
+        DocPushConf(
+            "pytorch_python_doc_push",
+            parent_build="pytorch_python_doc_build",
+            branch="site",
+        )
+    )
+
+    configs.append(
+        HiddenConf(
+            "pytorch_cpp_doc_build",
+            parent_build=xenial_parent_config,
+            filters=gen_filter_dict(
+                branches_list=["master", "main", "nightly"], tags_list=RC_PATTERN
+            ),
+        )
+    )
+    configs.append(
+        DocPushConf(
+            "pytorch_cpp_doc_push",
+            parent_build="pytorch_cpp_doc_build",
+            branch="master",
+        )
+    )
+    return configs
+
+
+def get_root():
+    return TopLevelNode("PyTorch Builds", CONFIG_TREE_DATA)
+
+
+def gen_tree():
+    root = get_root()
+    configs_list = conf_tree.dfs(root)
+    return configs_list
+
+
+def instantiate_configs(only_slow_gradcheck):
+    config_list = []
+
+    root = get_root()
+    found_configs = conf_tree.dfs(root)
+    for fc in found_configs:
+        restrict_phases = None
+        distro_name = fc.find_prop("distro_name")
+        compiler_name = fc.find_prop("compiler_name")
+        compiler_version = fc.find_prop("compiler_version")
+        is_xla = fc.find_prop("is_xla") or False
+        is_asan = fc.find_prop("is_asan") or False
+        is_crossref = fc.find_prop("is_crossref") or False
+        is_dynamo = fc.find_prop("is_dynamo") or False
+        is_onnx = fc.find_prop("is_onnx") or False
+        is_pure_torch = fc.find_prop("is_pure_torch") or False
+        is_vulkan = fc.find_prop("is_vulkan") or False
+        is_slow_gradcheck = fc.find_prop("is_slow_gradcheck") or False
+        parms_list_ignored_for_docker_image = []
+
+        if only_slow_gradcheck ^ is_slow_gradcheck:
+            continue
+
+        python_version = None
+        if compiler_name == "cuda" or compiler_name == "android":
+            python_version = fc.find_prop("pyver")
+            parms_list = [fc.find_prop("abbreviated_pyver")]
+        else:
+            parms_list = ["py" + fc.find_prop("pyver")]
+
+        cuda_version = None
+        rocm_version = None
+        if compiler_name == "cuda":
+            cuda_version = fc.find_prop("compiler_version")
+
+        elif compiler_name == "rocm":
+            rocm_version = fc.find_prop("compiler_version")
+            restrict_phases = ["build", "test1", "test2", "caffe2_test"]
+
+        elif compiler_name == "android":
+            android_ndk_version = fc.find_prop("compiler_version")
+            # TODO: do we need clang to compile host binaries like protoc?
+            parms_list.append("clang5")
+            parms_list.append("android-ndk-" + android_ndk_version)
+            android_abi = fc.find_prop("android_abi")
+            parms_list_ignored_for_docker_image.append(android_abi)
+            restrict_phases = ["build"]
+
+        elif compiler_name:
+            gcc_version = compiler_name + (fc.find_prop("compiler_version") or "")
+            parms_list.append(gcc_version)
+
+        if is_asan:
+            parms_list.append("asan")
+            python_version = fc.find_prop("pyver")
+            parms_list[0] = fc.find_prop("abbreviated_pyver")
+
+        if is_crossref:
+            parms_list_ignored_for_docker_image.append("crossref")
+
+        if is_dynamo:
+            parms_list_ignored_for_docker_image.append("dynamo")
+
+        if is_onnx:
+            parms_list.append("onnx")
+            python_version = fc.find_prop("pyver")
+            parms_list[0] = fc.find_prop("abbreviated_pyver")
+            restrict_phases = ["build", "ort_test1", "ort_test2"]
+
+        if cuda_version:
+            cuda_gcc_version = fc.find_prop("cuda_gcc_override") or "gcc7"
+            parms_list.append(cuda_gcc_version)
+
+        is_libtorch = fc.find_prop("is_libtorch") or False
+        is_important = fc.find_prop("is_important") or False
+        parallel_backend = fc.find_prop("parallel_backend") or None
+        build_only = fc.find_prop("build_only") or False
+        shard_test = fc.find_prop("shard_test") or False
+        # TODO: fix pure_torch python test packaging issue.
+        if shard_test:
+            restrict_phases = ["build"] if restrict_phases is None else restrict_phases
+            restrict_phases.extend(["test1", "test2"])
+        if build_only or is_pure_torch:
+            restrict_phases = ["build"]
+
+        if is_slow_gradcheck:
+            parms_list_ignored_for_docker_image.append("old")
+            parms_list_ignored_for_docker_image.append("gradcheck")
+
+        gpu_resource = None
+        if cuda_version and cuda_version != "10":
+            gpu_resource = "medium"
+
+        c = Conf(
+            distro_name,
+            parms_list,
+            parms_list_ignored_for_docker_image,
+            python_version,
+            cuda_version,
+            rocm_version,
+            is_xla,
+            is_vulkan,
+            is_pure_torch,
+            restrict_phases,
+            gpu_resource,
+            is_libtorch=is_libtorch,
+            is_important=is_important,
+            parallel_backend=parallel_backend,
+            build_only=build_only,
+        )
+
+        # run docs builds on "pytorch-linux-xenial-py3.7-gcc5.4". Docs builds
+        # should run on a CPU-only build that runs on all PRs.
+        # XXX should this be updated to a more modern build?
+        if (
+            distro_name == "xenial"
+            and fc.find_prop("pyver") == "3.7"
+            and cuda_version is None
+            and parallel_backend is None
+            and not is_vulkan
+            and not is_pure_torch
+            and compiler_name == "gcc"
+            and fc.find_prop("compiler_version") == "5.4"
+        ):
+            c.filters = gen_filter_dict(branches_list=r"/.*/", tags_list=RC_PATTERN)
+            c.dependent_tests = gen_docs_configs(c)
+
+        config_list.append(c)
+
+    return config_list
+
+
+def get_workflow_jobs(only_slow_gradcheck=False):
+    config_list = instantiate_configs(only_slow_gradcheck)
+
+    x = []
+    for conf_options in config_list:
+        phases = conf_options.restrict_phases or dimensions.PHASES
+
+        for phase in phases:
+            # TODO why does this not have a test?
+            if Conf.is_test_phase(phase) and conf_options.cuda_version == "10":
+                continue
+
+            x.append(conf_options.gen_workflow_job(phase))
+
+        # TODO convert to recursion
+        for conf in conf_options.get_dependents():
+            x.append(conf.gen_workflow_job("test"))
+
+    return x
--- a/torch/_inductor/codegen/cuda/cutlass_lib_extensions/init.py
+++ b/torch/_inductor/codegen/cuda/cutlass_lib_extensions/init.py
--- a/.circleci/cimodel/data/simple/anaconda_prune_defintions.py
+++ b/.circleci/cimodel/data/simple/anaconda_prune_defintions.py
@ -0,0 +1,28 @@
+from collections import OrderedDict
+
+from cimodel.data.simple.util.branch_filters import gen_filter_dict
+from cimodel.lib.miniutils import quote
+
+
+CHANNELS_TO_PRUNE = ["pytorch-nightly", "pytorch-test"]
+PACKAGES_TO_PRUNE = "pytorch torchvision torchaudio torchtext ignite torchcsprng"
+
+
+def gen_workflow_job(channel: str):
+    return OrderedDict(
+        {
+            "anaconda_prune": OrderedDict(
+                {
+                    "name": f"anaconda-prune-{channel}",
+                    "context": quote("org-member"),
+                    "packages": quote(PACKAGES_TO_PRUNE),
+                    "channel": channel,
+                    "filters": gen_filter_dict(branches_list=["postnightly"]),
+                }
+            )
+        }
+    )
+
+
+def get_workflow_jobs():
+    return [gen_workflow_job(channel) for channel in CHANNELS_TO_PRUNE]
--- a/.circleci/cimodel/data/simple/docker_definitions.py
+++ b/.circleci/cimodel/data/simple/docker_definitions.py
@ -0,0 +1,39 @@
+from collections import OrderedDict
+
+from cimodel.data.simple.util.branch_filters import gen_filter_dict, RC_PATTERN
+
+from cimodel.lib.miniutils import quote
+
+
+# NOTE: All hardcoded docker image builds have been migrated to GHA
+IMAGE_NAMES = []
+
+# This entry should be an element from the list above
+# This should contain the image matching the "slow_gradcheck" entry in
+# pytorch_build_data.py
+SLOW_GRADCHECK_IMAGE_NAME = "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
+
+
+def get_workflow_jobs(images=IMAGE_NAMES, only_slow_gradcheck=False):
+    """Generates a list of docker image build definitions"""
+    ret = []
+    for image_name in images:
+        if image_name.startswith("docker-"):
+            image_name = image_name.lstrip("docker-")
+        if only_slow_gradcheck and image_name is not SLOW_GRADCHECK_IMAGE_NAME:
+            continue
+
+        parameters = OrderedDict(
+            {
+                "name": quote(f"docker-{image_name}"),
+                "image_name": quote(image_name),
+            }
+        )
+        if image_name == "pytorch-linux-xenial-py3.7-gcc5.4":
+            # pushing documentation on tags requires CircleCI to also
+            # build all the dependencies on tags, including this docker image
+            parameters["filters"] = gen_filter_dict(
+                branches_list=r"/.*/", tags_list=RC_PATTERN
+            )
+        ret.append(OrderedDict({"docker_build_job": parameters}))
+    return ret
--- a/.circleci/cimodel/data/simple/ios_definitions.py
+++ b/.circleci/cimodel/data/simple/ios_definitions.py
@ -0,0 +1,100 @@
+import cimodel.lib.miniutils as miniutils
+from cimodel.data.simple.util.branch_filters import gen_filter_dict_exclude
+from cimodel.data.simple.util.versions import MultiPartVersion
+
+XCODE_VERSION = MultiPartVersion([12, 5, 1])
+
+
+class ArchVariant:
+    def __init__(self, name, custom_build_name=""):
+        self.name = name
+        self.custom_build_name = custom_build_name
+
+    def render(self):
+        extra_parts = (
+            [self.custom_build_name] if len(self.custom_build_name) > 0 else []
+        )
+        return "-".join([self.name] + extra_parts).replace("_", "-")
+
+
+def get_platform(arch_variant_name):
+    return "SIMULATOR" if arch_variant_name == "x86_64" else "OS"
+
+
+class IOSJob:
+    def __init__(
+        self, xcode_version, arch_variant, is_org_member_context=True, extra_props=None
+    ):
+        self.xcode_version = xcode_version
+        self.arch_variant = arch_variant
+        self.is_org_member_context = is_org_member_context
+        self.extra_props = extra_props
+
+    def gen_name_parts(self):
+        version_parts = self.xcode_version.render_dots_or_parts("-")
+        build_variant_suffix = self.arch_variant.render()
+        return (
+            [
+                "ios",
+            ]
+            + version_parts
+            + [
+                build_variant_suffix,
+            ]
+        )
+
+    def gen_job_name(self):
+        return "-".join(self.gen_name_parts())
+
+    def gen_tree(self):
+        platform_name = get_platform(self.arch_variant.name)
+        props_dict = {
+            "name": self.gen_job_name(),
+            "build_environment": self.gen_job_name(),
+            "ios_arch": self.arch_variant.name,
+            "ios_platform": platform_name,
+        }
+
+        if self.is_org_member_context:
+            props_dict["context"] = "org-member"
+
+        if self.extra_props:
+            props_dict.update(self.extra_props)
+
+        props_dict["filters"] = gen_filter_dict_exclude()
+
+        return [{"pytorch_ios_build": props_dict}]
+
+
+WORKFLOW_DATA = [
+    IOSJob(
+        XCODE_VERSION,
+        ArchVariant("x86_64"),
+        is_org_member_context=False,
+        extra_props={"lite_interpreter": miniutils.quote(str(int(True)))},
+    ),
+    # IOSJob(XCODE_VERSION, ArchVariant("arm64"), extra_props={
+    #     "lite_interpreter": miniutils.quote(str(int(True)))}),
+    # IOSJob(XCODE_VERSION, ArchVariant("arm64", "metal"), extra_props={
+    #     "use_metal": miniutils.quote(str(int(True))),
+    #     "lite_interpreter": miniutils.quote(str(int(True)))}),
+    # IOSJob(XCODE_VERSION, ArchVariant("arm64", "custom-ops"), extra_props={
+    #     "op_list": "mobilenetv2.yaml",
+    #     "lite_interpreter": miniutils.quote(str(int(True)))}),
+    IOSJob(
+        XCODE_VERSION,
+        ArchVariant("x86_64", "coreml"),
+        is_org_member_context=False,
+        extra_props={
+            "use_coreml": miniutils.quote(str(int(True))),
+            "lite_interpreter": miniutils.quote(str(int(True))),
+        },
+    ),
+    # IOSJob(XCODE_VERSION, ArchVariant("arm64", "coreml"), extra_props={
+    #     "use_coreml": miniutils.quote(str(int(True))),
+    #     "lite_interpreter": miniutils.quote(str(int(True)))}),
+]
+
+
+def get_workflow_jobs():
+    return [item.gen_tree() for item in WORKFLOW_DATA]
--- a/.circleci/cimodel/data/simple/macos_definitions.py
+++ b/.circleci/cimodel/data/simple/macos_definitions.py
@ -0,0 +1,54 @@
+class MacOsJob:
+    def __init__(self, os_version, is_build=False, is_test=False, extra_props=tuple()):
+        # extra_props is tuple type, because mutable data structures for argument defaults
+        # is not recommended.
+        self.os_version = os_version
+        self.is_build = is_build
+        self.is_test = is_test
+        self.extra_props = dict(extra_props)
+
+    def gen_tree(self):
+        non_phase_parts = ["pytorch", "macos", self.os_version, "py3"]
+
+        extra_name_list = [name for name, exist in self.extra_props.items() if exist]
+        full_job_name_list = (
+            non_phase_parts
+            + extra_name_list
+            + [
+                "build" if self.is_build else None,
+                "test" if self.is_test else None,
+            ]
+        )
+
+        full_job_name = "_".join(list(filter(None, full_job_name_list)))
+
+        test_build_dependency = "_".join(non_phase_parts + ["build"])
+        extra_dependencies = [test_build_dependency] if self.is_test else []
+        job_dependencies = extra_dependencies
+
+        # Yes we name the job after itself, it needs a non-empty value in here
+        # for the YAML output to work.
+        props_dict = {"requires": job_dependencies, "name": full_job_name}
+
+        return [{full_job_name: props_dict}]
+
+
+WORKFLOW_DATA = [
+    MacOsJob("10_15", is_build=True),
+    MacOsJob("10_13", is_build=True),
+    MacOsJob(
+        "10_13",
+        is_build=False,
+        is_test=True,
+    ),
+    MacOsJob(
+        "10_13",
+        is_build=True,
+        is_test=True,
+        extra_props=tuple({"lite_interpreter": True}.items()),
+    ),
+]
+
+
+def get_workflow_jobs():
+    return [item.gen_tree() for item in WORKFLOW_DATA]
--- a/.circleci/cimodel/data/simple/mobile_definitions.py
+++ b/.circleci/cimodel/data/simple/mobile_definitions.py
@ -0,0 +1,51 @@
+"""
+PyTorch Mobile PR builds (use linux host toolchain + mobile build options)
+"""
+
+import cimodel.data.simple.util.branch_filters
+import cimodel.lib.miniutils as miniutils
+
+
+class MobileJob:
+    def __init__(
+        self, docker_image, docker_requires, variant_parts, is_master_only=False
+    ):
+        self.docker_image = docker_image
+        self.docker_requires = docker_requires
+        self.variant_parts = variant_parts
+        self.is_master_only = is_master_only
+
+    def gen_tree(self):
+        non_phase_parts = [
+            "pytorch",
+            "linux",
+            "xenial",
+            "py3",
+            "clang5",
+            "mobile",
+        ] + self.variant_parts
+
+        full_job_name = "_".join(non_phase_parts)
+        build_env_name = "-".join(non_phase_parts)
+
+        props_dict = {
+            "build_environment": build_env_name,
+            "build_only": miniutils.quote(str(int(True))),
+            "docker_image": self.docker_image,
+            "requires": self.docker_requires,
+            "name": full_job_name,
+        }
+
+        if self.is_master_only:
+            props_dict[
+                "filters"
+            ] = cimodel.data.simple.util.branch_filters.gen_filter_dict()
+
+        return [{"pytorch_linux_build": props_dict}]
+
+
+WORKFLOW_DATA = []
+
+
+def get_workflow_jobs():
+    return [item.gen_tree() for item in WORKFLOW_DATA]
--- a/.circleci/cimodel/data/simple/nightly_ios.py
+++ b/.circleci/cimodel/data/simple/nightly_ios.py
@ -0,0 +1,96 @@
+import cimodel.data.simple.ios_definitions as ios_definitions
+import cimodel.lib.miniutils as miniutils
+
+
+class IOSNightlyJob:
+    def __init__(self, variant, is_full_jit=False, is_upload=False):
+        self.variant = variant
+        self.is_full_jit = is_full_jit
+        self.is_upload = is_upload
+
+    def get_phase_name(self):
+        return "upload" if self.is_upload else "build"
+
+    def get_common_name_pieces(self, sep):
+        extra_name_suffix = [self.get_phase_name()] if self.is_upload else []
+
+        extra_name = ["full_jit"] if self.is_full_jit else []
+
+        common_name_pieces = (
+            [
+                "ios",
+            ]
+            + extra_name
+            + []
+            + ios_definitions.XCODE_VERSION.render_dots_or_parts(sep)
+            + [
+                "nightly",
+                self.variant,
+                "build",
+            ]
+            + extra_name_suffix
+        )
+
+        return common_name_pieces
+
+    def gen_job_name(self):
+        return "_".join(["pytorch"] + self.get_common_name_pieces(None))
+
+    def gen_tree(self):
+        build_configs = BUILD_CONFIGS_FULL_JIT if self.is_full_jit else BUILD_CONFIGS
+        extra_requires = (
+            [x.gen_job_name() for x in build_configs] if self.is_upload else []
+        )
+
+        props_dict = {
+            "build_environment": "-".join(
+                ["libtorch"] + self.get_common_name_pieces(".")
+            ),
+            "requires": extra_requires,
+            "context": "org-member",
+            "filters": {"branches": {"only": "nightly"}},
+        }
+
+        if not self.is_upload:
+            props_dict["ios_arch"] = self.variant
+            props_dict["ios_platform"] = ios_definitions.get_platform(self.variant)
+            props_dict["name"] = self.gen_job_name()
+            props_dict["use_metal"] = miniutils.quote(str(int(True)))
+            props_dict["use_coreml"] = miniutils.quote(str(int(True)))
+
+        if self.is_full_jit:
+            props_dict["lite_interpreter"] = miniutils.quote(str(int(False)))
+
+        template_name = "_".join(
+            [
+                "binary",
+                "ios",
+                self.get_phase_name(),
+            ]
+        )
+
+        return [{template_name: props_dict}]
+
+
+BUILD_CONFIGS = [
+    IOSNightlyJob("x86_64"),
+    IOSNightlyJob("arm64"),
+]
+
+BUILD_CONFIGS_FULL_JIT = [
+    IOSNightlyJob("x86_64", is_full_jit=True),
+    IOSNightlyJob("arm64", is_full_jit=True),
+]
+
+WORKFLOW_DATA = (
+    BUILD_CONFIGS
+    + BUILD_CONFIGS_FULL_JIT
+    + [
+        IOSNightlyJob("binary", is_full_jit=False, is_upload=True),
+        IOSNightlyJob("binary", is_full_jit=True, is_upload=True),
+    ]
+)
+
+
+def get_workflow_jobs():
+    return [item.gen_tree() for item in WORKFLOW_DATA]
--- a/torch/_inductor/fx_passes/serialized_patterns/init.py
+++ b/torch/_inductor/fx_passes/serialized_patterns/init.py
--- a/.circleci/cimodel/data/simple/util/branch_filters.py
+++ b/.circleci/cimodel/data/simple/util/branch_filters.py
@ -0,0 +1,36 @@
+NON_PR_BRANCH_LIST = [
+    "main",
+    "master",
+    r"/ci-all\/.*/",
+    r"/release\/.*/",
+]
+
+PR_BRANCH_LIST = [
+    r"/gh\/.*\/head/",
+    r"/pull\/.*/",
+]
+
+RC_PATTERN = r"/v[0-9]+(\.[0-9]+)*-rc[0-9]+/"
+
+MAC_IOS_EXCLUSION_LIST = ["nightly", "postnightly"]
+
+
+def gen_filter_dict(branches_list=NON_PR_BRANCH_LIST, tags_list=None):
+    """Generates a filter dictionary for use with CircleCI's job filter"""
+    filter_dict = {
+        "branches": {
+            "only": branches_list,
+        },
+    }
+
+    if tags_list is not None:
+        filter_dict["tags"] = {"only": tags_list}
+    return filter_dict
+
+
+def gen_filter_dict_exclude(branches_list=MAC_IOS_EXCLUSION_LIST):
+    return {
+        "branches": {
+            "ignore": branches_list,
+        },
+    }
--- a/.circleci/cimodel/data/simple/util/docker_constants.py
+++ b/.circleci/cimodel/data/simple/util/docker_constants.py
@ -0,0 +1,35 @@
+AWS_DOCKER_HOST = "308535385114.dkr.ecr.us-east-1.amazonaws.com"
+
+
+def gen_docker_image(container_type):
+    return (
+        "/".join([AWS_DOCKER_HOST, "pytorch", container_type]),
+        f"docker-{container_type}",
+    )
+
+
+def gen_docker_image_requires(image_name):
+    return [f"docker-{image_name}"]
+
+
+DOCKER_IMAGE_BASIC, DOCKER_REQUIREMENT_BASE = gen_docker_image(
+    "pytorch-linux-xenial-py3.7-gcc5.4"
+)
+
+DOCKER_IMAGE_CUDA_10_2, DOCKER_REQUIREMENT_CUDA_10_2 = gen_docker_image(
+    "pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7"
+)
+
+DOCKER_IMAGE_GCC7, DOCKER_REQUIREMENT_GCC7 = gen_docker_image(
+    "pytorch-linux-xenial-py3.7-gcc7"
+)
+
+
+def gen_mobile_docker(specifier):
+    container_type = "pytorch-linux-xenial-py3-clang5-" + specifier
+    return gen_docker_image(container_type)
+
+
+DOCKER_IMAGE_ASAN, DOCKER_REQUIREMENT_ASAN = gen_mobile_docker("asan")
+
+DOCKER_IMAGE_NDK, DOCKER_REQUIREMENT_NDK = gen_mobile_docker("android-ndk-r19c")
--- a/.circleci/cimodel/data/simple/util/versions.py
+++ b/.circleci/cimodel/data/simple/util/versions.py
@ -0,0 +1,36 @@
+from typing import Optional
+
+
+class MultiPartVersion:
+    def __init__(self, parts, prefix=""):
+        self.parts = parts
+        self.prefix = prefix
+
+    def prefixed_parts(self):
+        """
+        Prepends the first element of the version list
+        with the prefix string.
+        """
+        if self.parts:
+            return [self.prefix + str(self.parts[0])] + [
+                str(part) for part in self.parts[1:]
+            ]
+        else:
+            return [self.prefix]
+
+    def render_dots_or_parts(self, sep: Optional[str] = None):
+        if sep is None:
+            return self.prefixed_parts()
+        else:
+            return [sep.join(self.prefixed_parts())]
+
+
+class CudaVersion(MultiPartVersion):
+    def __init__(self, major, minor):
+        self.major = major
+        self.minor = minor
+
+        super().__init__([self.major, self.minor], "cuda")
+
+    def __str__(self):
+        return f"{self.major}.{self.minor}"
--- a/.circleci/cimodel/lib/init.py
+++ b/.circleci/cimodel/lib/init.py
--- a/.circleci/cimodel/lib/conf_tree.py
+++ b/.circleci/cimodel/lib/conf_tree.py
@ -0,0 +1,111 @@
+from dataclasses import dataclass, field
+from typing import Dict, Optional
+
+
+def X(val):
+    """
+    Compact way to write a leaf node
+    """
+    return val, []
+
+
+def XImportant(name):
+    """Compact way to write an important (run on PRs) leaf node"""
+    return (name, [("important", [X(True)])])
+
+
+@dataclass
+class Ver:
+    """
+    Represents a product with a version number
+    """
+
+    name: str
+    version: str = ""
+
+    def __str__(self):
+        return self.name + self.version
+
+
+@dataclass
+class ConfigNode:
+    parent: Optional["ConfigNode"]
+    node_name: str
+    props: Dict[str, str] = field(default_factory=dict)
+
+    def get_label(self):
+        return self.node_name
+
+    # noinspection PyMethodMayBeStatic
+    def get_children(self):
+        return []
+
+    def get_parents(self):
+        return (
+            (self.parent.get_parents() + [self.parent.get_label()])
+            if self.parent
+            else []
+        )
+
+    def get_depth(self):
+        return len(self.get_parents())
+
+    def get_node_key(self):
+        return "%".join(self.get_parents() + [self.get_label()])
+
+    def find_prop(self, propname, searched=None):
+        """
+        Checks if its own dictionary has
+        the property, otherwise asks parent node.
+        """
+
+        if searched is None:
+            searched = []
+
+        searched.append(self.node_name)
+
+        if propname in self.props:
+            return self.props[propname]
+        elif self.parent:
+            return self.parent.find_prop(propname, searched)
+        else:
+            # raise Exception('Property "%s" does not exist anywhere in the tree! Searched: %s' % (propname, searched))
+            return None
+
+
+def dfs_recurse(
+    node,
+    leaf_callback=lambda x: None,
+    discovery_callback=lambda x, y, z: None,
+    child_callback=lambda x, y: None,
+    sibling_index=0,
+    sibling_count=1,
+):
+    discovery_callback(node, sibling_index, sibling_count)
+
+    node_children = node.get_children()
+    if node_children:
+        for i, child in enumerate(node_children):
+            child_callback(node, child)
+
+            dfs_recurse(
+                child,
+                leaf_callback,
+                discovery_callback,
+                child_callback,
+                i,
+                len(node_children),
+            )
+    else:
+        leaf_callback(node)
+
+
+def dfs(toplevel_config_node):
+    config_list = []
+
+    def leaf_callback(node):
+        config_list.append(node)
+
+    dfs_recurse(toplevel_config_node, leaf_callback)
+
+    return config_list
--- a/.circleci/cimodel/lib/miniutils.py
+++ b/.circleci/cimodel/lib/miniutils.py
@ -0,0 +1,10 @@
+def quote(s):
+    return sandwich('"', s)
+
+
+def sandwich(bread, jam):
+    return bread + jam + bread
+
+
+def override(word, substitutions):
+    return substitutions.get(word, word)
--- a/.circleci/cimodel/lib/miniyaml.py
+++ b/.circleci/cimodel/lib/miniyaml.py
@ -0,0 +1,51 @@
+from collections import OrderedDict
+
+import cimodel.lib.miniutils as miniutils
+
+
+LIST_MARKER = "- "
+INDENTATION_WIDTH = 2
+
+
+def is_dict(data):
+    return type(data) in [dict, OrderedDict]
+
+
+def is_collection(data):
+    return is_dict(data) or type(data) is list
+
+
+def render(fh, data, depth, is_list_member=False):
+    """
+    PyYaml does not allow precise control over the quoting
+    behavior, especially for merge references.
+    Therefore, we use this custom YAML renderer.
+    """
+
+    indentation = " " * INDENTATION_WIDTH * depth
+
+    if is_dict(data):
+        tuples = list(data.items())
+        if type(data) is not OrderedDict:
+            tuples.sort()
+
+        for i, (k, v) in enumerate(tuples):
+            if not v:
+                continue
+            # If this dict is itself a list member, the first key gets prefixed with a list marker
+            list_marker_prefix = LIST_MARKER if is_list_member and not i else ""
+
+            trailing_whitespace = "\n" if is_collection(v) else " "
+            fh.write(indentation + list_marker_prefix + k + ":" + trailing_whitespace)
+
+            render(fh, v, depth + 1 + int(is_list_member))
+
+    elif type(data) is list:
+        for v in data:
+            render(fh, v, depth, True)
+
+    else:
+        # use empty quotes to denote an empty string value instead of blank space
+        modified_data = miniutils.quote(data) if data == "" else data
+        list_member_prefix = indentation + LIST_MARKER if is_list_member else ""
+        fh.write(list_member_prefix + str(modified_data) + "\n")
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
--- a/.circleci/ensure-consistency.py
+++ b/.circleci/ensure-consistency.py
@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+
+import os
+import subprocess
+import sys
+import tempfile
+
+import generate_config_yml
+
+
+CHECKED_IN_FILE = "config.yml"
+REGENERATION_SCRIPT = "regenerate.sh"
+
+PARENT_DIR = os.path.basename(os.path.dirname(os.path.abspath(__file__)))
+README_PATH = os.path.join(PARENT_DIR, "README.md")
+
+ERROR_MESSAGE_TEMPLATE = """
+The checked-in CircleCI "%s" file does not match what was generated by the scripts.
+Please re-run the "%s" script in the "%s" directory and commit the result. See "%s" for more information.
+"""
+
+
+def check_consistency():
+    _, temp_filename = tempfile.mkstemp("-generated-config.yml")
+
+    with open(temp_filename, "w") as fh:
+        generate_config_yml.stitch_sources(fh)
+
+    try:
+        subprocess.check_call(["cmp", temp_filename, CHECKED_IN_FILE])
+    except subprocess.CalledProcessError:
+        sys.exit(
+            ERROR_MESSAGE_TEMPLATE
+            % (CHECKED_IN_FILE, REGENERATION_SCRIPT, PARENT_DIR, README_PATH)
+        )
+    finally:
+        os.remove(temp_filename)
+
+
+if __name__ == "__main__":
+    check_consistency()
--- a/.circleci/generate_config_yml.py
+++ b/.circleci/generate_config_yml.py
@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+
+"""
+This script is the source of truth for config.yml.
+Please see README.md in this directory for details.
+"""
+
+import os
+import shutil
+import sys
+from collections import namedtuple
+
+import cimodel.data.simple.anaconda_prune_defintions
+
+import cimodel.data.simple.docker_definitions
+import cimodel.data.simple.mobile_definitions
+import cimodel.data.simple.nightly_ios
+import cimodel.lib.miniutils as miniutils
+import cimodel.lib.miniyaml as miniyaml
+
+
+class File:
+    """
+    Verbatim copy the contents of a file into config.yml
+    """
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    def write(self, output_filehandle):
+        with open(os.path.join("verbatim-sources", self.filename)) as fh:
+            shutil.copyfileobj(fh, output_filehandle)
+
+
+class FunctionGen(namedtuple("FunctionGen", "function depth")):
+    __slots__ = ()
+
+
+class Treegen(FunctionGen):
+    """
+    Insert the content of a YAML tree into config.yml
+    """
+
+    def write(self, output_filehandle):
+        miniyaml.render(output_filehandle, self.function(), self.depth)
+
+
+class Listgen(FunctionGen):
+    """
+    Insert the content of a YAML list into config.yml
+    """
+
+    def write(self, output_filehandle):
+        miniyaml.render(output_filehandle, self.function(), self.depth)
+
+
+def horizontal_rule():
+    return "".join("#" * 78)
+
+
+class Header:
+    def __init__(self, title, summary=None):
+        self.title = title
+        self.summary_lines = summary or []
+
+    def write(self, output_filehandle):
+        text_lines = [self.title] + self.summary_lines
+        comment_lines = ["# " + x for x in text_lines]
+        lines = miniutils.sandwich([horizontal_rule()], comment_lines)
+
+        for line in filter(None, lines):
+            output_filehandle.write(line + "\n")
+
+
+def _for_all_items(items, functor) -> None:
+    if isinstance(items, list):
+        for item in items:
+            _for_all_items(item, functor)
+    if isinstance(items, dict) and len(items) == 1:
+        item_type, item = next(iter(items.items()))
+        functor(item_type, item)
+
+
+def filter_master_only_jobs(items):
+    def _is_main_or_master_item(item):
+        filters = item.get("filters", None)
+        branches = filters.get("branches", None) if filters is not None else None
+        branches_only = branches.get("only", None) if branches is not None else None
+        return (
+            ("main" in branches_only or "master" in branches_only)
+            if branches_only is not None
+            else False
+        )
+
+    master_deps = set()
+
+    def _save_requires_if_master(item_type, item):
+        requires = item.get("requires", None)
+        item_name = item.get("name", None)
+        if not isinstance(requires, list):
+            return
+        if _is_main_or_master_item(item) or item_name in master_deps:
+            master_deps.update([n.strip('"') for n in requires])
+
+    def _do_filtering(items):
+        if isinstance(items, list):
+            rc = [_do_filtering(item) for item in items]
+            return [item for item in rc if len(item if item is not None else []) > 0]
+        assert isinstance(items, dict) and len(items) == 1
+        item_type, item = next(iter(items.items()))
+        item_name = item.get("name", None)
+        item_name = item_name.strip('"') if item_name is not None else None
+        if not _is_main_or_master_item(item) and item_name not in master_deps:
+            return None
+        if "filters" in item:
+            item = item.copy()
+            item.pop("filters")
+        return {item_type: item}
+
+    # Scan of dependencies twice to pick up nested required jobs
+    # I.e. jobs depending on jobs that main-only job depend on
+    _for_all_items(items, _save_requires_if_master)
+    _for_all_items(items, _save_requires_if_master)
+    return _do_filtering(items)
+
+
+def generate_required_docker_images(items):
+    required_docker_images = set()
+
+    def _requires_docker_image(item_type, item):
+        requires = item.get("requires", None)
+        if not isinstance(requires, list):
+            return
+        for requirement in requires:
+            requirement = requirement.replace('"', "")
+            if requirement.startswith("docker-"):
+                required_docker_images.add(requirement)
+
+    _for_all_items(items, _requires_docker_image)
+    return required_docker_images
+
+
+def gen_build_workflows_tree():
+    build_workflows_functions = [
+        cimodel.data.simple.mobile_definitions.get_workflow_jobs,
+        cimodel.data.simple.nightly_ios.get_workflow_jobs,
+        cimodel.data.simple.anaconda_prune_defintions.get_workflow_jobs,
+    ]
+    build_jobs = [f() for f in build_workflows_functions]
+    build_jobs.extend(
+        cimodel.data.simple.docker_definitions.get_workflow_jobs(
+            # sort for consistency
+            sorted(generate_required_docker_images(build_jobs))
+        )
+    )
+    master_build_jobs = filter_master_only_jobs(build_jobs)
+
+    rc = {
+        "workflows": {
+            "build": {
+                "when": r"<< pipeline.parameters.run_build >>",
+                "jobs": build_jobs,
+            },
+        }
+    }
+    if len(master_build_jobs) > 0:
+        rc["workflows"]["master_build"] = {
+            "when": r"<< pipeline.parameters.run_master_build >>",
+            "jobs": master_build_jobs,
+        }
+    return rc
+
+
+# Order of this list matters to the generated config.yml.
+YAML_SOURCES = [
+    File("header-section.yml"),
+    File("commands.yml"),
+    File("nightly-binary-build-defaults.yml"),
+    Header("Build parameters"),
+    File("build-parameters/pytorch-build-params.yml"),
+    File("build-parameters/binary-build-params.yml"),
+    Header("Job specs"),
+    File("job-specs/binary-job-specs.yml"),
+    File("job-specs/job-specs-custom.yml"),
+    File("job-specs/binary_update_htmls.yml"),
+    File("job-specs/binary-build-tests.yml"),
+    File("job-specs/docker_jobs.yml"),
+    Header("Workflows"),
+    Treegen(gen_build_workflows_tree, 0),
+]
+
+
+def stitch_sources(output_filehandle):
+    for f in YAML_SOURCES:
+        f.write(output_filehandle)
+
+
+if __name__ == "__main__":
+    stitch_sources(sys.stdout)
--- a/.circleci/regenerate.ps1
+++ b/.circleci/regenerate.ps1
@ -0,0 +1,5 @@
+cd $PSScriptRoot;
+$NewFile = New-TemporaryFile;
+python generate_config_yml.py > $NewFile.name
+(Get-Content $NewFile.name -Raw).TrimEnd().Replace("`r`n","`n") | Set-Content config.yml -Force
+Remove-Item $NewFile.name
--- a/.circleci/regenerate.sh
+++ b/.circleci/regenerate.sh
@ -0,0 +1,17 @@
+#!/bin/bash -e
+
+# Allows this script to be invoked from any directory:
+cd "$(dirname "$0")"
+
+UNCOMMIT_CHANGE=$(git status -s | grep " config.yml" | wc -l | xargs)
+if [[ $UNCOMMIT_CHANGE != 0 ]]; then
+    OLD_FILE=$(mktemp)
+    cp config.yml "$OLD_FILE"
+    echo "Uncommitted change detected in .circleci/config.yml"
+    echo "It has been backed up to $OLD_FILE"
+fi
+
+NEW_FILE=$(mktemp)
+./generate_config_yml.py > "$NEW_FILE"
+cp "$NEW_FILE" config.yml
+echo "New config generated in .circleci/config.yml"
--- a/.circleci/scripts/binary_checkout.sh
+++ b/.circleci/scripts/binary_checkout.sh
@ -62,7 +62,7 @@ git --no-pager log --max-count 1
 popd

 # Clone the Builder main repo
-retry git clone -q https://github.com/pytorch/builder.git "$BUILDER_ROOT"
+retry git clone -q https://github.com/pytorch/builder.git -b release/2.1 "$BUILDER_ROOT"
 pushd "$BUILDER_ROOT"
 echo "Using builder from "
 git --no-pager log --max-count 1
--- a/.circleci/scripts/binary_ios_upload.sh
+++ b/.circleci/scripts/binary_ios_upload.sh
@ -33,7 +33,7 @@ fi
 cp ${PROJ_ROOT}/LICENSE ${ZIP_DIR}/
 # zip the library
 export DATE="$(date -u +%Y%m%d)"
-export IOS_NIGHTLY_BUILD_VERSION="2.2.0.${DATE}"
+export IOS_NIGHTLY_BUILD_VERSION="2.1.0.${DATE}"
 if [ "${BUILD_LITE_INTERPRETER}" == "1" ]; then
    # libtorch_lite_ios_nightly_1.11.0.20210810.zip
    ZIPFILE="libtorch_lite_ios_nightly_${IOS_NIGHTLY_BUILD_VERSION}.zip"
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -54,7 +54,7 @@ fi



-# Move debug wheels out of the package dir so they don't get installed
+# Move debug wheels out of the the package dir so they don't get installed
 mkdir -p /tmp/debug_final_pkgs
 mv /final_pkgs/debug-*.zip /tmp/debug_final_pkgs || echo "no debug packages to move"

@ -66,12 +66,6 @@ mv /final_pkgs/debug-*.zip /tmp/debug_final_pkgs || echo "no debug packages to m
 #   conda build scripts themselves. These should really be consolidated
 # Pick only one package of multiple available (which happens as result of workflow re-runs)
 pkg="/final_pkgs/\$(ls -1 /final_pkgs|sort|tail -1)"
-if [[ "\$PYTORCH_BUILD_VERSION" == *dev* ]]; then
-    CHANNEL="nightly"
-else
-    CHANNEL="test"
-fi
-
 if [[ "$PACKAGE_TYPE" == conda ]]; then
  (
    # For some reason conda likes to re-activate the conda environment when attempting this install
@ -89,14 +83,25 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
    if [[ "$DESIRED_CUDA" == 'cpu' ]]; then
      retry conda install -c pytorch -y cpuonly
    else
+
      cu_ver="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4}"
      CUDA_PACKAGE="pytorch-cuda"
-      retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c "pytorch-\${CHANNEL}" "pytorch-cuda=\${cu_ver}"
+      PYTORCH_CHANNEL="pytorch"
+      if [[ "\${TORCH_CONDA_BUILD_FOLDER}" == "pytorch-nightly" ]]; then
+              PYTORCH_CHANNEL="pytorch-nightly"
+      fi
+      retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c pytorch-test "pytorch-cuda=\${cu_ver}"
    fi
    conda install \${EXTRA_CONDA_FLAGS} -y "\$pkg" --offline
  )
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
-  pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
+  if [[ "$(uname -m)" == aarch64 ]]; then
+    # Using "extra-index-url" until all needed aarch64 dependencies are
+    # added to "https://download.pytorch.org/whl/nightly/"
+    pip install "\$pkg" --extra-index-url "https://download.pytorch.org/whl/test/${DESIRED_CUDA}"
+  else
+    pip install "\$pkg" --index-url "https://download.pytorch.org/whl/test/${DESIRED_CUDA}"
+  fi
  retry pip install -q numpy protobuf typing-extensions
 fi
 if [[ "$PACKAGE_TYPE" == libtorch ]]; then
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -58,7 +58,8 @@ fi
 PIP_UPLOAD_FOLDER='nightly/'
 # We put this here so that OVERRIDE_PACKAGE_VERSION below can read from it
 export DATE="$(date -u +%Y%m%d)"
-BASE_BUILD_VERSION="$(cat ${PYTORCH_ROOT}/version.txt|cut -da -f1).dev${DATE}"
+#TODO: We should be pulling semver version from the base version.txt
+BASE_BUILD_VERSION="2.1.0.dev$DATE"
 # Change BASE_BUILD_VERSION to git tag when on a git tag
 # Use 'git -C' to make doubly sure we're in the correct directory for checking
 # the git tag
@ -76,8 +77,15 @@ else
  export PYTORCH_BUILD_VERSION="${BASE_BUILD_VERSION}+$DESIRED_CUDA"
 fi

+# The build with with-pypi-cudnn suffix is only applicabe to
+# pypi small wheel Linux x86 build
+if [[ -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]] && [[ "$(uname)" == 'Linux' && "$(uname -m)" == "x86_64" ]]; then
+  export PYTORCH_BUILD_VERSION="${PYTORCH_BUILD_VERSION}-with-pypi-cudnn"
+fi
+
 export PYTORCH_BUILD_NUMBER=1

+
 JAVA_HOME=
 BUILD_JNI=OFF
 if [[ "$PACKAGE_TYPE" == libtorch ]]; then
@ -149,8 +157,8 @@ EOL

 # nproc doesn't exist on darwin
 if [[ "$(uname)" != Darwin ]]; then
-  # This was lowered from 18 to 12 to avoid OOMs when compiling FlashAttentionV2
-  MEMORY_LIMIT_MAX_JOBS=12
+  # Because most Circle executors only have 20 CPUs, using more causes OOMs w/ Ninja and nvcc parallelization
+  MEMORY_LIMIT_MAX_JOBS=18
  NUM_CPUS=$(( $(nproc) - 2 ))

  # Defaults here for **binary** linux builds so they can be changed in one place
--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -16,6 +16,11 @@ UPLOAD_BUCKET="s3://pytorch"
 BACKUP_BUCKET="s3://pytorch-backup"
 BUILD_NAME=${BUILD_NAME:-}

+# this is temporary change to upload pypi-cudnn builds to separate folder
+if [[ ${BUILD_NAME} == *with-pypi-cudnn* ]]; then
+  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_cudnn"
+fi
+
 DRY_RUN=${DRY_RUN:-enabled}
 # Don't actually do work unless explicit
 ANACONDA="true anaconda"
--- a/.circleci/verbatim-sources/build-parameters/binary-build-params.yml
+++ b/.circleci/verbatim-sources/build-parameters/binary-build-params.yml
@ -0,0 +1,65 @@
+binary_linux_build_params: &binary_linux_build_params
+  parameters:
+    build_environment:
+      type: string
+      default: ""
+    docker_image:
+      type: string
+      default: ""
+    libtorch_variant:
+      type: string
+      default: ""
+    resource_class:
+      type: string
+      default: "2xlarge+"
+  environment:
+    BUILD_ENVIRONMENT: << parameters.build_environment >>
+    LIBTORCH_VARIANT: << parameters.libtorch_variant >>
+    ANACONDA_USER: pytorch
+  resource_class: << parameters.resource_class >>
+  docker:
+    - image: << parameters.docker_image >>
+
+binary_linux_test_upload_params: &binary_linux_test_upload_params
+  parameters:
+    build_environment:
+      type: string
+      default: ""
+    docker_image:
+      type: string
+      default: ""
+    libtorch_variant:
+      type: string
+      default: ""
+    resource_class:
+      type: string
+      default: "medium"
+    use_cuda_docker_runtime:
+      type: string
+      default: ""
+  environment:
+    BUILD_ENVIRONMENT: << parameters.build_environment >>
+    DOCKER_IMAGE: << parameters.docker_image >>
+    USE_CUDA_DOCKER_RUNTIME: << parameters.use_cuda_docker_runtime >>
+    LIBTORCH_VARIANT: << parameters.libtorch_variant >>
+  resource_class: << parameters.resource_class >>
+
+binary_mac_params: &binary_mac_params
+  parameters:
+    build_environment:
+      type: string
+      default: ""
+  environment:
+    BUILD_ENVIRONMENT: << parameters.build_environment >>
+
+binary_windows_params: &binary_windows_params
+  parameters:
+    build_environment:
+      type: string
+      default: ""
+    executor:
+      type: string
+      default: "windows-xlarge-cpu-with-nvidia-cuda"
+  environment:
+    BUILD_ENVIRONMENT: << parameters.build_environment >>
+    JOB_EXECUTOR: <<parameters.executor>>
--- a/.circleci/verbatim-sources/build-parameters/pytorch-build-params.yml
+++ b/.circleci/verbatim-sources/build-parameters/pytorch-build-params.yml
@ -0,0 +1,105 @@
+pytorch_params: &pytorch_params
+  parameters:
+    build_environment:
+      type: string
+      default: ""
+    docker_image:
+      type: string
+      default: ""
+    resource_class:
+      type: string
+      default: "large"
+    use_cuda_docker_runtime:
+      type: string
+      default: ""
+    build_only:
+      type: string
+      default: ""
+    ci_master:
+      type: string
+      default: ""
+  environment:
+    BUILD_ENVIRONMENT: << parameters.build_environment >>
+    DOCKER_IMAGE: << parameters.docker_image >>
+    USE_CUDA_DOCKER_RUNTIME: << parameters.use_cuda_docker_runtime >>
+    BUILD_ONLY: << parameters.build_only >>
+    CI_MASTER: << pipeline.parameters.run_master_build >>
+  resource_class: << parameters.resource_class >>
+
+pytorch_ios_params: &pytorch_ios_params
+  parameters:
+    build_environment:
+      type: string
+      default: ""
+    ios_arch:
+      type: string
+      default: ""
+    ios_platform:
+      type: string
+      default: ""
+    op_list:
+      type: string
+      default: ""
+    use_metal:
+      type: string
+      default: "0"
+    lite_interpreter:
+      type: string
+      default: "1"
+    use_coreml:
+      type: string
+      default: "0"
+  environment:
+    BUILD_ENVIRONMENT: << parameters.build_environment >>
+    IOS_ARCH: << parameters.ios_arch >>
+    IOS_PLATFORM: << parameters.ios_platform >>
+    SELECTED_OP_LIST: << parameters.op_list >>
+    USE_PYTORCH_METAL: << parameters.use_metal >>
+    BUILD_LITE_INTERPRETER: << parameters.lite_interpreter >>
+    USE_COREML_DELEGATE: << parameters.use_coreml >>
+
+pytorch_windows_params: &pytorch_windows_params
+  parameters:
+    executor:
+      type: string
+      default: "windows-xlarge-cpu-with-nvidia-cuda"
+    build_environment:
+      type: string
+      default: ""
+    test_name:
+      type: string
+      default: ""
+    cuda_version:
+      type: string
+      default: "10.1"
+    python_version:
+      type: string
+      default: "3.8"
+    vs_version:
+      type: string
+      default: "16.8.6"
+    vc_version:
+      type: string
+      default: "14.16"
+    vc_year:
+      type: string
+      default: "2019"
+    vc_product:
+      type: string
+      default: "BuildTools"
+    use_cuda:
+      type: string
+      default: ""
+  environment:
+    BUILD_ENVIRONMENT: <<parameters.build_environment>>
+    SCCACHE_BUCKET: "ossci-compiler-cache"
+    CUDA_VERSION: <<parameters.cuda_version>>
+    PYTHON_VERSION: <<parameters.python_version>>
+    VS_VERSION: <<parameters.vs_version>>
+    VC_VERSION: <<parameters.vc_version>>
+    VC_YEAR: <<parameters.vc_year>>
+    VC_PRODUCT: <<parameters.vc_product>>
+    USE_CUDA: <<parameters.use_cuda>>
+    TORCH_CUDA_ARCH_LIST: "5.2 7.5"
+    JOB_BASE_NAME: <<parameters.test_name>>
+    JOB_EXECUTOR: <<parameters.executor>>
--- a/.circleci/verbatim-sources/commands.yml
+++ b/.circleci/verbatim-sources/commands.yml
@ -0,0 +1,134 @@
+commands:
+
+  calculate_docker_image_tag:
+    description: "Calculates the docker image tag"
+    steps:
+      - run:
+          name: "Calculate docker image hash"
+          command: |
+            DOCKER_TAG=$(git rev-parse HEAD:.ci/docker)
+            echo "DOCKER_TAG=${DOCKER_TAG}" >> "${BASH_ENV}"
+
+  designate_upload_channel:
+    description: "inserts the correct upload channel into ${BASH_ENV}"
+    steps:
+      - run:
+          name: adding UPLOAD_CHANNEL to BASH_ENV
+          command: |
+            our_upload_channel=nightly
+            # On tags upload to test instead
+            if [[ -n "${CIRCLE_TAG}" ]]; then
+              our_upload_channel=test
+            fi
+            echo "export UPLOAD_CHANNEL=${our_upload_channel}" >> ${BASH_ENV}
+
+  # This system setup script is meant to run before the CI-related scripts, e.g.,
+  # installing Git client, checking out code, setting up CI env, and
+  # building/testing.
+  setup_linux_system_environment:
+    steps:
+      - run:
+          name: Set Up System Environment
+          no_output_timeout: "1h"
+          command: .circleci/scripts/setup_linux_system_environment.sh
+
+  setup_ci_environment:
+    steps:
+      - run:
+          name: Set Up CI Environment After attach_workspace
+          no_output_timeout: "1h"
+          command: .circleci/scripts/setup_ci_environment.sh
+
+  brew_update:
+    description: "Update Homebrew and install base formulae"
+    steps:
+      - run:
+          name: Update Homebrew
+          no_output_timeout: "10m"
+          command: |
+            set -ex
+
+            # Update repositories manually.
+            # Running `brew update` produces a comparison between the
+            # current checkout and the updated checkout, which takes a
+            # very long time because the existing checkout is 2y old.
+            for path in $(find /usr/local/Homebrew -type d -name .git)
+            do
+            cd $path/..
+            git fetch --depth=1 origin
+            git reset --hard origin/master
+            done
+
+            export HOMEBREW_NO_AUTO_UPDATE=1
+
+            # Install expect and moreutils so that we can call `unbuffer` and `ts`.
+            # moreutils installs a `parallel` executable by default, which conflicts
+            # with the executable from the GNU `parallel`, so we must unlink GNU
+            # `parallel` first, and relink it afterwards.
+            brew unlink parallel
+            brew install moreutils
+            brew link parallel --overwrite
+            brew install expect
+
+  brew_install:
+    description: "Install Homebrew formulae"
+    parameters:
+      formulae:
+        type: string
+        default: ""
+    steps:
+      - run:
+          name: Install << parameters.formulae >>
+          no_output_timeout: "10m"
+          command: |
+            set -ex
+            export HOMEBREW_NO_AUTO_UPDATE=1
+            brew install << parameters.formulae >>
+
+  run_brew_for_macos_build:
+    steps:
+      - brew_update
+      - brew_install:
+          formulae: libomp
+
+  run_brew_for_ios_build:
+    steps:
+      - brew_update
+      - brew_install:
+          formulae: libtool
+
+  optional_merge_target_branch:
+    steps:
+      - run:
+          name: (Optional) Merge target branch
+          no_output_timeout: "10m"
+          command: |
+            if [[ -n "$CIRCLE_PULL_REQUEST" && "$CIRCLE_BRANCH" != "nightly" ]]; then
+              PR_NUM=$(basename $CIRCLE_PULL_REQUEST)
+              CIRCLE_PR_BASE_BRANCH=$(curl -s https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/$PR_NUM | jq -r '.base.ref')
+              if [[ "${BUILD_ENVIRONMENT}" == *"xla"* || "${BUILD_ENVIRONMENT}" == *"gcc5"* ]] ; then
+                set -x
+                git config --global user.email "circleci.ossci@gmail.com"
+                git config --global user.name "CircleCI"
+                git config remote.origin.url https://github.com/pytorch/pytorch.git
+                git config --add remote.origin.fetch +refs/heads/master:refs/remotes/origin/master
+                git fetch --tags --progress https://github.com/pytorch/pytorch.git +refs/heads/master:refs/remotes/origin/master --depth=100 --quiet
+                # PRs generated from ghstack has format CIRCLE_PR_BASE_BRANCH=gh/xxx/1234/base
+                if [[ "${CIRCLE_PR_BASE_BRANCH}" == "gh/"* ]]; then
+                  CIRCLE_PR_BASE_BRANCH=master
+                fi
+                export GIT_MERGE_TARGET=`git log -n 1 --pretty=format:"%H" origin/$CIRCLE_PR_BASE_BRANCH`
+                echo "GIT_MERGE_TARGET: " ${GIT_MERGE_TARGET}
+                export GIT_COMMIT=${CIRCLE_SHA1}
+                echo "GIT_COMMIT: " ${GIT_COMMIT}
+                git checkout -f ${GIT_COMMIT}
+                git reset --hard ${GIT_COMMIT}
+                git merge --allow-unrelated-histories --no-edit --no-ff ${GIT_MERGE_TARGET}
+                echo "Merged $CIRCLE_PR_BASE_BRANCH branch before building in environment $BUILD_ENVIRONMENT"
+                set +x
+              else
+                echo "No need to merge with $CIRCLE_PR_BASE_BRANCH, skipping..."
+              fi
+            else
+              echo "This is not a pull request, skipping..."
+            fi
--- a/.circleci/verbatim-sources/header-section.yml
+++ b/.circleci/verbatim-sources/header-section.yml
@ -0,0 +1,41 @@
+# WARNING: DO NOT EDIT THIS FILE DIRECTLY!!!
+# See the README.md in this directory.
+
+# IMPORTANT: To update Docker image version, please follow
+# the instructions at
+# https://github.com/pytorch/pytorch/wiki/Docker-image-build-on-CircleCI
+
+version: 2.1
+
+parameters:
+  run_binary_tests:
+    type: boolean
+    default: false
+  run_build:
+    type: boolean
+    default: true
+  run_master_build:
+    type: boolean
+    default: false
+  run_slow_gradcheck_build:
+    type: boolean
+    default: false
+
+executors:
+  windows-with-nvidia-gpu:
+    machine:
+      resource_class: windows.gpu.nvidia.medium
+      image: windows-server-2019-nvidia:previous
+      shell: bash.exe
+
+  windows-xlarge-cpu-with-nvidia-cuda:
+    machine:
+      resource_class: windows.xlarge
+      image: windows-server-2019-vs2019:stable
+      shell: bash.exe
+
+  windows-medium-cpu-with-nvidia-cuda:
+    machine:
+      resource_class: windows.medium
+      image: windows-server-2019-vs2019:stable
+      shell: bash.exe
--- a/.circleci/verbatim-sources/job-specs/binary-build-tests.yml
+++ b/.circleci/verbatim-sources/job-specs/binary-build-tests.yml
@ -0,0 +1,14 @@
+
+# There is currently no testing for libtorch TODO
+#  binary_linux_libtorch_3.6m_cpu_test:
+#    environment:
+#      BUILD_ENVIRONMENT: "libtorch 3.6m cpu"
+#    resource_class: gpu.nvidia.small
+#    <<: *binary_linux_test
+#
+#  binary_linux_libtorch_3.6m_cu90_test:
+#    environment:
+#      BUILD_ENVIRONMENT: "libtorch 3.6m cu90"
+#    resource_class: gpu.nvidia.small
+#    <<: *binary_linux_test
+#
--- a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
@ -0,0 +1,73 @@
+jobs:
+  binary_ios_build:
+    <<: *pytorch_ios_params
+    macos:
+      xcode: "12.5.1"
+    steps:
+    - attach_workspace:
+        at: ~/workspace
+    - checkout
+    - run_brew_for_ios_build
+    - run:
+        name: Build
+        no_output_timeout: "1h"
+        command: |
+          script="/Users/distiller/project/.circleci/scripts/binary_ios_build.sh"
+          cat "$script"
+          source "$script"
+    - run:
+        name: Test
+        no_output_timeout: "30m"
+        command: |
+          script="/Users/distiller/project/.circleci/scripts/binary_ios_test.sh"
+          cat "$script"
+          source "$script"
+    - persist_to_workspace:
+        root: /Users/distiller/workspace/
+        paths: ios
+
+  binary_ios_upload:
+    <<: *pytorch_ios_params
+    macos:
+      xcode: "12.5.1"
+    steps:
+    - attach_workspace:
+        at: ~/workspace
+    - checkout
+    - run_brew_for_ios_build
+    - run:
+        name: Upload
+        no_output_timeout: "1h"
+        command: |
+          script="/Users/distiller/project/.circleci/scripts/binary_ios_upload.sh"
+          cat "$script"
+          source "$script"
+
+  anaconda_prune:
+    parameters:
+      packages:
+        type: string
+        description: "What packages are we pruning? (quoted, space-separated string. eg. 'pytorch', 'torchvision torchaudio', etc.)"
+        default: "pytorch"
+      channel:
+        type: string
+        description: "What channel are we pruning? (eq. pytorch-nightly)"
+        default: "pytorch-nightly"
+    docker:
+      - image: continuumio/miniconda3
+    environment:
+      - PACKAGES: "<< parameters.packages >>"
+      - CHANNEL: "<< parameters.channel >>"
+    steps:
+      - checkout
+      - run:
+          name: Install dependencies
+          no_output_timeout: "1h"
+          command: |
+            conda install -yq anaconda-client
+      - run:
+          name: Prune packages
+          no_output_timeout: "1h"
+          command: |
+              ANACONDA_API_TOKEN="${CONDA_PYTORCHBOT_TOKEN}" \
+              scripts/release/anaconda-prune/run.sh
--- a/.circleci/verbatim-sources/job-specs/binary_update_htmls.yml
+++ b/.circleci/verbatim-sources/job-specs/binary_update_htmls.yml
@ -0,0 +1,53 @@
+
+  # update_s3_htmls job
+  # These jobs create html files for every cpu/cu## folder in s3. The html
+  # files just store the names of all the files in that folder (which are
+  # binary files (.whl files)). This is to allow pip installs of the latest
+  # version in a folder without having to know the latest date. Pip has a flag
+  # -f that you can pass an html file listing a bunch of packages, and pip will
+  # then install the one with the most recent version.
+  update_s3_htmls: &update_s3_htmls
+    machine:
+      image: ubuntu-2004:202104-01
+    resource_class: medium
+    steps:
+    - checkout
+    - setup_linux_system_environment
+    - run:
+        <<: *binary_checkout
+    # N.B. we do not run binary_populate_env. The only variable we need is
+    # PIP_UPLOAD_FOLDER (which is 'nightly/' for the nightlies and '' for
+    # releases, and sometimes other things for special cases). Instead we
+    # expect PIP_UPLOAD_FOLDER to be passed directly in the env. This is
+    # because, unlike all the other binary jobs, these jobs only get run once,
+    # in a separate workflow. They are not a step in other binary jobs like
+    # build, test, upload.
+    #
+    # You could attach this to every job, or include it in the upload step if
+    # you wanted. You would need to add binary_populate_env in this case to
+    # make sure it has the same upload folder as the job it's attached to. This
+    # function is idempotent, so it won't hurt anything; it's just a little
+    # unnescessary"
+    - run:
+        name: define PIP_UPLOAD_FOLDER
+        command: |
+          our_upload_folder=nightly/
+          # On tags upload to test instead
+          if [[ -n "${CIRCLE_TAG}" ]]; then
+            our_upload_folder=test/
+          fi
+          echo "export PIP_UPLOAD_FOLDER=${our_upload_folder}" >> ${BASH_ENV}
+    - run:
+        name: Update s3 htmls
+        no_output_timeout: "1h"
+        command: |
+          set +x
+          echo "declare -x \"AWS_ACCESS_KEY_ID=${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}\"" >> /home/circleci/project/env
+          echo "declare -x \"AWS_SECRET_ACCESS_KEY=${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}\"" >> /home/circleci/project/env
+          source /home/circleci/project/env
+          set -eux -o pipefail
+          retry () {
+              $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+          }
+          retry pip install awscli==1.6
+          "/home/circleci/project/builder/cron/update_s3_htmls.sh"
--- a/.circleci/verbatim-sources/job-specs/docker_jobs.yml
+++ b/.circleci/verbatim-sources/job-specs/docker_jobs.yml
@ -0,0 +1,56 @@
+  docker_build_job:
+      parameters:
+        image_name:
+          type: string
+          default: ""
+      machine:
+        image: ubuntu-2004:202104-01
+      resource_class: large
+      environment:
+        IMAGE_NAME: << parameters.image_name >>
+        # Enable 'docker manifest'
+        DOCKER_CLI_EXPERIMENTAL: "enabled"
+        DOCKER_BUILDKIT: 1
+      steps:
+        - checkout
+        - calculate_docker_image_tag
+        - run:
+            name: Check if image should be built
+            command: |
+              set +x
+              export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_DOCKER_BUILDER_V1}
+              export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_DOCKER_BUILDER_V1}
+              export AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\")
+              export AWS_REGION=us-east-1
+              aws ecr get-login-password --region $AWS_REGION|docker login --username AWS \
+                       --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com
+              set -x
+              # Check if image already exists, if it does then skip building it
+              if docker manifest inspect "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${IMAGE_NAME}:${DOCKER_TAG}"; then
+                circleci-agent step halt
+                # circleci-agent step halt doesn't actually halt the step so we need to
+                # explicitly exit the step here ourselves before it causes too much trouble
+                exit 0
+              fi
+              # Covers the case where a previous tag doesn't exist for the tree
+              # this is only really applicable on trees that don't have `.ci/docker` at its merge base, i.e. nightly
+              if ! git rev-parse "$(git merge-base HEAD << pipeline.git.base_revision >>):.ci/docker"; then
+                echo "Directory '.ci/docker' not found in tree << pipeline.git.base_revision >>, you should probably rebase onto a more recent commit"
+                exit 1
+              fi
+              PREVIOUS_DOCKER_TAG=$(git rev-parse "$(git merge-base HEAD << pipeline.git.base_revision >>):ci/docker")
+              # If no image exists but the hash is the same as the previous hash then we should error out here
+              if [[ "${PREVIOUS_DOCKER_TAG}" = "${DOCKER_TAG}" ]]; then
+                echo "ERROR: Something has gone wrong and the previous image isn't available for the merge-base of your branch"
+                echo "       contact the PyTorch team to restore the original images"
+                exit 1
+              fi
+        - run:
+            name: build_docker_image_<< parameters.image_name >>
+            no_output_timeout: "1h"
+            command: |
+              set +x
+              export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_DOCKER_BUILDER_V1}
+              export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_DOCKER_BUILDER_V1}
+              set -x
+              cd .ci/docker && ./build_docker.sh
--- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
+++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
@ -0,0 +1,747 @@
+  pytorch_doc_push:
+    resource_class: medium
+    machine:
+      image: ubuntu-2004:202104-01
+    parameters:
+      branch:
+        type: string
+        default: "main"
+    steps:
+    - attach_workspace:
+        at: /tmp/workspace
+    - run:
+        name: Generate netrc
+        command: |
+          # set credentials for https pushing
+          cat > ~/.netrc \<<DONE
+            machine github.com
+            login pytorchbot
+            password ${GITHUB_PYTORCHBOT_TOKEN}
+          DONE
+    - run:
+        name: Docs push
+        command: |
+          pushd /tmp/workspace
+          git push -u origin "<< parameters.branch >>"
+
+  pytorch_macos_10_15_py3_build:
+    environment:
+      BUILD_ENVIRONMENT: pytorch-macos-10.15-py3-arm64-build
+    macos:
+      xcode: "12.3.0"
+    steps:
+      - checkout
+      - run_brew_for_macos_build
+      - run:
+          name: Build
+          no_output_timeout: "1h"
+          command: |
+            set -e
+            export CROSS_COMPILE_ARM64=1
+            export JOB_BASE_NAME=$CIRCLE_JOB
+
+            # Install sccache
+            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo chmod +x /usr/local/bin/sccache
+            export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
+
+            # This IAM user allows write access to S3 bucket for sccache
+            set +x
+            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4}
+            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4}
+            set -x
+
+            chmod a+x .ci/pytorch/macos-build.sh
+            unbuffer .ci/pytorch/macos-build.sh 2>&1 | ts
+
+      - persist_to_workspace:
+          root: /Users/distiller/workspace/
+          paths:
+            - miniconda3
+      - store_artifacts:
+          path: /Users/distiller/project/dist
+
+  pytorch_macos_10_13_py3_build:
+    environment:
+      BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-build
+    macos:
+      xcode: "12.0"
+    steps:
+      - checkout
+      - run_brew_for_macos_build
+      - run:
+          name: Build
+          no_output_timeout: "1h"
+          command: |
+            set -e
+            export JOB_BASE_NAME=$CIRCLE_JOB
+
+            # Install sccache
+            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo chmod +x /usr/local/bin/sccache
+            export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2
+
+            # This IAM user allows write access to S3 bucket for sccache
+            set +x
+            export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4}
+            export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4}
+            set -x
+
+            chmod a+x .ci/pytorch/macos-build.sh
+            unbuffer .ci/pytorch/macos-build.sh 2>&1 | ts
+
+      - persist_to_workspace:
+          root: /Users/distiller/workspace/
+          paths:
+            - miniconda3
+
+  mac_build:
+    parameters:
+      build-environment:
+        type: string
+        description: Top-level label for what's being built/tested.
+      xcode-version:
+        type: string
+        default: "13.3.1"
+        description: What xcode version to build with.
+      build-generates-artifacts:
+        type: boolean
+        default: true
+        description: if the build generates build artifacts
+      python-version:
+        type: string
+        default: "3.8"
+    macos:
+      xcode: << parameters.xcode-version >>
+    resource_class: medium
+    environment:
+      BUILD_ENVIRONMENT: << parameters.build-environment >>
+      AWS_REGION: us-east-1
+    steps:
+
+      - checkout
+      - run_brew_for_macos_build
+
+      - run:
+          name: Install sccache
+          command: |
+            sudo curl --retry 3 https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo chmod +x /usr/local/bin/sccache
+            echo "export SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${BASH_ENV}"
+            echo "export SCCACHE_S3_KEY_PREFIX=${GITHUB_WORKFLOW}" >> "${BASH_ENV}"
+
+            set +x
+            echo "export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4}" >> "${BASH_ENV}"
+            echo "export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4}" >> "${BASH_ENV}"
+            set -x
+
+      - run:
+          name: Get workflow job id
+          command: |
+            echo "export OUR_GITHUB_JOB_ID=${CIRCLE_WORKFLOW_JOB_ID}" >> "${BASH_ENV}"
+
+      - run:
+          name: Build
+          command: |
+            set -x
+
+            git submodule sync
+            git submodule update --init --recursive --depth 1 --jobs 0
+
+            export PATH="/usr/local/bin:$PATH"
+            export WORKSPACE_DIR="${HOME}/workspace"
+            mkdir -p "${WORKSPACE_DIR}"
+            MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-py38_4.12.0-MacOSX-x86_64.sh"
+            if [  << parameters.python-version >> == 3.9.12 ]; then
+              MINICONDA_URL="https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-MacOSX-x86_64.sh"
+            fi
+
+            # If a local installation of conda doesn't exist, we download and install conda
+            if [ ! -d "${WORKSPACE_DIR}/miniconda3" ]; then
+              mkdir -p "${WORKSPACE_DIR}"
+              curl --retry 3 ${MINICONDA_URL} -o "${WORKSPACE_DIR}"/miniconda3.sh
+              bash "${WORKSPACE_DIR}"/miniconda3.sh -b -p "${WORKSPACE_DIR}"/miniconda3
+            fi
+            export PATH="${WORKSPACE_DIR}/miniconda3/bin:$PATH"
+            # shellcheck disable=SC1091
+            source "${WORKSPACE_DIR}"/miniconda3/bin/activate
+
+            brew link --force libomp
+
+            echo "export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"}" >> "${BASH_ENV}"
+            .ci/pytorch/macos-build.sh
+
+      - when:
+          condition: << parameters.build-generates-artifacts >>
+          steps:
+            - run:
+                name: Archive artifacts into zip
+                command: |
+                  zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json .pytorch-test-times.json .pytorch-test-file-ratings.json
+                  cp artifacts.zip /Users/distiller/workspace
+
+      - persist_to_workspace:
+          root: /Users/distiller/workspace/
+          paths:
+            - miniconda3
+            - artifacts.zip
+
+      - store_artifacts:
+          path: /Users/distiller/project/artifacts.zip
+
+  mac_test:
+    parameters:
+      build-environment:
+        type: string
+      shard-number:
+        type: string
+      num-test-shards:
+        type: string
+      xcode-version:
+        type: string
+      test-config:
+        type: string
+        default: 'default'
+
+    macos:
+      xcode: << parameters.xcode-version >>
+    environment:
+      GIT_DEFAULT_BRANCH: 'master'
+      BUILD_ENVIRONMENT: << parameters.build-environment >>
+      TEST_CONFIG: << parameters.test-config >>
+      SHARD_NUMBER: << parameters.shard-number >>
+      NUM_TEST_SHARDS: << parameters.num-test-shards >>
+      PYTORCH_RETRY_TEST_CASES: 1
+      PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
+    steps:
+      - checkout
+      - attach_workspace:
+          at: ~/workspace
+      - run_brew_for_macos_build
+      - run:
+          name: Test
+          no_output_timeout: "2h"
+          command: |
+            set -x
+
+            git submodule sync --recursive
+            git submodule update --init --recursive
+
+            mv ~/workspace/artifacts.zip .
+            unzip artifacts.zip
+
+            export IN_CI=1
+
+            COMMIT_MESSAGES=$(git cherry -v "origin/${GIT_DEFAULT_BRANCH:-master}")
+
+            export PATH="/usr/local/bin:$PATH"
+            export WORKSPACE_DIR="${HOME}/workspace"
+            mkdir -p "${WORKSPACE_DIR}"
+
+            export PATH="${WORKSPACE_DIR}/miniconda3/bin:$PATH"
+            source "${WORKSPACE_DIR}"/miniconda3/bin/activate
+
+            # sanitize the input commit message and PR body here:
+
+            # trim all new lines from commit messages to avoid issues with batch environment
+            # variable copying. see https://github.com/pytorch/pytorch/pull/80043#issuecomment-1167796028
+            COMMIT_MESSAGES="${COMMIT_MESSAGES//[$'\n\r']}"
+
+            # then trim all special characters like single and double quotes to avoid unescaped inputs to
+            # wreak havoc internally
+            export COMMIT_MESSAGES="${COMMIT_MESSAGES//[\'\"]}"
+
+            python3 -mpip install dist/*.whl
+            .ci/pytorch/macos-test.sh
+      - run:
+          name: Copy files for uploading test stats
+          command: |
+            # copy into a parent folder test-reports because we can't use CIRCLEI_BUILD_NUM in path when persisting to workspace
+            mkdir -p test-reports/test-reports_${CIRCLE_BUILD_NUM}/test/test-reports
+            cp -r test/test-reports test-reports/test-reports_${CIRCLE_BUILD_NUM}/test/test-reports
+      - store_test_results:
+          path: test/test-reports
+      - persist_to_workspace:
+          root: /Users/distiller/project/
+          paths:
+            - test-reports
+
+  upload_test_stats:
+    machine: # executor type
+      image: ubuntu-2004:202010-01 # # recommended linux image - includes Ubuntu 20.04, docker 19.03.13, docker-compose 1.27.4
+    steps:
+      - checkout
+      - attach_workspace:
+          at: ~/workspace
+      - run:
+          name: upload
+          command: |
+            set -ex
+            if [ -z ${AWS_ACCESS_KEY_FOR_OSSCI_ARTIFACT_UPLOAD} ]; then
+              echo "No credentials found, cannot upload test stats (are you on a fork?)"
+              exit 0
+            fi
+            cp -r ~/workspace/test-reports/* ~/project
+            pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
+            export AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_FOR_OSSCI_ARTIFACT_UPLOAD}
+            export AWS_SECRET_ACCESS_KEY=${AWS_SECRET_KEY_FOR_OSSCI_ARTIFACT_UPLOAD}
+            # i dont know how to get the run attempt number for reruns so default to 1
+            python3 -m tools.stats.upload_test_stats --workflow-run-id "${CIRCLE_WORKFLOW_JOB_ID}" --workflow-run-attempt 1 --head-branch << pipeline.git.branch >> --circleci
+  pytorch_macos_10_13_py3_test:
+    environment:
+      BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-test
+    macos:
+      xcode: "12.0"
+    steps:
+      - checkout
+      - attach_workspace:
+          at: ~/workspace
+      - run_brew_for_macos_build
+      - run:
+          name: Test
+          no_output_timeout: "1h"
+          command: |
+            set -e
+            export JOB_BASE_NAME=$CIRCLE_JOB
+
+            chmod a+x .ci/pytorch/macos-test.sh
+            unbuffer .ci/pytorch/macos-test.sh 2>&1 | ts
+      - store_test_results:
+          path: test/test-reports
+
+  pytorch_macos_10_13_py3_lite_interpreter_build_test:
+    environment:
+      BUILD_ENVIRONMENT: pytorch-macos-10.13-py3-test
+    macos:
+      xcode: "12.0"
+    steps:
+      - checkout
+      - attach_workspace:
+          at: ~/workspace
+      - run_brew_for_macos_build
+      - run:
+          name: Test
+          no_output_timeout: "1h"
+          command: |
+            set -e
+            export BUILD_LITE_INTERPRETER=1
+            export JOB_BASE_NAME=$CIRCLE_JOB
+            chmod a+x ${HOME}/project/.ci/pytorch/macos-lite-interpreter-build-test.sh
+            unbuffer ${HOME}/project/.ci/pytorch/macos-lite-interpreter-build-test.sh 2>&1 | ts
+      - store_test_results:
+          path: test/test-reports
+
+  pytorch_android_gradle_build:
+    environment:
+      BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c"
+      PYTHON_VERSION: "3.7"
+    resource_class: large
+    machine:
+      image: ubuntu-2004:202104-01
+    steps:
+    - checkout
+    - calculate_docker_image_tag
+    - setup_linux_system_environment
+    - setup_ci_environment
+    - run:
+        name: pytorch android gradle build
+        no_output_timeout: "1h"
+        command: |
+          set -eux
+          docker_image_commit=${DOCKER_IMAGE}:build-${DOCKER_TAG}-${CIRCLE_SHA1}
+
+          docker_image_libtorch_android_x86_32=${docker_image_commit}-android-x86_32
+          docker_image_libtorch_android_x86_64=${docker_image_commit}-android-x86_64
+          docker_image_libtorch_android_arm_v7a=${docker_image_commit}-android-arm-v7a
+          docker_image_libtorch_android_arm_v8a=${docker_image_commit}-android-arm-v8a
+
+          echo "docker_image_commit: "${docker_image_commit}
+          echo "docker_image_libtorch_android_x86_32: "${docker_image_libtorch_android_x86_32}
+          echo "docker_image_libtorch_android_x86_64: "${docker_image_libtorch_android_x86_64}
+          echo "docker_image_libtorch_android_arm_v7a: "${docker_image_libtorch_android_arm_v7a}
+          echo "docker_image_libtorch_android_arm_v8a: "${docker_image_libtorch_android_arm_v8a}
+
+          # x86_32
+          time docker pull ${docker_image_libtorch_android_x86_32} >/dev/null
+          export id_x86_32=$(docker run --env-file "${BASH_ENV}" -e GRADLE_OFFLINE=1 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${docker_image_libtorch_android_x86_32})
+
+          export COMMAND='((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "$id_x86_32" bash) 2>&1'
+          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
+          # arm-v7a
+          time docker pull ${docker_image_libtorch_android_arm_v7a} >/dev/null
+          export id_arm_v7a=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${docker_image_libtorch_android_arm_v7a})
+
+          export COMMAND='((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "$id_arm_v7a" bash) 2>&1'
+          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
+          mkdir -p ~/workspace/build_android_install_arm_v7a
+          docker cp $id_arm_v7a:/var/lib/jenkins/workspace/build_android/install ~/workspace/build_android_install_arm_v7a
+
+          # x86_64
+          time docker pull ${docker_image_libtorch_android_x86_64} >/dev/null
+          export id_x86_64=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${docker_image_libtorch_android_x86_64})
+
+          export COMMAND='((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "$id_x86_64" bash) 2>&1'
+          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
+          mkdir -p ~/workspace/build_android_install_x86_64
+          docker cp $id_x86_64:/var/lib/jenkins/workspace/build_android/install ~/workspace/build_android_install_x86_64
+
+          # arm-v8a
+          time docker pull ${docker_image_libtorch_android_arm_v8a} >/dev/null
+          export id_arm_v8a=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${docker_image_libtorch_android_arm_v8a})
+
+          export COMMAND='((echo "sudo chown -R jenkins workspace") | docker exec -u jenkins -i "$id_arm_v8a" bash) 2>&1'
+          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
+          mkdir -p ~/workspace/build_android_install_arm_v8a
+          docker cp $id_arm_v8a:/var/lib/jenkins/workspace/build_android/install ~/workspace/build_android_install_arm_v8a
+
+          docker cp ~/workspace/build_android_install_arm_v7a $id_x86_32:/var/lib/jenkins/workspace/build_android_install_arm_v7a
+          docker cp ~/workspace/build_android_install_x86_64 $id_x86_32:/var/lib/jenkins/workspace/build_android_install_x86_64
+          docker cp ~/workspace/build_android_install_arm_v8a $id_x86_32:/var/lib/jenkins/workspace/build_android_install_arm_v8a
+
+          # run gradle buildRelease
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/build_android_gradle.sh") | docker exec -u jenkins -i "$id_x86_32" bash) 2>&1'
+          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
+          mkdir -p ~/workspace/build_android_artifacts
+          docker cp $id_x86_32:/var/lib/jenkins/workspace/android/artifacts.tgz ~/workspace/build_android_artifacts/
+
+          output_image=$docker_image_libtorch_android_x86_32-gradle
+          docker commit "$id_x86_32" ${output_image}
+          time docker push ${output_image}
+    - store_artifacts:
+        path: ~/workspace/build_android_artifacts/artifacts.tgz
+        destination: artifacts.tgz
+
+  pytorch_android_publish_snapshot:
+    environment:
+      BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-publish-snapshot
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c"
+      PYTHON_VERSION: "3.7"
+    resource_class: large
+    machine:
+      image: ubuntu-2004:202104-01
+    steps:
+    - checkout
+    - calculate_docker_image_tag
+    - setup_linux_system_environment
+    - setup_ci_environment
+    - run:
+        name: pytorch android gradle build
+        no_output_timeout: "1h"
+        command: |
+          set -eux
+          docker_image_commit=${DOCKER_IMAGE}:build-${DOCKER_TAG}-${CIRCLE_SHA1}
+
+          docker_image_libtorch_android_x86_32_gradle=${docker_image_commit}-android-x86_32-gradle
+
+          echo "docker_image_commit: "${docker_image_commit}
+          echo "docker_image_libtorch_android_x86_32_gradle: "${docker_image_libtorch_android_x86_32_gradle}
+
+          # x86_32
+          time docker pull ${docker_image_libtorch_android_x86_32_gradle} >/dev/null
+          export id_x86_32=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${docker_image_libtorch_android_x86_32_gradle})
+
+          export COMMAND='((echo "sudo chown -R jenkins workspace" && echo "export BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}" && echo "export SONATYPE_NEXUS_USERNAME=${SONATYPE_NEXUS_USERNAME}" && echo "export SONATYPE_NEXUS_PASSWORD=${SONATYPE_NEXUS_PASSWORD}" && echo "export ANDROID_SIGN_KEY=${ANDROID_SIGN_KEY}" && echo "export ANDROID_SIGN_PASS=${ANDROID_SIGN_PASS}" && echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/publish_android_snapshot.sh") | docker exec -u jenkins -i "$id_x86_32" bash) 2>&1'
+          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
+          output_image=${docker_image_libtorch_android_x86_32_gradle}-publish-snapshot
+          docker commit "$id_x86_32" ${output_image}
+          time docker push ${output_image}
+
+  pytorch_android_gradle_build-x86_32:
+    environment:
+      BUILD_ENVIRONMENT: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-only-x86_32
+      DOCKER_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c"
+      PYTHON_VERSION: "3.7"
+    resource_class: large
+    machine:
+      image: ubuntu-2004:202104-01
+    steps:
+    - checkout
+    - calculate_docker_image_tag
+    - setup_linux_system_environment
+    - checkout
+    - setup_ci_environment
+    - run:
+        name: pytorch android gradle build only x86_32 (for PR)
+        no_output_timeout: "1h"
+        command: |
+          set -e
+          docker_image_libtorch_android_x86_32=${DOCKER_IMAGE}:build-${DOCKER_TAG}-${CIRCLE_SHA1}-android-x86_32
+          echo "docker_image_libtorch_android_x86_32: "${docker_image_libtorch_android_x86_32}
+
+          # x86
+          time docker pull ${docker_image_libtorch_android_x86_32} >/dev/null
+          export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${docker_image_libtorch_android_x86_32})
+
+          export COMMAND='((echo "export BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}" && echo "export GRADLE_OFFLINE=1" && echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/build_android_gradle.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
+          mkdir -p ~/workspace/build_android_x86_32_artifacts
+          docker cp $id:/var/lib/jenkins/workspace/android/artifacts.tgz ~/workspace/build_android_x86_32_artifacts/
+
+          output_image=${docker_image_libtorch_android_x86_32}-gradle
+          docker commit "$id" ${output_image}
+          time docker push ${output_image}
+    - store_artifacts:
+        path: ~/workspace/build_android_x86_32_artifacts/artifacts.tgz
+        destination: artifacts.tgz
+
+  pytorch_ios_build:
+    <<: *pytorch_ios_params
+    macos:
+      xcode: "12.5.1"
+    steps:
+      - run:
+          name: checkout with retry
+          command: |
+            checkout() {
+              set -ex
+              # Workaround old docker images with incorrect $HOME
+              # check https://github.com/docker/docker/issues/2968 for details
+              if [ "${HOME}" = "/" ]
+                then
+                export HOME=$(getent passwd $(id -un) | cut -d: -f6)
+              fi
+
+              mkdir -p ~/.ssh
+
+              echo 'github.com ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEAq2A7hRGmdnm9tUDbO9IDSwBK6TbQa+PXYPCPy6rbTrTtw7PHkccKrpp0yVhp5HdEIcKr6pLlVDBfOLX9QUsyCOV0wzfjIJNlGEYsdlLJizHhbn2mUjvSAHQqZETYP81eFzLQNnPHt4EVVUh7VfDESU84KezmD5QlWpXLmvU31/yMf+Se8xhHTvKSCZIFImWwoG6mbUoWf9nzpIoaSjB+weqqUUmpaaasXVal72J+UX2B+2RPW3RcT0eOzQgqlJL3RKrTJvdsjE3JEAvGq3lGHSZXy28G3skua2SmVi/w4yCE6gbODqnTWlg7+wC604ydGXA8VJiS5ap43JXiUFFAaQ==
+              ' >> ~/.ssh/known_hosts
+
+              # use git+ssh instead of https
+              git config --global url."ssh://git@github.com".insteadOf "https://github.com" || true
+              git config --global gc.auto 0 || true
+
+              echo 'Cloning git repository'
+              mkdir -p '/Users/distiller/project'
+              cd '/Users/distiller/project'
+              git clone "$CIRCLE_REPOSITORY_URL" .
+              echo 'Checking out branch'
+              git checkout --force -B "$CIRCLE_BRANCH" "$CIRCLE_SHA1"
+              git --no-pager log --no-color -n 1 --format='HEAD is now at %h %s'
+            }
+
+            retry () {
+              $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+            }
+            retry checkout
+      - run_brew_for_ios_build
+      - run:
+          name: Setup Fastlane
+          no_output_timeout: "1h"
+          command: |
+            set -e
+            PROJ_ROOT=/Users/distiller/project
+            cd ${PROJ_ROOT}/ios/TestApp
+            # install fastlane
+            sudo gem install bundler && bundle install
+      - run:
+          name: Build
+          no_output_timeout: "1h"
+          command: |
+            set -e
+            WORKSPACE=/Users/distiller/workspace
+            PROJ_ROOT=/Users/distiller/project
+            export TCLLIBPATH="/usr/local/lib"
+
+            # Install conda
+            curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-MacOSX-x86_64.sh
+            chmod +x ~/conda.sh
+            /bin/bash ~/conda.sh -b -p ~/anaconda
+            export PATH="~/anaconda/bin:${PATH}"
+            source ~/anaconda/bin/activate
+
+            # Install dependencies
+            retry () {
+                $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+            }
+
+            retry conda install numpy ninja pyyaml mkl mkl-include setuptools cmake requests typing-extensions --yes
+
+            # sync submodules
+            cd ${PROJ_ROOT}
+            git submodule sync
+            git submodule update --init --recursive --depth 1 --jobs 0
+
+            # export
+            export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+
+            # run build script
+            chmod a+x ${PROJ_ROOT}/scripts/build_ios.sh
+            echo "IOS_ARCH: ${IOS_ARCH}"
+            echo "IOS_PLATFORM: ${IOS_PLATFORM}"
+            echo "USE_PYTORCH_METAL": "${USE_METAL}"
+            echo "BUILD_LITE_INTERPRETER": "${BUILD_LITE_INTERPRETER}"
+            echo "USE_COREML_DELEGATE": "${USE_COREML_DELEGATE}"
+
+            #check the custom build flag
+            echo "SELECTED_OP_LIST: ${SELECTED_OP_LIST}"
+            if [ -n "${SELECTED_OP_LIST}" ]; then
+                export SELECTED_OP_LIST="${PROJ_ROOT}/ios/TestApp/custom_build/${SELECTED_OP_LIST}"
+            fi
+            export IOS_ARCH=${IOS_ARCH}
+            export IOS_PLATFORM=${IOS_PLATFORM}
+            export USE_COREML_DELEGATE=${USE_COREML_DELEGATE}
+            if [ ${IOS_PLATFORM} != "SIMULATOR" ]; then
+              export USE_PYTORCH_METAL=${USE_METAL}
+            fi
+            unbuffer ${PROJ_ROOT}/scripts/build_ios.sh 2>&1 | ts
+      - run:
+          name: Run Build Test
+          no_output_timeout: "30m"
+          command: |
+            set -e
+            PROJ_ROOT=/Users/distiller/project
+            # run the ruby build script
+            if ! [ -x "$(command -v xcodebuild)" ]; then
+              echo 'Error: xcodebuild is not installed.'
+              exit 1
+            fi
+            ruby ${PROJ_ROOT}/scripts/xcode_build.rb -i ${PROJ_ROOT}/build_ios/install -x ${PROJ_ROOT}/ios/TestApp/TestApp.xcodeproj -p ${IOS_PLATFORM}
+            if ! [ "$?" -eq "0" ]; then
+              echo 'xcodebuild failed!'
+              exit 1
+            fi
+      - run:
+          name: Run Simulator Tests
+          no_output_timeout: "2h"
+          command: |
+            set -e
+            if [ ${IOS_PLATFORM} != "SIMULATOR" ]; then
+              echo "not SIMULATOR build, skip it."
+              exit 0
+            fi
+            WORKSPACE=/Users/distiller/workspace
+            PROJ_ROOT=/Users/distiller/project
+            source ~/anaconda/bin/activate
+            # use the pytorch nightly build to generate models
+            pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+            # generate models for differnet backends
+            cd ${PROJ_ROOT}/ios/TestApp/benchmark
+            mkdir -p ../models
+            if [ ${USE_COREML_DELEGATE} == 1 ]; then
+              pip install coremltools==5.0b5 protobuf==3.20.1
+              python coreml_backend.py
+            else
+              cd "${PROJ_ROOT}"
+              python test/mobile/model_test/gen_test_model.py ios-test
+            fi
+            cd "${PROJ_ROOT}/ios/TestApp/benchmark"
+            if [ ${BUILD_LITE_INTERPRETER} == 1 ]; then
+              echo "Setting up the TestApp for LiteInterpreter"
+              ruby setup.rb --lite 1
+            else
+              echo "Setting up the TestApp for Full JIT"
+              ruby setup.rb
+            fi
+            cd "${PROJ_ROOT}/ios/TestApp"
+            # instruments -s -devices
+            if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then
+              if [ "${USE_COREML_DELEGATE}" == 1 ]; then
+                fastlane scan --only_testing TestAppTests/TestAppTests/testCoreML
+              else
+                fastlane scan --only_testing TestAppTests/TestAppTests/testLiteInterpreter
+              fi
+            else
+              fastlane scan --only_testing TestAppTests/TestAppTests/testFullJIT
+            fi
+  pytorch_linux_bazel_build:
+    <<: *pytorch_params
+    machine:
+      image: ubuntu-2004:202104-01
+    steps:
+    - checkout
+    - calculate_docker_image_tag
+    - setup_linux_system_environment
+    - setup_ci_environment
+    - run:
+        name: Bazel Build
+        no_output_timeout: "1h"
+        command: |
+          set -e
+          # Pull Docker image and run build
+          echo "DOCKER_IMAGE: "${DOCKER_IMAGE}:${DOCKER_TAG}
+          time docker pull ${DOCKER_IMAGE}:${DOCKER_TAG} >/dev/null
+          export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}:${DOCKER_TAG})
+
+          echo "Do NOT merge main branch into $CIRCLE_BRANCH in environment $BUILD_ENVIRONMENT"
+
+          git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0
+
+          docker cp /home/circleci/project/. $id:/var/lib/jenkins/workspace
+
+          export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .ci/pytorch/build.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+
+          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
+          # Push intermediate Docker image for next phase to use
+          if [ -z "${BUILD_ONLY}" ]; then
+            # Augment our output image name with bazel to avoid collisions
+            output_image=${DOCKER_IMAGE}:build-${DOCKER_TAG}-bazel-${CIRCLE_SHA1}
+            export COMMIT_DOCKER_IMAGE=$output_image
+            docker commit "$id" ${COMMIT_DOCKER_IMAGE}
+            time docker push ${COMMIT_DOCKER_IMAGE}
+          fi
+
+  pytorch_linux_bazel_test:
+    <<: *pytorch_params
+    machine:
+      image: ubuntu-2004:202104-01
+    steps:
+    - checkout
+    - calculate_docker_image_tag
+    - setup_linux_system_environment
+    - setup_ci_environment
+    - run:
+        name: Test
+        no_output_timeout: "90m"
+        command: |
+          set -e
+          output_image=${DOCKER_IMAGE}:build-${DOCKER_TAG}-bazel-${CIRCLE_SHA1}
+          export COMMIT_DOCKER_IMAGE=$output_image
+          echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE}
+
+          time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null
+
+          if [ -n "${USE_CUDA_DOCKER_RUNTIME}" ]; then
+            export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --gpus all -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
+          else
+            export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE})
+          fi
+
+          retrieve_test_reports() {
+            echo "retrieving test reports"
+            docker cp -L $id:/var/lib/jenkins/workspace/bazel-testlogs ./ || echo 'No test reports found!'
+          }
+          trap "retrieve_test_reports" ERR
+
+          if [[ ${BUILD_ENVIRONMENT} == *"multigpu"* ]]; then
+            export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .ci/pytorch/multigpu-test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          else
+            export COMMAND='((echo "sudo chown -R jenkins workspace && cd workspace && .ci/pytorch/test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+          fi
+          echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+
+          retrieve_test_reports
+          docker stats --all --no-stream
+    - store_test_results:
+        path: bazel-testlogs
+
+  pytorch_windows_test_multigpu:
+    machine:
+      image: ubuntu-2004:202104-01
+    steps:
+      - checkout
+      - run:
+          name: Test
+          no_output_timeout: "90m"
+          command: |
+            set -e
+            python3 -m pip install requests
+            python3 ./.circleci/scripts/trigger_azure_pipeline.py
--- a/.circleci/verbatim-sources/job-specs/job-specs-promote.yml
+++ b/.circleci/verbatim-sources/job-specs/job-specs-promote.yml
@ -0,0 +1,18 @@
+
+  promote_s3:
+    <<: *promote_common
+    steps:
+      - checkout
+      - run:
+          name: Running promote script
+          command: |
+            scripts/release/promote/wheel_to_s3.sh
+
+  promote_conda:
+    <<: *promote_common
+    steps:
+      - checkout
+      - run:
+          name: Running promote script
+          command: |
+            scripts/release/promote/conda_to_conda.sh
--- a/.circleci/verbatim-sources/job-specs/job-specs-setup.yml
+++ b/.circleci/verbatim-sources/job-specs/job-specs-setup.yml
@ -0,0 +1,29 @@
+
+  setup:
+    docker:
+      - image: circleci/python:3.7.3
+    steps:
+      - checkout
+      - run:
+          name: Save commit message
+          command: git log --format='%B' -n 1 HEAD > .circleci/scripts/COMMIT_MSG
+      # Note [Workspace for CircleCI scripts]
+      # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+      # In the beginning, you wrote your CI scripts in a
+      # .circleci/config.yml file, and life was good.  Your CI
+      # configurations flourished and multiplied.
+      #
+      # Then one day, CircleCI cometh down high and say, "Your YAML file
+      # is too biggeth, it stresses our servers so."  And thus they
+      # asketh us to smite the scripts in the yml file.
+      #
+      # But you can't just put the scripts in the .circleci folder,
+      # because in some jobs, you don't ever actually checkout the
+      # source repository.  Where you gonna get the scripts from?
+      #
+      # Here's how you do it: you persist .circleci/scripts into a
+      # workspace, attach the workspace in your subjobs, and run all
+      # your scripts from there.
+      - persist_to_workspace:
+          root: .
+          paths: .circleci/scripts
--- a/.circleci/verbatim-sources/nightly-binary-build-defaults.yml
+++ b/.circleci/verbatim-sources/nightly-binary-build-defaults.yml
@ -0,0 +1,51 @@
+
+##############################################################################
+# Binary build (nightlies nightly build) defaults
+# The binary builds use the docker executor b/c at time of writing the machine
+# executor is limited to only two cores and is painfully slow (4.5+ hours per
+# GPU build). But the docker executor cannot be run with --runtime=nvidia, and
+# so the binary test/upload jobs must run on a machine executor. The package
+# built in the build job is persisted to the workspace, which the test jobs
+# expect. The test jobs just run a few quick smoke tests (very similar to the
+# second-round-user-facing smoke tests above) and then upload the binaries to
+# their final locations. The upload part requires credentials that should only
+# be available to org-members.
+#
+# binary_checkout MUST be run before other commands here. This is because the
+# other commands are written in .circleci/scripts/*.sh , so the pytorch source
+# code must be downloaded on the machine before they can be run. We cannot
+# inline all the code into this file, since that would cause the yaml size to
+# explode past 4 MB (all the code in the command section is just copy-pasted to
+# everywhere in the .circleci/config.yml file where it appears).
+##############################################################################
+
+# Checks out the Pytorch and Builder repos (always both of them), and places
+# them in the right place depending on what executor we're running on. We curl
+# our .sh file from the interweb to avoid yaml size bloat. Note that many jobs
+# do not need both the pytorch and builder repos, so this is a little wasteful
+# (smoke tests and upload jobs do not need the pytorch repo).
+binary_checkout: &binary_checkout
+  name: Checkout pytorch/builder repo
+  no_output_timeout: "30m"
+  command: .circleci/scripts/binary_checkout.sh
+
+# Parses circleci arguments in a consistent way, essentially routing to the
+# correct pythonXgccXcudaXos build we want
+binary_populate_env: &binary_populate_env
+  name: Set up binary env variables
+  command: .circleci/scripts/binary_populate_env.sh
+
+binary_install_miniconda: &binary_install_miniconda
+  name: Install miniconda
+  no_output_timeout: "1h"
+  command: .circleci/scripts/binary_install_miniconda.sh
+
+# This section is used in the binary_test and smoke_test jobs. It expects
+# 'binary_populate_env' to have populated /home/circleci/project/env and it
+# expects another section to populate /home/circleci/project/ci_test_script.sh
+# with the code to run in the docker
+binary_run_in_docker: &binary_run_in_docker
+  name: Run in docker
+  # This step only runs on circleci linux machine executors that themselves
+  # need to start docker images
+  command: .circleci/scripts/binary_run_in_docker.sh
--- a/.circleci/verbatim-sources/workflows/workflows-nightly-uploads-header.yml
+++ b/.circleci/verbatim-sources/workflows/workflows-nightly-uploads-header.yml
@ -0,0 +1,8 @@
+      #- binary_linux_libtorch_3.6m_cpu_test:
+      #    requires:
+      #      - binary_linux_libtorch_3.6m_cpu_build
+      #- binary_linux_libtorch_3.6m_cu90_test:
+      #    requires:
+      #      - binary_linux_libtorch_3.6m_cu90_build
+
+      # Nightly uploads
--- a/.clang-tidy
+++ b/.clang-tidy
@ -1,8 +1,5 @@
 ---
 # NOTE there must be no spaces before the '-', so put the comma last.
-# The check bugprone-unchecked-optional-access is also turned off atm
-# because it causes clang-tidy to hang randomly. The tracking issue
-# can be found at https://github.com/llvm/llvm-project/issues/69369.
 InheritParentConfig: true
 Checks: '
 bugprone-*,
@ -12,7 +9,6 @@ bugprone-*,
 -bugprone-lambda-function-name,
 -bugprone-reserved-identifier,
 -bugprone-swapped-arguments,
-bugprone-unchecked-optional-access,
 clang-diagnostic-missing-prototypes,
 cppcoreguidelines-*,
 -cppcoreguidelines-avoid-do-while,
@ -34,13 +30,8 @@ cppcoreguidelines-*,
 -facebook-hte-RelativeInclude,
 hicpp-exception-baseclass,
 hicpp-avoid-goto,
-misc-*,
-misc-const-correctness,
-misc-use-anonymous-namespace,
-misc-unused-parameters,
-misc-no-recursion,
-misc-non-private-member-variables-in-classes,
-misc-confusable-identifiers,
+misc-unused-alias-decls,
+misc-unused-using-decls,
 modernize-*,
 -modernize-concat-nested-namespaces,
 -modernize-macro-to-enum,
@ -52,15 +43,8 @@ modernize-*,
 -modernize-use-nodiscard,
 performance-*,
 readability-container-size-empty,
-readability-delete-null-pointer,
-readability-duplicate-include
-readability-misplaced-array-index,
-readability-redundant-function-ptr-dereference,
-readability-redundant-smartptr-get,
-readability-simplify-subscript-expr,
-readability-string-compare,
 '
-HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
+HeaderFilterRegex: '^(c10/(?!test)|torch/csrc/(?!deploy/interpreter/cpython)).*$'
 AnalyzeTemporaryDtors: false
 WarningsAsErrors: '*'
 ...
--- a/.devcontainer/README.md
+++ b/.devcontainer/README.md
@ -1,72 +0,0 @@
-# Step by step guide on using PyTorch's DevContainer
-
-Using PyTorch's DevContainer environment involves a series of steps that will help you set up a development environment that is isolated and replicable. Below, we'll guide you through each step to make this process as smooth as possible:
-
-## Step 1: Install VSCode
-
-1. Navigate to the [Visual Studio Code website](https://code.visualstudio.com/).
-2. Download the appropriate installer for your operating system (Windows, Linux, or macOS).
-3. Run the installer and follow the on-screen instructions to install VSCode on your system.
-4. After installation, launch VSCode.
-
-## Step 2: Install DevContainer Extension
-
-1. In VSCode, go to the Extensions view by clicking on the Extensions icon in the Activity Bar on the side of the window.
-2. Search for "Dev Containers" in the Extensions view search bar.
-3. Find the "Dev Containers" extension in the search results and click on the install button to install it.
-
-You can also go to the extension's [homepage](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) and [documentation page](https://code.visualstudio.com/docs/devcontainers/containers) to find more details.
-
-## Step 3: Install Docker and Add Current Login User to Docker Group
-
-1. Follow the [official guide](https://docs.docker.com/get-docker/) to install Docker. Don't forget the [post installation steps](https://docs.docker.com/engine/install/linux-postinstall/).
-
-If you are using [Visual Studio Code Remote - SSH](https://code.visualstudio.com/docs/remote/ssh), then you only need to install Docker in the remote host, not your local computer. And the following steps should be run in the remote host.
-
-## Step 4 (Optional): Install NVIDIA Container Toolkit for GPU Usage
-
-1. If you intend to use GPU resources, first ensure you have NVIDIA drivers installed on your system. Check if `nvidia-smi` works to verify your GPU setup.
-2. Follow the [official guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#docker) to install the NVIDIA Container Toolkit.
-3. After installation, verify that the toolkit is installed correctly by running:
-   ```
-   docker run --rm --runtime=nvidia --gpus all nvidia/cuda:11.6.2-base-ubuntu20.04 nvidia-smi
-   ```
-
-## Step 5: Clone PyTorch
-
-1. Open a terminal or command prompt.
-2. Use the following command to clone the PyTorch repository:
-   ```
-   git clone https://github.com/pytorch/pytorch
-   ```
-3. Navigate to the cloned directory:
-   ```
-   cd pytorch
-   ```
-
-## Step 6: Open in DevContainer
-
-1. In VSCode, use the Command Palette (`Ctrl+Shift+P` or `Cmd+Shift+P` on macOS) to run the "Remote-Containers: Open Folder in Container..." command.
-2. You will be prompted with two options: CPU dev container or CUDA dev container. Choose the one you want to run.
-
-## Step 7: Wait for Building the Environment
-
-1. After opening the folder in a DevContainer, VSCode will start building the container. This process can take some time as it involves downloading necessary images and setting up the environment.
-2. You can monitor the progress in the VSCode terminal.
-3. Once the build process completes, you'll have a fully configured PyTorch development environment in a container.
-4. The next time you open the same dev container, it will be much faster, as it does not require building the image again.
-
-You are now all set to start developing with PyTorch in a DevContainer environment. This setup ensures you have a consistent and isolated development environment for your PyTorch projects.
-
-## Step 8: Build PyTorch
-
-To build pytorch from source, simply run:
-   ```
-   python setup.py develop
-   ```
-
-The process involves compiling thousands of files, and would take a long time. Fortunately, the compiled objects can be useful for your next build. When you modify some files, you only need to compile the changed files the next time.
-
-Note that only contents in the `pytorch` directory are saved to disk. This directory is mounted to the docker image, while other contents in the docker image are all temporary, and will be lost if docker restarts the image or the server reboots.
-
-For an in-depth understanding of Dev Container and its caveats, please refer to [the full documentation](https://code.visualstudio.com/docs/devcontainers/containers).
--- a/.devcontainer/scripts/install-dev-tools.sh
+++ b/.devcontainer/scripts/install-dev-tools.sh
@ -9,5 +9,3 @@ make setup_lint

 # Add CMAKE_PREFIX_PATH to bashrc
 echo 'export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}' >> ~/.bashrc
-# Add linker path so that cuda-related libraries can be found
-echo 'export LDFLAGS="-L${CONDA_PREFIX}/lib/ $LDFLAGS"' >> ~/.bashrc
--- a/.flake8
+++ b/.flake8
@ -2,35 +2,29 @@
 # NOTE: **Mirror any changes** to this file the [tool.ruff] config in pyproject.toml
 # before we can fully move to use ruff
 enable-extensions = G
-select = B,C,E,F,G,P,SIM1,T4,W,B9,TOR0,TOR1,TOR2
+select = B,C,E,F,G,P,SIM1,T4,W,B9
 max-line-length = 120
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
 ignore =
-    E203,E305,E402,E501,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,
+    E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
+    # fix these lints in the future
+    E275,
    # shebang has extra meaning in fbcode lints, so I think it's not worth trying
    # to line this up with executable bit
    EXE001,
    # these ignores are from flake8-bugbear; please fix!
-    B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907
+    B007,B008,B017,B019,B020,B023,B024,B026,B028,B903,B904,B905,B906,B907
    # these ignores are from flake8-comprehensions; please fix!
    C407,
    # these ignores are from flake8-logging-format; please fix!
-    G100,G101,G200
+    G100,G101,G200,G201,G202
    # these ignores are from flake8-simplify. please fix or ignore with commented reason
    SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
    # flake8-simplify code styles
    SIM102,SIM103,SIM106,SIM112,
-    # TorchFix codes that don't make sense for PyTorch itself:
-    # removed and deprecated PyTorch functions.
-    TOR001,TOR101,
-    # TODO(kit1980): fix all TOR102 issues
-    # `torch.load` without `weights_only` parameter is unsafe
-    TOR102,
 per-file-ignores =
    __init__.py: F401
-    test/**: F821
-    test/**/__init__.py: F401,F821
    torch/utils/cpp_extension.py: B950
    torchgen/api/types/__init__.py: F401,F403
    torchgen/executorch/api/types/__init__.py: F401,F403
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@ -38,5 +38,3 @@ f70844bec783bfce43c950ccf180dc494e86f2bf
 e6ec0efaf87703c5f889cfc20b29be455885d58d
 # 2023-07-31 [optim][BE] split test file into logical parts: SWA, LR, optim
 a53cda1ddc15336dc1ff0ce1eff2a49cdc5f882e
-# 2024-01-02 clangformat: fused adam #116583
-9dc68d1aa9e554d09344a10fff69f7b50b2d23a0
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -3,12 +3,11 @@ self-hosted-runner:
    - linux.20_04.4x
    - linux.20_04.16x
    - linux.large
-    - linux.large.arc
    - linux.2xlarge
    - linux.4xlarge
    - linux.12xlarge
    - linux.24xlarge
-    - linux.arm64.2xlarge
+    - linux.t4g.2xlarge
    - linux.4xlarge.nvidia.gpu
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
@ -24,5 +23,3 @@ self-hosted-runner:
    - macos-12-xl
    - macos-12
    - macos12.3-m1
-    - macos-latest-xlarge
-    - macos-13-xlarge
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@ -13,10 +13,6 @@ inputs:
    required: true
    type: string
    description: JSON description of what test configs to run.
-  job-name:
-    type: string
-    required: false
-    default: ""

 outputs:
  test-matrix:
@ -46,8 +42,7 @@ runs:
        retry_wait_seconds: 30
        command: |
          set -eux
-          # PyYAML 6.0 doesn't work with MacOS x86 anymore
-          python3 -m pip install requests==2.26.0 pyyaml==6.0.1
+          python3 -m pip install requests==2.26.0 pyyaml==6.0

    - name: Parse ref
      id: parse-ref
@ -61,7 +56,6 @@ runs:

    - name: Get the job name
      id: get-job-name
-      if: inputs.job-name == ''
      continue-on-error: true
      shell: bash
      run: |
@ -97,7 +91,7 @@ runs:
      shell: bash
      env:
        GITHUB_TOKEN: ${{ inputs.github-token }}
-        JOB_NAME: ${{ inputs.job-name == '' && steps.get-job-name.outputs.job-name || inputs.job-name }}
+        JOB_NAME: ${{ steps.get-job-name.outputs.job-name }}
        PR_NUMBER: ${{ github.event.pull_request.number }}
        TAG: ${{ steps.parse-ref.outputs.tag }}
        EVENT_NAME: ${{ github.event_name }}
--- a/.github/actions/get-workflow-job-id/action.yml
+++ b/.github/actions/get-workflow-job-id/action.yml
@ -11,20 +11,18 @@ outputs:
  job-id:
    description: The retrieved workflow job id
    value: ${{ steps.get-job-id.outputs.job-id }}
-  job-name:
-    description: The retrieved workflow job name
-    value: ${{ steps.get-job-id.outputs.job-name }}

 runs:
  using: composite
  steps:
-    - name: Get job id and name or fail
+    - name: Get jobid or fail
      # timeout-minutes is unsupported for composite workflows, see https://github.com/actions/runner/issues/1979
      # timeout-minutes: 10
      shell: bash
      id: get-job-id
      run: |
        set -eux
-        python3 .github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}"
+        GHA_WORKFLOW_JOB_ID=$(python3 .github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}")
+        echo "job-id=${GHA_WORKFLOW_JOB_ID}" >> "${GITHUB_OUTPUT}"
      env:
        GITHUB_TOKEN: ${{ inputs.github-token }}
--- a/.github/actions/pytest-cache-upload/action.yml
+++ b/.github/actions/pytest-cache-upload/action.yml
@ -10,13 +10,6 @@ inputs:
    description: Shard number for the current job
    required: false
    default: "0"
-  sha:
-    description: SHA for the commit
-    required: true
-  test_config:
-    description: Name of the test config
-    required: false
-    default: "default"
  job_identifier:
    description: Text that uniquely identifies a given job type within a workflow. All shards of a job should share the same job identifier.
    required: true
@ -40,8 +33,6 @@ runs:
      env:
        CACHE_DIR: ${{ inputs.cache_dir }}
        JOB_IDENTIFIER: ${{ inputs.job_identifier }}
-        SHA: ${{ inputs.sha }}
-        TEST_CONFIG: ${{ inputs.test_config }}
        SHARD: ${{ inputs.shard }}
        REPO: ${{ github.repository }}
      run: |
@ -50,8 +41,6 @@ runs:
          --cache_dir $GITHUB_WORKSPACE/$CACHE_DIR \
          --pr_identifier $GITHUB_REF \
          --job_identifier $JOB_IDENTIFIER \
-          --sha $SHA \
-          --test_config $TEST_CONFIG \
          --shard $SHARD \
          --repo $REPO \
          --temp_dir $RUNNER_TEMP \
--- a/.github/actions/setup-xpu/action.yml
+++ b/.github/actions/setup-xpu/action.yml
@ -1,67 +0,0 @@
-name: Setup XPU host
-
-description: Set up XPU host for CI
-
-runs:
-  using: composite
-  steps:
-    - name: Clean all stopped docker containers
-      if: always()
-      shell: bash
-      run: |
-        # Prune all stopped containers.
-        # If other runner is pruning on this node, will skip.
-        nprune=$(ps -ef | grep -c "docker container prune")
-        if [[ $nprune -eq 1 ]]; then
-          docker container prune -f
-        fi
-
-    - name: Runner health check system info
-      if: always()
-      shell: bash
-      run: |
-        cat /etc/os-release || true
-        cat /etc/apt/sources.list.d/oneAPI.list || true
-        cat /etc/apt/sources.list.d/intel-gpu-jammy.list || true
-        whoami
-
-    - name: Runner health check xpu-smi
-      if: always()
-      shell: bash
-      run: |
-        xpu-smi discovery
-
-    - name: Runner health check GPU count
-      if: always()
-      shell: bash
-      run: |
-        ngpu=$(xpu-smi discovery | grep -c -E 'Device Name')
-        msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
-        if [[ $ngpu -eq 0 ]]; then
-          echo "Error: Failed to detect any GPUs on the runner"
-          echo "$msg"
-          exit 1
-        fi
-
-    - name: Runner diskspace health check
-      uses: ./.github/actions/diskspace-cleanup
-      if: always()
-
-    - name: Runner health check disconnect on failure
-      if: ${{ failure() }}
-      shell: bash
-      run: |
-        killall runsvc.sh
-
-    - name: Preserve github env variables for use in docker
-      shell: bash
-      run: |
-        env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
-        env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
-
-    - name: XPU set GPU_FLAG
-      shell: bash
-      run: |
-        # Add render group for container creation.
-        render_gid=`cat /etc/group | grep render | cut -d: -f3`
-        echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}"
--- a/.github/actions/teardown-xpu/action.yml
+++ b/.github/actions/teardown-xpu/action.yml
@ -1,20 +0,0 @@
-name: Teardown XPU host
-
-description: Tear down XPU host for CI
-
-runs:
-  using: composite
-  steps:
-    - name: Teardown XPU
-      if: always()
-      shell: bash
-      run: |
-        # Prune all stopped containers.
-        # If other runner is pruning on this node, will skip.
-        nprune=$(ps -ef | grep -c "docker container prune")
-        if [[ $nprune -eq 1 ]]; then
-          docker container prune -f
-        fi
-    - name: Runner diskspace health check
-      uses: ./.github/actions/diskspace-cleanup
-      if: always()
--- a/.github/actions/upload-test-artifacts/action.yml
+++ b/.github/actions/upload-test-artifacts/action.yml
@ -43,14 +43,14 @@ runs:
        FILE_SUFFIX: ${{ inputs.file-suffix }}
      run: |
        # Remove any previous test reports if they exist
-        rm -f logs-*.zip
+        rm -f usage-log-*.zip
        # this workflow is also run in bazel build test, but we dont generate usage reports for it
        # so check to see if the file exists first
        if [ -f 'usage_log.txt' ]; then
-            zip "logs-${FILE_SUFFIX}.zip" 'usage_log.txt'
+            zip "usage-log-${FILE_SUFFIX}.zip" 'usage_log.txt'
        fi
        if ls test/**/*.log 1> /dev/null 2>&1; then
-            zip -r "logs-${FILE_SUFFIX}.zip" test -i '*.log'
+            zip -r "usage-log-${FILE_SUFFIX}.zip" test -i '*.log'
        fi

    # Windows zip
@ -80,7 +80,7 @@ runs:
        FILE_SUFFIX: ${{ inputs.file-suffix }}
      run: |
        # -ir => recursive include all files in pattern
-        7z a "logs-$Env:FILE_SUFFIX.zip" 'usage_log.txt' -ir'!test\*.log'
+        7z a "usage-log-$Env:FILE_SUFFIX.zip" 'usage_log.txt' -ir'!test\*.log'

    # S3 upload
    - name: Store Test Downloaded JSONs on S3
@ -112,7 +112,7 @@ runs:
          ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
        retention-days: 14
        if-no-files-found: ignore
-        path: logs-*.zip
+        path: usage-log-*.zip

    # GHA upload
    - name: Store Test Downloaded JSONs on Github
@ -146,7 +146,7 @@ runs:
      continue-on-error: true
      with:
        # Add the run attempt, see [Artifact run attempt]
-        name: logs-runattempt${{ github.run_attempt }}-${{ inputs.file-suffix }}.zip
+        name: usage-log-runattempt${{ github.run_attempt }}-${{ inputs.file-suffix }}.zip
        retention-days: 14
        if-no-files-found: ignore
        path: |
--- a/.github/auto_request_review.yml
+++ b/.github/auto_request_review.yml
@ -12,6 +12,7 @@ reviewers:
    symbolic-shapes:
      - symbolic-shapes
      - antoniojkim
+      - wconstab
      - SherlockNoMad
    Chillee:
      - ezyang
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-e3efbc2d9094685dd2d4ae143853941f82f167af
+a8f4e97bd5356a7a77510cdf6a3a62e25a5dc602
--- a/.github/ci_commit_pins/fbgemm.txt
+++ b/.github/ci_commit_pins/fbgemm.txt
@ -1 +1 @@
-de731af65b4f04696e85c729e3282450b51b95fd
+1b2746f642cc2c99fe9d1a0c34359c0de45341c2
--- a/.github/ci_commit_pins/numpy_pytorch_interop.txt
+++ b/.github/ci_commit_pins/numpy_pytorch_interop.txt
@ -0,0 +1 @@
+0c4e82511d349358d2c8c492dd833334e742f27f
--- a/.github/ci_commit_pins/timm.txt
+++ b/.github/ci_commit_pins/timm.txt
@ -0,0 +1 @@
+b9d43c7dcac1fe05e851dd7be7187b108af593d2
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .2.0
 .1.0