Revert "Require less alignment for attn bias (#114173 ) (#114837 )"

This reverts commit 59656491f3b1da809312942872cce010337504b0.
Fix NULL dereference in binary CPU ops (#115241 )
2025-10-27 17:54:55 +08:00 · 2023-12-12 08:41:07 -08:00 · 2023-12-06 01:20:06 -08:00 · 2023-12-05 14:50:58 -05:00 · 2023-12-01 10:58:57 -08:00 · 2023-11-30 08:11:08 -08:00
3662 changed files with 410954 additions and 222862 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -71,9 +71,6 @@ if [[ "$image" == *cuda* && "$UBUNTU_VERSION" != "22.04" ]]; then
  DOCKERFILE="${OS}-cuda/Dockerfile"
 elif [[ "$image" == *rocm* ]]; then
  DOCKERFILE="${OS}-rocm/Dockerfile"
-elif [[ "$image" == *cuda*linter* ]]; then
-  # Use a separate Dockerfile for linter to keep a small image size
-  DOCKERFILE="linter-cuda/Dockerfile"
 elif [[ "$image" == *linter* ]]; then
  # Use a separate Dockerfile for linter to keep a small image size
  DOCKERFILE="linter/Dockerfile"
@ -132,6 +129,35 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
+  pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc7)
+    CUDA_VERSION=11.8.0
+    CUDNN_VERSION=8
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=7
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
+    pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc7-inductor-benchmarks)
+    CUDA_VERSION=11.8.0
+    CUDNN_VERSION=8
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=7
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
    CUDA_VERSION=12.1.1
    CUDNN_VERSION=8
@ -155,13 +181,13 @@ case "$image" in
    CONDA_CMAKE=yes
    ONNX=yes
    ;;
-  pytorch-linux-focal-py3-clang9-android-ndk-r21e)
+  pytorch-linux-focal-py3-clang7-android-ndk-r19c)
    ANACONDA_PYTHON_VERSION=3.8
-    CLANG_VERSION=9
+    CLANG_VERSION=7
    LLVMDEV=yes
    PROTOBUF=yes
    ANDROID=yes
-    ANDROID_NDK_VERSION=r21e
+    ANDROID_NDK_VERSION=r19c
    GRADLE_VERSION=6.8.3
    NINJA_VERSION=1.9.0
    ;;
@ -202,7 +228,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    ROCM_VERSION=5.6
+    ROCM_VERSION=5.4.2
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
@ -213,11 +239,22 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    ROCM_VERSION=5.7
+    ROCM_VERSION=5.6
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
+  pytorch-linux-focal-py3.8-gcc7)
+    ANACONDA_PYTHON_VERSION=3.8
+    GCC_VERSION=7
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    CONDA_CMAKE=yes
+    TRITON=yes
+    DOCS=yes
+    ;;
    pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.8
    GCC_VERSION=11
@ -249,12 +286,6 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-py3-clang15-asan)
-    ANACONDA_PYTHON_VERSION=3.10
-    CLANG_VERSION=15
-    CONDA_CMAKE=yes
-    VISION=yes
-    ;;
  pytorch-linux-jammy-py3.8-gcc11)
    ANACONDA_PYTHON_VERSION=3.8
    GCC_VERSION=11
@ -266,12 +297,6 @@ case "$image" in
    TRITON=yes
    DOCS=yes
    ;;
-  pytorch-linux-jammy-py3-clang12-executorch)
-    ANACONDA_PYTHON_VERSION=3.10
-    CLANG_VERSION=12
-    CONDA_CMAKE=yes
-    EXECUTORCH=yes
-    ;;
  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
    # We will need to update mypy version eventually, but that's for another day. The task
@ -279,11 +304,6 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CONDA_CMAKE=yes
    ;;
-  pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter)
-    ANACONDA_PYTHON_VERSION=3.9
-    CUDA_VERSION=11.8
-    CONDA_CMAKE=yes
-    ;;
  *)
    # Catch-all for builds that are not hardcoded.
    PROTOBUF=yes
@ -301,9 +321,6 @@ case "$image" in
      extract_version_from_image_name rocm ROCM_VERSION
      NINJA_VERSION=1.9.0
      TRITON=yes
-      # To ensure that any ROCm config will build using conda cmake
-      # and thus have LAPACK/MKL enabled
-      CONDA_CMAKE=yes
    fi
    if [[ "$image" == *centos7* ]]; then
      NINJA_VERSION=1.10.2
@ -337,11 +354,14 @@ if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
 fi

 # Build image
+# TODO: build-arg THRIFT is not turned on for any image, remove it once we confirm
+# it's no longer needed.
 docker build \
       --no-cache \
       --progress=plain \
       --build-arg "BUILD_ENVIRONMENT=${image}" \
       --build-arg "PROTOBUF=${PROTOBUF:-}" \
+       --build-arg "THRIFT=${THRIFT:-}" \
       --build-arg "LLVMDEV=${LLVMDEV:-}" \
       --build-arg "DB=${DB:-}" \
       --build-arg "VISION=${VISION:-}" \
@ -373,7 +393,6 @@ docker build \
       --build-arg "ONNX=${ONNX}" \
       --build-arg "DOCS=${DOCS}" \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
-       --build-arg "EXECUTORCH=${EXECUTORCH}" \
       -f $(dirname ${DOCKERFILE})/Dockerfile \
       -t "$tmp_tag" \
       "$@" \
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -98,18 +98,6 @@ COPY ./common/install_ninja.sh install_ninja.sh
 RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
 RUN rm install_ninja.sh

-ARG TRITON
-# Install triton, this needs to be done before sccache because the latter will
-# try to reach out to S3, which docker build runners don't have access
-ENV CMAKE_C_COMPILER cc
-ENV CMAKE_CXX_COMPILER c++
-COPY ./common/install_triton.sh install_triton.sh
-COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/triton-rocm.txt triton-rocm.txt
-COPY triton_version.txt triton_version.txt
-RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
-RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt
-
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +0,0 @@
-b2f5dfe80704404298467347b8ee3ac229efed47
--- a/.ci/docker/ci_commit_pins/huggingface.txt
+++ b/.ci/docker/ci_commit_pins/huggingface.txt
@ -1 +1 @@
-6c26faa159b79a42d7fa46cb66e2d21523351987
+4.27.4
--- a/.ci/docker/ci_commit_pins/timm.txt
+++ b/.ci/docker/ci_commit_pins/timm.txt
@ -1 +1 @@
-730b907b4d45a4713cbc425cbf224c46089fd514
+b9d43c7dcac1fe05e851dd7be7187b108af593d2
--- a/.ci/docker/ci_commit_pins/triton-rocm.txt
+++ b/.ci/docker/ci_commit_pins/triton-rocm.txt
@ -1 +1 @@
-dafe1459823b9549417ed95e9720f1b594fab329
+34f8189eae57a23cc15b4b4f032fe25757e0db8e
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-bcad9dabe15021c53b6a88296e9d7a210044f108
+e6216047b8b0aef1fe8da6ca8667a3ad0a016411
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@ -9,7 +9,10 @@ install_ubuntu() {
  #   "$UBUNTU_VERSION" == "18.04"*
  # instead of
  #   "$UBUNTU_VERSION" == "18.04"
-  if [[ "$UBUNTU_VERSION" == "20.04"* ]]; then
+  if [[ "$UBUNTU_VERSION" == "18.04"* ]]; then
+    cmake3="cmake=3.10*"
+    maybe_libiomp_dev="libiomp-dev"
+  elif [[ "$UBUNTU_VERSION" == "20.04"* ]]; then
    cmake3="cmake=3.16*"
    maybe_libiomp_dev=""
  elif [[ "$UBUNTU_VERSION" == "22.04"* ]]; then
@ -20,9 +23,7 @@ install_ubuntu() {
    maybe_libiomp_dev="libiomp-dev"
  fi

-  if [[ "$CLANG_VERSION" == 15 ]]; then
-    maybe_libomp_dev="libomp-15-dev"
-  elif [[ "$CLANG_VERSION" == 12 ]]; then
+  if [[ "$CLANG_VERSION" == 12 ]]; then
    maybe_libomp_dev="libomp-12-dev"
  elif [[ "$CLANG_VERSION" == 10 ]]; then
    maybe_libomp_dev="libomp-10-dev"
@ -61,7 +62,6 @@ install_ubuntu() {
    ${maybe_libiomp_dev} \
    libyaml-dev \
    libz-dev \
-    libjemalloc2 \
    libjpeg-dev \
    libasound2-dev \
    libsndfile-dev \
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -54,13 +54,23 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"
  if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ]; then
    conda_install numpy=1.23.5 ${CONDA_COMMON_DEPS}
-  else
+  elif [ "$ANACONDA_PYTHON_VERSION" = "3.10" ]; then
    conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
+  elif [ "$ANACONDA_PYTHON_VERSION" = "3.9" ]; then
+    conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
+  elif [ "$ANACONDA_PYTHON_VERSION" = "3.8" ]; then
+    conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
+  else
+    # Install `typing-extensions` for 3.7
+    conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS} typing-extensions
  fi

-  # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
-  # and libpython-static for torch deploy
-  conda_install llvmdev=8.0.0 "libpython-static=${ANACONDA_PYTHON_VERSION}"
+  # This is only supported in 3.8 upward
+  if [ "$MINOR_PYTHON_VERSION" -gt "7" ]; then
+    # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
+    # and libpython-static for torch deploy
+    conda_install llvmdev=8.0.0 "libpython-static=${ANACONDA_PYTHON_VERSION}"
+  fi

  # Use conda cmake in some cases. Conda cmake will be newer than our supported
  # min version (3.5 for xenial and 3.10 for bionic), so we only do it in those
@ -79,7 +89,13 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  # Install some other packages, including those needed for Python test reporting
  pip_install -r /opt/conda/requirements-ci.txt

-  pip_install -U scikit-learn
+  # Update scikit-learn to a python-3.8 compatible version
+  if [[ $(python -c "import sys; print(int(sys.version_info >= (3, 8)))") == "1" ]]; then
+    pip_install -U scikit-learn
+  else
+    # Pinned scikit-learn due to https://github.com/scikit-learn/scikit-learn/issues/14485 (affects gcc 5.5 only)
+    pip_install scikit-learn==0.20.3
+  fi

  if [ -n "$DOCS" ]; then
    apt-get update
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -1,62 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
-
-clone_executorch() {
-  EXECUTORCH_PINNED_COMMIT=$(get_pinned_commit executorch)
-
-  # Clone the Executorch
-  git clone https://github.com/pytorch/executorch.git
-
-  # and fetch the target commit
-  pushd executorch
-  git checkout "${EXECUTORCH_PINNED_COMMIT}"
-  git submodule update --init
-  popd
-
-  chown -R jenkins executorch
-}
-
-install_buck2() {
-  pushd executorch/.ci/docker
-
-  BUCK2_VERSION=$(cat ci_commit_pins/buck2.txt)
-  source common/install_buck.sh
-
-  popd
-}
-
-install_conda_dependencies() {
-  pushd executorch/.ci/docker
-  # Install conda dependencies like flatbuffer
-  conda_install --file conda-env-ci.txt
-  popd
-}
-
-install_pip_dependencies() {
-  pushd executorch/.ci/docker
-  # Install all Python dependencies
-  pip_install -r requirements-ci.txt
-  popd
-}
-
-setup_executorch() {
-  pushd executorch
-  source .ci/scripts/utils.sh
-
-  install_flatc_from_source
-  pip_install .
-  build_executorch_runner "cmake"
-
-  # Make sure that all the newly generate files are owned by Jenkins
-  chown -R jenkins .
-  popd
-}
-
-clone_executorch
-install_buck2
-install_conda_dependencies
-install_pip_dependencies
-setup_executorch
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -6,21 +6,19 @@ source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

 function install_huggingface() {
  local version
-  commit=$(get_pinned_commit huggingface)
+  version=$(get_pinned_commit huggingface)
  pip_install pandas==2.0.3
-  pip_install "git+https://github.com/huggingface/transformers@${commit}"
+  pip_install "transformers==${version}"
 }

 function install_timm() {
  local commit
  commit=$(get_pinned_commit timm)
  pip_install pandas==2.0.3
-  pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
-  # Clean up
-  conda_run pip uninstall -y cmake torch torchvision triton
+  pip_install "git+https://github.com/rwightman/pytorch-image-models@${commit}"
 }

 # Pango is needed for weasyprint which is needed for doctr
 conda_install pango
 install_huggingface
-install_timm
+# install_timm
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -10,13 +10,13 @@ retry () {

 # A bunch of custom pip dependencies for ONNX
 pip_install \
-  beartype==0.15.0 \
+  beartype==0.10.4 \
  filelock==3.9.0 \
  flatbuffers==2.0 \
  mock==5.0.1 \
  ninja==1.10.2 \
  networkx==2.0 \
-  numpy==1.24.2
+  numpy==1.22.4

 # ONNXRuntime should be installed before installing
 # onnx-weekly. Otherwise, onnx-weekly could be
@ -26,13 +26,13 @@ pip_install \
  pytest-cov==4.0.0 \
  pytest-subtests==0.10.0 \
  tabulate==0.9.0 \
-  transformers==4.32.1
+  transformers==4.31.0

 pip_install coloredlogs packaging
-retry pip_install -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ --no-cache-dir --no-input ort-nightly==1.17.0.dev20231005006
+retry pip_install -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ --no-cache-dir --no-input ort-nightly==1.16.0.dev20230908001

-pip_install -i https://test.pypi.org/simple/ onnx==1.15.0rc2
-pip_install onnxscript==0.1.0.dev20231128 --no-deps
+pip_install onnx==1.14.1
+pip_install onnxscript-preview==0.1.0.dev20230828 --no-deps

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -5,10 +5,8 @@ set -ex
 # "install" hipMAGMA into /opt/rocm/magma by copying after build
 git clone https://bitbucket.org/icl/magma.git
 pushd magma
-
-# Version 2.7.2 + ROCm related updates
-git checkout 823531632140d0edcb7e77c3edc0e837421471c5
-
+# Fixes memory leaks of magma found while executing linalg UTs
+git checkout 28592a7170e4b3707ed92644bf4a689ed600c27f
 cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
 echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
 echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib' >> make.inc
--- a/.ci/docker/common/install_thrift.sh
+++ b/.ci/docker/common/install_thrift.sh
@ -0,0 +1,14 @@
+apt-get update
+apt-get install -y sudo wget libboost-dev libboost-test-dev libboost-program-options-dev libboost-filesystem-dev libboost-thread-dev libevent-dev automake libtool flex bison pkg-config g++ libssl-dev
+wget https://www-us.apache.org/dist/thrift/0.12.0/thrift-0.12.0.tar.gz
+tar -xvf thrift-0.12.0.tar.gz
+cd thrift-0.12.0
+for file in ./compiler/cpp/Makefile*; do
+  sed -i 's/\-Werror//' $file
+done
+./bootstrap.sh
+./configure --without-php --without-java --without-python --without-nodejs --without-go --without-ruby
+sudo make
+sudo make install
+cd ..
+rm thrift-0.12.0.tar.gz
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -23,10 +23,8 @@ fi
 # The logic here is copied from .ci/pytorch/common_utils.sh
 TRITON_PINNED_COMMIT=$(get_pinned_commit ${TRITON_TEXT_FILE})

-if [ -n "${UBUNTU_VERSION}" ];then
-    apt update
-    apt-get install -y gpg-agent
-fi
+apt update
+apt-get install -y gpg-agent

 if [ -n "${CONDA_CMAKE}" ]; then
  # Keep the current cmake and numpy version here, so we can reinstall them later
@ -38,12 +36,12 @@ if [ -z "${MAX_JOBS}" ]; then
    export MAX_JOBS=$(nproc)
 fi

-if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}" == "7" ]]; then
+if [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}" == "7" ]]; then
  # Triton needs at least gcc-9 to build
  apt-get install -y g++-9

  CXX=g++-9 pip_install "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python"
-elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
+elif [ -n "${CLANG_VERSION}" ]; then
  # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
  add-apt-repository -y ppa:ubuntu-toolchain-r/test
  apt-get install -y g++-9
--- a/.ci/docker/linter-cuda/Dockerfile
+++ b/.ci/docker/linter-cuda/Dockerfile
@ -1,44 +0,0 @@
-ARG UBUNTU_VERSION
-
-FROM ubuntu:${UBUNTU_VERSION}
-
-ARG UBUNTU_VERSION
-
-ENV DEBIAN_FRONTEND noninteractive
-
-# Install common dependencies (so that this step can be cached separately)
-COPY ./common/install_base.sh install_base.sh
-RUN bash ./install_base.sh && rm install_base.sh
-
-# Install missing libomp-dev
-RUN apt-get update && apt-get install -y --no-install-recommends libomp-dev && apt-get autoclean && apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# Install user
-COPY ./common/install_user.sh install_user.sh
-RUN bash ./install_user.sh && rm install_user.sh
-
-# Install conda and other packages (e.g., numpy, pytest)
-ARG ANACONDA_PYTHON_VERSION
-ARG CONDA_CMAKE
-ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
-ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
-COPY requirements-ci.txt /opt/conda/requirements-ci.txt
-COPY ./common/install_conda.sh install_conda.sh
-COPY ./common/common_utils.sh common_utils.sh
-RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
-
-# Install cuda and cudnn
-ARG CUDA_VERSION
-RUN wget -q https://raw.githubusercontent.com/pytorch/builder/main/common/install_cuda.sh -O install_cuda.sh
-RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
-ENV DESIRED_CUDA ${CUDA_VERSION}
-ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
-
-# Note that Docker build forbids copying file outside the build context
-COPY ./common/install_linter.sh install_linter.sh
-COPY ./common/common_utils.sh common_utils.sh
-RUN bash ./install_linter.sh
-RUN rm install_linter.sh common_utils.sh
-
-USER jenkins
-CMD ["bash"]
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -75,10 +75,10 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.7.0
+mypy==1.4.1
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
-#Pinned versions: 1.7.0
+#Pinned versions: 1.4.1
 #test that import: test_typing.py, test_type_hints.py

 networkx==2.8.8
@ -124,22 +124,10 @@ opt-einsum==3.3
 #Pinned versions: 3.3
 #test that import: test_linalg.py

-optree==0.9.1
-#Description: A library for tree manipulation
-#Pinned versions: 0.9.1
-#test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
-#test_pytree.py, test_ops.py, test_control_flow.py, test_modules.py,
-#common_utils.py, test_eager_transforms.py, test_python_dispatch.py,
-#test_expanded_weights.py, test_decomp.py, test_overrides.py, test_masked.py,
-#test_ops.py, test_prims.py, test_subclass.py, test_functionalization.py,
-#test_schema_check.py, test_profiler_tree.py, test_meta.py, test_torchxla_num_output.py,
-#test_utils.py, test_proxy_tensor.py, test_memory_profiler.py, test_view_ops.py,
-#test_pointwise_ops.py, test_dtensor_ops.py, test_torchinductor.py, test_fx.py,
-#test_fake_tensor.py, test_mps.py
-
-pillow==10.0.1
+pillow==9.3.0 ; python_version <= "3.8"
+pillow==9.5.0 ; python_version > "3.8"
 #Description:  Python Imaging Library fork
-#Pinned versions: 10.0.1
+#Pinned versions:
 #test that import:

 protobuf==3.20.2
@ -292,14 +280,3 @@ tensorboard==2.13.0
 #Description: Also included in .ci/docker/requirements-docs.txt
 #Pinned versions:
 #test that import: test_tensorboard
-
-pywavelets==1.4.1
-#Description: This is a requirement of scikit-image, we need to pin
-# it here because 1.5.0 conflicts with numpy 1.21.2 used in CI
-#Pinned versions: 1.4.1
-#test that import:
-
-lxml==4.9.4
-#Description: This is a requirement of unittest-xml-reporting
-# have to pin to 4.9.4 because 5.0.0 release on Dec 29th missing
-# Python-3.9 binaries
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-2.2.0
+2.1.0
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -79,6 +79,12 @@ ENV OPENSSL_ROOT_DIR /opt/openssl
 RUN bash ./install_openssl.sh
 ENV OPENSSL_DIR /opt/openssl

+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+COPY ./common/install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
@ -87,12 +93,6 @@ COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

-# (optional) Install non-default CMake version
-ARG CMAKE_VERSION
-COPY ./common/install_cmake.sh install_cmake.sh
-RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
-RUN rm install_cmake.sh
-
 ARG TRITON
 # Install triton, this needs to be done before sccache because the latter will
 # try to reach out to S3, which docker build runners don't have access
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -17,6 +17,13 @@ ARG LLVMDEV
 COPY ./common/install_clang.sh install_clang.sh
 RUN bash ./install_clang.sh && rm install_clang.sh

+# (optional) Install thrift.
+ARG THRIFT
+COPY ./common/install_thrift.sh install_thrift.sh
+RUN if [ -n "${THRIFT}" ]; then bash ./install_thrift.sh; fi
+RUN rm install_thrift.sh
+ENV INSTALLED_THRIFT ${THRIFT}
+
 # Install user
 COPY ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh
@ -146,14 +153,6 @@ COPY ci_commit_pins/triton.txt triton.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton.txt

-ARG EXECUTORCH
-# Build and install executorch
-COPY ./common/install_executorch.sh install_executorch.sh
-COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/executorch.txt executorch.txt
-RUN if [ -n "${EXECUTORCH}" ]; then bash ./install_executorch.sh; fi
-RUN rm install_executorch.sh common_utils.sh executorch.txt
-
 ARG ONNX
 # Install ONNX dependencies
 COPY ./common/install_onnx.sh ./common/common_utils.sh ./
--- a/.ci/onnx/test.sh
+++ b/.ci/onnx/test.sh
@ -3,6 +3,11 @@
 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"

+# Use to retry ONNX test, only retry it twice
+retry () {
+    "$@" || (sleep 60 && "$@")
+}
+
 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
  # TODO: This can be removed later once vision is also part of the Docker image
  pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
@ -11,5 +16,5 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
  # NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we
  # need to bring this to the standard PyTorch run_test eventually. The issue will be tracked in
  # https://github.com/pytorch/pytorch/issues/98626
-  "$ROOT_DIR/scripts/onnx/test.sh"
+  retry "$ROOT_DIR/scripts/onnx/test.sh"
 fi
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -28,8 +28,6 @@ echo "Environment variables:"
 env

 if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
-  # Use jemalloc during compilation to mitigate https://github.com/pytorch/pytorch/issues/116289
-  export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
  echo "NVCC version:"
  nvcc --version
 fi
@ -65,12 +63,6 @@ else
  export LLVM_DIR=/opt/llvm/lib/cmake/llvm
 fi

-if [[ "$BUILD_ENVIRONMENT" == *executorch* ]]; then
-  # To build test_edge_op_registration
-  export BUILD_EXECUTORCH=ON
-  export USE_CUDA=0
-fi
-
 if ! which conda; then
  # In ROCm CIs, we are doing cross compilation on build machines with
  # intel cpu and later run tests on machines with amd cpu.
@ -167,14 +159,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* && -z "$TORCH_CUDA_ARCH_LIST" ]]; then
  exit 1
 fi

-# We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
-# memory to build and will OOM
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ "$TORCH_CUDA_ARCH_LIST" == *"8.6"* || "$TORCH_CUDA_ARCH_LIST" == *"8.0"* ]]; then
-  echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
-  echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
-  export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
-fi
-
 if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
  export CC=clang
  export CXX=clang++
@ -184,6 +168,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
  export LDSHARED="clang --shared"
  export USE_CUDA=0
  export USE_ASAN=1
+  export USE_MKLDNN=0
  export UBSAN_FLAGS="-fno-sanitize-recover=all;-fno-sanitize=float-divide-by-zero;-fno-sanitize=float-cast-overflow"
  unset USE_LLVM
 fi
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -43,7 +43,7 @@ function assert_git_not_dirty() {
    # TODO: we should add an option to `build_amd.py` that reverts the repo to
    #       an unmodified state.
    if [[ "$BUILD_ENVIRONMENT" != *rocm* ]] && [[ "$BUILD_ENVIRONMENT" != *xla* ]] ; then
-        git_status=$(git status --porcelain | grep -v '?? third_party' || true)
+        git_status=$(git status --porcelain)
        if [[ $git_status ]]; then
            echo "Build left local git repository checkout dirty"
            echo "git status --porcelain:"
@ -171,9 +171,16 @@ function install_torchrec_and_fbgemm() {
  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
 }

+function install_numpy_pytorch_interop() {
+  local commit
+  commit=$(get_pinned_commit numpy_pytorch_interop)
+  # TODO: --no-use-pep517 will result in failure.
+  pip_install --user "git+https://github.com/Quansight-Labs/numpy_pytorch_interop.git@${commit}"
+}
+
 function clone_pytorch_xla() {
  if [[ ! -d ./xla ]]; then
-    git clone --recursive -b r2.2 https://github.com/pytorch/xla.git
+    git clone --recursive -b r2.1 https://github.com/pytorch/xla.git
    pushd xla
    # pin the xla hash so that we don't get broken by changes to xla
    git checkout "$(cat ../.github/ci_commit_pins/xla.txt)"
@ -205,6 +212,15 @@ function test_torch_deploy(){
 popd
 }

+function install_timm() {
+  local commit
+  commit=$(get_pinned_commit timm)
+  pip_install pandas
+  pip_install scipy
+  pip_install z3-solver
+  pip_install "git+https://github.com/rwightman/pytorch-image-models@${commit}"
+}
+
 function checkout_install_torchbench() {
  local commit
  commit=$(get_pinned_commit torchbench)
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -43,7 +43,7 @@ cross_compile_arm64() {
 compile_arm64() {
  # Compilation for arm64
  # TODO: Compile with OpenMP support (but this causes CI regressions as cross-compilation were done with OpenMP disabled)
-  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+  USE_DISTRIBUTED=0 USE_OPENMP=0 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
 }

 compile_x86_64() {
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -36,12 +36,10 @@ time python test/run_test.py --verbose -i distributed/test_functional_api


 # DTensor tests
+time python test/run_test.py --verbose -i distributed/_tensor/test_device_mesh
 time python test/run_test.py --verbose -i distributed/_tensor/test_random_ops
 time python test/run_test.py --verbose -i distributed/_tensor/test_dtensor_compile

-# DeviceMesh test
-time python test/run_test.py --verbose -i distributed/test_device_mesh
-
 # DTensor/TP tests
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_ddp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_fsdp_2d_parallel
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -80,11 +80,6 @@ if [[ "$BUILD_ENVIRONMENT" != *bazel* ]]; then
  CUSTOM_TEST_ARTIFACT_BUILD_DIR=$(realpath "${CUSTOM_TEST_ARTIFACT_BUILD_DIR:-"build/custom_test_artifacts"}")
 fi

-# Reduce set of tests to include when running run_test.py
-if [[ -n $TESTS_TO_INCLUDE ]]; then
-  echo "Setting INCLUDE_CLAUSE"
-  INCLUDE_CLAUSE="--include $TESTS_TO_INCLUDE"
-fi

 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
@ -153,7 +148,7 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
    export PYTORCH_TEST_WITH_ASAN=1
    export PYTORCH_TEST_WITH_UBSAN=1
    # TODO: Figure out how to avoid hard-coding these paths
-    export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-15/bin/llvm-symbolizer
+    export ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-12/bin/llvm-symbolizer
    export TORCH_USE_RTLD_GLOBAL=1
    # NB: We load libtorch.so with RTLD_GLOBAL for UBSAN, unlike our
    # default behavior.
@ -187,7 +182,7 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
    # have, and it applies to child processes.

    # TODO: get rid of the hardcoded path
-    export LD_PRELOAD=/usr/lib/llvm-15/lib/clang/15.0.7/lib/linux/libclang_rt.asan-x86_64.so
+    export LD_PRELOAD=/usr/lib/llvm-12/lib/clang/12.0.1/lib/linux/libclang_rt.asan-x86_64.so
    # Disable valgrind for asan
    export VALGRIND=OFF
    # Increase stack size, because ASAN red zones use more stack
@ -233,16 +228,13 @@ test_python_shard() {
    exit 1
  fi

-  # Bare --include flag is not supported and quoting for lint ends up with flag not being interpreted correctly
-  # shellcheck disable=SC2086
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "$1" "$NUM_TEST_SHARDS" --verbose

  assert_git_not_dirty
 }

 test_python() {
-  # shellcheck disable=SC2086
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests --verbose
  assert_git_not_dirty
 }

@ -289,10 +281,6 @@ test_inductor_distributed() {
  # Smuggle a few multi-gpu tests here so that we don't have to request another large node
  echo "Testing multi_gpu tests in test_torchinductor"
  pytest test/inductor/test_torchinductor.py -k test_multi_gpu
-  pytest test/inductor/test_aot_inductor.py -k test_non_default_cuda_device
-  pytest test/inductor/test_aot_inductor.py -k test_replicate_on_devices
-  pytest test/distributed/_tensor/test_dtensor_compile.py
-  pytest test/distributed/tensor/parallel/test_fsdp_2d_parallel.py

  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
  # with if required # gpus aren't available
@ -315,17 +303,14 @@ test_inductor() {
 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
 # For example 'dynamic_aot_eager_torchbench' TEST_CONFIG means we run
 # the benchmark script with '--dynamic-shapes --backend aot_eager --device cuda'
-# The matrix of test options is specified in .github/workflows/inductor.yml,
-# .github/workflows/inductor-periodic.yml, and
-# .github/workflows/inductor-perf-test-nightly.yml
+# The matrix of test options is specified in .github/workflows/periodic.yml
+# and .github/workflows/inductor.yml
 DYNAMO_BENCHMARK_FLAGS=()

 if [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--backend eager)
 elif [[ "${TEST_CONFIG}" == *aot_eager* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--backend aot_eager)
-elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
-  DYNAMO_BENCHMARK_FLAGS+=(--export-aot-inductor)
 elif [[ "${TEST_CONFIG}" == *inductor* && "${TEST_CONFIG}" != *perf* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--inductor)
 fi
@ -334,7 +319,7 @@ if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--dynamic-shapes --dynamic-batch-only)
 fi

-if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
+if [[ "${TEST_CONFIG}" == *cpu_accuracy* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
 else
  DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
@ -398,11 +383,6 @@ test_perf_for_dashboard() {
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
-      if [[ "$DASHBOARD_TAG" == *freeze_autotune_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
-        TORCHINDUCTOR_MAX_AUTOTUNE=1 python "benchmarks/dynamo/$suite.py" \
-            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
-            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_autotune_${suite}_${dtype}_${mode}_cuda_${target}.csv"
-      fi
      if [[ "$DASHBOARD_TAG" == *aotinductor-true* ]] && [[ "$mode" == "inference" ]]; then
        python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --export-aot-inductor --disable-cudagraphs "$@" \
@ -453,12 +433,19 @@ test_single_dynamo_benchmark() {
      "${DYNAMO_BENCHMARK_FLAGS[@]}" \
      "$@" "${partition_flags[@]}" \
      --output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
-    python benchmarks/dynamo/check_accuracy.py \
-      --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
-    python benchmarks/dynamo/check_graph_breaks.py \
-      --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
+
+    if [[ "${TEST_CONFIG}" == *inductor* ]] && [[ "${TEST_CONFIG}" != *cpu_accuracy* ]]; then
+      # other jobs (e.g. periodic, cpu-accuracy) may have different set of expected models.
+      python benchmarks/dynamo/check_accuracy.py \
+        --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
+        --expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
+      python benchmarks/dynamo/check_graph_breaks.py \
+        --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
+        --expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
+    else
+      python benchmarks/dynamo/check_csv.py \
+        -f "$TEST_REPORTS_DIR/${name}_${suite}.csv"
+    fi
  fi
 }

@ -476,10 +463,8 @@ test_dynamo_benchmark() {
  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
  else
-    if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
+    if [[ "${TEST_CONFIG}" == *cpu_accuracy* ]]; then
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 "$@"
-    elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
-      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
    else
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
      test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
@ -494,13 +479,9 @@ test_inductor_torchbench_smoketest_perf() {
  python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
    --output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
-  # The threshold value needs to be actively maintained to make this check useful
-  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4
-
-  python benchmarks/dynamo/torchbench.py --device cuda --performance --bfloat16 --inference \
-    --export-aot-inductor --only nanogpt --output "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv"
-  # The threshold value needs to be actively maintained to make this check useful
-  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 5.2
+  # the reference speedup value is hardcoded in check_hf_bert_perf_csv.py
+  # this value needs to be actively maintained to make this check useful
+  python benchmarks/dynamo/check_hf_bert_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"

  # Check memory compression ratio for a few models
  for test in hf_Albert timm_vision_transformer; do
@ -624,7 +605,7 @@ test_libtorch_jit() {

  # Run jit and lazy tensor cpp tests together to finish them faster
  if [[ "$BUILD_ENVIRONMENT" == *cuda* && "$TEST_CONFIG" != *nogpu* ]]; then
-    LTC_TS_CUDA=1 python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/test_lazy
+    LTC_TS_CUDA=1 python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/nvfuser_tests cpp/test_lazy
  else
    # CUDA tests have already been skipped when CUDA is not available
    python test/run_test.py --cpp --verbose -i cpp/test_jit cpp/test_lazy -k "not CUDA"
@ -685,8 +666,7 @@ test_vulkan() {

 test_distributed() {
  echo "Testing distributed python tests"
-  # shellcheck disable=SC2086
-  time python test/run_test.py --distributed-tests --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" $INCLUDE_CLAUSE --verbose
+  time python test/run_test.py --distributed-tests --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
  assert_git_not_dirty

  if [[ ("$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm*) && "$SHARD_NUMBER" == 1 ]]; then
@ -995,28 +975,9 @@ test_docs_test() {
 }

 test_executorch() {
-  pushd /executorch
-
-  echo "Install torchvision and torchaudio"
-  # TODO(huydhn): Switch this to the pinned commits on ExecuTorch once they are
-  # there.  These libraries need to be built here, and not part of the Docker
-  # image because they require the target version of torch to be installed first
-  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git"
-  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/vision.git"
-
-  echo "Run ExecuTorch regression tests for some models"
-  # NB: This is a sample model, more can be added here
-  export PYTHON_EXECUTABLE=python
-  # TODO(huydhn): Add more coverage here using ExecuTorch's gather models script
-  # shellcheck disable=SC1091
-  source .ci/scripts/test.sh mv3 cmake xnnpack-quantization-delegation ''
-
-  popd
-
  # Test torchgen generated code for Executorch.
-  echo "Testing ExecuTorch op registration"
+  echo "Testing Executorch op registration"
  "$BUILD_BIN_DIR"/test_edge_op_registration
-
  assert_git_not_dirty
 }

@ -1031,8 +992,6 @@ elif [[ "${TEST_CONFIG}" == *xla* ]]; then
  install_torchvision
  build_xla
  test_xla
-elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
-  test_executorch
 elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
  test_python_legacy_jit
 elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
@ -1055,10 +1014,11 @@ elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
  test_dynamo_benchmark huggingface "$id"
 elif [[ "${TEST_CONFIG}" == *timm* ]]; then
  install_torchvision
+  install_timm
  id=$((SHARD_NUMBER-1))
  test_dynamo_benchmark timm_models "$id"
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
-  if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
+  if [[ "${TEST_CONFIG}" == *cpu_accuracy* ]]; then
    install_torchaudio cpu
  else
    install_torchaudio cuda
@ -1075,7 +1035,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
    checkout_install_torchbench
    # Do this after checkout_install_torchbench to ensure we clobber any
    # nightlies that torchbench may pull in
-    if [[ "${TEST_CONFIG}" != *cpu_inductor* ]]; then
+    if [[ "${TEST_CONFIG}" != *cpu_accuracy* ]]; then
      install_torchrec_and_fbgemm
    fi
    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
@ -1087,10 +1047,12 @@ elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
 elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  test_without_numpy
  install_torchvision
+  install_numpy_pytorch_interop
  test_dynamo_shard 1
  test_aten
 elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1 ]]; then
  install_torchvision
+  install_numpy_pytorch_interop
  test_dynamo_shard 2
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  test_without_numpy
@ -1118,10 +1080,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
  test_libtorch
 elif [[ "${TEST_CONFIG}" = docs_test ]]; then
  test_docs_test
-elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
-  install_torchvision
-  test_python
-  test_aten
 else
  install_torchvision
  install_monkeytype
@ -1134,4 +1092,5 @@ else
  test_custom_backend
  test_torch_function_benchmark
  test_benchmarks
+  test_executorch
 fi
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -127,7 +127,8 @@ python -c "import os, glob; os.system('python -mpip install --no-index --no-deps

    :: export test times so that potential sharded tests that'll branch off this build will use consistent data
    python tools/stats/export_test_times.py
-    robocopy /E ".additional_ci_files" "%PYTORCH_FINAL_PACKAGE_DIR%\.additional_ci_files"
+    copy /Y ".pytorch-test-times.json" "%PYTORCH_FINAL_PACKAGE_DIR%"
+    copy /Y ".pytorch-test-file-ratings.json" "%PYTORCH_FINAL_PACKAGE_DIR%"

    :: Also save build/.ninja_log as an artifact
    copy /Y "build\.ninja_log" "%PYTORCH_FINAL_PACKAGE_DIR%\"
--- a/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
+++ b/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
@ -2,7 +2,6 @@

 import os
 import subprocess
-import sys

 COMMON_TESTS = [
    (
@ -54,4 +53,4 @@ if __name__ == "__main__":
                print("Reruning with traceback enabled")
                print("Command:", command_string)
                subprocess.run(command_args, check=False)
-            sys.exit(e.returncode)
+            exit(e.returncode)
--- a/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat
+++ b/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat
@ -26,6 +26,11 @@ popd
 python test_custom_ops.py -v
 if ERRORLEVEL 1 exit /b 1

+:: TODO: fix and re-enable this test
+:: See https://github.com/pytorch/pytorch/issues/25155
+:: python test_custom_classes.py -v
+:: if ERRORLEVEL 1 exit /b 1
+
 python model.py --export-script-module="build/model.pt"
 if ERRORLEVEL 1 exit /b 1

--- a/.ci/pytorch/win-test-helpers/test_libtorch.bat
+++ b/.ci/pytorch/win-test-helpers/test_libtorch.bat
@ -1,3 +1,7 @@
+:: Skip LibTorch tests when building a GPU binary and testing on a CPU machine
+:: because LibTorch tests are not well designed for this use case.
+if "%USE_CUDA%" == "0" IF NOT "%CUDA_VERSION%" == "cpu" exit /b 0
+
 call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
 if errorlevel 1 exit /b 1

@ -17,7 +21,7 @@ if not errorlevel 0 exit /b 1
 cd %TMP_DIR_WIN%\build\torch\test
 for /r "." %%a in (*.exe) do (
    call :libtorch_check "%%~na" "%%~fa"
-    if errorlevel 1 goto fail
+    if errorlevel 1 exit /b 1
 )

 goto :eof
@ -30,6 +34,18 @@ set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\test
 :: Skip verify_api_visibility as it a compile level test
 if "%~1" == "verify_api_visibility" goto :eof

+:: See https://github.com/pytorch/pytorch/issues/25161
+if "%~1" == "c10_metaprogramming_test" goto :eof
+if "%~1" == "module_test" goto :eof
+:: See https://github.com/pytorch/pytorch/issues/25312
+if "%~1" == "converter_nomigraph_test" goto :eof
+:: See https://github.com/pytorch/pytorch/issues/35636
+if "%~1" == "generate_proposals_op_gpu_test" goto :eof
+:: See https://github.com/pytorch/pytorch/issues/35648
+if "%~1" == "reshape_op_gpu_test" goto :eof
+:: See https://github.com/pytorch/pytorch/issues/35651
+if "%~1" == "utility_ops_gpu_test" goto :eof
+
 echo Running "%~2"
 if "%~1" == "c10_intrusive_ptr_benchmark" (
  :: NB: This is not a gtest executable file, thus couldn't be handled by pytest-cpp
@ -40,15 +56,11 @@ if "%~1" == "c10_intrusive_ptr_benchmark" (
 python test\run_test.py --cpp --verbose -i "cpp/%~1"
 if errorlevel 1 (
  echo %1 failed with exit code %errorlevel%
-  goto fail
+  exit /b 1
 )
 if not errorlevel 0 (
  echo %1 failed with exit code %errorlevel%
-  goto fail
+  exit /b 1
 )

-:eof
-exit /b 0
-
-:fail
-exit /b 1
+goto :eof
--- a/.ci/pytorch/win-test-helpers/test_python_jit_legacy.bat
+++ b/.ci/pytorch/win-test-helpers/test_python_jit_legacy.bat
@ -1,7 +1,8 @@
 call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat

 echo Copying over test times file
-robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"
+copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%PROJECT_DIR_WIN%"
+copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-file-ratings.json" "%PROJECT_DIR_WIN%"

 pushd test

--- a/.ci/pytorch/win-test-helpers/test_python_shard.bat
+++ b/.ci/pytorch/win-test-helpers/test_python_shard.bat
@ -22,7 +22,8 @@ if "%SHARD_NUMBER%" == "1" (
 )

 echo Copying over test times file
-robocopy /E "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.additional_ci_files" "%PROJECT_DIR_WIN%\.additional_ci_files"
+copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%PROJECT_DIR_WIN%"
+copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-file-ratings.json" "%PROJECT_DIR_WIN%"

 echo Run nn tests
 python run_test.py --exclude-jit-executor --exclude-distributed-tests --shard "%SHARD_NUMBER%" "%NUM_TEST_SHARDS%" --verbose
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -38,7 +38,7 @@ fi
 python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0

 # Install Z3 optional dependency for Windows builds.
-python -m pip install z3-solver==4.12.2.0
+python -m pip install z3-solver

 run_tests() {
    # Run nvidia-smi if available
--- a/.circleci/cimodel/data/simple/anaconda_prune_defintions.py
+++ b/.circleci/cimodel/data/simple/anaconda_prune_defintions.py
@ -0,0 +1,28 @@
+from collections import OrderedDict
+
+from cimodel.data.simple.util.branch_filters import gen_filter_dict
+from cimodel.lib.miniutils import quote
+
+
+CHANNELS_TO_PRUNE = ["pytorch-nightly", "pytorch-test"]
+PACKAGES_TO_PRUNE = "pytorch torchvision torchaudio torchtext ignite torchcsprng"
+
+
+def gen_workflow_job(channel: str):
+    return OrderedDict(
+        {
+            "anaconda_prune": OrderedDict(
+                {
+                    "name": f"anaconda-prune-{channel}",
+                    "context": quote("org-member"),
+                    "packages": quote(PACKAGES_TO_PRUNE),
+                    "channel": channel,
+                    "filters": gen_filter_dict(branches_list=["postnightly"]),
+                }
+            )
+        }
+    )
+
+
+def get_workflow_jobs():
+    return [gen_workflow_job(channel) for channel in CHANNELS_TO_PRUNE]
--- a/.circleci/cimodel/data/simple/util/docker_constants.py
+++ b/.circleci/cimodel/data/simple/util/docker_constants.py
@ -32,4 +32,4 @@ def gen_mobile_docker(specifier):

 DOCKER_IMAGE_ASAN, DOCKER_REQUIREMENT_ASAN = gen_mobile_docker("asan")

-DOCKER_IMAGE_NDK, DOCKER_REQUIREMENT_NDK = gen_mobile_docker("android-ndk-r21e")
+DOCKER_IMAGE_NDK, DOCKER_REQUIREMENT_NDK = gen_mobile_docker("android-ndk-r19c")
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -444,6 +444,35 @@ jobs:
          script="/Users/distiller/project/.circleci/scripts/binary_ios_upload.sh"
          cat "$script"
          source "$script"
+
+  anaconda_prune:
+    parameters:
+      packages:
+        type: string
+        description: "What packages are we pruning? (quoted, space-separated string. eg. 'pytorch', 'torchvision torchaudio', etc.)"
+        default: "pytorch"
+      channel:
+        type: string
+        description: "What channel are we pruning? (eq. pytorch-nightly)"
+        default: "pytorch-nightly"
+    docker:
+      - image: continuumio/miniconda3
+    environment:
+      - PACKAGES: "<< parameters.packages >>"
+      - CHANNEL: "<< parameters.channel >>"
+    steps:
+      - checkout
+      - run:
+          name: Install dependencies
+          no_output_timeout: "1h"
+          command: |
+            conda install -yq anaconda-client
+      - run:
+          name: Prune packages
+          no_output_timeout: "1h"
+          command: |
+              ANACONDA_API_TOKEN="${CONDA_PYTORCHBOT_TOKEN}" \
+              scripts/release/anaconda-prune/run.sh
  pytorch_doc_push:
    resource_class: medium
    machine:
@ -623,7 +652,7 @@ jobs:
            - run:
                name: Archive artifacts into zip
                command: |
-                  zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json .additional_ci_files
+                  zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json .pytorch-test-times.json .pytorch-test-file-ratings.json
                  cp artifacts.zip /Users/distiller/workspace

      - persist_to_workspace:
@ -1385,4 +1414,22 @@ workflows:
          requires:
            - pytorch_ios_full_jit_12_5_1_nightly_x86_64_build
            - pytorch_ios_full_jit_12_5_1_nightly_arm64_build
+      - anaconda_prune:
+          name: anaconda-prune-pytorch-nightly
+          context: "org-member"
+          packages: "pytorch torchvision torchaudio torchtext ignite torchcsprng"
+          channel: pytorch-nightly
+          filters:
+            branches:
+              only:
+                - postnightly
+      - anaconda_prune:
+          name: anaconda-prune-pytorch-test
+          context: "org-member"
+          packages: "pytorch torchvision torchaudio torchtext ignite torchcsprng"
+          channel: pytorch-test
+          filters:
+            branches:
+              only:
+                - postnightly
    when: << pipeline.parameters.run_build >>
--- a/.circleci/generate_config_yml.py
+++ b/.circleci/generate_config_yml.py
@ -10,6 +10,8 @@ import shutil
 import sys
 from collections import namedtuple

+import cimodel.data.simple.anaconda_prune_defintions
+
 import cimodel.data.simple.docker_definitions
 import cimodel.data.simple.mobile_definitions
 import cimodel.data.simple.nightly_ios
@ -142,6 +144,7 @@ def gen_build_workflows_tree():
    build_workflows_functions = [
        cimodel.data.simple.mobile_definitions.get_workflow_jobs,
        cimodel.data.simple.nightly_ios.get_workflow_jobs,
+        cimodel.data.simple.anaconda_prune_defintions.get_workflow_jobs,
    ]
    build_jobs = [f() for f in build_workflows_functions]
    build_jobs.extend(
--- a/.circleci/scripts/binary_checkout.sh
+++ b/.circleci/scripts/binary_checkout.sh
@ -62,7 +62,7 @@ git --no-pager log --max-count 1
 popd

 # Clone the Builder main repo
-retry git clone -q https://github.com/pytorch/builder.git "$BUILDER_ROOT"
+retry git clone -q https://github.com/pytorch/builder.git -b release/2.1 "$BUILDER_ROOT"
 pushd "$BUILDER_ROOT"
 echo "Using builder from "
 git --no-pager log --max-count 1
--- a/.circleci/scripts/binary_ios_upload.sh
+++ b/.circleci/scripts/binary_ios_upload.sh
@ -33,7 +33,7 @@ fi
 cp ${PROJ_ROOT}/LICENSE ${ZIP_DIR}/
 # zip the library
 export DATE="$(date -u +%Y%m%d)"
-export IOS_NIGHTLY_BUILD_VERSION="2.2.0.${DATE}"
+export IOS_NIGHTLY_BUILD_VERSION="2.1.0.${DATE}"
 if [ "${BUILD_LITE_INTERPRETER}" == "1" ]; then
    # libtorch_lite_ios_nightly_1.11.0.20210810.zip
    ZIPFILE="libtorch_lite_ios_nightly_${IOS_NIGHTLY_BUILD_VERSION}.zip"
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -54,7 +54,7 @@ fi



-# Move debug wheels out of the package dir so they don't get installed
+# Move debug wheels out of the the package dir so they don't get installed
 mkdir -p /tmp/debug_final_pkgs
 mv /final_pkgs/debug-*.zip /tmp/debug_final_pkgs || echo "no debug packages to move"

@ -66,12 +66,6 @@ mv /final_pkgs/debug-*.zip /tmp/debug_final_pkgs || echo "no debug packages to m
 #   conda build scripts themselves. These should really be consolidated
 # Pick only one package of multiple available (which happens as result of workflow re-runs)
 pkg="/final_pkgs/\$(ls -1 /final_pkgs|sort|tail -1)"
-if [[ "\$PYTORCH_BUILD_VERSION" == *dev* ]]; then
-    CHANNEL="nightly"
-else
-    CHANNEL="test"
-fi
-
 if [[ "$PACKAGE_TYPE" == conda ]]; then
  (
    # For some reason conda likes to re-activate the conda environment when attempting this install
@ -89,14 +83,25 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
    if [[ "$DESIRED_CUDA" == 'cpu' ]]; then
      retry conda install -c pytorch -y cpuonly
    else
+
      cu_ver="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4}"
      CUDA_PACKAGE="pytorch-cuda"
-      retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c "pytorch-\${CHANNEL}" "pytorch-cuda=\${cu_ver}"
+      PYTORCH_CHANNEL="pytorch"
+      if [[ "\${TORCH_CONDA_BUILD_FOLDER}" == "pytorch-nightly" ]]; then
+              PYTORCH_CHANNEL="pytorch-nightly"
+      fi
+      retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c pytorch-test "pytorch-cuda=\${cu_ver}"
    fi
    conda install \${EXTRA_CONDA_FLAGS} -y "\$pkg" --offline
  )
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
-  pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
+  if [[ "$(uname -m)" == aarch64 ]]; then
+    # Using "extra-index-url" until all needed aarch64 dependencies are
+    # added to "https://download.pytorch.org/whl/nightly/"
+    pip install "\$pkg" --extra-index-url "https://download.pytorch.org/whl/test/${DESIRED_CUDA}"
+  else
+    pip install "\$pkg" --index-url "https://download.pytorch.org/whl/test/${DESIRED_CUDA}"
+  fi
  retry pip install -q numpy protobuf typing-extensions
 fi
 if [[ "$PACKAGE_TYPE" == libtorch ]]; then
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -59,7 +59,7 @@ PIP_UPLOAD_FOLDER='nightly/'
 # We put this here so that OVERRIDE_PACKAGE_VERSION below can read from it
 export DATE="$(date -u +%Y%m%d)"
 #TODO: We should be pulling semver version from the base version.txt
-BASE_BUILD_VERSION="2.2.0.dev$DATE"
+BASE_BUILD_VERSION="2.1.0.dev$DATE"
 # Change BASE_BUILD_VERSION to git tag when on a git tag
 # Use 'git -C' to make doubly sure we're in the correct directory for checking
 # the git tag
@ -77,8 +77,15 @@ else
  export PYTORCH_BUILD_VERSION="${BASE_BUILD_VERSION}+$DESIRED_CUDA"
 fi

+# The build with with-pypi-cudnn suffix is only applicabe to
+# pypi small wheel Linux x86 build
+if [[ -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]] && [[ "$(uname)" == 'Linux' && "$(uname -m)" == "x86_64" ]]; then
+  export PYTORCH_BUILD_VERSION="${PYTORCH_BUILD_VERSION}-with-pypi-cudnn"
+fi
+
 export PYTORCH_BUILD_NUMBER=1

+
 JAVA_HOME=
 BUILD_JNI=OFF
 if [[ "$PACKAGE_TYPE" == libtorch ]]; then
@ -150,8 +157,8 @@ EOL

 # nproc doesn't exist on darwin
 if [[ "$(uname)" != Darwin ]]; then
-  # This was lowered from 18 to 12 to avoid OOMs when compiling FlashAttentionV2
-  MEMORY_LIMIT_MAX_JOBS=12
+  # Because most Circle executors only have 20 CPUs, using more causes OOMs w/ Ninja and nvcc parallelization
+  MEMORY_LIMIT_MAX_JOBS=18
  NUM_CPUS=$(( $(nproc) - 2 ))

  # Defaults here for **binary** linux builds so they can be changed in one place
--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -16,6 +16,11 @@ UPLOAD_BUCKET="s3://pytorch"
 BACKUP_BUCKET="s3://pytorch-backup"
 BUILD_NAME=${BUILD_NAME:-}

+# this is temporary change to upload pypi-cudnn builds to separate folder
+if [[ ${BUILD_NAME} == *with-pypi-cudnn* ]]; then
+  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_cudnn"
+fi
+
 DRY_RUN=${DRY_RUN:-enabled}
 # Don't actually do work unless explicit
 ANACONDA="true anaconda"
--- a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml
@ -42,3 +42,32 @@ jobs:
          script="/Users/distiller/project/.circleci/scripts/binary_ios_upload.sh"
          cat "$script"
          source "$script"
+
+  anaconda_prune:
+    parameters:
+      packages:
+        type: string
+        description: "What packages are we pruning? (quoted, space-separated string. eg. 'pytorch', 'torchvision torchaudio', etc.)"
+        default: "pytorch"
+      channel:
+        type: string
+        description: "What channel are we pruning? (eq. pytorch-nightly)"
+        default: "pytorch-nightly"
+    docker:
+      - image: continuumio/miniconda3
+    environment:
+      - PACKAGES: "<< parameters.packages >>"
+      - CHANNEL: "<< parameters.channel >>"
+    steps:
+      - checkout
+      - run:
+          name: Install dependencies
+          no_output_timeout: "1h"
+          command: |
+            conda install -yq anaconda-client
+      - run:
+          name: Prune packages
+          no_output_timeout: "1h"
+          command: |
+              ANACONDA_API_TOKEN="${CONDA_PYTORCHBOT_TOKEN}" \
+              scripts/release/anaconda-prune/run.sh
--- a/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
+++ b/.circleci/verbatim-sources/job-specs/job-specs-custom.yml
@ -177,7 +177,7 @@
            - run:
                name: Archive artifacts into zip
                command: |
-                  zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json .additional_ci_files
+                  zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json .pytorch-test-times.json .pytorch-test-file-ratings.json
                  cp artifacts.zip /Users/distiller/workspace

      - persist_to_workspace:
--- a/.clang-tidy
+++ b/.clang-tidy
@ -1,8 +1,5 @@
 ---
 # NOTE there must be no spaces before the '-', so put the comma last.
-# The check bugprone-unchecked-optional-access is also turned off atm
-# because it causes clang-tidy to hang randomly. The tracking issue
-# can be found at https://github.com/llvm/llvm-project/issues/69369.
 InheritParentConfig: true
 Checks: '
 bugprone-*,
@ -12,7 +9,6 @@ bugprone-*,
 -bugprone-lambda-function-name,
 -bugprone-reserved-identifier,
 -bugprone-swapped-arguments,
-bugprone-unchecked-optional-access,
 clang-diagnostic-missing-prototypes,
 cppcoreguidelines-*,
 -cppcoreguidelines-avoid-do-while,
@ -34,13 +30,8 @@ cppcoreguidelines-*,
 -facebook-hte-RelativeInclude,
 hicpp-exception-baseclass,
 hicpp-avoid-goto,
-misc-*,
-misc-const-correctness,
-misc-use-anonymous-namespace,
-misc-unused-parameters,
-misc-no-recursion,
-misc-non-private-member-variables-in-classes,
-misc-confusable-identifiers,
+misc-unused-alias-decls,
+misc-unused-using-decls,
 modernize-*,
 -modernize-concat-nested-namespaces,
 -modernize-macro-to-enum,
@ -53,7 +44,7 @@ modernize-*,
 performance-*,
 readability-container-size-empty,
 '
-HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
+HeaderFilterRegex: '^(c10/(?!test)|torch/csrc/(?!deploy/interpreter/cpython)).*$'
 AnalyzeTemporaryDtors: false
 WarningsAsErrors: '*'
 ...
--- a/.devcontainer/README.md
+++ b/.devcontainer/README.md
@ -1,72 +0,0 @@
-# Step by step guide on using PyTorch's DevContainer
-
-Using PyTorch's DevContainer environment involves a series of steps that will help you set up a development environment that is isolated and replicable. Below, we'll guide you through each step to make this process as smooth as possible:
-
-## Step 1: Install VSCode
-
-1. Navigate to the [Visual Studio Code website](https://code.visualstudio.com/).
-2. Download the appropriate installer for your operating system (Windows, Linux, or macOS).
-3. Run the installer and follow the on-screen instructions to install VSCode on your system.
-4. After installation, launch VSCode.
-
-## Step 2: Install DevContainer Extension
-
-1. In VSCode, go to the Extensions view by clicking on the Extensions icon in the Activity Bar on the side of the window.
-2. Search for "Dev Containers" in the Extensions view search bar.
-3. Find the "Dev Containers" extension in the search results and click on the install button to install it.
-
-You can also go to the extension's [homepage](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) and [documentation page](https://code.visualstudio.com/docs/devcontainers/containers) to find more details.
-
-## Step 3: Install Docker and Add Current Login User to Docker Group
-
-1. Follow the [official guide](https://docs.docker.com/get-docker/) to install Docker. Don't forget the [post installation steps](https://docs.docker.com/engine/install/linux-postinstall/).
-
-If you are using [Visual Studio Code Remote - SSH](https://code.visualstudio.com/docs/remote/ssh), then you only need to install Docker in the remote host, not your local computer. And the following steps should be run in the remote host.
-
-## Step 4 (Optional): Install NVIDIA Container Toolkit for GPU Usage
-
-1. If you intend to use GPU resources, first ensure you have NVIDIA drivers installed on your system. Check if `nvidia-smi` works to verify your GPU setup.
-2. Follow the [official guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#docker) to install the NVIDIA Container Toolkit.
-3. After installation, verify that the toolkit is installed correctly by running:
-   ```
-   docker run --rm --runtime=nvidia --gpus all nvidia/cuda:11.6.2-base-ubuntu20.04 nvidia-smi
-   ```
-
-## Step 5: Clone PyTorch
-
-1. Open a terminal or command prompt.
-2. Use the following command to clone the PyTorch repository:
-   ```
-   git clone https://github.com/pytorch/pytorch
-   ```
-3. Navigate to the cloned directory:
-   ```
-   cd pytorch
-   ```
-
-## Step 6: Open in DevContainer
-
-1. In VSCode, use the Command Palette (`Ctrl+Shift+P` or `Cmd+Shift+P` on macOS) to run the "Remote-Containers: Open Folder in Container..." command.
-2. You will be prompted with two options: CPU dev container or CUDA dev container. Choose the one you want to run.
-
-## Step 7: Wait for Building the Environment
-
-1. After opening the folder in a DevContainer, VSCode will start building the container. This process can take some time as it involves downloading necessary images and setting up the environment.
-2. You can monitor the progress in the VSCode terminal.
-3. Once the build process completes, you'll have a fully configured PyTorch development environment in a container.
-4. The next time you open the same dev container, it will be much faster, as it does not require building the image again.
-
-You are now all set to start developing with PyTorch in a DevContainer environment. This setup ensures you have a consistent and isolated development environment for your PyTorch projects.
-
-## Step 8: Build PyTorch
-
-To build pytorch from source, simply run:
-   ```
-   python setup.py develop
-   ```
-
-The process involves compiling thousands of files, and would take a long time. Fortunately, the compiled objects can be useful for your next build. When you modify some files, you only need to compile the changed files the next time.
-
-Note that only contents in the `pytorch` directory are saved to disk. This directory is mounted to the docker image, while other contents in the docker image are all temporary, and will be lost if docker restarts the image or the server reboots.
-
-For an in-depth understanding of Dev Container and its caveats, please refer to [the full documentation](https://code.visualstudio.com/docs/devcontainers/containers).
--- a/.devcontainer/scripts/install-dev-tools.sh
+++ b/.devcontainer/scripts/install-dev-tools.sh
@ -9,5 +9,3 @@ make setup_lint

 # Add CMAKE_PREFIX_PATH to bashrc
 echo 'export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}' >> ~/.bashrc
-# Add linker path so that cuda-related libraries can be found
-echo 'export LDFLAGS="-L${CONDA_PREFIX}/lib/ $LDFLAGS"' >> ~/.bashrc
--- a/.flake8
+++ b/.flake8
@ -2,7 +2,7 @@
 # NOTE: **Mirror any changes** to this file the [tool.ruff] config in pyproject.toml
 # before we can fully move to use ruff
 enable-extensions = G
-select = B,C,E,F,G,P,SIM1,T4,W,B9,TOR0,TOR1,TOR2
+select = B,C,E,F,G,P,SIM1,T4,W,B9
 max-line-length = 120
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
@ -14,21 +14,15 @@ ignore =
    # to line this up with executable bit
    EXE001,
    # these ignores are from flake8-bugbear; please fix!
-    B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907
+    B007,B008,B017,B019,B020,B023,B024,B026,B028,B903,B904,B905,B906,B907
    # these ignores are from flake8-comprehensions; please fix!
    C407,
    # these ignores are from flake8-logging-format; please fix!
-    G100,G101,G200
+    G100,G101,G200,G201,G202
    # these ignores are from flake8-simplify. please fix or ignore with commented reason
    SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
    # flake8-simplify code styles
    SIM102,SIM103,SIM106,SIM112,
-    # TorchFix codes that don't make sense for PyTorch itself:
-    # removed and deprecated PyTorch functions.
-    TOR001,TOR101,
-    # TODO(kit1980): fix all TOR102 issues
-    # `torch.load` without `weights_only` parameter is unsafe
-    TOR102,
 per-file-ignores =
    __init__.py: F401
    torch/utils/cpp_extension.py: B950
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -7,7 +7,7 @@ self-hosted-runner:
    - linux.4xlarge
    - linux.12xlarge
    - linux.24xlarge
-    - linux.arm64.2xlarge
+    - linux.t4g.2xlarge
    - linux.4xlarge.nvidia.gpu
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
@ -23,5 +23,3 @@ self-hosted-runner:
    - macos-12-xl
    - macos-12
    - macos12.3-m1
-    - macos-latest-xlarge
-    - macos-13-xlarge
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@ -13,10 +13,6 @@ inputs:
    required: true
    type: string
    description: JSON description of what test configs to run.
-  job-name:
-    type: string
-    required: false
-    default: ""

 outputs:
  test-matrix:
@ -46,8 +42,7 @@ runs:
        retry_wait_seconds: 30
        command: |
          set -eux
-          # PyYAML 6.0 doesn't work with MacOS x86 anymore
-          python3 -m pip install requests==2.26.0 pyyaml==6.0.1
+          python3 -m pip install requests==2.26.0 pyyaml==6.0

    - name: Parse ref
      id: parse-ref
@ -61,7 +56,6 @@ runs:

    - name: Get the job name
      id: get-job-name
-      if: inputs.job-name == ''
      continue-on-error: true
      shell: bash
      run: |
@ -97,7 +91,7 @@ runs:
      shell: bash
      env:
        GITHUB_TOKEN: ${{ inputs.github-token }}
-        JOB_NAME: ${{ inputs.job-name == '' && steps.get-job-name.outputs.job-name || inputs.job-name }}
+        JOB_NAME: ${{ steps.get-job-name.outputs.job-name }}
        PR_NUMBER: ${{ github.event.pull_request.number }}
        TAG: ${{ steps.parse-ref.outputs.tag }}
        EVENT_NAME: ${{ github.event_name }}
--- a/.github/actions/get-workflow-job-id/action.yml
+++ b/.github/actions/get-workflow-job-id/action.yml
@ -11,20 +11,18 @@ outputs:
  job-id:
    description: The retrieved workflow job id
    value: ${{ steps.get-job-id.outputs.job-id }}
-  job-name:
-    description: The retrieved workflow job name
-    value: ${{ steps.get-job-id.outputs.job-name }}

 runs:
  using: composite
  steps:
-    - name: Get job id and name or fail
+    - name: Get jobid or fail
      # timeout-minutes is unsupported for composite workflows, see https://github.com/actions/runner/issues/1979
      # timeout-minutes: 10
      shell: bash
      id: get-job-id
      run: |
        set -eux
-        python3 .github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}"
+        GHA_WORKFLOW_JOB_ID=$(python3 .github/scripts/get_workflow_job_id.py "${GITHUB_RUN_ID}" "${RUNNER_NAME}")
+        echo "job-id=${GHA_WORKFLOW_JOB_ID}" >> "${GITHUB_OUTPUT}"
      env:
        GITHUB_TOKEN: ${{ inputs.github-token }}
--- a/.github/actions/pytest-cache-upload/action.yml
+++ b/.github/actions/pytest-cache-upload/action.yml
@ -10,13 +10,6 @@ inputs:
    description: Shard number for the current job
    required: false
    default: "0"
-  sha:
-    description: SHA for the commit
-    required: true
-  test_config:
-    description: Name of the test config
-    required: false
-    default: "default"
  job_identifier:
    description: Text that uniquely identifies a given job type within a workflow. All shards of a job should share the same job identifier.
    required: true
@ -40,8 +33,6 @@ runs:
      env:
        CACHE_DIR: ${{ inputs.cache_dir }}
        JOB_IDENTIFIER: ${{ inputs.job_identifier }}
-        SHA: ${{ inputs.sha }}
-        TEST_CONFIG: ${{ inputs.test_config }}
        SHARD: ${{ inputs.shard }}
        REPO: ${{ github.repository }}
      run: |
@ -50,8 +41,6 @@ runs:
          --cache_dir $GITHUB_WORKSPACE/$CACHE_DIR \
          --pr_identifier $GITHUB_REF \
          --job_identifier $JOB_IDENTIFIER \
-          --sha $SHA \
-          --test_config $TEST_CONFIG \
          --shard $SHARD \
          --repo $REPO \
          --temp_dir $RUNNER_TEMP \
--- a/.github/actions/upload-test-artifacts/action.yml
+++ b/.github/actions/upload-test-artifacts/action.yml
@ -43,14 +43,14 @@ runs:
        FILE_SUFFIX: ${{ inputs.file-suffix }}
      run: |
        # Remove any previous test reports if they exist
-        rm -f logs-*.zip
+        rm -f usage-log-*.zip
        # this workflow is also run in bazel build test, but we dont generate usage reports for it
        # so check to see if the file exists first
        if [ -f 'usage_log.txt' ]; then
-            zip "logs-${FILE_SUFFIX}.zip" 'usage_log.txt'
+            zip "usage-log-${FILE_SUFFIX}.zip" 'usage_log.txt'
        fi
        if ls test/**/*.log 1> /dev/null 2>&1; then
-            zip -r "logs-${FILE_SUFFIX}.zip" test -i '*.log'
+            zip -r "usage-log-${FILE_SUFFIX}.zip" test -i '*.log'
        fi

    # Windows zip
@ -80,7 +80,7 @@ runs:
        FILE_SUFFIX: ${{ inputs.file-suffix }}
      run: |
        # -ir => recursive include all files in pattern
-        7z a "logs-$Env:FILE_SUFFIX.zip" 'usage_log.txt' -ir'!test\*.log'
+        7z a "usage-log-$Env:FILE_SUFFIX.zip" 'usage_log.txt' -ir'!test\*.log'

    # S3 upload
    - name: Store Test Downloaded JSONs on S3
@ -112,7 +112,7 @@ runs:
          ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
        retention-days: 14
        if-no-files-found: ignore
-        path: logs-*.zip
+        path: usage-log-*.zip

    # GHA upload
    - name: Store Test Downloaded JSONs on Github
@ -146,7 +146,7 @@ runs:
      continue-on-error: true
      with:
        # Add the run attempt, see [Artifact run attempt]
-        name: logs-runattempt${{ github.run_attempt }}-${{ inputs.file-suffix }}.zip
+        name: usage-log-runattempt${{ github.run_attempt }}-${{ inputs.file-suffix }}.zip
        retention-days: 14
        if-no-files-found: ignore
        path: |
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-6518fa9b2c74e84d7eb1fc6e3eb51e43213f0c05
+a8f4e97bd5356a7a77510cdf6a3a62e25a5dc602
--- a/.github/ci_commit_pins/fbgemm.txt
+++ b/.github/ci_commit_pins/fbgemm.txt
@ -1 +1 @@
-de731af65b4f04696e85c729e3282450b51b95fd
+1b2746f642cc2c99fe9d1a0c34359c0de45341c2
--- a/.github/ci_commit_pins/numpy_pytorch_interop.txt
+++ b/.github/ci_commit_pins/numpy_pytorch_interop.txt
@ -0,0 +1 @@
+0c4e82511d349358d2c8c492dd833334e742f27f
--- a/.github/ci_commit_pins/timm.txt
+++ b/.github/ci_commit_pins/timm.txt
@ -0,0 +1 @@
+b9d43c7dcac1fe05e851dd7be7187b108af593d2
--- a/.github/ci_commit_pins/torchbench.txt
+++ b/.github/ci_commit_pins/torchbench.txt
@ -1 +1 @@
-99944a2fb8624947f9c0e2edc898ff42a16124da
+9371b9e13c826f3930e54346b4d619cb59182f68
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-c1e2095c3a16fbe7db25b9e2f206025488c2c203
+47cd5ea8e21d7596a24907710411d6b4a43f628d
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-r2.2
+r2.1
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -15,7 +15,6 @@
 "ciflow/inductor":
 - torch/_decomp/**
 - torch/_dynamo/**
- torch/_export/**
 - torch/_inductor/**
 - benchmarks/dynamo/**
 - torch/_subclasses/fake_tensor.py
@ -29,10 +28,6 @@
 - .github/ci_commit_pins/**
 - c10/core/Sym*
 - torch/fx/experimental/symbolic_shapes.py
- test/distributed/_tensor/test_dtensor_compile.py
- test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
- torch/distributed/_tensor/**
- torch/distributed/fsdp/**

 "module: cpu":
 - aten/src/ATen/cpu/**
@ -71,10 +66,3 @@

 "ciflow/trunk":
 - .ci/docker/ci_commit_pins/triton.txt
-
-"module: distributed":
- torch/csrc/distributed/**
- torch/distributed/**
- torch/nn/parallel/**
- test/distributed/**
- torch/testing/_internal/distributed/**
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -4,19 +4,16 @@
  - .ci/onnx/*
  - .ci/docker/common/install_onnx.sh
  - aten/src/ATen/core/interned_strings.h
-  - benchmarks/dynamo/**
  - docs/source/onnx.rst
  - docs/source/onnx*
  - docs/source/scripts/onnx/**
  - docs/source/_static/img/onnx/**
  - scripts/onnx/**
  - test/onnx/**
-  - test/onnx_caffe2/**
  - tools/onnx/**
  - torch/_dynamo/backends/onnxrt.py
  - torch/_C/__init__.pyi.in
  - torch/_C/_onnx.pyi
-  - torch/_logging/**
  - torch/csrc/jit/passes/onnx.*
  - torch/csrc/jit/passes/onnx/**
  - torch/csrc/jit/serialization/export.*
@ -26,6 +23,8 @@
  - torch/testing/_internal/common_methods_invocations.py
  - third_party/onnx
  - caffe2/python/onnx/**
+  - benchmarks/dynamo/_onnx/**
+  - torch/_logging/**
  approved_by:
  - BowenBao
  - abock
@ -74,7 +73,6 @@

 - name: OSS CI / pytorchbot
  patterns:
-  - .github/ci_commit_pins/audio.txt
  - .github/ci_commit_pins/vision.txt
  - .github/ci_commit_pins/torchdynamo.txt
  - .ci/docker/ci_commit_pins/triton.txt
@ -85,19 +83,6 @@
  - EasyCLA
  - Lint
  - pull
-  - inductor
-
- name: OSS CI /pytorchbot / Executorch
-  patterns:
-  - .ci/docker/ci_commit_pins/executorch.txt
-  approved_by:
-  - pytorchbot
-  ignore_flaky_failures: false
-  mandatory_checks_name:
-  - EasyCLA
-  - Lint
-  - pull / linux-jammy-py3-clang12-executorch / build
-  - pull / linux-jammy-py3-clang12-executorch / test (executorch, 1, 1, linux.2xlarge)

 - name: OSS CI / pytorchbot / XLA
  patterns:
@ -108,8 +93,8 @@
  mandatory_checks_name:
  - EasyCLA
  - Lint
-  - pull / linux-focal-py3_8-clang9-xla / build
-  - pull / linux-focal-py3_8-clang9-xla / test (xla, 1, 1, linux.12xlarge)
+  - pull / linux-bionic-py3_8-clang8-xla / build
+  - pull / linux-bionic-py3_8-clang8-xla / test (xla, 1, 1, linux.12xlarge)

 - name: Documentation
  patterns:
@ -139,6 +124,9 @@

 - name: PrimTorch
  patterns:
+  - aten/src/ATen/native_functions.yaml
+  - aten/src/ATen/native/**
+  - test/**
  - torch/_meta_registrations.py
  - torch/_decomp/**
  - torch/_refs/**
@ -332,7 +320,6 @@
  - XiaobingSuper
  - jgong5
  - vfdev-5
-  - leslie-fang-intel
  mandatory_checks_name:
  - EasyCLA
  - Lint
@ -351,21 +338,6 @@
  - Lint
  - pull

- name: x86 CPU quantization
-  patterns:
-  - torch/ao/quantization/quantizer/x86_inductor_quantizer.py
-  - torch/_inductor/fx_passes/quantization.py
-  - test/quantization/core/test_quantized_op.py
-  - test/inductor/test_mkldnn_pattern_matcher.py
-  - test/quantization/pt2e/test_x86inductor_quantizer.py
-  approved_by:
-  - leslie-fang-intel
-  - jgong5
-  mandatory_checks_name:
-  - EasyCLA
-  - Lint
-  - pull
-
 - name: Autocast
  patterns:
  - torch/amp/**
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -10,7 +10,6 @@ ciflow_push_tags:
 - ciflow/mps
 - ciflow/nightly
 - ciflow/periodic
- ciflow/rocm
 - ciflow/slow
 - ciflow/trunk
 - ciflow/unstable
--- a/.github/requirements/conda-env-iOS.txt
+++ b/.github/requirements/conda-env-iOS.txt
@ -1,5 +1,7 @@
 blas=1.0
 cmake=3.22.1
+mkl=2022.1.0
+mkl-include=2022.1.0
 ninja=1.10.2
 numpy=1.23.3
 pyyaml=6.0
--- a/.github/requirements/conda-env-macOS-ARM64
+++ b/.github/requirements/conda-env-macOS-ARM64
@ -5,7 +5,7 @@ cmake=3.22.*
 typing-extensions=4.3.0
 dataclasses=0.8
 pip=22.2.2
-pillow=10.0.1
+pillow=9.2.0
 pkg-config=0.29.2
 wheel=0.37.1
 # NB: This is intentionally held back because anaconda main doesn't
--- a/.github/requirements/conda-env-macOS-X64
+++ b/.github/requirements/conda-env-macOS-X64
@ -7,7 +7,7 @@ cmake=3.22.*
 typing-extensions=4.3.0
 dataclasses=0.8
 pip=22.2.2
-pillow=10.0.1
+pillow=9.2.0
 libuv=1.40.0
 pkg-config=0.29.2
 wheel=0.37.1
--- a/.github/requirements/pip-requirements-iOS.txt
+++ b/.github/requirements/pip-requirements-iOS.txt
@ -1,4 +1,3 @@
 # iOS simulator requirements
 coremltools==5.0b5
 protobuf==3.20.2
-optree==0.9.1
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -10,7 +10,6 @@ numba<=0.49.1; platform_machine != "arm64"
 opt-einsum>=3.3
 psutil==5.9.1
 nvidia-ml-py==11.525.84
-packaging==23.1
 pygments==2.15.0
 pytest==7.3.2
 pytest-xdist==3.3.1
@ -27,4 +26,3 @@ pytest-cpp==2.3.0
 rockset==1.0.3
 z3-solver==4.12.2.0
 tensorboard==2.13.0
-optree==0.9.1
--- a/.github/requirements/regenerate-requirements.txt
+++ b/.github/requirements/regenerate-requirements.txt
@ -1,2 +1,2 @@
-typing-extensions>=4.8.0
+typing-extensions
 jinja2
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@ -67,12 +67,10 @@ def build_triton(
        max_jobs = os.cpu_count() or 1
        env["MAX_JOBS"] = str(max_jobs)

-    version_suffix = ""
    if not release:
        # Nightly binaries include the triton commit hash, i.e. 2.1.0+e6216047b8
        # while release build should only include the version, i.e. 2.1.0
-        version_suffix = f"+{commit_hash[:10]}"
-        version += version_suffix
+        version = f"{version}+{commit_hash[:10]}"

    with TemporaryDirectory() as tmpdir:
        triton_basedir = Path(tmpdir) / "triton"
@ -84,14 +82,7 @@ def build_triton(
            triton_repo = "https://github.com/openai/triton"
            triton_pkg_name = "pytorch-triton"
        check_call(["git", "clone", triton_repo], cwd=tmpdir)
-        if release:
-            ver, rev, patch = version.split(".")
-            check_call(
-                ["git", "checkout", f"release/{ver}.{rev}.x"], cwd=triton_basedir
-            )
-        else:
-            check_call(["git", "checkout", commit_hash], cwd=triton_basedir)
-
+        check_call(["git", "checkout", commit_hash], cwd=triton_basedir)
        if build_conda:
            with open(triton_basedir / "meta.yaml", "w") as meta:
                print(
@ -141,21 +132,17 @@ def build_triton(
            shutil.copy(conda_path, Path.cwd())
            return Path.cwd() / conda_path.name

-        # change built wheel name and version
-        env["TRITON_WHEEL_NAME"] = triton_pkg_name
-        env["TRITON_WHEEL_VERSION_SUFFIX"] = version_suffix
+        patch_setup_py(
+            triton_pythondir / "setup.py",
+            name=triton_pkg_name,
+            version=f"{version}",
+        )
        patch_init_py(
            triton_pythondir / "triton" / "__init__.py",
            version=f"{version}",
        )

        if build_rocm:
-            # TODO: Remove me when ROCM triton is updated
-            patch_setup_py(
-                triton_pythondir / "setup.py",
-                name=triton_pkg_name,
-                version=f"{version}",
-            )
            check_call("scripts/amd/setup_rocm_libs.sh", cwd=triton_basedir, shell=True)
            print("ROCm libraries setup for triton installation...")

--- a/.github/scripts/check_labels.py
+++ b/.github/scripts/check_labels.py
@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 """Check whether a PR has required labels."""

-import sys
 from typing import Any

 from github_utils import gh_delete_comment, gh_post_pr_comment
@ -47,7 +46,7 @@ def main() -> None:
    except Exception as e:
        pass

-    sys.exit(0)
+    exit(0)


 if __name__ == "__main__":
--- a/.github/scripts/drci_mocks.json.gz
+++ b/.github/scripts/drci_mocks.json.gz
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -62,9 +62,10 @@ SUPPORTED_PERIODICAL_MODES: Dict[str, Callable[[Optional[str]], bool]] = {
 }

 # The link to the published list of disabled jobs
-DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=jbbJUxI_SSZFssBBGCU6ybH9sxHitHLY"
+# Pinning Disabled and Unstable job to Oct 4, 2023.
+DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=EniFrNbB6taGjwKyN94j4oqUeeN8ALfI"
 # and unstable jobs
-UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=hUtTalgnWb1m3AtJyVLUdu7DBrnddRkp"
+UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=2voGK5DSv0Hzvxhc23ChGcOLEBIO2vHf"

 # Some constants used to handle disabled and unstable jobs
 JOB_NAME_SEP = "/"
@ -410,17 +411,16 @@ def process_jobs(
            if target_job in (TEST_JOB_NAME, BUILD_AND_TEST_JOB_NAME):
                target_cfg = m.group("cfg")

-                # NB: There can be multiple unstable configurations, i.e. inductor, inductor_huggingface
-                test_matrix = _filter_jobs(
+                return _filter_jobs(
                    test_matrix=test_matrix,
                    issue_type=issue_type,
                    target_cfg=target_cfg,
                )
-        else:
-            warnings.warn(
-                f"Found a matching {issue_type.value} issue {target_url} for {workflow} / {job_name}, "
-                + f"but the name {target_job_cfg} is invalid"
-            )
+
+        warnings.warn(
+            f"Found a matching {issue_type.value} issue {target_url} for {workflow} / {job_name}, "
+            + f"but the name {target_job_cfg} is invalid"
+        )

    # Found no matching target, return the same input test matrix
    return test_matrix
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -10,19 +10,13 @@ architectures:
    * Latest ROCM
 """

-import os
 from typing import Dict, List, Optional, Tuple

+
 CUDA_ARCHES = ["11.8", "12.1"]


-CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1"}
-
-
-CUDA_ARCHES_CUDNN_VERSION = {"11.8": "8", "12.1": "8"}
-
-
-ROCM_ARCHES = ["5.6", "5.7"]
+ROCM_ARCHES = ["5.5", "5.6"]


 CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]
@ -30,79 +24,20 @@ CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]

 CPU_AARCH64_ARCH = ["cpu-aarch64"]

-
-PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
-    "11.8": (
-        "nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "  # noqa: B950
-        "nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'"
-    ),
-    "12.1": (
-        "nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "  # noqa: B950
-        "nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'"
-    ),
-}
-
-
-def get_nccl_submodule_version() -> str:
-    from pathlib import Path
-
-    nccl_version_mk = (
-        Path(__file__).absolute().parent.parent.parent
-        / "third_party"
-        / "nccl"
-        / "nccl"
-        / "makefiles"
-        / "version.mk"
-    )
-    if not nccl_version_mk.exists():
-        raise RuntimeError(
-            "Please make sure that nccl submodule is checked out when importing this script"
-        )
-    with nccl_version_mk.open("r") as f:
-        content = f.read()
-    d = {}
-    for l in content.split("\n"):
-        if not l.startswith("NCCL_"):
-            continue
-        (k, v) = l.split(":=")
-        d[k.strip()] = v.strip()
-    return f"{d['NCCL_MAJOR']}.{d['NCCL_MINOR']}.{d['NCCL_PATCH']}"
-
-
-def get_nccl_wheel_version(arch_version: str) -> str:
-    import re
-
-    requirements = map(
-        str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
-    )
-    return [x for x in requirements if x.startswith("nvidia-nccl-cu")][0].split("==")[1]
-
-
-def validate_nccl_dep_consistency(arch_version: str) -> None:
-    wheel_ver = get_nccl_wheel_version(arch_version)
-    submodule_ver = get_nccl_submodule_version()
-    if wheel_ver != submodule_ver:
-        raise RuntimeError(
-            f"NCCL submodule version {submodule_ver} differs from wheel version {wheel_ver}"
-        )
+PYTORCH_EXTRA_INSTALL_REQUIREMENTS = (
+    "nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "  # noqa: B950
+    "nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "nvidia-nccl-cu12==2.18.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+    "triton==2.1.0; platform_system == 'Linux' and platform_machine == 'x86_64'"
+)


 def arch_type(arch_version: str) -> str:
@ -118,29 +53,20 @@ def arch_type(arch_version: str) -> str:
        return "cpu"


-# This can be updated to the release version when cutting release branch, i.e. 2.1
-DEFAULT_TAG = os.getenv("RELEASE_VERSION_TAG", "main")
-
 WHEEL_CONTAINER_IMAGES = {
-    **{
-        gpu_arch: f"pytorch/manylinux-builder:cuda{gpu_arch}-{DEFAULT_TAG}"
-        for gpu_arch in CUDA_ARCHES
-    },
-    **{
-        gpu_arch: f"pytorch/manylinux-builder:rocm{gpu_arch}-{DEFAULT_TAG}"
-        for gpu_arch in ROCM_ARCHES
-    },
-    "cpu": f"pytorch/manylinux-builder:cpu-{DEFAULT_TAG}",
-    "cpu-cxx11-abi": f"pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-{DEFAULT_TAG}",
-    "cpu-aarch64": f"pytorch/manylinuxaarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
+    "11.8": "pytorch/manylinux-builder:cuda11.8-2.1",
+    "12.1": "pytorch/manylinux-builder:cuda12.1-2.1",
+    "5.5": "pytorch/manylinux-builder:rocm5.5-2.1",
+    "5.6": "pytorch/manylinux-builder:rocm5.6-2.1",
+    "cpu": "pytorch/manylinux-builder:cpu-2.1",
+    "cpu-cxx11-abi": "pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-2.1",
+    "cpu-aarch64": "pytorch/manylinuxaarch64-builder:cpu-aarch64-2.1",
 }

 CONDA_CONTAINER_IMAGES = {
-    **{
-        gpu_arch: f"pytorch/conda-builder:cuda{gpu_arch}-{DEFAULT_TAG}"
-        for gpu_arch in CUDA_ARCHES
-    },
-    "cpu": f"pytorch/conda-builder:cpu-{DEFAULT_TAG}",
+    "11.8": "pytorch/conda-builder:cuda11.8-2.1",
+    "12.1": "pytorch/conda-builder:cuda12.1-2.1",
+    "cpu": "pytorch/conda-builder:cpu-2.1",
 }

 PRE_CXX11_ABI = "pre-cxx11"
@ -149,39 +75,49 @@ RELEASE = "release"
 DEBUG = "debug"

 LIBTORCH_CONTAINER_IMAGES: Dict[Tuple[str, str], str] = {
-    **{
-        (
-            gpu_arch,
-            PRE_CXX11_ABI,
-        ): f"pytorch/manylinux-builder:cuda{gpu_arch}-{DEFAULT_TAG}"
-        for gpu_arch in CUDA_ARCHES
-    },
-    **{
-        (
-            gpu_arch,
-            CXX11_ABI,
-        ): f"pytorch/libtorch-cxx11-builder:cuda{gpu_arch}-{DEFAULT_TAG}"
-        for gpu_arch in CUDA_ARCHES
-    },
-    **{
-        (
-            gpu_arch,
-            PRE_CXX11_ABI,
-        ): f"pytorch/manylinux-builder:rocm{gpu_arch}-{DEFAULT_TAG}"
-        for gpu_arch in ROCM_ARCHES
-    },
-    **{
-        (
-            gpu_arch,
-            CXX11_ABI,
-        ): f"pytorch/libtorch-cxx11-builder:rocm{gpu_arch}-{DEFAULT_TAG}"
-        for gpu_arch in ROCM_ARCHES
-    },
-    ("cpu", PRE_CXX11_ABI): f"pytorch/manylinux-builder:cpu-{DEFAULT_TAG}",
-    ("cpu", CXX11_ABI): f"pytorch/libtorch-cxx11-builder:cpu-{DEFAULT_TAG}",
+    (
+        "11.8",
+        PRE_CXX11_ABI,
+    ): "pytorch/manylinux-builder:cuda11.8-2.1",
+    (
+        "12.1",
+        PRE_CXX11_ABI,
+    ): "pytorch/manylinux-builder:cuda12.1-2.1",
+    (
+        "11.8",
+        CXX11_ABI,
+    ): "pytorch/libtorch-cxx11-builder:cuda11.8-2.1",
+    (
+        "12.1",
+        CXX11_ABI,
+    ): "pytorch/libtorch-cxx11-builder:cuda12.1-2.1",
+    (
+        "5.5",
+        PRE_CXX11_ABI,
+    ): "pytorch/manylinux-builder:rocm5.5-2.1",
+    (
+        "5.6",
+        PRE_CXX11_ABI,
+    ): "pytorch/manylinux-builder:rocm5.6-2.1",
+    (
+        "5.5",
+        CXX11_ABI,
+    ): "pytorch/libtorch-cxx11-builder:rocm5.5-2.1",
+    (
+        "5.6",
+        CXX11_ABI,
+    ): "pytorch/libtorch-cxx11-builder:rocm5.6-2.1",
+    (
+        "cpu",
+        PRE_CXX11_ABI,
+    ): "pytorch/manylinux-builder:cpu-2.1",
+    (
+        "cpu",
+        CXX11_ABI,
+    ): "pytorch/libtorch-cxx11-builder:cpu-2.1",
 }

-FULL_PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11", "3.12"]
+FULL_PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11"]


 def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
@ -288,6 +224,7 @@ def generate_wheels_matrix(
    os: str,
    arches: Optional[List[str]] = None,
    python_versions: Optional[List[str]] = None,
+    gen_special_an_non_special_wheel: bool = True,
 ) -> List[Dict[str, str]]:
    package_type = "wheel"
    if os == "linux" or os == "linux-aarch64":
@ -321,8 +258,9 @@ def generate_wheels_matrix(
                else arch_version
            )

-            # 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
-            if arch_version in ["12.1", "11.8"] and os == "linux":
+            # special 12.1 wheels package without dependencies
+            # dependency downloaded via pip install
+            if arch_version == "12.1" and os == "linux":
                ret.append(
                    {
                        "python_version": python_version,
@ -334,36 +272,34 @@ def generate_wheels_matrix(
                        "devtoolset": "",
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
-                        "pytorch_extra_install_requirements": PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version],  # fmt: skip
-                        "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(  # noqa: B950
+                        "pytorch_extra_install_requirements": PYTORCH_EXTRA_INSTALL_REQUIREMENTS,
+                        "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-with-pypi-cudnn".replace(  # noqa: B950
                            ".", "_"
                        ),
                    }
                )
-            else:
-                ret.append(
-                    {
-                        "python_version": python_version,
-                        "gpu_arch_type": gpu_arch_type,
-                        "gpu_arch_version": gpu_arch_version,
-                        "desired_cuda": translate_desired_cuda(
-                            gpu_arch_type, gpu_arch_version
-                        ),
-                        "devtoolset": "cxx11-abi"
-                        if arch_version == "cpu-cxx11-abi"
-                        else "",
-                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
-                        "package_type": package_type,
-                        "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
-                            ".", "_"
-                        ),
-                        "pytorch_extra_install_requirements":
-                        PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.1"]  # fmt: skip
-                        if os != "linux" else "",
-                    }
-                )
+                if not gen_special_an_non_special_wheel:
+                    continue
+
+            ret.append(
+                {
+                    "python_version": python_version,
+                    "gpu_arch_type": gpu_arch_type,
+                    "gpu_arch_version": gpu_arch_version,
+                    "desired_cuda": translate_desired_cuda(
+                        gpu_arch_type, gpu_arch_version
+                    ),
+                    "devtoolset": "cxx11-abi"
+                    if arch_version == "cpu-cxx11-abi"
+                    else "",
+                    "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
+                    "package_type": package_type,
+                    "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
+                        ".", "_"
+                    ),
+                    "pytorch_extra_install_requirements": PYTORCH_EXTRA_INSTALL_REQUIREMENTS
+                    if os != "linux"
+                    else "",
+                }
+            )
    return ret
-
-
-validate_nccl_dep_consistency("12.1")
-validate_nccl_dep_consistency("11.8")
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -60,7 +60,7 @@ class BinaryBuildWorkflow:
    branches: str = "nightly"
    # Mainly for macos
    cross_compile_arm64: bool = False
-    macos_runner: str = "macos-12-xl"
+    xcode_version: str = ""

    def __post_init__(self) -> None:
        if self.abi_version:
@ -125,9 +125,7 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
        package_type="libtorch",
        abi_version=generate_binary_build_matrix.CXX11_ABI,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.LINUX,
-            generate_binary_build_matrix.CXX11_ABI,
-            libtorch_variants=["shared-with-deps"],
+            OperatingSystem.LINUX, generate_binary_build_matrix.CXX11_ABI
        ),
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
@ -139,9 +137,7 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
        package_type="libtorch",
        abi_version=generate_binary_build_matrix.PRE_CXX11_ABI,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.LINUX,
-            generate_binary_build_matrix.PRE_CXX11_ABI,
-            libtorch_variants=["shared-with-deps"],
+            OperatingSystem.LINUX, generate_binary_build_matrix.PRE_CXX11_ABI
        ),
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
@ -158,6 +154,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
            OperatingSystem.LINUX,
            arches=["11.8", "12.1"],
            python_versions=["3.8"],
+            gen_special_an_non_special_wheel=False,
        ),
        branches="main",
    ),
@ -215,9 +212,7 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
        package_type="libtorch",
        abi_version=generate_binary_build_matrix.RELEASE,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.WINDOWS,
-            generate_binary_build_matrix.RELEASE,
-            libtorch_variants=["shared-with-deps"],
+            OperatingSystem.WINDOWS, generate_binary_build_matrix.RELEASE
        ),
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
@ -229,9 +224,7 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
        package_type="libtorch",
        abi_version=generate_binary_build_matrix.DEBUG,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.WINDOWS,
-            generate_binary_build_matrix.DEBUG,
-            libtorch_variants=["shared-with-deps"],
+            OperatingSystem.WINDOWS, generate_binary_build_matrix.DEBUG
        ),
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
@ -301,39 +294,20 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
        package_type="libtorch",
        abi_version=generate_binary_build_matrix.CXX11_ABI,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.MACOS,
-            generate_binary_build_matrix.CXX11_ABI,
-            libtorch_variants=["shared-with-deps"],
+            OperatingSystem.MACOS, generate_binary_build_matrix.CXX11_ABI
        ),
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
            isolated_workflow=True,
        ),
    ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.MACOS_ARM64,
-        package_type="libtorch",
-        abi_version=generate_binary_build_matrix.CXX11_ABI,
-        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.MACOS,
-            generate_binary_build_matrix.CXX11_ABI,
-            libtorch_variants=["shared-with-deps"],
-        ),
-        cross_compile_arm64=False,
-        macos_runner="macos-13-xlarge",
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
-            isolated_workflow=True,
-        ),
-    ),
    BinaryBuildWorkflow(
        os=OperatingSystem.MACOS_ARM64,
        package_type="wheel",
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.MACOS_ARM64
        ),
-        cross_compile_arm64=False,
-        macos_runner="macos-13-xlarge",
+        cross_compile_arm64=True,
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
            isolated_workflow=True,
--- a/.github/scripts/generate_docker_release_matrix.py
+++ b/.github/scripts/generate_docker_release_matrix.py
@ -1,42 +0,0 @@
-#!/usr/bin/env python3
-
-"""Generates a matrix for docker releases through github actions
-
-Will output a condensed version of the matrix. Will include fllowing:
-    * CUDA version short
-    * CUDA full verison
-    * CUDNN version short
-    * Image type either runtime or devel
-    * Platform linux/arm64,linux/amd64
-
-"""
-
-import json
-from typing import Dict, List
-
-import generate_binary_build_matrix
-
-DOCKER_IMAGE_TYPES = ["runtime", "devel"]
-
-
-def generate_docker_matrix() -> Dict[str, List[Dict[str, str]]]:
-    ret: List[Dict[str, str]] = []
-    for cuda, version in generate_binary_build_matrix.CUDA_ARCHES_FULL_VERSION.items():
-        for image in DOCKER_IMAGE_TYPES:
-            ret.append(
-                {
-                    "cuda": cuda,
-                    "cuda_full_version": version,
-                    "cudnn_version": generate_binary_build_matrix.CUDA_ARCHES_CUDNN_VERSION[
-                        cuda
-                    ],
-                    "image_type": image,
-                    "platform": "linux/arm64,linux/amd64",
-                }
-            )
-    return {"include": ret}
-
-
-if __name__ == "__main__":
-    build_matrix = generate_docker_matrix()
-    print(json.dumps(build_matrix))
--- a/.github/scripts/get_workflow_job_id.py
+++ b/.github/scripts/get_workflow_job_id.py
@ -111,7 +111,7 @@ def fetch_jobs(url: str, headers: Dict[str, str]) -> List[Dict[str, str]]:
 # running.


-def find_job_id_name(args: Any) -> Tuple[str, str]:
+def find_job_id(args: Any) -> str:
    # From https://docs.github.com/en/actions/learn-github-actions/environment-variables
    PYTORCH_REPO = os.environ.get("GITHUB_REPOSITORY", "pytorch/pytorch")
    PYTORCH_GITHUB_API = f"https://api.github.com/repos/{PYTORCH_REPO}"
@ -130,28 +130,15 @@ def find_job_id_name(args: Any) -> Tuple[str, str]:

    for job in jobs:
        if job["runner_name"] == args.runner_name:
-            return (job["id"], job["name"])
+            return job["id"]

    raise RuntimeError(f"Can't find job id for runner {args.runner_name}")


-def set_output(name: str, val: Any) -> None:
-    if os.getenv("GITHUB_OUTPUT"):
-        with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
-            print(f"{name}={val}", file=env)
-        print(f"setting {name}={val}")
-    else:
-        print(f"::set-output name={name}::{val}")
-
-
 def main() -> None:
    args = parse_args()
    try:
-        # Get both the job ID and job name because we have already spent a request
-        # here to get the job info
-        job_id, job_name = find_job_id_name(args)
-        set_output("job-id", job_id)
-        set_output("job-name", job_name)
+        print(find_job_id(args))
    except Exception as e:
        print(repr(e), file=sys.stderr)
        print(f"workflow-{args.workflow_run_id}")
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -5,15 +5,12 @@ import os
 import warnings

 from dataclasses import dataclass
-from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, cast, Dict, List, Optional, Tuple
 from urllib.error import HTTPError
 from urllib.parse import quote
 from urllib.request import Request, urlopen


-GITHUB_API_URL = "https://api.github.com"
-
-
@dataclass
 class GitHubComment:
    body_text: str
@ -29,20 +26,16 @@ def gh_fetch_url_and_headers(
    url: str,
    *,
    headers: Optional[Dict[str, str]] = None,
-    data: Union[Optional[Dict[str, Any]], str] = None,
+    data: Optional[Dict[str, Any]] = None,
    method: Optional[str] = None,
    reader: Callable[[Any], Any] = lambda x: x.read(),
 ) -> Tuple[Any, Any]:
    if headers is None:
        headers = {}
    token = os.environ.get("GITHUB_TOKEN")
-    if token is not None and url.startswith(f"{GITHUB_API_URL}/"):
+    if token is not None and url.startswith("https://api.github.com/"):
        headers["Authorization"] = f"token {token}"
-
-    data_ = None
-    if data is not None:
-        data_ = data.encode() if isinstance(data, str) else json.dumps(data).encode()
-
+    data_ = json.dumps(data).encode() if data is not None else None
    try:
        with urlopen(Request(url, headers=headers, data=data_, method=method)) as conn:
            return conn.headers, reader(conn)
@ -64,7 +57,7 @@ def gh_fetch_url(
    url: str,
    *,
    headers: Optional[Dict[str, str]] = None,
-    data: Union[Optional[Dict[str, Any]], str] = None,
+    data: Optional[Dict[str, Any]] = None,
    method: Optional[str] = None,
    reader: Callable[[Any], Any] = lambda x: x.read(),
 ) -> Any:
@ -132,7 +125,7 @@ def gh_post_pr_comment(
    org: str, repo: str, pr_num: int, comment: str, dry_run: bool = False
 ) -> List[Dict[str, Any]]:
    return _gh_post_comment(
-        f"{GITHUB_API_URL}/repos/{org}/{repo}/issues/{pr_num}/comments",
+        f"https://api.github.com/repos/{org}/{repo}/issues/{pr_num}/comments",
        comment,
        dry_run,
    )
@ -142,14 +135,14 @@ def gh_post_commit_comment(
    org: str, repo: str, sha: str, comment: str, dry_run: bool = False
 ) -> List[Dict[str, Any]]:
    return _gh_post_comment(
-        f"{GITHUB_API_URL}/repos/{org}/{repo}/commits/{sha}/comments",
+        f"https://api.github.com/repos/{org}/{repo}/commits/{sha}/comments",
        comment,
        dry_run,
    )


 def gh_delete_comment(org: str, repo: str, comment_id: int) -> None:
-    url = f"{GITHUB_API_URL}/repos/{org}/{repo}/issues/comments/{comment_id}"
+    url = f"https://api.github.com/repos/{org}/{repo}/issues/comments/{comment_id}"
    gh_fetch_url(url, method="DELETE")


@ -160,7 +153,7 @@ def gh_fetch_merge_base(org: str, repo: str, base: str, head: str) -> str:
    # https://docs.github.com/en/rest/commits/commits?apiVersion=2022-11-28#compare-two-commits
    try:
        json_data = gh_fetch_url(
-            f"{GITHUB_API_URL}/repos/{org}/{repo}/compare/{base}...{head}",
+            f"https://api.github.com/repos/{org}/{repo}/compare/{base}...{head}",
            headers={"Accept": "application/vnd.github.v3+json"},
            reader=json.load,
        )
@ -174,18 +167,3 @@ def gh_fetch_merge_base(org: str, repo: str, base: str, head: str) -> str:
        warnings.warn(f"Failed to get merge base for {base}...{head}: {error}")

    return merge_base
-
-
-def gh_update_pr_state(org: str, repo: str, pr_num: int, state: str = "open") -> None:
-    url = f"{GITHUB_API_URL}/repos/{org}/{repo}/pulls/{pr_num}"
-    try:
-        gh_fetch_url(url, method="PATCH", data={"state": state})
-    except HTTPError as err:
-        # When trying to open the pull request, error 422 means that the branch
-        # has been deleted and the API couldn't re-open it
-        if err.code == 422 and state == "open":
-            warnings.warn(
-                f"Failed to open {pr_num} because its head branch has been deleted: {err}"
-            )
-        else:
-            raise
--- a/.github/scripts/gql_mocks.json
+++ b/.github/scripts/gql_mocks.json
--- a/.github/scripts/gql_mocks.json.gz
+++ b/.github/scripts/gql_mocks.json.gz
--- a/.github/scripts/pytest_cache.py
+++ b/.github/scripts/pytest_cache.py
@ -38,12 +38,6 @@ def parse_args() -> argparse.Namespace:
        required=True,
        help="A unique job identifier that should be the same for all runs of job",
    )
-    parser.add_argument(
-        "--sha", required="--upload" in sys.argv, help="SHA of the commit"
-    )  # Only required for upload
-    parser.add_argument(
-        "--test_config", required="--upload" in sys.argv, help="The test config"
-    )  # Only required for upload
    parser.add_argument(
        "--shard", required="--upload" in sys.argv, help="The shard id"
    )  # Only required for upload
@ -90,8 +84,6 @@ def main() -> None:
            pr_identifier=pr_identifier,
            repo=repo,
            job_identifier=args.job_identifier,
-            sha=args.sha,
-            test_config=args.test_config,
            shard=args.shard,
            cache_dir=cache_dir,
            bucket=args.bucket,
--- a/.github/scripts/pytest_caching_utils.py
+++ b/.github/scripts/pytest_caching_utils.py
@ -56,8 +56,6 @@ def upload_pytest_cache(
    pr_identifier: PRIdentifier,
    repo: GithubRepo,
    job_identifier: str,
-    sha: str,
-    test_config: str,
    shard: str,
    cache_dir: Path,
    temp_dir: Path,
@ -81,11 +79,25 @@ def upload_pytest_cache(
    if not bucket:
        bucket = BUCKET

-    # Upload the cache
-    obj_key_prefix = _get_s3_key_prefix(
-        pr_identifier, repo, job_identifier, sha, test_config, shard
+    # Merge the current cache with any caches from previous runs before uploading
+    # We only need to merge it with the cache for the same shard (which will have already been downloaded if it exists)
+    # since the other shards will handle themselves
+    shard_cache_path = _get_temp_cache_dir_path(
+        temp_dir, pr_identifier, repo, job_identifier, shard
    )
-    zip_file_path = zip_folder(cache_dir, temp_dir / ZIP_UPLOAD / obj_key_prefix)
+
+    if shard_cache_path.is_dir():
+        _merge_pytest_caches(shard_cache_path, cache_dir)
+
+    #
+    # Upload the cache
+    #
+
+    obj_key_prefix = _get_s3_key_prefix(pr_identifier, repo, job_identifier, shard)
+    # This doesn't include the zip file extension. That'll get added later
+    zip_file_path = temp_dir / ZIP_UPLOAD / obj_key_prefix
+
+    zip_file_path = zip_folder(cache_dir, zip_file_path)
    obj_key = f"{obj_key_prefix}{os.path.splitext(zip_file_path)[1]}"  # Keep the new file extension
    upload_file_to_s3(zip_file_path, bucket, obj_key)

@ -124,22 +136,38 @@ def download_pytest_cache(
    )

    for downloaded_zip in downloads:
-        # Unzip into random folder, then merge with the current cache
-        cache_dir_for_shard = (
-            temp_dir / UNZIPPED_CACHES / os.urandom(16).hex() / PYTEST_CACHE_DIR_NAME
+        # the file name of the zip is the shard id
+        shard = os.path.splitext(os.path.basename(downloaded_zip))[0]
+        cache_dir_for_shard = _get_temp_cache_dir_path(
+            temp_dir, pr_identifier, repo, job_identifier, shard
        )

        unzip_folder(downloaded_zip, cache_dir_for_shard)
-        print(f"Merging cache from {downloaded_zip}")
+        print(
+            f"Merging cache for job_identifier `{job_identifier}`, shard `{shard}` into `{dest_cache_dir}`"
+        )
        _merge_pytest_caches(cache_dir_for_shard, dest_cache_dir)


+def _get_temp_cache_dir_path(
+    temp_dir: Path,
+    pr_identifier: PRIdentifier,
+    repo: GithubRepo,
+    job_identifier: str,
+    shard: str,
+) -> Path:
+    return (
+        temp_dir
+        / UNZIPPED_CACHES
+        / _get_s3_key_prefix(pr_identifier, repo, job_identifier, shard)
+        / PYTEST_CACHE_DIR_NAME
+    )
+
+
 def _get_s3_key_prefix(
    pr_identifier: PRIdentifier,
    repo: GithubRepo,
    job_identifier: str,
-    sha: str = "",
-    test_config: str = "",
    shard: str = "",
 ) -> str:
    """
@ -148,10 +176,6 @@ def _get_s3_key_prefix(
    """
    prefix = f"{PYTEST_CACHE_KEY_PREFIX}/{repo.owner}/{repo.name}/{pr_identifier}/{sanitize_for_s3(job_identifier)}"

-    if sha:
-        prefix += f"/{sha}"
-    if test_config:
-        prefix += f"/{sanitize_for_s3(test_config)}"
    if shard:
        prefix += f"/{shard}"

--- a/.github/scripts/rockset_mocks.json
+++ b/.github/scripts/rockset_mocks.json
--- a/.github/scripts/rockset_mocks.json.gz
+++ b/.github/scripts/rockset_mocks.json.gz
--- a/.github/scripts/tag_docker_images_for_release.py
+++ b/.github/scripts/tag_docker_images_for_release.py
@ -1,64 +0,0 @@
-import argparse
-import subprocess
-from typing import Dict
-
-import generate_binary_build_matrix
-
-
-def tag_image(
-    image: str,
-    default_tag: str,
-    release_version: str,
-    dry_run: str,
-    tagged_images: Dict[str, bool],
-) -> None:
-    if image in tagged_images:
-        return
-    release_image = image.replace(f"-{default_tag}", f"-{release_version}")
-    print(f"Tagging {image} to {release_image} , dry_run: {dry_run}")
-
-    if dry_run == "disabled":
-        subprocess.check_call(["docker", "pull", image])
-        subprocess.check_call(["docker", "tag", image, release_image])
-        subprocess.check_call(["docker", "push", release_image])
-    tagged_images[image] = True
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--version",
-        help="Version to tag",
-        type=str,
-        default="2.2",
-    )
-    parser.add_argument(
-        "--dry-run",
-        help="No Runtime Error check",
-        type=str,
-        choices=["enabled", "disabled"],
-        default="enabled",
-    )
-
-    options = parser.parse_args()
-    tagged_images: Dict[str, bool] = dict()
-    platform_images = [
-        generate_binary_build_matrix.WHEEL_CONTAINER_IMAGES,
-        generate_binary_build_matrix.LIBTORCH_CONTAINER_IMAGES,
-        generate_binary_build_matrix.CONDA_CONTAINER_IMAGES,
-    ]
-    default_tag = generate_binary_build_matrix.DEFAULT_TAG
-
-    for platform_image in platform_images:  # type: ignore[attr-defined]
-        for arch in platform_image.keys():  # type: ignore[attr-defined]
-            tag_image(
-                platform_image[arch],  # type: ignore[index]
-                default_tag,
-                options.version,
-                options.dry_run,
-                tagged_images,
-            )
-
-
-if __name__ == "__main__":
-    main()
--- a/.github/scripts/test_filter_test_configs.py
+++ b/.github/scripts/test_filter_test_configs.py
@ -102,30 +102,6 @@ MOCKED_DISABLED_UNSTABLE_JOBS = {
        "manywheel-py3_8-cuda11_8-build",
        "",
    ],
-    "inductor / cuda12.1-py3.10-gcc9-sm86 / test (inductor)": [
-        "pytorchbot",
-        "107079",
-        "https://github.com/pytorch/pytorch/issues/107079",
-        "inductor",
-        "cuda12.1-py3.10-gcc9-sm86",
-        "test (inductor)",
-    ],
-    "inductor / cuda12.1-py3.10-gcc9-sm86 / test (inductor_huggingface)": [
-        "pytorchbot",
-        "109153",
-        "https://github.com/pytorch/pytorch/issues/109153",
-        "inductor",
-        "cuda12.1-py3.10-gcc9-sm86",
-        "test (inductor_huggingface)",
-    ],
-    "inductor / cuda12.1-py3.10-gcc9-sm86 / test (inductor_huggingface_dynamic)": [
-        "pytorchbot",
-        "109154",
-        "https://github.com/pytorch/pytorch/issues/109154",
-        "inductor",
-        "cuda12.1-py3.10-gcc9-sm86",
-        "test (inductor_huggingface_dynamic)",
-    ],
 }

 MOCKED_PR_INFO = {
@ -593,37 +569,6 @@ class TestConfigFilter(TestCase):
                "expected": '{"include": [{"config": "default", "unstable": "unstable"}]}',
                "description": "Both binary build and test jobs are unstable",
            },
-            {
-                "workflow": "inductor",
-                "job_name": "cuda12.1-py3.10-gcc9-sm86 / build",
-                "test_matrix": """
-                    { include: [
-                        { config: "inductor" },
-                        { config: "inductor_huggingface", shard: 1 },
-                        { config: "inductor_huggingface", shard: 2 },
-                        { config: "inductor_timm", shard: 1 },
-                        { config: "inductor_timm", shard: 2 },
-                        { config: "inductor_torchbench" },
-                        { config: "inductor_huggingface_dynamic" },
-                        { config: "inductor_torchbench_dynamic" },
-                        { config: "inductor_distributed" },
-                    ]}
-                """,
-                "expected": """
-                    { "include": [
-                        { "config": "inductor", "unstable": "unstable" },
-                        { "config": "inductor_huggingface", "shard": 1, "unstable": "unstable" },
-                        { "config": "inductor_huggingface", "shard": 2, "unstable": "unstable" },
-                        { "config": "inductor_timm", "shard": 1 },
-                        { "config": "inductor_timm", "shard": 2 },
-                        { "config": "inductor_torchbench" },
-                        { "config": "inductor_huggingface_dynamic", "unstable": "unstable" },
-                        { "config": "inductor_torchbench_dynamic" },
-                        { "config": "inductor_distributed" }
-                    ]}
-                """,
-                "description": "Marking multiple unstable configurations",
-            },
        ]

        for case in testcases:
@ -632,7 +577,7 @@ class TestConfigFilter(TestCase):
            test_matrix = yaml.safe_load(case["test_matrix"])

            filtered_test_matrix = mark_unstable_jobs(workflow, job_name, test_matrix)
-            self.assertEqual(json.loads(case["expected"]), filtered_test_matrix)
+            self.assertEqual(case["expected"], json.dumps(filtered_test_matrix))

    @mock.patch("subprocess.check_output")
    def test_perform_misc_tasks(self, mocked_subprocess: Any) -> None:
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -7,12 +7,11 @@
 # GraphQL queries in trymerge.py, please make sure to delete `gql_mocks.json`
 # And re-run the test locally with ones PAT

-import gzip
 import json
 import os
 import warnings
 from hashlib import sha256
-from typing import Any, Dict, List, Optional
+from typing import Any, cast, Dict, List, Optional
 from unittest import main, mock, skip, TestCase
 from urllib.error import HTTPError

@ -20,20 +19,18 @@ from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo

 from trymerge import (
    categorize_checks,
-    DRCI_CHECKRUN_NAME,
    find_matching_merge_rule,
+    FlakyRule,
    get_classifications,
-    get_drci_classifications,
    get_rockset_results,
    gh_get_team_members,
    gh_graphql,
    GitHubPR,
-    JobCheckState,
+    is_broken_trunk,
    main as trymerge_main,
    MandatoryChecksMissingError,
    MergeRule,
    PostCommentError,
-    RE_GHSTACK_DESC,
    read_merge_rules,
    remove_job_name_suffix,
    validate_revert,
@ -42,10 +39,6 @@ from trymerge import (
 if "GIT_REMOTE_URL" not in os.environ:
    os.environ["GIT_REMOTE_URL"] = "https://github.com/pytorch/pytorch"

-GQL_MOCKS = "gql_mocks.json.gz"
-ROCKSET_MOCKS = "rockset_mocks.json.gz"
-DRCI_MOCKS = "drci_mocks.json.gz"
-

 def mock_query(
    fallback_function: Any,
@ -58,11 +51,11 @@ def mock_query(
    def get_mocked_queries() -> Any:
        if not os.path.exists(gql_db_fname):
            return {}
-        with gzip.open(gql_db_fname, encoding="utf-8", mode="rt") as f:
+        with open(gql_db_fname, encoding="utf-8") as f:
            return json.load(f)

    def save_mocked_queries(obj: Any) -> None:
-        with gzip.open(gql_db_fname, encoding="utf-8", mode="wt") as f:
+        with open(gql_db_fname, encoding="utf-8", mode="w") as f:
            json.dump(obj, f, indent=2)
            f.write("\n")

@ -75,20 +68,19 @@ def mock_query(
    try:
        rc = fallback_function(*args)
    except HTTPError as err:
-        if err.code == 401 or err.code == 403:
+        if err.code == 401:
            err_msg = f"If you are seeing this message during workflow run, please make sure to update {file_name}"
-            err_msg += f" locally, by deleting it and running {os.path.basename(__file__)} with"
-            err_msg += " GitHub Personal Access Token passed via GITHUB_TOKEN,"
-            err_msg += " the rockset api key passed via ROCKSET_API_KEY,"
-            err_msg += " and drci api key passed via DRCI_BOT_KEY environment variables"
+            err_msg += f" locally, by deleting it and running {os.path.basename(__file__)} with "
+            err_msg += " GitHub Personal Access Token passed via GITHUB_TOKEN environment variable"
+            err_msg += (
+                " the rockset api key passed via ROCKSET_API_KEY environment variable"
+            )
            if (
                os.getenv("GITHUB_TOKEN") is None
                or os.getenv("ROCKSET_API_KEY") is None
-                or os.getenv("DRCI_BOT_KEY") is None
            ):
                err_msg = (
-                    "Failed to update cached queries as GITHUB_TOKEN or ROCKSET_API_KEY or DRCI_BOT_KEY "
-                    + "is not defined. "
+                    "Failed to update cached GraphQL queries as GITHUB_TOKEN or ROCKSET_API_KEY is not defined."
                    + err_msg
                )
            raise RuntimeError(err_msg) from err
@ -108,29 +100,19 @@ def mocked_gh_graphql(query: str, **kwargs: Any) -> Any:
    def gh_graphql_wrapper(query: str, kwargs: Any) -> Any:
        return gh_graphql(query, **kwargs)

-    return mock_query(gh_graphql_wrapper, GQL_MOCKS, key_function, query, kwargs)
+    return mock_query(gh_graphql_wrapper, "gql_mocks.json", key_function, query, kwargs)


 def mocked_rockset_results(head_sha: str, merge_base: str, num_retries: int = 3) -> Any:
    return mock_query(
        get_rockset_results,
-        ROCKSET_MOCKS,
+        "rockset_mocks.json",
        lambda x, y: f"{x} {y}",
        head_sha,
        merge_base,
    )


-def mocked_drci_classifications(pr_num: int, project: str, num_retries: int = 3) -> Any:
-    return mock_query(
-        get_drci_classifications,
-        DRCI_MOCKS,
-        lambda x, y: f"{x} {y}",
-        pr_num,
-        project,
-    )
-
-
 def mock_parse_args(revert: bool = False, force: bool = False) -> Any:
    class Object:
        def __init__(self) -> None:
@ -207,18 +189,6 @@ def mocked_read_merge_rules(repo: Any, org: str, project: str) -> List[MergeRule
            ],
            ignore_flaky_failures=True,
        ),
-        MergeRule(
-            name="xla",
-            patterns=[".github/ci_commit_pins/xla.txt"],
-            approved_by=["pytorchbot"],
-            mandatory_checks_name=[
-                "Lint",
-                "EasyCLA",
-                "pull / linux-focal-py3_8-clang9-xla / build",
-                "pull / linux-focal-py3_8-clang9-xla / test (xla, 1, 1, linux.12xlarge)",
-            ],
-            ignore_flaky_failures=True,
-        ),
    ]


@ -226,6 +196,16 @@ def mocked_read_merge_rules_raise(repo: Any, org: str, project: str) -> List[Mer
    raise RuntimeError("testing")


+def empty_flaky_rules() -> List[FlakyRule]:
+    return []
+
+
+def xla_is_flaky_rules() -> List[FlakyRule]:
+    return [
+        FlakyRule("xla", ["FAILED: Build did NOT complete successfully"]),
+    ]
+
+
 def xla_merge_rules(repo: Any, org: str, project: str) -> List[MergeRule]:
    return [
        MergeRule(
@ -237,7 +217,6 @@ def xla_merge_rules(repo: Any, org: str, project: str) -> List[MergeRule]:
                "EasyCLA",
                "pull / linux-bionic-py3_8-clang8-xla / build",
                "pull / linux-bionic-py3_8-clang8-xla / test (xla, 1, 1, linux.4xlarge)",
-                "inductor / cuda11.8-py3.10-gcc7-sm86 / test (inductor_torchbench_dynamic, 1, 1, linux.g5.4xlarge.nvidia.gpu)",
            ],
            ignore_flaky_failures=False,
        ),
@ -259,11 +238,9 @@ class DummyGitRepo(GitRepo):
        return "super awsome commit message"


+@mock.patch("trymerge.read_flaky_rules", side_effect=empty_flaky_rules)
@mock.patch("trymerge.get_rockset_results", side_effect=empty_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
-@mock.patch(
-    "trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
-)
 class TestTryMerge(TestCase):
    def test_merge_rules_valid(self, *args: Any) -> None:
        "Test that merge_rules.yaml can be parsed"
@ -274,7 +251,7 @@ class TestTryMerge(TestCase):
    @mock.patch("trymerge.read_merge_rules", side_effect=mocked_read_merge_rules)
    def test_match_rules(self, *args: Any) -> None:
        "Tests that PR passes merge rules"
-        pr = GitHubPR("pytorch", "pytorch", 109999)
+        pr = GitHubPR("pytorch", "pytorch", 77700)
        repo = DummyGitRepo()
        self.assertTrue(find_matching_merge_rule(pr, repo) is not None)

@ -327,9 +304,14 @@ class TestTryMerge(TestCase):

    def test_internal_changes(self, *args: Any) -> None:
        "Tests that PR with internal changes is detected"
-        pr = GitHubPR("pytorch", "pytorch", 110140)
+        pr = GitHubPR("pytorch", "pytorch", 73969)
        self.assertTrue(pr.has_internal_changes())

+    def test_checksuites_pagination(self, *args: Any) -> None:
+        "Tests that PR with lots of checksuits can be fetched"
+        pr = GitHubPR("pytorch", "pytorch", 73811)
+        self.assertEqual(len(pr.get_checkrun_conclusions()), 76)
+
    def test_comments_pagination(self, *args: Any) -> None:
        "Tests that PR with 50+ comments can be fetched"
        pr = GitHubPR("pytorch", "pytorch", 31093)
@ -341,9 +323,7 @@ class TestTryMerge(TestCase):
        # see https://gist.github.com/malfet/9b93bc7eeddeaf1d84546efc4f0c577f
        pr = GitHubPR("pytorch", "pytorch", 68111)
        self.assertGreater(len(pr.get_comments()), 20)
-        # NS(09/27/2023): GitHub seems to recycle older checkruns
-        # https://github.com/pytorch/pytorch/pull/68111/checks shows 0 runs
-        # self.assertGreater(len(pr.get_checkrun_conclusions()), 3)
+        self.assertGreater(len(pr.get_checkrun_conclusions()), 3)
        self.assertGreater(pr.get_commit_count(), 60)

    def test_gql_retrieve_checksuites(self, *args: Any) -> None:
@ -388,16 +368,14 @@ class TestTryMerge(TestCase):

    def test_get_checkruns_many_runs(self, *args: Any) -> None:
        """Tests that all checkruns can be fetched"""
-        pr = GitHubPR("pytorch", "pytorch", 105260)
+        pr = GitHubPR("pytorch", "pytorch", 77700)
        conclusions = pr.get_checkrun_conclusions()
-        self.assertEqual(len(conclusions), 221)
-        self.assertTrue(
-            "pull / linux-docs / build-docs-cpp-false" in conclusions.keys()
-        )
+        self.assertEqual(len(conclusions), 79)
+        self.assertTrue("pull / linux-docs / build-docs (cpp)" in conclusions.keys())

    def test_cancelled_gets_ignored(self, *args: Any) -> None:
        """Tests that cancelled workflow does not override existing successfull status"""
-        pr = GitHubPR("pytorch", "pytorch", 110367)
+        pr = GitHubPR("pytorch", "pytorch", 82169)
        conclusions = pr.get_checkrun_conclusions()
        lint_checks = [name for name in conclusions.keys() if "Lint" in name]
        self.assertTrue(len(lint_checks) > 0)
@ -545,7 +523,108 @@ class TestTryMerge(TestCase):
        for case in test_cases:
            self.assertEqual(case["expected"], remove_job_name_suffix(case["name"]))

-    def test_get_merge_base(self, *args: Any) -> None:
+    def test_is_broken_trunk(self, *args: Any) -> None:
+        test_cases: List[Dict[str, Any]] = [
+            {
+                "head_job": None,
+                "base_jobs": {
+                    "job_a": {
+                        "conclusion": "success",
+                        "failure_captures": ["a", "b"],
+                    },
+                    "job_b": {
+                        "conclusion": "failure",
+                        "failure_captures": ["a", "b"],
+                    },
+                },
+                "expected": False,
+                "description": "Invalid input - head job",
+            },
+            {
+                "head_job": {
+                    "conclusion": "failure",
+                    "failure_captures": ["a", "b"],
+                },
+                "base_jobs": None,
+                "expected": False,
+                "description": "Invalid input - base jobs",
+            },
+            {
+                "head_job": {
+                    "conclusion": "failure",
+                    "failure_captures": ["a", "b"],
+                },
+                "base_jobs": {},
+                "expected": False,
+                "description": "Invalid input - empty base jobs",
+            },
+            {
+                "head_job": {
+                    "conclusion": "failure",
+                    "failure_captures": ["x", "y"],
+                },
+                "base_jobs": {
+                    "job_a": {
+                        "conclusion": "success",
+                        "failure_captures": ["a", "b"],
+                    },
+                    "job_b": {
+                        "conclusion": "failure",
+                        "failure_captures": ["x", "y"],
+                    },
+                },
+                "expected": True,
+                "description": "Found a match",
+            },
+            {
+                "head_job": {
+                    "conclusion": "success",
+                    "failure_captures": ["x", "y"],
+                },
+                "base_jobs": {
+                    "job_a": {
+                        "conclusion": "success",
+                        "failure_captures": ["a", "b"],
+                    },
+                    "job_b": {
+                        "conclusion": "failure",
+                        "failure_captures": ["x", "y"],
+                    },
+                },
+                "expected": False,
+                "description": "Not found - different conclusion",
+            },
+            {
+                "head_job": {
+                    "conclusion": "failure",
+                    "failure_captures": ["a", "b"],
+                },
+                "base_jobs": {
+                    "job_a": {
+                        "conclusion": "success",
+                        "failure_captures": ["a", "b"],
+                    },
+                    "job_b": {
+                        "conclusion": "failure",
+                        "failure_captures": ["x", "y"],
+                    },
+                },
+                "expected": False,
+                "description": "Not found - different captured failures",
+            },
+        ]
+
+        for case in test_cases:
+            self.assertEqual(
+                case["expected"], is_broken_trunk(case["head_job"], case["base_jobs"])
+            )
+
+    def test_get_merge_base(
+        self,
+        mock_gh_graphql: Any,
+        mock_get_rockset_results: Any,
+        mock_read_flaky_rules: Any,
+    ) -> None:
        pr = GitHubPR("pytorch", "pytorch", 104121)

        mock_merge_base = "mocked-sha"
@ -563,130 +642,57 @@ class TestTryMerge(TestCase):
@mock.patch("trymerge.get_rockset_results", side_effect=mocked_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
-@mock.patch(
-    "trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
-)
 class TestBypassFailures(TestCase):
    def test_get_classifications(self, *args: Any) -> None:
-        pr = GitHubPR("pytorch", "pytorch", 109584)
+        flaky_rules = [
+            # Try a regex rule
+            FlakyRule("distributed", ["##\\[error\\]The operation [wW]as .+"])
+        ]
+        pr = GitHubPR("pytorch", "pytorch", 92863)
        checks = pr.get_checkrun_conclusions()
        checks = get_classifications(
-            pr.pr_num,
-            pr.project,
-            checks,
-            [],
+            checks, pr.last_commit()["oid"], pr.get_merge_base(), flaky_rules, []
        )
        self.assertTrue(
            checks[
-                "pull / linux-focal-py3.11-clang10 / test (dynamo, 1, 2, linux.2xlarge)"
+                "pull / linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.4xlarge)"
            ].classification
            == "BROKEN_TRUNK"
        )
        self.assertTrue(
            checks[
-                "trunk / win-vs2019-cpu-py3 / test (default, 2, 3, windows.4xlarge.nonephemeral)"
+                "pull / linux-focal-py3.7-gcc7 / test (distributed, 1, 2, linux.2xlarge)"
            ].classification
            == "FLAKY"
        )
-        self.assertTrue(
-            checks[
-                "pull / linux-jammy-py3.8-gcc11 / test (distributed, 1, 2, linux.2xlarge)"
-            ].classification
-            == "FLAKY"
-        )
-        self.assertTrue(
-            checks[
-                "pull / linux-focal-cuda11.8-py3.10-gcc9 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)"
-            ].classification
-            == "FLAKY"
-        )
-
-        # Set the threshold larger or equal to the number of ok failures
        pending, failed, ignorable = categorize_checks(
-            checks, list(checks.keys()), ok_failed_checks_threshold=6
+            checks, list(checks.keys()), ok_failed_checks_threshold=2
        )
        self.assertTrue(len(pending) == 0)
        self.assertTrue(len(failed) == 0)
-        self.assertTrue(len(ignorable["FLAKY"]) == 4)
-        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 2)
+        self.assertTrue(len(ignorable["FLAKY"]) == 1)
+        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)

        # Not set any threshold, defaults to -1 to ignore all flaky and broken trunk failures
        pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
        self.assertTrue(len(pending) == 0)
        self.assertTrue(len(failed) == 0)
-        self.assertTrue(len(ignorable["FLAKY"]) == 4)
-        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 2)
+        self.assertTrue(len(ignorable["FLAKY"]) == 1)
+        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)

-        # Set the threshold lower than the number of ok failures
        pending, failed, ignorable = categorize_checks(
            checks, list(checks.keys()), ok_failed_checks_threshold=1
        )
        self.assertTrue(len(pending) == 0)
-        self.assertTrue(len(failed) == 6)
-        self.assertTrue(len(ignorable["FLAKY"]) == 4)
-        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 2)
-
-        # Set the threshold to 0 like when ignore_flaky_failures is on
-        pending, failed, ignorable = categorize_checks(
-            checks, list(checks.keys()), ok_failed_checks_threshold=1
-        )
-        self.assertTrue(len(pending) == 0)
-        self.assertTrue(len(failed) == 6)
-        self.assertTrue(len(ignorable["FLAKY"]) == 4)
-        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 2)
-
-    def test_get_classifications_flaky_fullname(self, *args: Any) -> None:
-        pr = GitHubPR("pytorch", "pytorch", 110362)
-        checks = pr.get_checkrun_conclusions()
-        checks = get_classifications(
-            pr.pr_num,
-            pr.project,
-            checks,
-            [],
-        )
-        pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
-        self.assertTrue(len(pending) == 0)
-        self.assertTrue(len(failed) == 0)
-        self.assertTrue(len(ignorable["FLAKY"]) == 1)
-
-    def test_get_classifications_invalid_cancel(self, *args: Any) -> None:
-        pr = GitHubPR("pytorch", "pytorch", 110367)
-        checks = pr.get_checkrun_conclusions()
-        checks = get_classifications(
-            pr.pr_num,
-            pr.project,
-            checks,
-            [],
-        )
-        pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
-        self.assertTrue(len(pending) == 0)
-        self.assertTrue(len(failed) == 0)
-        self.assertTrue(len(ignorable["FLAKY"]) == 0)
-        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 0)
-        self.assertTrue(len(ignorable["UNSTABLE"]) == 3)
-
-    def test_get_classifications_similar_failures(self, *args: Any) -> None:
-        pr = GitHubPR("pytorch", "pytorch", 109750)
-        checks = pr.get_checkrun_conclusions()
-        checks = get_classifications(
-            pr.pr_num,
-            pr.project,
-            checks,
-            [],
-        )
-        pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
-        self.assertTrue(len(pending) == 0)
-        self.assertTrue(len(failed) == 0)
+        self.assertTrue(len(failed) == 2)
        self.assertTrue(len(ignorable["FLAKY"]) == 1)
+        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)

    def test_get_classifications_unstable(self, *args: Any) -> None:
        pr = GitHubPR("pytorch", "pytorch", 104312)
        checks = pr.get_checkrun_conclusions()
        checks = get_classifications(
-            pr.pr_num,
-            pr.project,
-            checks,
-            [],
+            checks, pr.last_commit()["oid"], pr.get_merge_base(), [], []
        )
        workflow_name = "linux-bionic-cuda12.1-py3.10-gcc9-bazel-test"
        job_name = "build-and-test (default, 1, 1, linux.4xlarge.nvidia.gpu, unstable)"
@ -700,6 +706,19 @@ class TestBypassFailures(TestCase):
        self.assertTrue(len(failed) == 0)
        self.assertTrue(len(ignorable["UNSTABLE"]) == 1)

+    def test_get_classifications_pending_unstable(self, *args: Any) -> None:
+        pr = GitHubPR("pytorch", "pytorch", 105998)
+        checks = pr.get_checkrun_conclusions()
+        checks = get_classifications(
+            checks, pr.last_commit()["oid"], pr.get_merge_base(), [], []
+        )
+        pending, failed, ignorable = categorize_checks(
+            checks, list(checks.keys()), ok_failed_checks_threshold=1
+        )
+        self.assertTrue(len(pending) == 0)
+        self.assertTrue(len(failed) == 3)
+        self.assertTrue(len(ignorable["UNSTABLE"]) == 3)
+
    def test_get_classifications_broken_trunk(self, *args: Any) -> None:
        # The mock merge base is the actual value returned by gh_fetch_merge_base
        test_cases = [
@ -707,13 +726,13 @@ class TestBypassFailures(TestCase):
                # This PR had one broken trunk failure but it was run on a different shard
                # than the one on the base commit. This should still count as broken trunk
                "pr_num": 104214,
-                "related_failure_count": 0,
+                "mock_merge_base": "436d035dc74db9c703297a62163b0cad0c546665",
                "unrelated_failure_count": 1,
            },
            {
                # This PR had one broken trunk failure and it used ghstack
                "pr_num": 105145,
-                "related_failure_count": 0,
+                "mock_merge_base": "194fe1d12f9860734cc28ed21bdabda2fbb06336",
                "unrelated_failure_count": 1,
            },
            {
@ -722,81 +741,112 @@ class TestBypassFailures(TestCase):
                # keep the failure record from the merge base so that it can
                # be used to detect broken trunk
                "pr_num": 107160,
-                "related_failure_count": 0,
+                "mock_merge_base": "a5d841ef01e615e2a654fb12cf0cd08697d12ccf",
                "unrelated_failure_count": 4,
            },
-            {
-                # This PR used Dr.CI broken trunk classification
-                "pr_num": 111253,
-                "related_failure_count": 1,
-                "unrelated_failure_count": 2,
-            },
        ]

        for case in test_cases:
            pr_num = case["pr_num"]
-            related_failure_count = case["related_failure_count"]
+            mock_merge_base = case["mock_merge_base"]
            unrelated_failure_count = case["unrelated_failure_count"]

-            pr = GitHubPR("pytorch", "pytorch", pr_num)
-            checks = pr.get_checkrun_conclusions()
-            checks = get_classifications(
-                pr.pr_num,
-                pr.project,
-                checks,
-                [],
-            )
+            pr = GitHubPR("pytorch", "pytorch", cast(int, pr_num))
+            with mock.patch(
+                "trymerge.gh_fetch_merge_base", return_value=mock_merge_base
+            ) as mocked_gh_fetch_merge_base:
+                checks = pr.get_checkrun_conclusions()
+                checks = get_classifications(
+                    checks, pr.last_commit()["oid"], pr.get_merge_base(), [], []
+                )

-            pending, failed, _ = categorize_checks(checks, list(checks.keys()))
-            self.assertTrue(len(pending) == 0)
-            self.assertTrue(len(failed) == related_failure_count)
+                pending, failed, _ = categorize_checks(checks, list(checks.keys()))
+                self.assertTrue(len(pending) == 0)
+                self.assertTrue(len(failed) == 0)

-            # When the ok_failed_checks_threshold is set to 0, the broken trunk failure
-            # won't be ignored
-            pending, failed, _ = categorize_checks(
-                checks, list(checks.keys()), ok_failed_checks_threshold=0
-            )
-            self.assertTrue(len(pending) == 0)
-            self.assertTrue(
-                len(failed) == unrelated_failure_count + related_failure_count
-            )
+                # When the ok_failed_checks_threshold is set to 0, the broken trunk failure
+                # won't be ignored
+                pending, failed, _ = categorize_checks(
+                    checks, list(checks.keys()), ok_failed_checks_threshold=0
+                )
+                self.assertTrue(len(pending) == 0)
+                self.assertTrue(len(failed) == unrelated_failure_count)

    def test_ignore_current(self, *args: Any) -> None:
        # Test various interactions of the failure classifier to ensure that ignore
        # current checks takes place after other classifications: flaky, unstable,
        # or broken trunk. Only actual new failures should be kept in the list of
        # ignore current checks to use to record force merge with actual failures
-        flaky = "pull / linux-focal-cuda11.8-py3.10-gcc9 / test (distributed, 1, 3, linux.8xlarge.nvidia.gpu)"
+        flaky_rules = [
+            FlakyRule("distributed", ["##\\[error\\]The operation was canceled."])
+        ]
+        flaky = (
+            "pull / linux-focal-py3.7-gcc7 / test (distributed, 1, 2, linux.2xlarge)"
+        )
        broken_trunk = (
-            "pull / linux-focal-py3.11-clang10 / test (dynamo, 1, 2, linux.2xlarge)"
+            "pull / linux-bionic-py3_7-clang8-xla / test (xla, 1, 1, linux.4xlarge)"
        )

-        pr = GitHubPR("pytorch", "pytorch", 109584)
+        pr = GitHubPR("pytorch", "pytorch", 92863)
        checks = pr.get_checkrun_conclusions()

+        # No broken trunk or flaky rules, then all failures are ignored when ic is used
+        checks = get_classifications(
+            checks, pr.last_commit()["oid"], None, [], [broken_trunk, flaky]
+        )
+        self.assertTrue(checks[flaky].classification == "IGNORE_CURRENT_CHECK")
+        self.assertTrue(checks[broken_trunk].classification == "IGNORE_CURRENT_CHECK")
+        _, failed, ignorable = categorize_checks(
+            checks, list(checks.keys()), ok_failed_checks_threshold=2
+        )
+        self.assertTrue(len(failed) == 0)
+        self.assertTrue(len(ignorable["IGNORE_CURRENT_CHECK"]) == 2)
+        self.assertTrue(len(ignorable["FLAKY"]) == 0)
+        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 0)
+
        # Known flaky failure takes precedence over ignore current (need to set the
        # merge base here to get the results from Rockset, and that categorize the
        # broken trunk failure too
        checks = get_classifications(
-            pr.pr_num,
-            pr.project,
            checks,
+            pr.last_commit()["oid"],
+            pr.get_merge_base(),
+            flaky_rules,
            [broken_trunk, flaky],
        )
        self.assertTrue(checks[flaky].classification == "FLAKY")
        self.assertTrue(checks[broken_trunk].classification == "BROKEN_TRUNK")
-        _, failed, ignorable = categorize_checks(checks, list(checks.keys()))
+        _, failed, ignorable = categorize_checks(
+            checks, list(checks.keys()), ok_failed_checks_threshold=2
+        )
        self.assertTrue(len(failed) == 0)
        self.assertTrue(len(ignorable["IGNORE_CURRENT_CHECK"]) == 0)
-        self.assertTrue(len(ignorable["FLAKY"]) == 4)
-        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 2)
+        self.assertTrue(len(ignorable["FLAKY"]) == 1)
+        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)

+        # Broken trunk takes precedence over ignore current (no flaky rule is set here)
+        checks = get_classifications(
+            checks,
+            pr.last_commit()["oid"],
+            pr.get_merge_base(),
+            [],
+            [broken_trunk, flaky],
+        )
+        self.assertTrue(checks[flaky].classification == "IGNORE_CURRENT_CHECK")
+        self.assertTrue(checks[broken_trunk].classification == "BROKEN_TRUNK")
+        _, failed, ignorable = categorize_checks(
+            checks, list(checks.keys()), ok_failed_checks_threshold=2
+        )
+        self.assertTrue(len(failed) == 0)
+        self.assertTrue(len(ignorable["IGNORE_CURRENT_CHECK"]) == 1)
+        self.assertTrue(len(ignorable["FLAKY"]) == 0)
+        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)
+
+    @mock.patch("trymerge.read_flaky_rules", side_effect=xla_is_flaky_rules)
    @mock.patch("trymerge.read_merge_rules", side_effect=xla_merge_rules)
    def test_dont_ignore_flaky_failures(self, *args: Any) -> None:
-        """
-        Regression test for https://github.com/pytorch/test-infra/issues/4126
-        """
-        pr = GitHubPR("pytorch", "pytorch", 105312)
+        """Regression test for https://github.com/pytorch/test-infra/issues/4126"""
+        pr = GitHubPR("pytorch", "pytorch", 100369)
        repo = DummyGitRepo()
        # Check that failure is classified as flaky but still raises exception
        with warnings.catch_warnings(record=True) as w, self.assertRaises(RuntimeError):
@ -811,97 +861,14 @@ class TestBypassFailures(TestCase):
@mock.patch("trymerge.get_rockset_results", side_effect=mocked_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
-@mock.patch("trymerge.get_drci_classifications", return_value={})
-class TestBypassFailuresOnSandCastle(TestCase):
-    def test_get_classifications(self, *args: Any) -> None:
-        pr = GitHubPR("pytorch", "pytorch", 111467)
-        checks = pr.get_checkrun_conclusions()
-        checks = get_classifications(
-            pr.pr_num,
-            pr.project,
-            checks,
-            [],
-        )
-        pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
-        self.assertTrue(len(pending) == 0)
-        self.assertTrue(len(failed) == 0)
-        self.assertTrue(len(ignorable["FLAKY"]) == 1)
-        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 1)
-
-    def test_get_classifications_drci_checkrun_not_found(self, *args: Any) -> None:
-        pr = GitHubPR("pytorch", "pytorch", 111467)
-
-        # No summary
-        checks = pr.get_checkrun_conclusions()
-        checks[DRCI_CHECKRUN_NAME] = JobCheckState(
-            DRCI_CHECKRUN_NAME,
-            "",
-            "NEUTRAL",
-            None,
-            1,
-            "",
-            None,
-        )
-        checks = get_classifications(
-            pr.pr_num,
-            pr.project,
-            checks,
-            [],
-        )
-        pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
-        self.assertTrue(len(pending) == 0)
-        self.assertTrue(len(failed) == 2)
-
-        # Empty summary
-        checks = pr.get_checkrun_conclusions()
-        checks[DRCI_CHECKRUN_NAME] = JobCheckState(
-            DRCI_CHECKRUN_NAME,
-            "",
-            "NEUTRAL",
-            None,
-            1,
-            "",
-            "",
-        )
-        checks = get_classifications(
-            pr.pr_num,
-            pr.project,
-            checks,
-            [],
-        )
-        pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
-        self.assertTrue(len(pending) == 0)
-        self.assertTrue(len(failed) == 2)
-
-        # No Dr.CI checkrun
-        checks = pr.get_checkrun_conclusions()
-        del checks[DRCI_CHECKRUN_NAME]
-        checks = get_classifications(
-            pr.pr_num,
-            pr.project,
-            checks,
-            [],
-        )
-        pending, failed, ignorable = categorize_checks(checks, list(checks.keys()))
-        self.assertTrue(len(pending) == 0)
-        self.assertTrue(len(failed) == 2)
-
-
-@mock.patch("trymerge.get_rockset_results", side_effect=mocked_rockset_results)
-@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
-@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
-@mock.patch(
-    "trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
-)
-class TestGitHubPRGhstackDependencies(TestCase):
+class TestGitHubPRGhstackDependencies2(TestCase):
    def test_pr_dependencies(self, *args: Any) -> None:
        pr = GitHubPR("pytorch", "pytorch", 106068)
        msg = pr.gen_commit_message(filter_ghstack=True)
-        self.assertEqual(
-            msg,
-            f"{pr.get_title()} (#106068)\n\n{RE_GHSTACK_DESC.sub('', pr.get_body())}\n"
-            "Pull Request resolved: https://github.com/pytorch/pytorch/pull/106068\n"
-            "Approved by: https://github.com/ezyang, https://github.com/fegin\n",
+        assert msg == (
+            "[FSDP] Break up `_post_backward_hook` into smaller funcs (#106068)\n\n\nDifferential Revision: ["
+            "D47852461](https://our.internmc.facebook.com/intern/diff/D47852461)\nPull Request resolved: "
+            "https://github.com/pytorch/pytorch/pull/106068\nApproved by: \n"
        )

    def test_pr_dependencies_ghstack(self, *args: Any) -> None:
@ -909,13 +876,13 @@ class TestGitHubPRGhstackDependencies(TestCase):
        pr1 = GitHubPR("pytorch", "pytorch", 106033)
        pr2 = GitHubPR("pytorch", "pytorch", 106034)
        pr = GitHubPR("pytorch", "pytorch", 106068)
+
        msg = pr.gen_commit_message(filter_ghstack=True, ghstack_deps=[pr0, pr1, pr2])
-        self.assertEqual(
-            msg,
-            f"{pr.get_title()} (#106068)\n\n{RE_GHSTACK_DESC.sub('', pr.get_body())}\n"
-            "Pull Request resolved: https://github.com/pytorch/pytorch/pull/106068\n"
-            "Approved by: https://github.com/ezyang, https://github.com/fegin\n"
-            "ghstack dependencies: #106032, #106033, #106034\n",
+        assert msg == (
+            "[FSDP] Break up `_post_backward_hook` into smaller funcs (#106068)\n\n\nDifferential Revision: ["
+            "D47852461](https://our.internmc.facebook.com/intern/diff/D47852461)\nPull Request resolved: "
+            "https://github.com/pytorch/pytorch/pull/106068\nApproved by: \n"
+            "ghstack dependencies: #106032, #106033, #106034\n"
        )

    @skip(
@ -964,7 +931,7 @@ class TestGitHubPRGhstackDependencies(TestCase):
        mock_repo.cherry_pick.assert_any_call("rev2")
        mock_repo.cherry_pick.assert_any_call("rev123")

-        self.assertTrue(mock.call("rev1") not in mock_repo.cherry_pick.call_args_list)
+        assert mock.call("rev1") not in mock_repo.cherry_pick.call_args_list

        # Verify the first call
        message = mock_repo.amend_commit_message.call_args_list[0].args[0]
@ -977,8 +944,8 @@ class TestGitHubPRGhstackDependencies(TestCase):
            "dependencies: #106032, #106033\n"
        )

-        self.assertTrue(message.startswith(prefix))
-        self.assertTrue(message.endswith(suffix))
+        assert message.startswith(prefix)
+        assert message.endswith(suffix)

        # Verify the second call
        mock_repo.amend_commit_message.assert_any_call(
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -30,7 +30,6 @@ from github_utils import (
    gh_fetch_url,
    gh_post_commit_comment,
    gh_post_pr_comment,
-    gh_update_pr_state,
    GitHubComment,
 )

@ -62,7 +61,6 @@ class JobCheckState(NamedTuple):
    classification: Optional[str]
    job_id: Optional[int]
    title: Optional[str]
-    summary: Optional[str]


 JobNameToStateDict = Dict[str, JobCheckState]
@ -76,6 +74,29 @@ class WorkflowCheckState:
        self.jobs: JobNameToStateDict = {}


+class FlakyRule:
+    def __init__(self, name: str, captures: List[str]):
+        self.name = re.compile(name)
+        self.captures = [re.compile(r) for r in captures]
+
+    def matches(self, job: Optional[Dict[str, Any]]) -> bool:
+        return (
+            job is not None
+            and self.name.search(job.get("name", "")) is not None
+            and job.get("failure_captures") is not None
+            and all(
+                any(
+                    r.search(capture) is not None
+                    for capture in job.get("failure_captures", [])
+                )
+                for r in self.captures
+            )
+        )
+
+    def __repr__(self) -> str:
+        return f"FlakyRule[name='{self.name}', captures={self.captures}]"
+
+
 GH_PR_REVIEWS_FRAGMENT = """
 fragment PRReviews on PullRequestReviewConnection {
  nodes {
@ -120,7 +141,6 @@ fragment PRCheckSuites on CheckSuiteConnection {
          detailsUrl
          databaseId
          title
-          summary
        }
        pageInfo {
          endCursor
@ -312,7 +332,6 @@ query ($owner: String!, $name: String!, $number: Int!, $cs_cursor: String, $cr_c
                    detailsUrl
                    databaseId
                    title
-                    summary
                  }
                  pageInfo {
                    endCursor
@ -437,7 +456,6 @@ MERGE_RULE_PATH = Path(".github") / "merge_rules.yaml"
 ROCKSET_MERGES_COLLECTION = "merges"
 ROCKSET_MERGES_WORKSPACE = "commons"
 REMOTE_MAIN_BRANCH = "origin/main"
-DRCI_CHECKRUN_NAME = "Dr.CI"
 INTERNAL_CHANGES_CHECKRUN_NAME = "Meta Internal-Only Changes Check"
 HAS_NO_CONNECTED_DIFF_TITLE = (
    "There is no internal Diff connected, this can be merged now"
@ -551,7 +569,6 @@ def add_workflow_conclusions(
                            classification=None,
                            job_id=checkrun_node["databaseId"],
                            title=checkrun_node["title"],
-                            summary=checkrun_node["summary"],
                        )

                if bool(checkruns["pageInfo"]["hasNextPage"]):
@ -582,7 +599,6 @@ def add_workflow_conclusions(
                classification=None,
                job_id=None,
                title=None,
-                summary=None,
            )
    for job_name, job in no_workflow_obj.jobs.items():
        res[job_name] = job
@ -908,7 +924,6 @@ class GitHubPR:
                    classification=None,
                    job_id=None,
                    title=None,
-                    summary=None,
                )

        return self.conclusions
@ -1246,6 +1261,13 @@ def read_merge_rules(
        return [MergeRule(**x) for x in rc]


+@lru_cache(maxsize=None)
+def read_flaky_rules() -> List[FlakyRule]:
+    # NOTE: This is currently hardcoded, can be extended to do per repo rules
+    FLAKY_RULES_URL = "https://raw.githubusercontent.com/pytorch/test-infra/generated-stats/stats/flaky-rules.json"
+    return _get_flaky_rules(FLAKY_RULES_URL)
+
+
 def find_matching_merge_rule(
    pr: GitHubPR,
    repo: Optional[GitRepo] = None,
@ -1276,15 +1298,25 @@ def find_matching_merge_rule(
    reject_reason = f"No rule found to match PR. Please [report]{issue_link} this issue to DevX team."

    rules = read_merge_rules(repo, pr.org, pr.project)
+    flaky_rules = read_flaky_rules()
    if not rules:
        reject_reason = f"Rejecting the merge as no rules are defined for the repository in {MERGE_RULE_PATH}"
        raise RuntimeError(reject_reason)
-
    checks = pr.get_checkrun_conclusions()
+    base_rev = None
+    try:
+        # is allowed to fail if git is not available
+        base_rev = pr.get_merge_base()
+    except Exception as e:
+        print(
+            f"Failed fetching base git revision for {pr.pr_num}. Skipping additional classifications.\n"
+            f"{type(e)}\n{e}"
+        )
    checks = get_classifications(
-        pr.pr_num,
-        pr.project,
        checks,
+        pr.last_commit()["oid"],
+        base_rev,
+        flaky_rules,
        ignore_current_checks=ignore_current_checks,
    )

@ -1435,6 +1467,11 @@ def checks_to_markdown_bullets(
    ]


+@retries_decorator(rc=[])
+def _get_flaky_rules(url: str) -> List[FlakyRule]:
+    return [FlakyRule(**rule) for rule in gh_fetch_json_list(url)]
+
+
@retries_decorator()
 def save_merge_record(
    collection: str,
@ -1538,27 +1575,6 @@ where
        return []


-@retries_decorator()
-def get_drci_classifications(pr_num: int, project: str = "pytorch") -> Any:
-    """
-    Query HUD API to find similar failures to decide if they are flaky
-    """
-    # NB: This doesn't work internally atm because this requires making an
-    # external API call to HUD
-    failures = gh_fetch_url(
-        f"https://hud.pytorch.org/api/drci/drci?prNumber={pr_num}",
-        data=f"repo={project}",
-        headers={
-            "Authorization": os.getenv("DRCI_BOT_KEY", ""),
-            "Accept": "application/vnd.github.v3+json",
-        },
-        method="POST",
-        reader=json.load,
-    )
-
-    return failures.get(str(pr_num), {}) if failures else {}
-
-
 REMOVE_JOB_NAME_SUFFIX_REGEX = re.compile(r", [0-9]+, [0-9]+, .+\)$")


@ -1567,86 +1583,78 @@ def remove_job_name_suffix(name: str, replacement: str = ")") -> str:


 def is_broken_trunk(
-    name: str,
-    drci_classifications: Any,
+    head_job: Optional[Dict[str, Any]], base_jobs: Optional[Dict[str, Dict[str, Any]]]
 ) -> bool:
-    if not name or not drci_classifications:
+    if not head_job or not base_jobs:
        return False

-    # Consult the list of broken trunk failures from Dr.CI
    return any(
-        name == broken_trunk["name"]
-        for broken_trunk in drci_classifications.get("BROKEN_TRUNK", [])
-    )
-
-
-def is_flaky(
-    name: str,
-    drci_classifications: Any,
-) -> bool:
-    if not name or not drci_classifications:
-        return False
-
-    # Consult the list of flaky failures from Dr.CI
-    return any(name == flaky["name"] for flaky in drci_classifications.get("FLAKY", []))
-
-
-def is_invalid_cancel(
-    name: str,
-    conclusion: Optional[str],
-    drci_classifications: Any,
-) -> bool:
-    """
-    After https://github.com/pytorch/test-infra/pull/4579, invalid cancelled
-    signals have been removed from HUD and Dr.CI. The same needs to be done
-    here for consistency
-    """
-    if (
-        not name
-        or not drci_classifications
-        or not conclusion
-        or conclusion.upper() != "CANCELLED"
-    ):
-        return False
-
-    # If a job is cancelled and not listed as a failure by Dr.CI, it's an
-    # invalid signal and can be ignored
-    return all(
-        name != failure["name"] for failure in drci_classifications.get("FAILED", [])
+        head_job["conclusion"] == base_job["conclusion"]
+        and head_job["failure_captures"] == base_job["failure_captures"]
+        for base_job in base_jobs.values()
    )


 def get_classifications(
-    pr_num: int,
-    project: str,
    checks: Dict[str, JobCheckState],
+    head_sha: str,
+    merge_base: Optional[str],
+    flaky_rules: List[FlakyRule],
    ignore_current_checks: Optional[List[str]],
 ) -> Dict[str, JobCheckState]:
-    # Get the failure classification from Dr.CI, which is the source of truth
-    # going forward. It's preferable to try calling Dr.CI API directly first
-    # to get the latest results as well as update Dr.CI PR comment
-    drci_classifications = get_drci_classifications(pr_num=pr_num, project=project)
-    print(f"From Dr.CI API: {json.dumps(drci_classifications)}")
+    # Group by job name without shard id and suffix to correctly identify broken
+    # trunk failures, i.e. linux-bionic-cuda12.1-py3.10-gcc9-sm86 / test (default)
+    head_sha_jobs: Dict[str, Dict[str, Dict[str, Any]]] = defaultdict(dict)
+    merge_base_jobs: Dict[str, Dict[str, Dict[str, Any]]] = defaultdict(dict)

-    # NB: if the latest results from Dr.CI is not available, i.e. when calling from
-    # SandCastle, we fallback to any results we can find on Dr.CI check run summary
-    if (
-        not drci_classifications
-        and DRCI_CHECKRUN_NAME in checks
-        and checks[DRCI_CHECKRUN_NAME]
-        and checks[DRCI_CHECKRUN_NAME].summary
-    ):
-        drci_summary = checks[DRCI_CHECKRUN_NAME].summary
-        try:
-            print(f"From Dr.CI checkrun summary: {drci_summary}")
-            drci_classifications = json.loads(str(drci_summary))
-        except json.JSONDecodeError as error:
-            warn("Invalid Dr.CI checkrun summary")
-            drci_classifications = {}
+    if merge_base is not None:
+
+        def insert(
+            d: Dict[str, Dict[str, Dict[str, Any]]],
+            key: str,
+            val: Dict[str, Any],
+            overwrite_failed_run_attempt: bool,
+        ) -> None:
+            key_no_suffix = remove_job_name_suffix(key)
+            if key not in d[key_no_suffix]:
+                d[key_no_suffix][key] = val
+                return
+
+            # When overwrite_failed_run_attempt is set to True, always overwrite
+            # the job with the result from the latest attempt. This option is for
+            # jobs from the pull request head_sha where the latest retry is used
+            # when merging
+            #
+            # When overwrite_failed_run_attempt is False, only overwrite the job
+            # with the result from the latest attempt if the latest retry failed.
+            # This option is for jobs from the merger_base where we want to record
+            # failures for broken trunk
+            if d[key_no_suffix][key]["id"] < val["id"] and (
+                overwrite_failed_run_attempt or not is_passing_status(val["conclusion"])
+            ):
+                d[key_no_suffix][key] = val
+
+        rockset_results = get_rockset_results(head_sha, merge_base)
+        for rockset_result in rockset_results:
+            name = f"{rockset_result['workflow_name']} / {rockset_result['name']}"
+            if rockset_result["head_sha"] == head_sha:
+                insert(
+                    head_sha_jobs,
+                    name,
+                    rockset_result,
+                    overwrite_failed_run_attempt=True,
+                )
+            else:
+                insert(
+                    merge_base_jobs,
+                    name,
+                    rockset_result,
+                    overwrite_failed_run_attempt=False,
+                )

    checks_with_classifications = checks.copy()
    for name, check in checks.items():
-        if check.status == "SUCCESS" or check.status == "NEUTRAL":
+        if check.status == "SUCCESS":
            continue

        if "unstable" in name:
@ -1657,13 +1665,13 @@ def get_classifications(
                "UNSTABLE",
                check.job_id,
                check.title,
-                check.summary,
            )
            continue

-        # NB: It's important to note that when it comes to ghstack and broken trunk classification,
-        # Dr.CI uses the base of the whole stack
-        if is_broken_trunk(name, drci_classifications):
+        name_no_suffix = remove_job_name_suffix(name)
+        head_sha_job = head_sha_jobs.get(name_no_suffix, {}).get(name)
+
+        if is_broken_trunk(head_sha_job, merge_base_jobs.get(name_no_suffix)):
            checks_with_classifications[name] = JobCheckState(
                check.name,
                check.url,
@ -1671,34 +1679,12 @@ def get_classifications(
                "BROKEN_TRUNK",
                check.job_id,
                check.title,
-                check.summary,
            )
            continue

-        elif is_flaky(name, drci_classifications):
+        elif any(rule.matches(head_sha_job) for rule in flaky_rules):
            checks_with_classifications[name] = JobCheckState(
-                check.name,
-                check.url,
-                check.status,
-                "FLAKY",
-                check.job_id,
-                check.title,
-                check.summary,
-            )
-            continue
-
-        elif is_invalid_cancel(name, check.status, drci_classifications):
-            # NB: Create a new category here for invalid cancelled signals because
-            # there are usually many of them when they happen. So, they shouldn't
-            # be counted toward ignorable failures threshold
-            checks_with_classifications[name] = JobCheckState(
-                check.name,
-                check.url,
-                check.status,
-                "INVALID_CANCEL",
-                check.job_id,
-                check.title,
-                check.summary,
+                check.name, check.url, check.status, "FLAKY", check.job_id, check.title
            )
            continue

@ -1710,7 +1696,6 @@ def get_classifications(
                "IGNORE_CURRENT_CHECK",
                check.job_id,
                check.title,
-                check.summary,
            )

    return checks_with_classifications
@ -1804,7 +1789,6 @@ def try_revert(
    if not dry_run:
        pr.add_numbered_label("reverted")
        gh_post_commit_comment(pr.org, pr.project, commit_sha, revert_msg)
-        gh_update_pr_state(pr.org, pr.project, pr.pr_num)


 def prefix_with_github_url(suffix_str: str) -> str:
@ -1880,8 +1864,6 @@ def categorize_checks(
            # ignored anyway. This is useful to not need to wait for scarce resources
            # like ROCm, which is also frequently in unstable mode
            pending_checks.append((checkname, url, job_id))
-        elif classification == "INVALID_CANCEL":
-            continue
        elif not is_passing_status(check_runs[checkname].status):
            target = (
                ignorable_failed_checks[classification]
@ -1927,8 +1909,7 @@ def merge(
    ignore_current: bool = False,
 ) -> None:
    initial_commit_sha = pr.last_commit()["oid"]
-    pr_link = f"https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num}"
-    print(f"Attempting merge of {initial_commit_sha} ({pr_link})")
+    print(f"Attempting merge of {initial_commit_sha}")

    if MERGE_IN_PROGRESS_LABEL not in pr.get_labels():
        gh_add_labels(pr.org, pr.project, pr.pr_num, [MERGE_IN_PROGRESS_LABEL])
@ -1993,6 +1974,7 @@ def merge(
    start_time = time.time()
    last_exception = ""
    elapsed_time = 0.0
+    flaky_rules = read_flaky_rules()
    ignore_current_checks = [
        x[0] for x in ignore_current_checks_info
    ]  # convert to List[str] for convenience
@ -2025,9 +2007,10 @@ def merge(

            checks = pr.get_checkrun_conclusions()
            checks = get_classifications(
-                pr.pr_num,
-                pr.project,
                checks,
+                pr.last_commit()["oid"],
+                pr.get_merge_base(),
+                flaky_rules,
                ignore_current_checks=ignore_current_checks,
            )
            pending, failing, _ = categorize_checks(
--- a/.github/scripts/tryrebase.py
+++ b/.github/scripts/tryrebase.py
@ -51,7 +51,7 @@ def post_already_uptodate(

 def rebase_onto(
    pr: GitHubPR, repo: GitRepo, onto_branch: str, dry_run: bool = False
-) -> bool:
+) -> None:
    branch = f"pull/{pr.pr_num}/head"
    remote_url = f"https://github.com/{pr.info['headRepository']['nameWithOwner']}.git"
    refspec = f"{branch}:{pr.head_ref()}"
@ -68,7 +68,6 @@ def rebase_onto(
        push_result = repo._run_git("push", "-f", remote_url, refspec)
    if "Everything up-to-date" in push_result:
        post_already_uptodate(pr, repo, onto_branch, dry_run)
-        return False
    else:
        gh_post_comment(
            pr.org,
@ -79,21 +78,18 @@ def rebase_onto(
            + "git pull --rebase`)",
            dry_run=dry_run,
        )
-        return True


 def rebase_ghstack_onto(
    pr: GitHubPR, repo: GitRepo, onto_branch: str, dry_run: bool = False
-) -> bool:
+) -> None:
    if (
        subprocess.run(
-            [sys.executable, "-m", "ghstack", "--help"],
-            capture_output=True,
-            check=False,
+            [sys.executable, "-m", "ghstack", "--help"], capture_output=True
        ).returncode
        != 0
    ):
-        subprocess.run([sys.executable, "-m", "pip", "install", "ghstack"], check=True)
+        subprocess.run([sys.executable, "-m", "pip", "install", "ghstack"])
    orig_ref = f"{re.sub(r'/head$', '/orig', pr.head_ref())}"

    repo.fetch(orig_ref, orig_ref)
@ -119,9 +115,8 @@ def rebase_ghstack_onto(

    if dry_run:
        print("Don't know how to dry-run ghstack")
-        return False
    else:
-        ghstack_result = subprocess.run(["ghstack"], capture_output=True, check=True)
+        ghstack_result = subprocess.run(["ghstack"], capture_output=True)
        push_result = ghstack_result.stdout.decode("utf-8")
        print(push_result)
        if ghstack_result.returncode != 0:
@ -171,8 +166,6 @@ def rebase_ghstack_onto(
            in push_result
        ):
            post_already_uptodate(pr, repo, onto_branch, dry_run)
-            return False
-        return True


 def additional_rebase_failure_info(e: Exception) -> str:
@ -229,10 +222,9 @@ def main() -> None:
    try:
        if pr.is_ghstack_pr():
            with git_config_guard(repo):
-                rc = rebase_ghstack_onto(pr, repo, onto_branch, dry_run=args.dry_run)
+                rebase_ghstack_onto(pr, repo, onto_branch, dry_run=args.dry_run)
        else:
-            rc = rebase_onto(pr, repo, onto_branch, dry_run=args.dry_run)
-        sys.exit(0 if rc else 1)
+            rebase_onto(pr, repo, onto_branch, dry_run=args.dry_run)

    except Exception as e:
        msg = f"Rebase failed due to {e}"
--- a/.github/scripts/update_commit_hashes.py
+++ b/.github/scripts/update_commit_hashes.py
@ -114,8 +114,7 @@ def main() -> None:

    # query to see if a pr already exists
    params = {
-        "q": f"is:pr is:open in:title author:pytorchupdatebot repo:{OWNER}/{REPO} {args.repo_name} hash update",
-        "sort": "created",
+        "q": f"is:pr is:open in:title author:pytorchmergebot repo:{OWNER}/{REPO} {args.repo_name} hash update"
    }
    response = git_api("/search/issues", params)
    if response["total_count"] != 0:
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@ -8,7 +8,7 @@
 # NOTE: If testing pytorch/builder changes you can change this variable to change what pytorch/builder reference
 #       the binary builds will check out
 {%- set builder_repo = "pytorch/builder" -%}
-{%- set builder_branch = "release/2.2" -%}
+{%- set builder_branch = "release/2.1" -%}

 {%- macro concurrency(build_environment) -%}
 concurrency:
@ -36,10 +36,10 @@ concurrency:
 {%- macro setup_ec2_windows() -%}
      !{{ display_ec2_information() }}
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.2
+        uses: seemethere/add-github-ssh-key@v1
        continue-on-error: true
        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -7,7 +7,6 @@
 name: !{{ build_environment }}
 {%- endblock %}

-
 on:
  push:
    {%- if branches == "nightly" %}
@ -58,8 +57,6 @@ jobs:
      {%- if "aarch64" in build_environment %}
      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
-      {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
-      runs_on: linux.24xlarge
      {%- endif %}
      build_name: !{{ config["build_name"] }}
      build_environment: !{{ build_environment }}
@ -106,7 +103,7 @@ jobs:
        run: |
          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.2
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
          docker-image: !{{ config["container_image"] }}
      - name: Test Pytorch binary
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -58,7 +58,7 @@ jobs:
 {%- for config in build_configs %}
  !{{ config["build_name"] }}-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: !{{ macos_runner }}
+    runs-on: macos-12-xl
    timeout-minutes: !{{ common.timeout_minutes }}
    !{{ upload.binary_env(config, true) }}
    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
@ -72,15 +72,11 @@ jobs:
      - name: Install conda and dependencies
        run: |
          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-x86_64.sh
          chmod +x "${RUNNER_TEMP}/conda.sh"
          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
+          echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch, checkout_pr_head=False) }}
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@ -53,9 +53,6 @@
 {%- macro upload_binaries(config, is_windows=False, has_test=True, use_s3=True) -%}
 !{{ config["build_name"] }}-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
 {%- if has_test %}
    needs: !{{ config["build_name"] }}-test
 {%- else %}
@ -68,7 +65,8 @@
      {%- endif %}
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+      aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
 {%- endmacro %}
--- a/.github/workflows/_android-build-test.yml
+++ b/.github/workflows/_android-build-test.yml
@ -36,7 +36,7 @@ jobs:
      keep-going: ${{ steps.filter.outputs.keep-going }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.2
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.1
        with:
          fetch-depth: 1
          submodules: false
@ -58,25 +58,25 @@ jobs:
    runs-on: ${{ matrix.runner }}
    steps:
      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.2
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.1
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.2
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.1

      - name: Setup Linux
        uses: ./.github/actions/setup-linux

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.2
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.1
        with:
          docker-image-name: ${{ inputs.docker-image-name }}

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.2
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -140,5 +140,5 @@ jobs:
        if: always()

      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.2
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.1
        if: always()
--- a/.github/workflows/_android-full-build-test.yml
+++ b/.github/workflows/_android-full-build-test.yml
@ -36,7 +36,7 @@ jobs:
      keep-going: ${{ steps.filter.outputs.keep-going }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.2
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.1
        with:
          fetch-depth: 1
          submodules: false
@ -58,25 +58,25 @@ jobs:
    runs-on: ${{ matrix.runner }}
    steps:
      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.2
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.1
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.2
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.1

      - name: Setup Linux
        uses: ./.github/actions/setup-linux

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.2
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.1
        with:
          docker-image-name: ${{ inputs.docker-image-name }}

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.2
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -157,7 +157,7 @@ jobs:

          # run gradle buildRelease
          (echo "./.circleci/scripts/build_android_gradle.sh" | docker exec \
-            -e BUILD_ENVIRONMENT="pytorch-linux-focal-py3-clang9-android-ndk-r21e-gradle-build" \
+            -e BUILD_ENVIRONMENT="pytorch-linux-focal-py3-clang7-android-ndk-r19c-gradle-build" \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e AWS_DEFAULT_REGION \
            -e PR_NUMBER \
@ -185,5 +185,5 @@ jobs:
        if: always()

      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.2
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.1
        if: always()
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@ -41,7 +41,7 @@ jobs:
      reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.2
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.1
        with:
          fetch-depth: 1
          submodules: false
@ -63,30 +63,30 @@ jobs:
    runs-on: ${{ matrix.runner }}
    steps:
      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.2
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.1
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.2
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.1

      - name: Setup Linux
        uses: ./.github/actions/setup-linux

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.2
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.1
        with:
          docker-image-name: ${{ inputs.docker-image-name }}

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.2
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.2
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.1
        if: ${{ inputs.cuda-version != 'cpu' }}

      - name: Output disk space left
@ -120,7 +120,6 @@ jobs:
          GITHUB_RUN_ID: ${{ github.run_id }}
          GITHUB_RUN_NUMBER: ${{ github.run_number }}
          GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
-          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          PYTORCH_RETRY_TEST_CASES: 1
          PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
          REENABLED_ISSUES: ${{ needs.filter.outputs.reenabled-issues }}
@ -148,7 +147,6 @@ jobs:
            -e GITHUB_JOB \
            -e GITHUB_RUN_NUMBER \
            -e GITHUB_RUN_ATTEMPT \
-            -e JOB_ID \
            -e GIT_DEFAULT_BRANCH="$GIT_DEFAULT_BRANCH" \
            -e SHARD_NUMBER \
            -e NUM_TEST_SHARDS \
@ -186,7 +184,7 @@ jobs:
        shell: bash
        if: always() && steps.test.conclusion
        run: |
-          cat test/**/*_toprint.log || true
+          cat test/**/*.log || true

      - name: Chown workspace
        uses: ./.github/actions/chown-workspace
@ -199,5 +197,5 @@ jobs:
          file-suffix: bazel-${{ github.job }}_${{ steps.get-job-id.outputs.job-id }}

      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.2
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.1
        if: always()
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -15,7 +15,7 @@ on:
          required: false
          default: linux.12xlarge
          type: string
-          description: Hardware to run this "build"job on, linux.12xlarge or linux.arm64.2xlarge.
+          description: Hardware to run this "build"job on, linux.12xlarge or linux.t4g.2xlarge.
      ALPINE_IMAGE:
        required: false
        type: string
@ -139,13 +139,13 @@ jobs:
        run: env

      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.2
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.1
        continue-on-error: true
        with:
          github-secret: ${{ secrets.github-token }}

      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.2
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.1
        with:
          no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' }}

@ -186,7 +186,7 @@ jobs:
      - name: Checkout pytorch/builder to builder dir
        uses: malfet/checkout@silent-checkout
        with:
-          ref: release/2.2
+          ref: release/2.1
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -212,7 +212,7 @@ jobs:

      - name: Pull Docker image
        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }}
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.2
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
          docker-image: ${{ inputs.DOCKER_IMAGE }}

@ -269,7 +269,7 @@ jobs:

      - name: Teardown Linux
        if: always()
-        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.2
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.1

      - name: Chown workspace
        if: always()
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -62,7 +62,7 @@ on:
      runs_on:
        required: true
        type: string
-        description: Hardware to run this job on. Valid values are linux.4xlarge, linux.4xlarge.nvidia.gpu, linux.arm64.2xlarge, and linux.rocm.gpu
+        description: Hardware to run this job on. Valid values are linux.4xlarge, linux.4xlarge.nvidia.gpu, linux.t4g.2xlarge, and linux.rocm.gpu
    secrets:
      github-token:
        required: true
@ -127,14 +127,14 @@ jobs:
          } >> "${GITHUB_ENV} }}"

      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.2
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.1
        continue-on-error: true
        with:
          github-secret: ${{ secrets.github-token }}

        # Setup the environment
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.2
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.1
        with:
          no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' }}

@ -167,7 +167,7 @@ jobs:
      - name: Checkout pytorch/builder to builder dir
        uses: malfet/checkout@silent-checkout
        with:
-          ref: release/2.2
+          ref: release/2.1
          submodules: recursive
          repository: pytorch/builder
          path: builder
@ -198,12 +198,12 @@ jobs:
          path: "${{ runner.temp }}/artifacts/"

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.2
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.1
        if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}

      - name: Pull Docker image
        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }}
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.2
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.1
        with:
          docker-image: ${{ inputs.DOCKER_IMAGE }}

@ -213,7 +213,7 @@ jobs:

      - name: Teardown Linux
        if: always()
-        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.2
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.1

      - name: Chown workspace
        if: always()
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@ -59,15 +59,17 @@ on:
      github-token:
        required: true
        description: Github Token
+      aws-pytorch-uploader-access-key-id:
+        required: true
+        description: AWS access key id
+      aws-pytorch-uploader-secret-access-key:
+        required: true
+        description: AWS secret access key
      conda-pytorchbot-token:
        required: true
        description: Conda PyTorchBot token
-      conda-pytorchbot-token-test:
-        required: true
-        description: Conda PyTorchBot token
-
 jobs:
-  upload:
+  build:
    runs-on: ubuntu-22.04
    environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
    container:
@ -95,24 +97,10 @@ jobs:
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.2
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.1
        with:
          no-sudo: true

-      - name: Configure AWS credentials(PyTorch account) for nightly
-        if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/nightly' }}
-        uses: aws-actions/configure-aws-credentials@v3
-        with:
-          role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
-          aws-region: us-east-1
-
-      - name: Configure AWS credentials(PyTorch account) for RC builds
-        if: ${{ github.event_name == 'push' &&  (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/')) }}
-        uses: aws-actions/configure-aws-credentials@v3
-        with:
-          role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_test_build_wheels
-          aws-region: us-east-1
-
      - name: Download Build Artifacts
        id: download-artifacts
        # NB: When the previous build job is skipped, there won't be any artifacts and
@ -139,19 +127,14 @@ jobs:

      - name: Upload binaries
        if: steps.download-artifacts.outcome && steps.download-artifacts.outcome == 'success'
-        shell: bash
        env:
          PKG_DIR: "${{ runner.temp }}/artifacts"
          UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}"
          # When running these on pull_request events these should be blank
-          CONDA_PYTORCHBOT_TOKEN: ${{ secrets.conda-pytorchbot-token }}
-          CONDA_PYTORCHBOT_TOKEN_TEST: ${{ secrets.conda-pytorchbot-token-test }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.aws-pytorch-uploader-access-key-id }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.aws-pytorch-uploader-secret-access-key }}
+          ANACONDA_API_TOKEN: ${{ secrets.conda-pytorchbot-token }}
          BUILD_NAME: ${{ inputs.build_name }}
        run: |
            set -ex
-            if [[ "${GITHUB_REF_NAME}" = *-rc[0-9]* ]]; then
-              export ANACONDA_API_TOKEN="${CONDA_PYTORCHBOT_TOKEN_TEST}"
-            else
-              export ANACONDA_API_TOKEN="${CONDA_PYTORCHBOT_TOKEN}"
-            fi
            bash .circleci/scripts/binary_upload.sh
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .2.0
 .1.0