[dynamo] Allow inlining of hooks for the top module

ghstack-source-id: 51408faf9d8b5f054544107f38316f2ccf1f7a3a Pull Request resolved: https://github.com/pytorch/pytorch/pull/124501
[wip][inductor] Fix batch fusion pass
2025-10-23 23:04:52 +08:00 · 2024-05-10 10:04:37 -07:00 · 2024-05-10 10:04:37 -07:00
5930 changed files with 292680 additions and 196016 deletions
--- a/.ci/docker/aotriton_version.txt
+++ b/.ci/docker/aotriton_version.txt
@ -1,5 +0,0 @@
-0.6b
-manylinux_2_17
-rocm6.1
-7f07e8a1cb1f99627eb6d77f5c0e9295c775f3c7
-77c29fa3f3b614e187d7213d745e989a92708cee2bc6020419ab49019af399d1
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -84,30 +84,16 @@ fi
 # CMake 3.18 is needed to support CUDA17 language variant
 CMAKE_VERSION=3.18.5

-_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
-_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
+_UCX_COMMIT=00bcc6bb18fc282eb160623b4c0d300147f579af
+_UCC_COMMIT=7cb07a76ccedad7e56ceb136b865eb9319c258ea

 # It's annoying to rename jobs every time you want to rewrite a
 # configuration, so we hardcode everything here rather than do it
 # from scratch
 case "$image" in
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
+  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -119,24 +105,9 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks)
+  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -149,39 +120,9 @@ case "$image" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-focal-cuda12.1-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9)
+  pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9)
    CUDA_VERSION=11.8.0
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -193,37 +134,9 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
+  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -313,7 +226,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    XPU_VERSION=0.5
+    BASEKIT_VERSION=2024.0.0-49522
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
@ -330,10 +243,10 @@ case "$image" in
    DOCS=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda11.8-cudnn9-py3.8-clang12)
+  pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12)
    ANACONDA_PYTHON_VERSION=3.8
    CUDA_VERSION=11.8
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    CLANG_VERSION=12
    PROTOBUF=yes
    DB=yes
@ -373,13 +286,6 @@ case "$image" in
    CONDA_CMAKE=yes
    EXECUTORCH=yes
    ;;
-  pytorch-linux-jammy-py3.12-halide)
-    CUDA_VERSION=12.4
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=11
-    CONDA_CMAKE=yes
-    HALIDE=yes
-    ;;
  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
    # We will need to update mypy version eventually, but that's for another day. The task
@ -387,7 +293,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CONDA_CMAKE=yes
    ;;
-  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter)
+  pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter)
    ANACONDA_PYTHON_VERSION=3.9
    CUDA_VERSION=11.8
    CONDA_CMAKE=yes
@ -454,7 +360,7 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
 #when using cudnn version 8 install it separately from cuda
 if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
  IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
-  if [[ ${CUDNN_VERSION} == 9 ]]; then
+  if [[ ${CUDNN_VERSION} == 8 ]]; then
    IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
  fi
 fi
@ -497,8 +403,7 @@ docker build \
       --build-arg "DOCS=${DOCS}" \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
-       --build-arg "HALIDE=${HALIDE}" \
-       --build-arg "XPU_VERSION=${XPU_VERSION}" \
+       --build-arg "BASEKIT_VERSION=${BASEKIT_VERSION}" \
       --build-arg "ACL=${ACL:-}" \
       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
       --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
@ -507,7 +412,7 @@ docker build \
       "$@" \
       .

-# NVIDIA dockers for RC releases use tag names like `11.0-cudnn9-devel-ubuntu18.04-rc`,
+# NVIDIA dockers for RC releases use tag names like `11.0-cudnn8-devel-ubuntu18.04-rc`,
 # for this case we will set UBUNTU_VERSION to `18.04-rc` so that the Dockerfile could
 # find the correct image. As a result, here we have to replace the
 #   "$UBUNTU_VERSION" == "18.04-rc"
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -62,7 +62,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV
+# (optional) Install vision packages like OpenCV and ffmpeg
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -77,9 +77,6 @@ RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
-COPY ./common/install_amdsmi.sh install_amdsmi.sh
-RUN bash ./install_amdsmi.sh
-RUN rm install_amdsmi.sh
 ENV PATH /opt/rocm/bin:$PATH
 ENV PATH /opt/rocm/hcc/bin:$PATH
 ENV PATH /opt/rocm/hip/bin:$PATH
@ -113,13 +110,6 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt

-# Install AOTriton (Early fail)
-COPY ./aotriton_version.txt aotriton_version.txt
-COPY ./common/common_utils.sh common_utils.sh
-COPY ./common/install_aotriton.sh install_aotriton.sh
-RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
-ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
-
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-9d859653ae916d0a72f6b2b5c5925bed38832140
+d4b3e5cc607e97afdba79dc90f8ef968142f347c
--- a/.ci/docker/ci_commit_pins/halide.txt
+++ b/.ci/docker/ci_commit_pins/halide.txt
@ -1 +0,0 @@
-340136fec6d3ebc73e7a19eba1663e9b0ba8ab2d
--- a/.ci/docker/ci_commit_pins/triton-rocm.txt
+++ b/.ci/docker/ci_commit_pins/triton-rocm.txt
@ -1 +1 @@
-21eae954efa5bf584da70324b640288c3ee7aede
+bbe6246e37d8aa791c67daaf9d9d61b26c9ccfdc
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-1b2f15840e0d70eec50d84c7a0575cb835524def
+b8c64f64c18d8cac598b3adb355c21e7439c21de
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-dedb7bdf339a3546896d4820366ca562c586bfa0
+45fff310c891f5a92d55445adf8cc9d29df5841e
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -1,6 +1,6 @@
 set -euo pipefail

-readonly version=v24.04
+readonly version=v23.08
 readonly src_host=https://review.mlplatform.org/ml
 readonly src_repo=ComputeLibrary

--- a/.ci/docker/common/install_amdsmi.sh
+++ b/.ci/docker/common/install_amdsmi.sh
@ -1,5 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-cd /opt/rocm/share/amd_smi && pip install .
--- a/.ci/docker/common/install_aotriton.sh
+++ b/.ci/docker/common/install_aotriton.sh
@ -1,23 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
-
-TARBALL='aotriton.tar.bz2'
-# This read command alwasy returns with exit code 1
-read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
-ARCH=$(uname -m)
-AOTRITON_INSTALL_PREFIX="$1"
-AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.bz2"
-
-cd "${AOTRITON_INSTALL_PREFIX}"
-# Must use -L to follow redirects
-curl -L --retry 3 -o "${TARBALL}" "${AOTRITON_URL}"
-ACTUAL_SHA256=$(sha256sum "${TARBALL}" | cut -d " " -f 1)
-if [ "${SHA256}" != "${ACTUAL_SHA256}" ]; then
-  echo -n "Error: The SHA256 of downloaded tarball is ${ACTUAL_SHA256},"
-  echo " which does not match the expected value ${SHA256}."
-  exit
-fi
-tar xf "${TARBALL}" && rm -rf "${TARBALL}"
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@ -3,7 +3,7 @@
 set -ex

 install_ubuntu() {
-  # NVIDIA dockers for RC releases use tag names like `11.0-cudnn9-devel-ubuntu18.04-rc`,
+  # NVIDIA dockers for RC releases use tag names like `11.0-cudnn8-devel-ubuntu18.04-rc`,
  # for this case we will set UBUNTU_VERSION to `18.04-rc` so that the Dockerfile could
  # find the correct image. As a result, here we have to check for
  #   "$UBUNTU_VERSION" == "18.04"*
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -85,7 +85,7 @@ fi
  else
    CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"

-    if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.13" ]; then
+    if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ]; then
      conda_install numpy=1.26.0 ${CONDA_COMMON_DEPS}
    else
      conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@ -1,18 +1,20 @@
 #!/bin/bash

-if [[ -n "${CUDNN_VERSION}" ]]; then
+if [[ ${CUDNN_VERSION} == 8 ]]; then
    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
    mkdir tmp_cudnn
    pushd tmp_cudnn
-    if [[ ${CUDA_VERSION:0:2} == "12" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda12-archive"
-    elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
+    if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-8.9.2.26_cuda12-archive"
+        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
+    elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-8.7.0.84_cuda11-archive"
+        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/${CUDNN_NAME}.tar.xz
    else
        print "Unsupported CUDA version ${CUDA_VERSION}"
        exit 1
    fi
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
+
    tar xf ${CUDNN_NAME}.tar.xz
    cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
    cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,14 +5,9 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt

-if [[ ${CUDA_VERSION:0:4} =~ ^12\.[1-4]$ ]]; then
-    arch_path='sbsa'
-    export TARGETARCH=${TARGETARCH:-$(uname -m)}
-    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
-        arch_path='x86_64'
-    fi
-    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.5.2.1-archive"
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
+if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
+    CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.5.2.1-archive"
+    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
 elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
    CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.4.0.7-archive"
    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -37,9 +37,6 @@ install_conda_dependencies() {

 install_pip_dependencies() {
  pushd executorch/.ci/docker
-  # Install PyTorch CPU build beforehand to avoid installing the much bigger CUDA
-  # binaries later, ExecuTorch only needs CPU
-  pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
  # Install all Python dependencies
  pip_install -r requirements-ci.txt
  popd
@ -47,14 +44,13 @@ install_pip_dependencies() {

 setup_executorch() {
  pushd executorch
-  # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
-  as_jenkins bash .ci/scripts/setup-vulkan-linux-deps.sh
+  source .ci/scripts/utils.sh

-  export PYTHON_EXECUTABLE=python
-  export EXECUTORCH_BUILD_PYBIND=ON
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+  install_flatc_from_source
+  pip_install .

-  as_jenkins .ci/scripts/setup-linux.sh cmake
+  # Make sure that all the newly generate files are owned by Jenkins
+  chown -R jenkins .
  popd
 }

--- a/.ci/docker/common/install_halide.sh
+++ b/.ci/docker/common/install_halide.sh
@ -1,46 +0,0 @@
-#!/bin/bash
-set -ex
-
-source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
-
-COMMIT=$(get_pinned_commit halide)
-test -n "$COMMIT"
-
-# activate conda to populate CONDA_PREFIX
-test -n "$ANACONDA_PYTHON_VERSION"
-eval "$(conda shell.bash hook)"
-conda activate py_$ANACONDA_PYTHON_VERSION
-
-if [ -n "${UBUNTU_VERSION}" ];then
-    apt update
-    apt-get install -y lld liblld-15-dev libpng-dev libjpeg-dev libgl-dev \
-                  libopenblas-dev libeigen3-dev libatlas-base-dev libzstd-dev
-fi
-
-conda_install numpy scipy imageio cmake ninja
-
-git clone --depth 1 --branch release/16.x --recursive https://github.com/llvm/llvm-project.git
-cmake -DCMAKE_BUILD_TYPE=Release \
-        -DLLVM_ENABLE_PROJECTS="clang" \
-        -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \
-        -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_ENABLE_ASSERTIONS=ON \
-        -DLLVM_ENABLE_EH=ON -DLLVM_ENABLE_RTTI=ON -DLLVM_BUILD_32_BITS=OFF \
-        -S llvm-project/llvm -B llvm-build -G Ninja
-cmake --build llvm-build
-cmake --install llvm-build --prefix llvm-install
-export LLVM_ROOT=`pwd`/llvm-install
-export LLVM_CONFIG=$LLVM_ROOT/bin/llvm-config
-
-git clone https://github.com/halide/Halide.git
-pushd Halide
-git checkout ${COMMIT} && git submodule update --init --recursive
-pip_install -r requirements.txt
-cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -S . -B build
-cmake --build build
-test -e ${CONDA_PREFIX}/lib/python3 || ln -s python${ANACONDA_PYTHON_VERSION} ${CONDA_PREFIX}/lib/python3
-cmake --install build --prefix ${CONDA_PREFIX}
-chown -R jenkins ${CONDA_PREFIX}
-popd
-rm -rf Halide llvm-build llvm-project llvm-install
-
-python -c "import halide"  # check for errors
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -30,12 +30,10 @@ pip_install \

 pip_install coloredlogs packaging

-pip_install onnxruntime==1.18
-pip_install onnx==1.16.0
+pip_install onnxruntime==1.17.0
+pip_install onnx==1.15.0
 # pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@3e869ef8ccf19b5ebd21c10d3e9c267c9a9fa729" --no-deps
-pip_install onnxscript==0.1.0.dev20240613 --no-deps
-# required by onnxscript
-pip_install ml_dtypes
+pip_install onnxscript==0.1.0.dev20240315 --no-deps

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -6,6 +6,9 @@ ver() {
    printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
 }

+# Map ROCm version to AMDGPU version
+declare -A AMDGPU_VERSIONS=( ["5.0"]="21.50" ["5.1.1"]="22.10.1" ["5.2"]="22.20" )
+
 install_ubuntu() {
    apt-get update
    if [[ $UBUNTU_VERSION == 18.04 ]]; then
@ -23,14 +26,31 @@ install_ubuntu() {
    apt-get install -y libc++1
    apt-get install -y libc++abi1

-    # Add amdgpu repository
-    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
-    echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then
+        # Add amdgpu repository
+        UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
+        local amdgpu_baseurl
+        if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
+        else
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/ubuntu"
+        fi
+        echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
+    fi
+
+    ROCM_REPO="ubuntu"
+    if [[ $(ver $ROCM_VERSION) -lt $(ver 4.2) ]]; then
+        ROCM_REPO="xenial"
+    fi
+
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
+        ROCM_REPO="${UBUNTU_VERSION_NAME}"
+    fi

    # Add rocm repository
    wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
    local rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
-    echo "deb [arch=amd64] ${rocm_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/rocm.list
+    echo "deb [arch=amd64] ${rocm_baseurl} ${ROCM_REPO} main" > /etc/apt/sources.list.d/rocm.list
    apt-get update --allow-insecure-repositories

    DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
@ -39,8 +59,7 @@ install_ubuntu() {
                   rocm-libs \
                   rccl \
                   rocprofiler-dev \
-                   roctracer-dev \
-                   amd-smi-lib
+                   roctracer-dev

    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.1) ]]; then
        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
@ -49,18 +68,29 @@ install_ubuntu() {
    # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
    # search for all unversioned packages
    # if search fails it will abort this script; use true to avoid case where search fails
-    MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
-    if [[ "x${MIOPENHIPGFX}" = x ]]; then
-      echo "miopen-hip-gfx package not available" && exit 1
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 5.5) ]]; then
+        MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
+        if [[ "x${MIOPENHIPGFX}" = x ]]; then
+          echo "miopen-hip-gfx package not available" && exit 1
+        else
+          DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
+        fi
    else
-      DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
+        MIOPENKERNELS=$(apt-cache search --names-only miopenkernels | awk '{print $1}' | grep -F -v . || true)
+        if [[ "x${MIOPENKERNELS}" = x ]]; then
+          echo "miopenkernels package not available" && exit 1
+        else
+          DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENKERNELS}
+        fi
    fi

    # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
-    for kdb in /opt/rocm/share/miopen/db/*.kdb
-    do
-        sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
-    done
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then
+        for kdb in /opt/rocm/share/miopen/db/*.kdb
+        do
+            sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
+        done
+    fi

    # Cleanup
    apt-get autoclean && apt-get clean
@ -77,19 +107,25 @@ install_centos() {
  yum install -y epel-release
  yum install -y dkms kernel-headers-`uname -r` kernel-devel-`uname -r`

-  # Add amdgpu repository
-  local amdgpu_baseurl
-  if [[ $OS_VERSION == 9 ]]; then
-      amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/9.0/main/x86_64"
-  else
-      amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64"
+  if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then
+      # Add amdgpu repository
+      local amdgpu_baseurl
+      if [[ $OS_VERSION == 9 ]]; then
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/9.0/main/x86_64"
+      else
+        if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64"
+        else
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/7.9/main/x86_64"
+        fi
+      fi
+      echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo
+      echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo
+      echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo
+      echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo
+      echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo
+      echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo
  fi
-  echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo
-  echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo
-  echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo
-  echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo
-  echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo
-  echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo

  local rocm_baseurl="http://repo.radeon.com/rocm/yum/${ROCM_VERSION}"
  echo "[ROCm]" > /etc/yum.repos.d/rocm.repo
@ -107,23 +143,33 @@ install_centos() {
                   rocm-libs \
                   rccl \
                   rocprofiler-dev \
-                   roctracer-dev \
-                   amd-smi-lib
+                   roctracer-dev

  # precompiled miopen kernels; search for all unversioned packages
  # if search fails it will abort this script; use true to avoid case where search fails
-  MIOPENHIPGFX=$(yum -q search miopen-hip-gfx | grep miopen-hip-gfx | awk '{print $1}'| grep -F kdb. || true)
-  if [[ "x${MIOPENHIPGFX}" = x ]]; then
-    echo "miopen-hip-gfx package not available" && exit 1
+  if [[ $(ver $ROCM_VERSION) -ge $(ver 5.5) ]]; then
+      MIOPENHIPGFX=$(yum -q search miopen-hip-gfx | grep miopen-hip-gfx | awk '{print $1}'| grep -F kdb. || true)
+      if [[ "x${MIOPENHIPGFX}" = x ]]; then
+        echo "miopen-hip-gfx package not available" && exit 1
+      else
+        yum install -y ${MIOPENHIPGFX}
+      fi
  else
-    yum install -y ${MIOPENHIPGFX}
+      MIOPENKERNELS=$(yum -q search miopenkernels | grep miopenkernels- | awk '{print $1}'| grep -F kdb. || true)
+      if [[ "x${MIOPENKERNELS}" = x ]]; then
+        echo "miopenkernels package not available" && exit 1
+      else
+        yum install -y ${MIOPENKERNELS}
+      fi
  fi

  # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
-  for kdb in /opt/rocm/share/miopen/db/*.kdb
-  do
-      sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
-  done
+  if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then
+      for kdb in /opt/rocm/share/miopen/db/*.kdb
+      do
+          sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
+      done
+  fi

  # Cleanup
  yum clean all
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -15,7 +15,7 @@ conda_reinstall() {
 if [ -n "${ROCM_VERSION}" ]; then
  TRITON_REPO="https://github.com/openai/triton"
  TRITON_TEXT_FILE="triton-rocm"
-elif [ -n "${XPU_VERSION}" ]; then
+elif [ -n "${BASEKIT_VERSION}" ]; then
  TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
  TRITON_TEXT_FILE="triton-xpu"
 else
--- a/.ci/docker/common/install_vision.sh
+++ b/.ci/docker/common/install_vision.sh
@ -5,7 +5,8 @@ set -ex
 install_ubuntu() {
  apt-get update
  apt-get install -y --no-install-recommends \
-          libopencv-dev
+          libopencv-dev \
+          libavcodec-dev

  # Cleanup
  apt-get autoclean && apt-get clean
@ -18,7 +19,8 @@ install_centos() {
  yum --enablerepo=extras install -y epel-release

  yum install -y \
-      opencv-devel
+      opencv-devel \
+      ffmpeg-devel

  # Cleanup
  yum clean all
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -3,7 +3,10 @@ set -xe


 # Intel® software for general purpose GPU capabilities.
-# Refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
+# Refer to https://dgpu-docs.intel.com/releases/LTS_803.29_20240131.html
+
+# Intel® oneAPI Base Toolkit (version 2024.0.0) has been updated to include functional and security updates.
+# Refer to https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html

 # Users should update to the latest version as it becomes available

@ -14,16 +17,14 @@ function install_ubuntu() {
    # Set up the repository. To do this, download the key to the system keyring
    wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
        | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
-    wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
-        | gpg --dearmor --output /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
+    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+        | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null

    # Add the signed entry to APT sources and configure the APT client to use the Intel repository
-    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] \
-        https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
+    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
        | tee /etc/apt/sources.list.d/intel-gpu-jammy.list
-    echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] \
-        https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" \
-        | tee /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list
+    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
+        | tee /etc/apt/sources.list.d/oneAPI.list

    # Update the packages list and repository index
    apt-get update
@ -39,11 +40,11 @@ function install_ubuntu() {
        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
    # Development Packages
    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
-    # Install Intel Support Packages
-    if [ -n "$XPU_VERSION" ]; then
-        apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION}
+    # Install Intel® oneAPI Base Toolkit
+    if [ -n "$BASEKIT_VERSION" ]; then
+        apt-get install intel-basekit=$BASEKIT_VERSION -y
    else
-        apt-get install -y intel-for-pytorch-gpu-dev
+        apt-get install intel-basekit -y
    fi

    # Cleanup
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -85,10 +85,10 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.10.0
+mypy==1.9.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
-#Pinned versions: 1.10.0
+#Pinned versions: 1.9.0
 #test that import: test_typing.py, test_type_hints.py

 networkx==2.8.8
@ -134,9 +134,9 @@ opt-einsum==3.3
 #Pinned versions: 3.3
 #test that import: test_linalg.py

-optree==0.12.1
+optree==0.11.0
 #Description: A library for tree manipulation
-#Pinned versions: 0.12.1
+#Pinned versions: 0.11.0
 #test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
 #test_pytree.py, test_ops.py, test_control_flow.py, test_modules.py,
 #common_utils.py, test_eager_transforms.py, test_python_dispatch.py,
@ -306,7 +306,7 @@ pywavelets==1.5.0 ; python_version >= "3.12"
 #Pinned versions: 1.4.1
 #test that import:

-lxml==5.0.0
+lxml==5.0.0.
 #Description: This is a requirement of unittest-xml-reporting

 # Python-3.9 binaries
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -56,7 +56,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV
+# (optional) Install vision packages like OpenCV and ffmpeg
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -103,14 +103,6 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt

-ARG HALIDE
-# Build and install halide
-COPY ./common/install_halide.sh install_halide.sh
-COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/halide.txt halide.txt
-RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
-RUN rm install_halide.sh common_utils.sh halide.txt
-
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
@ -147,7 +139,7 @@ COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
 ARG CUDNN_VERSION
 ARG CUDA_VERSION
 COPY ./common/install_cudnn.sh install_cudnn.sh
-RUN if [ -n "${CUDNN_VERSION}" ]; then bash install_cudnn.sh; fi
+RUN if [ "${CUDNN_VERSION}" -eq 8 ]; then bash install_cudnn.sh; fi
 RUN rm install_cudnn.sh

 # Install CUSPARSELT
@ -160,7 +152,6 @@ RUN rm install_cusparselt.sh
 RUN if [ -h /usr/local/cuda-11.6/cuda-11.6 ]; then rm /usr/local/cuda-11.6/cuda-11.6; fi
 RUN if [ -h /usr/local/cuda-11.7/cuda-11.7 ]; then rm /usr/local/cuda-11.7/cuda-11.7; fi
 RUN if [ -h /usr/local/cuda-12.1/cuda-12.1 ]; then rm /usr/local/cuda-12.1/cuda-12.1; fi
-RUN if [ -h /usr/local/cuda-12.4/cuda-12.4 ]; then rm /usr/local/cuda-12.4/cuda-12.4; fi

 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -53,7 +53,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV
+# (optional) Install vision packages like OpenCV and ffmpeg
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -78,11 +78,6 @@ ENV MAGMA_HOME /opt/rocm/magma
 ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8

-# Install amdsmi
-COPY ./common/install_amdsmi.sh install_amdsmi.sh
-RUN bash ./install_amdsmi.sh
-RUN rm install_amdsmi.sh
-
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
@ -105,13 +100,6 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt

-# Install AOTriton
-COPY ./aotriton_version.txt aotriton_version.txt
-COPY ./common/common_utils.sh common_utils.sh
-COPY ./common/install_aotriton.sh install_aotriton.sh
-RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
-ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
-
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -62,7 +62,7 @@ RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_d
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

 # Install XPU Dependencies
-ARG XPU_VERSION
+ARG BASEKIT_VERSION
 COPY ./common/install_xpu.sh install_xpu.sh
 RUN bash ./install_xpu.sh && rm install_xpu.sh

@ -83,7 +83,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV
+# (optional) Install vision packages like OpenCV and ffmpeg
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -80,7 +80,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV
+# (optional) Install vision packages like OpenCV and ffmpeg
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -155,14 +155,6 @@ COPY ci_commit_pins/executorch.txt executorch.txt
 RUN if [ -n "${EXECUTORCH}" ]; then bash ./install_executorch.sh; fi
 RUN rm install_executorch.sh common_utils.sh executorch.txt

-ARG HALIDE
-# Build and install halide
-COPY ./common/install_halide.sh install_halide.sh
-COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/halide.txt halide.txt
-RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
-RUN rm install_halide.sh common_utils.sh halide.txt
-
 ARG ONNX
 # Install ONNX dependencies
 COPY ./common/install_onnx.sh ./common/common_utils.sh ./
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -44,7 +44,15 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
  fi
 fi

-if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
+if [[ ${BUILD_ENVIRONMENT} == *"caffe2"* ]]; then
+  echo "Caffe2 build is ON"
+  export BUILD_CAFFE2=ON
+fi
+
+if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then
+  export ATEN_THREADING=TBB
+  export USE_TBB=1
+elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
  export ATEN_THREADING=NATIVE
 fi

@ -230,10 +238,6 @@ if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]
  export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi

-if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
-  export CMAKE_BUILD_TYPE=RelWithAssert
-fi
-
 # Do not change workspace permissions for ROCm CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
 if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
@ -288,26 +292,9 @@ else
        # Which should be backward compatible with Numpy-1.X
        python -mpip install --pre numpy==2.0.0rc1
      fi
-
-      WERROR=1 python setup.py clean
-
-      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-        BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel
-        BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 python setup.py bdist_wheel --cmake
-      else
-        WERROR=1 python setup.py bdist_wheel
-      fi
+      WERROR=1 python setup.py bdist_wheel
    else
-      python setup.py clean
-      if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
-        source .ci/pytorch/install_cache_xla.sh
-      fi
-      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-        echo "USE_SPLIT_BUILD cannot be used with xla or rocm"
-        exit 1
-      else
-        python setup.py bdist_wheel
-      fi
+      python setup.py bdist_wheel
    fi
    pip_install_whl "$(echo dist/*.whl)"

@ -346,10 +333,9 @@ else
    CUSTOM_OP_TEST="$PWD/test/custom_operator"
    python --version
    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-
    mkdir -p "$CUSTOM_OP_BUILD"
    pushd "$CUSTOM_OP_BUILD"
-    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -362,7 +348,7 @@ else
    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
    mkdir -p "$JIT_HOOK_BUILD"
    pushd "$JIT_HOOK_BUILD"
-    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -374,7 +360,7 @@ else
    python --version
    mkdir -p "$CUSTOM_BACKEND_BUILD"
    pushd "$CUSTOM_BACKEND_BUILD"
-    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -56,29 +56,9 @@ function assert_git_not_dirty() {
 function pip_install_whl() {
  # This is used to install PyTorch and other build artifacts wheel locally
  # without using any network connection
-
-  # Convert the input arguments into an array
-  local args=("$@")
-
-  # Check if the first argument contains multiple paths separated by spaces
-  if [[ "${args[0]}" == *" "* ]]; then
-    # Split the string by spaces into an array
-    IFS=' ' read -r -a paths <<< "${args[0]}"
-    # Loop through each path and install individually
-    for path in "${paths[@]}"; do
-      echo "Installing $path"
-      python3 -mpip install --no-index --no-deps "$path"
-    done
-  else
-    # Loop through each argument and install individually
-    for path in "${args[@]}"; do
-      echo "Installing $path"
-      python3 -mpip install --no-index --no-deps "$path"
-    done
-  fi
+  python3 -mpip install --no-index --no-deps "$@"
 }

-
 function pip_install() {
  # retry 3 times
  # old versions of pip don't have the "--progress-bar" flag
@ -208,6 +188,28 @@ function clone_pytorch_xla() {
  fi
 }

+function checkout_install_torchdeploy() {
+  local commit
+  commit=$(get_pinned_commit multipy)
+  pushd ..
+  git clone --recurse-submodules https://github.com/pytorch/multipy.git
+  pushd multipy
+  git checkout "${commit}"
+  python multipy/runtime/example/generate_examples.py
+  BUILD_CUDA_TESTS=1 pip install -e .
+  popd
+  popd
+}
+
+function test_torch_deploy(){
+ pushd ..
+ pushd multipy
+ ./multipy/runtime/build/test_deploy
+ ./multipy/runtime/build/test_deploy_gpu
+ popd
+ popd
+}
+
 function checkout_install_torchbench() {
  local commit
  commit=$(get_pinned_commit torchbench)
@ -222,8 +224,6 @@ function checkout_install_torchbench() {
    # to install and test other models
    python install.py --continue_on_fail
  fi
-  echo "Print all dependencies after TorchBench is installed"
-  python -mpip freeze
  popd
 }

--- a/.ci/pytorch/create_test_cert.py
+++ b/.ci/pytorch/create_test_cert.py
@ -6,7 +6,6 @@ from cryptography.hazmat.primitives import hashes, serialization
 from cryptography.hazmat.primitives.asymmetric import rsa
 from cryptography.x509.oid import NameOID

-
 temp_dir = mkdtemp()
 print(temp_dir)

--- a/.ci/pytorch/docs-test.sh
+++ b/.ci/pytorch/docs-test.sh
@ -6,4 +6,4 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 echo "Testing pytorch docs"

 cd docs
-TERM=vt100 make doctest
+make doctest
--- a/.ci/pytorch/install_cache_xla.sh
+++ b/.ci/pytorch/install_cache_xla.sh
@ -1,37 +0,0 @@
-#!/bin/bash
-
-# Script for installing sccache on the xla build job, which uses xla's docker
-# image and doesn't have sccache installed on it.  This is mostly copied from
-# .ci/docker/install_cache.sh.  Changes are: removing checks that will always
-# return the same thing, ex checks for for rocm, CUDA, and changing the path
-# where sccache is installed, and not changing /etc/environment.
-
-set -ex
-
-install_binary() {
-  echo "Downloading sccache binary from S3 repo"
-  curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /tmp/cache/bin/sccache
-}
-
-mkdir -p /tmp/cache/bin
-mkdir -p /tmp/cache/lib
-export PATH="/tmp/cache/bin:$PATH"
-
-install_binary
-chmod a+x /tmp/cache/bin/sccache
-
-function write_sccache_stub() {
-  # Unset LD_PRELOAD for ps because of asan + ps issues
-  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589
-  # shellcheck disable=SC2086
-  # shellcheck disable=SC2059
-  printf "#!/bin/sh\nif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then\n  exec sccache $(which $1) \"\$@\"\nelse\n  exec $(which $1) \"\$@\"\nfi" > "/tmp/cache/bin/$1"
-  chmod a+x "/tmp/cache/bin/$1"
-}
-
-write_sccache_stub cc
-write_sccache_stub c++
-write_sccache_stub gcc
-write_sccache_stub g++
-write_sccache_stub clang
-write_sccache_stub clang++
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -18,9 +18,7 @@ time python test/run_test.py --verbose -i distributed/test_c10d_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_nccl
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
-time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering
 time python test/run_test.py --verbose -i distributed/test_store
-time python test/run_test.py --verbose -i distributed/test_symmetric_memory
 time python test/run_test.py --verbose -i distributed/test_pg_wrapper
 time python test/run_test.py --verbose -i distributed/rpc/cuda/test_tensorpipe_agent
 # FSDP tests
@ -52,9 +50,6 @@ time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_ra
 # FSDP2 tests
 time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh

-# Pipelining composability tests
-time python test/run_test.py --verbose -i distributed/pipelining/test_composability.py
-
 # Other tests
 time python test/run_test.py --verbose -i test_cuda_primary_ctx
 time python test/run_test.py --verbose -i test_optim -- -k test_forloop_goes_right_direction_multigpu
--- a/.ci/pytorch/perf_test/compare_with_baseline.py
+++ b/.ci/pytorch/perf_test/compare_with_baseline.py
@ -3,7 +3,6 @@ import json
 import math
 import sys

-
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "--test-name", dest="test_name", action="store", required=True, help="test name"
--- a/.ci/pytorch/perf_test/get_stats.py
+++ b/.ci/pytorch/perf_test/get_stats.py
@ -3,7 +3,6 @@ import sys

 import numpy

-
 sample_data_list = sys.argv[1:]
 sample_data_list = [float(v.strip()) for v in sample_data_list]

--- a/.ci/pytorch/perf_test/update_commit_hash.py
+++ b/.ci/pytorch/perf_test/update_commit_hash.py
@ -1,7 +1,6 @@
 import json
 import sys

-
 data_file_path = sys.argv[1]
 commit_hash = sys.argv[2]

--- a/.ci/pytorch/print_sccache_log.py
+++ b/.ci/pytorch/print_sccache_log.py
@ -1,6 +1,5 @@
 import sys

-
 log_file_path = sys.argv[1]

 with open(log_file_path) as f:
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -249,7 +249,9 @@ fi
 # This tests that the debug asserts are working correctly.
 if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
    echo "We are in debug mode: $BUILD_ENVIRONMENT. Expect the python assertion to fail"
-    (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_debug_asserts_fail(424242)")
+    # TODO: Enable the check after we setup the build to run debug asserts without having
+    #       to do a full (and slow) debug build
+    # (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_debug_asserts_fail(424242)")
 elif [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
    # Noop when debug is disabled. Skip bazel jobs because torch isn't available there yet.
    echo "We are not in debug mode: $BUILD_ENVIRONMENT. Expect the assertion to pass"
@ -275,9 +277,6 @@ test_python_shard() {

  # Bare --include flag is not supported and quoting for lint ends up with flag not being interpreted correctly
  # shellcheck disable=SC2086
-
-  # modify LD_LIBRARY_PATH to ensure it has the conda env.
-  # This set of tests has been shown to be buggy without it for the split-build
  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION

  assert_git_not_dirty
@ -324,11 +323,9 @@ test_inductor_distributed() {
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_state_dict.py -k test_dp_state_dict_save_load --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py -k test_clip_grad_norm_2d --verbose
  python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose

  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
@ -337,50 +334,26 @@ test_inductor_distributed() {
  assert_git_not_dirty
 }

-test_inductor_shard() {
-  if [[ -z "$NUM_TEST_SHARDS" ]]; then
-    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
-    exit 1
-  fi
-
+test_inductor() {
  python tools/dynamo/verify_dynamo.py
-  python test/run_test.py --inductor \
-    --include test_modules test_ops test_ops_gradients test_torch \
-    --shard "$1" "$NUM_TEST_SHARDS" \
-    --verbose
-
+  python test/run_test.py --inductor --include test_modules test_ops test_ops_gradients test_torch --verbose
  # Do not add --inductor for the following inductor unit tests, otherwise we will fail because of nested dynamo state
-  python test/run_test.py \
-    --include inductor/test_torchinductor inductor/test_torchinductor_opinfo inductor/test_aot_inductor \
-    --shard "$1" "$NUM_TEST_SHARDS" \
-    --verbose
-}
+  python test/run_test.py --include inductor/test_torchinductor inductor/test_torchinductor_opinfo inductor/test_aot_inductor --verbose

-test_inductor_aoti() {
  # docker build uses bdist_wheel which does not work with test_aot_inductor
  # TODO: need a faster way to build
  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-    BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
-    CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
+      BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
+      CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
  fi
 }

 test_inductor_cpp_wrapper_abi_compatible() {
  export TORCHINDUCTOR_ABI_COMPATIBLE=1
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-
  echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
  # cpu stack allocation causes segfault and needs more investigation
-  PYTORCH_TESTING_DEVICE_ONLY_FOR="" python test/run_test.py --include inductor/test_cpu_cpp_wrapper
+  TORCHINDUCTOR_STACK_ALLOCATION=0 python test/run_test.py --include inductor/test_cpu_cpp_wrapper
  python test/run_test.py --include inductor/test_cuda_cpp_wrapper
-
-  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
-    --training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
-    --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv"
-  python benchmarks/dynamo/check_accuracy.py \
-    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
-    --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv"
 }

 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
@ -405,7 +378,7 @@ if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--dynamic-shapes --dynamic-batch-only)
 fi

-if [[ "${TEST_CONFIG}" == *cpu_inductor* || "${TEST_CONFIG}" == *cpu_aot_inductor* ]]; then
+if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
 else
  DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
@ -530,10 +503,9 @@ test_single_dynamo_benchmark() {
    test_perf_for_dashboard "$suite" \
      "${DYNAMO_BENCHMARK_FLAGS[@]}" "$@" "${partition_flags[@]}"
  else
-    if [[ "${TEST_CONFIG}" == *aot_inductor* && "${TEST_CONFIG}" != *cpu_aot_inductor* ]]; then
+    if [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
      # Test AOTInductor with the ABI-compatible mode on CI
      # This can be removed once the ABI-compatible mode becomes default.
-      # For CPU device, we perfer non ABI-compatible mode on CI when testing AOTInductor.
      export TORCHINDUCTOR_ABI_COMPATIBLE=1
    fi
    python "benchmarks/dynamo/$suite.py" \
@ -551,13 +523,8 @@ test_single_dynamo_benchmark() {
 }

 test_inductor_micro_benchmark() {
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
-}
-
-test_inductor_halide() {
-  python test/run_test.py --include inductor/test_halide.py --verbose
-  assert_git_not_dirty
+  TEST_REPORTS_DIR=$(pwd)/test/test-micro-reports
+  python benchmarks/gpt_fast/benchmark.py
 }

 test_dynamo_benchmark() {
@ -574,16 +541,8 @@ test_dynamo_benchmark() {
  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
  else
-    if [[ "${TEST_CONFIG}" == *cpu_inductor* || "${TEST_CONFIG}" == *cpu_aot_inductor* ]]; then
-      local dt="float32"
-      if [[ "${TEST_CONFIG}" == *amp* ]]; then
-        dt="amp"
-      fi
-      if [[ "${TEST_CONFIG}" == *freezing* ]]; then
-        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" --freezing "$@"
-      else
-        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" "$@"
-      fi
+    if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
+      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 "$@"
    elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
    else
@ -597,16 +556,12 @@ test_inductor_torchbench_smoketest_perf() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

-  # Test some models in the cpp wrapper mode
-  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-    --bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-    --bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-    --bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
+  # smoke test the cpp_wrapper mode
+  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy --bfloat16 \
+    --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_smoketest.csv"
  python benchmarks/dynamo/check_accuracy.py \
-    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
-    --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"
+      --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_smoketest.csv" \
+      --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"

  python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
@ -621,8 +576,7 @@ test_inductor_torchbench_smoketest_perf() {
  # https://github.com/pytorch/pytorch/actions/runs/7158691360/job/19491437314,
  # and thus we lower its threshold to reduce flakiness. If this continues to be a problem,
  # we switch to use some other model.
-  # lowering threshold from 4.9 to 4.7 for cu124. Will bump it up after cuda 12.4.0->12.4.1 update
-  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.7
+  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.9

  # Check memory compression ratio for a few models
  for test in hf_Albert timm_vision_transformer; do
@ -634,15 +588,6 @@ test_inductor_torchbench_smoketest_perf() {
      "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv" \
      --expected benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
  done
-
-  # Perform some "warm-start" runs for a few huggingface models.
-  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
-    python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
-      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
-    python benchmarks/dynamo/check_accuracy.py \
-      --actual "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv"
-  done
 }

 test_inductor_torchbench_cpu_smoketest_perf(){
@ -726,6 +671,7 @@ test_aten() {
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libmkldnn* "$TEST_BASE_DIR"
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libnccl* "$TEST_BASE_DIR"
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libtorch* "$TEST_BASE_DIR"
+  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libtbb* "$TEST_BASE_DIR"

  ls "$TEST_BASE_DIR"
  aten/tools/run_tests.sh "$TEST_BASE_DIR"
@ -750,6 +696,21 @@ test_without_numpy() {
  popd
 }

+# pytorch extensions require including torch/extension.h which includes all.h
+# which includes utils.h which includes Parallel.h.
+# So you can call for instance parallel_for() from your extension,
+# but the compilation will fail because of Parallel.h has only declarations
+# and definitions are conditionally included Parallel.h(see last lines of Parallel.h).
+# I tried to solve it #39612 and #39881 by including Config.h into Parallel.h
+# But if Pytorch is built with TBB it provides Config.h
+# that has AT_PARALLEL_NATIVE_TBB=1(see #3961 or #39881) and it means that if you include
+# torch/extension.h which transitively includes Parallel.h
+# which transitively includes tbb.h which is not available!
+if [[ "${BUILD_ENVIRONMENT}" == *tbb* ]]; then
+  sudo mkdir -p /usr/include/tbb
+  sudo cp -r "$PWD"/third_party/tbb/include/tbb/* /usr/include/tbb
+fi
+
 test_libtorch() {
  local SHARD="$1"

@ -763,6 +724,7 @@ test_libtorch() {
    ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libshm* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
+    ln -sf "$TORCH_LIB_DIR"/libtbb* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libnvfuser* "$TORCH_BIN_DIR"

    export CPP_TESTS_DIR="${TORCH_BIN_DIR}"
@ -899,6 +861,7 @@ test_rpc() {
  # test reporting process to function as expected.
  ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
  ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
+  ln -sf "$TORCH_LIB_DIR"/libtbb* "$TORCH_BIN_DIR"

  CPP_TESTS_DIR="${TORCH_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_cpp_rpc
 }
@ -1178,21 +1141,15 @@ test_executorch() {

  pushd /executorch

-  export PYTHON_EXECUTABLE=python
-  export EXECUTORCH_BUILD_PYBIND=ON
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
-
-  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
-  # from the PR
+  # NB: We need to build ExecuTorch runner here and not inside the Docker image
+  # because it depends on PyTorch
  # shellcheck disable=SC1091
-  source .ci/scripts/setup-linux.sh cmake
-
-  echo "Run ExecuTorch unit tests"
-  pytest -v -n auto
-  # shellcheck disable=SC1091
-  LLVM_PROFDATA=llvm-profdata-12 LLVM_COV=llvm-cov-12 bash test/run_oss_cpp_tests.sh
+  source .ci/scripts/utils.sh
+  build_executorch_runner "cmake"

  echo "Run ExecuTorch regression tests for some models"
+  # NB: This is a sample model, more can be added here
+  export PYTHON_EXECUTABLE=python
  # TODO(huydhn): Add more coverage here using ExecuTorch's gather models script
  # shellcheck disable=SC1091
  source .ci/scripts/test.sh mv3 cmake xnnpack-quantization-delegation ''
@ -1252,10 +1209,11 @@ elif [[ "$TEST_CONFIG" == distributed ]]; then
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
    test_rpc
  fi
+elif [[ "$TEST_CONFIG" == deploy ]]; then
+  checkout_install_torchdeploy
+  test_torch_deploy
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
-elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
-  test_inductor_halide
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
@ -1267,14 +1225,13 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
  id=$((SHARD_NUMBER-1))
  test_dynamo_benchmark timm_models "$id"
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
-  if [[ "${TEST_CONFIG}" == *cpu_inductor* || "${TEST_CONFIG}" == *cpu_aot_inductor* ]]; then
+  if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
    install_torchaudio cpu
  else
    install_torchaudio cuda
  fi
  install_torchtext
  install_torchvision
-  TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install git+https://github.com/pytorch/ao.git
  id=$((SHARD_NUMBER-1))
  # https://github.com/opencv/opencv-python/issues/885
  pip_install opencv-python==4.8.0.74
@ -1293,7 +1250,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
    checkout_install_torchbench
    # Do this after checkout_install_torchbench to ensure we clobber any
    # nightlies that torchbench may pull in
-    if [[ "${TEST_CONFIG}" != *cpu_inductor* && "${TEST_CONFIG}" != *cpu_aot_inductor* ]]; then
+    if [[ "${TEST_CONFIG}" != *cpu_inductor* ]]; then
      install_torchrec_and_fbgemm
    fi
    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
@ -1301,23 +1258,17 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
 elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper_abi_compatible* ]]; then
  install_torchvision
  test_inductor_cpp_wrapper_abi_compatible
-elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
+elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
  install_torchvision
-  test_inductor_shard "${SHARD_NUMBER}"
-  if [[ "${SHARD_NUMBER}" == 1 ]]; then
-    test_inductor_aoti
-    test_inductor_distributed
-  fi
-elif [[ "${TEST_CONFIG}" == *dynamo* ]]; then
+  test_inductor
+  test_inductor_distributed
+elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
+  install_torchvision
+  test_dynamo_shard 1
+  test_aten
+elif [[ "${TEST_CONFIG}" == *dynamo* && $SHARD_NUMBER -gt 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  install_torchvision
  test_dynamo_shard "${SHARD_NUMBER}"
-  if [[ "${SHARD_NUMBER}" == 1 ]]; then
-    test_aten
-  fi
-elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
-  install_torchvision
-  test_python_shard "$SHARD_NUMBER"
-  test_aten
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  test_without_numpy
  install_torchvision
@ -1347,6 +1298,10 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
  test_libtorch
 elif [[ "${TEST_CONFIG}" = docs_test ]]; then
  test_docs_test
+elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
+  install_torchvision
+  test_python
+  test_aten
 elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
  install_torchvision
  test_python
--- a/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
+++ b/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
@ -4,7 +4,6 @@ import os
 import subprocess
 import sys

-
 COMMON_TESTS = [
    (
        "Checking that torch is available",
--- a/.circleci/codegen_validation/normalize_yaml_fragment.py
+++ b/.circleci/codegen_validation/normalize_yaml_fragment.py
@ -5,7 +5,6 @@ import sys

 import yaml

-
 # Need to import modules that lie on an upward-relative path
 sys.path.append(os.path.join(sys.path[0], ".."))

--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -46,18 +46,13 @@ if [[ "\$python_nodot" = *310* ]]; then
  PROTOBUF_PACKAGE="protobuf>=3.19.0"
 fi

-if [[ "\$python_nodot" = *39* ]]; then
+if [[ "\$python_nodot" = *39*  ]]; then
  # There's an issue with conda channel priority where it'll randomly pick 1.19 over 1.20
  # we set a lower boundary here just to be safe
  NUMPY_PIN=">=1.20"
 fi

-if [[ "\$python_nodot" = *38* ]]; then
-  # sympy 1.12.1 is the last version that supports Python 3.8
-  SYMPY_PIN="==1.12.1"
-else
-  SYMPY_PIN=">=1.13.0"
-fi
+

 # Move debug wheels out of the package dir so they don't get installed
 mkdir -p /tmp/debug_final_pkgs
@ -88,7 +83,7 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
      "numpy\${NUMPY_PIN}" \
      mkl>=2018 \
      ninja \
-      "sympy\${SYMPY_PIN}" \
+      sympy \
      typing-extensions \
      ${PROTOBUF_PACKAGE}
    if [[ "$DESIRED_CUDA" == 'cpu' ]]; then
@ -101,21 +96,8 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
    conda install \${EXTRA_CONDA_FLAGS} -y "\$pkg" --offline
  )
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
-  if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then
-    if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-      pkg_no_python="$(ls -1 /final_pkgs/torch_no_python* | sort |tail -1)"
-      pkg_torch="$(ls -1 /final_pkgs/torch-* | sort |tail -1)"
-      # todo: after folder is populated use the pypi_pkg channel instead
-      pip install "\$pkg_no_python" "\$pkg_torch" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}_pypi_pkg"
-      retry pip install -q numpy protobuf typing-extensions
-    else
-      pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
-      retry pip install -q numpy protobuf typing-extensions
-    fi
-  else
-    pip install "\$pkg"
-    retry pip install -q numpy protobuf typing-extensions
-  fi
+  pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
+  retry pip install -q numpy protobuf typing-extensions
 fi
 if [[ "$PACKAGE_TYPE" == libtorch ]]; then
  pkg="\$(ls /final_pkgs/*-latest.zip)"
@ -123,18 +105,9 @@ if [[ "$PACKAGE_TYPE" == libtorch ]]; then
  cd /tmp/libtorch
 fi

-if [[ "$GPU_ARCH_TYPE" == xpu ]]; then
-  # Workaround for __mkl_tmp_MOD unbound variable issue, refer https://github.com/pytorch/pytorch/issues/130543
-  set +u
-  source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
-fi
-
 # Test the package
 /builder/check_binary.sh

-# Clean temp files
-cd /builder && git clean -ffdx
-
 # =================== The above code will be executed inside Docker container ===================
 EOL
 echo
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -33,9 +33,9 @@ if [[ -z "$DOCKER_IMAGE" ]]; then
  if [[ "$PACKAGE_TYPE" == conda ]]; then
    export DOCKER_IMAGE="pytorch/conda-cuda"
  elif [[ "$DESIRED_CUDA" == cpu ]]; then
-    export DOCKER_IMAGE="pytorch/manylinux:cpu"
+    export DOCKER_IMAGE="pytorch/manylinux-cpu"
  else
-    export DOCKER_IMAGE="pytorch/manylinux-builder:${DESIRED_CUDA:2}"
+    export DOCKER_IMAGE="pytorch/manylinux-cuda${DESIRED_CUDA:2}"
  fi
 fi

@ -75,9 +75,9 @@ export PYTORCH_BUILD_NUMBER=1
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)

 # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
-TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.13'"
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
-  # Only linux Python < 3.13 are supported wheels for triton
+  # Only linux Python < 3.12 are supported wheels for triton
+  TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.12'"
  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
  if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
      TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
@ -87,11 +87,11 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:
 fi

 # Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton rocm package
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then
-    TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
+if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" && "$DESIRED_PYTHON" != "3.12" ]]; then
+    TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}"
    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-rocm.txt)
-        TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
+        TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}"
    fi
    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
@ -100,18 +100,30 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B
    fi
 fi

-# Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton xpu package
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*xpu.* && $(uname) == "Linux" ]]; then
-    TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}"
-    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
-        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-xpu.txt)
-        TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}+${TRITON_SHORTHASH}"
-    fi
-    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
-    else
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
+JAVA_HOME=
+BUILD_JNI=OFF
+if [[ "$PACKAGE_TYPE" == libtorch ]]; then
+  POSSIBLE_JAVA_HOMES=()
+  POSSIBLE_JAVA_HOMES+=(/usr/local)
+  POSSIBLE_JAVA_HOMES+=(/usr/lib/jvm/java-8-openjdk-amd64)
+  POSSIBLE_JAVA_HOMES+=(/Library/Java/JavaVirtualMachines/*.jdk/Contents/Home)
+  # Add the Windows-specific JNI path
+  POSSIBLE_JAVA_HOMES+=("$PWD/pytorch/.circleci/windows-jni/")
+  for JH in "${POSSIBLE_JAVA_HOMES[@]}" ; do
+    if [[ -e "$JH/include/jni.h" ]] ; then
+      # Skip if we're not on Windows but haven't found a JAVA_HOME
+      if [[ "$JH" == "$PWD/pytorch/.circleci/windows-jni/" && "$OSTYPE" != "msys" ]] ; then
+        break
+      fi
+      echo "Found jni.h under $JH"
+      JAVA_HOME="$JH"
+      BUILD_JNI=ON
+      break
    fi
+  done
+  if [ -z "$JAVA_HOME" ]; then
+    echo "Did not find jni.h"
+  fi
 fi

 cat >"$envfile" <<EOL
@ -124,7 +136,6 @@ export DESIRED_PYTHON="${DESIRED_PYTHON:-}"
 export DESIRED_CUDA="$DESIRED_CUDA"
 export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}"
 export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}"
-export USE_SPLIT_BUILD="${USE_SPLIT_BUILD:-}"
 if [[ "${OSTYPE}" == "msys" ]]; then
  export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}"
  if [[ "${LIBTORCH_CONFIG:-}" == 'debug' ]]; then
@ -148,6 +159,8 @@ export TORCH_CONDA_BUILD_FOLDER='pytorch-nightly'
 export ANACONDA_USER='pytorch'

 export USE_FBGEMM=1
+export JAVA_HOME=$JAVA_HOME
+export BUILD_JNI=$BUILD_JNI
 export PIP_UPLOAD_FOLDER="$PIP_UPLOAD_FOLDER"
 export DOCKER_IMAGE="$DOCKER_IMAGE"

--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -25,15 +25,6 @@ if [[ "${DRY_RUN}" = "disabled" ]]; then
  AWS_S3_CP="aws s3 cp"
 fi

-if [[ "${USE_SPLIT_BUILD:-false}" == "true" ]]; then
-  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_pkg"
-fi
-
-# this is special build with all dependencies packaged
-if [[ ${BUILD_NAME} == *-full* ]]; then
-  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_full"
-fi
-
 # Sleep 2 minutes between retries for conda upload
 retry () {
  "$@"  || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@")
--- a/.circleci/scripts/trigger_azure_pipeline.py
+++ b/.circleci/scripts/trigger_azure_pipeline.py
@ -8,7 +8,6 @@ import time

 import requests

-
 AZURE_PIPELINE_BASE_URL = "https://aiinfra.visualstudio.com/PyTorch/"
 AZURE_DEVOPS_PAT_BASE64 = os.environ.get("AZURE_DEVOPS_PAT_BASE64_SECRET", "")
 PIPELINE_ID = "911"
--- a/.clang-tidy
+++ b/.clang-tidy
@ -61,7 +61,6 @@ readability-simplify-subscript-expr,
 readability-string-compare,
 '
 HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
+AnalyzeTemporaryDtors: false
 WarningsAsErrors: '*'
-CheckOptions:
-  misc-header-include-cycle.IgnoredFilesList: 'format.h;ivalue.h;custom_class.h;Dict.h;List.h'
 ...
--- a/.flake8
+++ b/.flake8
@ -2,7 +2,7 @@
 # NOTE: **Mirror any changes** to this file the [tool.ruff] config in pyproject.toml
 # before we can fully move to use ruff
 enable-extensions = G
-select = B,C,E,F,G,P,SIM1,SIM911,T4,W,B9,TOR0,TOR1,TOR2,TOR9
+select = B,C,E,F,G,P,SIM1,T4,W,B9,TOR0,TOR1,TOR2,TOR9
 max-line-length = 120
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@ -40,7 +40,3 @@ e6ec0efaf87703c5f889cfc20b29be455885d58d
 a53cda1ddc15336dc1ff0ce1eff2a49cdc5f882e
 # 2024-01-02 clangformat: fused adam #116583
 9dc68d1aa9e554d09344a10fff69f7b50b2d23a0
-# 2024-06-28 enable UFMT in `torch/storage.py`
-d80939e5e9337e8078f11489afefec59fd42f93b
-# 2024-06-28 enable UFMT in `torch.utils.data`
-7cf0b90e49689d45be91aa539fdf54cf2ea8a9a3
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -1,12 +1,9 @@
 self-hosted-runner:
  labels:
-    # GitHub hosted x86 Linux runners
    - linux.20_04.4x
    - linux.20_04.16x
-    # Repo-specific LF hosted ARC runners
-    - linux.large.arc
-    # Organization-wide AWS Linux Runners
    - linux.large
+    - linux.large.arc
    - linux.2xlarge
    - linux.4xlarge
    - linux.12xlarge
@ -16,36 +13,17 @@ self-hosted-runner:
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
    - linux.g5.4xlarge.nvidia.gpu
-    # Organization-wide AWS Linux Runners on Linux Foundation account
-    - lf.linux.large
-    - lf.linux.2xlarge
-    - lf.linux.4xlarge
-    - lf.linux.12xlarge
-    - lf.linux.24xlarge
-    - lf.linux.arm64.2xlarge
-    - lf.linux.4xlarge.nvidia.gpu
-    - lf.linux.8xlarge.nvidia.gpu
-    - lf.linux.16xlarge.nvidia.gpu
-    - lf.linux.g5.4xlarge.nvidia.gpu
-    # Repo-specific IBM hosted S390x runner
-    - linux.s390x
-    # Organization wide AWS Windows runners
    - windows.4xlarge.nonephemeral
    - windows.8xlarge.nvidia.gpu
    - windows.8xlarge.nvidia.gpu.nonephemeral
    - windows.g5.4xlarge.nvidia.gpu
-    # Organization-wide AMD hosted MI300 runners
+    - bm-runner
    - linux.rocm.gpu
-    # Repo-specific Apple hosted  runners
-    - macos-m1-ultra
-    - macos-m2-14
-    # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
    - macos-m1-stable
    - macos-m1-13
    - macos-m1-14
-    # GitHub-hosted MacOS runners
+    - macos-12-xl
+    - macos-12
+    - macos12.3-m1
    - macos-latest-xlarge
    - macos-13-xlarge
-    - macos-14-xlarge
-    # Organization-wide Intel hosted XPU runners
-    - linux.idc.xpu
--- a/.github/actions/diskspace-cleanup/action.yml
+++ b/.github/actions/diskspace-cleanup/action.yml
@ -14,14 +14,12 @@ runs:
    - name: Cleans up diskspace
      shell: bash
      run: |
-        set -ex
        diskspace_cutoff=${{ inputs.diskspace-cutoff }}
-        docker_root_dir=$(docker info -f '{{.DockerRootDir}}')
-        diskspace=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
+        diskspace=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
        msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
        if [[ "$diskspace" -ge "$diskspace_cutoff" ]] ; then
            docker system prune -af
-            diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
+            diskspace_new=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
            if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
                echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
                echo "$msg"
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@ -66,8 +66,7 @@ runs:
        command: |
          set -eux
          # PyYAML 6.0 doesn't work with MacOS x86 anymore
-          # This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2
-          python3 -m pip install requests==2.27.1 pyyaml==6.0.1
+          python3 -m pip install requests==2.26.0 pyyaml==6.0.1

    - name: Parse ref
      id: parse-ref
--- a/.github/actions/linux-build/action.yml
+++ b/.github/actions/linux-build/action.yml
@ -52,13 +52,6 @@ inputs:
    description: Hugging Face Hub token
    required: false
    default: ""
-  use_split_build:
-    description: |
-      [Experimental] Build a libtorch only wheel and build pytorch such that
-      are built from the libtorch wheel.
-    required: false
-    type: boolean
-    default: false
 outputs:
  docker-image:
    value: ${{ steps.calculate-docker-image.outputs.docker-image }}
@ -151,7 +144,6 @@ runs:
        DEBUG: ${{ inputs.build-with-debug == 'true' && '1' || '0' }}
        OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
        HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
-        USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
      shell: bash
      run: |
        # detached container should get cleaned up by teardown_ec2_linux
@ -171,7 +163,6 @@ runs:
          -e PR_LABELS \
          -e OUR_GITHUB_JOB_ID \
          -e HUGGING_FACE_HUB_TOKEN \
-          -e USE_SPLIT_BUILD \
          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
          --security-opt seccomp=unconfined \
          --cap-add=SYS_PTRACE \
@ -192,7 +183,7 @@ runs:

    - name: Store PyTorch Build Artifacts on S3
      uses: seemethere/upload-artifact-s3@v5
-      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped' && inputs.use_split_build != 'true'
+      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped'
      with:
        name: ${{ inputs.build-environment }}
        retention-days: 14
@ -200,16 +191,6 @@ runs:
        path: artifacts.zip
        s3-bucket: ${{ inputs.s3-bucket }}

-    - name: Store PyTorch Build Artifacts on S3 for split build
-      uses: seemethere/upload-artifact-s3@v5
-      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped' && inputs.use_split_build == 'true'
-      with:
-        name: ${{ inputs.build-environment }}-experimental-split-build
-        retention-days: 14
-        if-no-files-found: error
-        path: artifacts.zip
-        s3-bucket: ${{ inputs.s3-bucket }}
-
    - name: Upload sccache stats
      if: steps.build.outcome != 'skipped'
      uses: seemethere/upload-artifact-s3@v5
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@ -26,7 +26,6 @@ runs:
          -e PYTORCH_FINAL_PACKAGE_DIR \
          -e PYTORCH_ROOT \
          -e SKIP_ALL_TESTS \
-          -e USE_SPLIT_BUILD \
          --tty \
          --detach \
          -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
@ -36,8 +35,7 @@ runs:
          "${DOCKER_IMAGE}"
        )

-        echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
-        if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" && "${GPU_ARCH_TYPE}" != "xpu" ]]; then
+        if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" ]]; then
          # Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner
          grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts"
        fi
@ -46,11 +44,3 @@ runs:
        # Generate test script
        docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
        docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-
-    - name: Cleanup docker
-      if: always() && (env.BUILD_ENVIRONMENT == 'linux-s390x-binary-manywheel' || env.GPU_ARCH_TYPE == 'xpu')
-      shell: bash
-      run: |
-        # on s390x or xpu stop the container for clean worker stop
-        # shellcheck disable=SC2046
-        docker stop "${{ env.CONTAINER_NAME }}" || true
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-69b2a0adc2ec03ab99990d7e8be3d4510438c148
+ea437b31ce316ea3d66fe73768c0dcb94edb79ad
--- a/.github/ci_commit_pins/torchbench.txt
+++ b/.github/ci_commit_pins/torchbench.txt
@ -1 +1 @@
-23512dbebd44a11eb84afbf53c3c071dd105297e
+d6015d42d9a1834bc7595c4bd6852562fb80b30b
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-5ea4535f0699f366adb554183a65ebf7dc34a8be
+e3fc03314dab5f44e3ed9ccbba6c15fbca3285cd
--- a/.github/lf-canary-scale-config.yml
+++ b/.github/lf-canary-scale-config.yml
@ -1,281 +0,0 @@
-# Defines runner types that will be provisioned by by LF Self-hosted
-# runners for pytorch/pytorch-canary and their labels.
-#
-# Runners listed here will be available as self hosted runners.
-# Configuration is directly pulled from the main branch.
-#
-# Default values:
-#
-# runner_types:
-#   runner_label: # label to specify in the Github Actions workflow
-#     instance_type: m4.large
-#     os: linux
-#     max_available: 20
-#     disk_size: 50
-#     is_ephemeral: true
-
-runner_types:
-  lf.c.linux.12xlarge:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-  lf.c.linux.24xl.spr-metal:
-    disk_size: 200
-    instance_type: c7i.metal-24xl
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.c.linux.16xlarge.spr:
-    disk_size: 200
-    instance_type: c7i.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.c.linux.12xlarge.ephemeral:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: true
-    max_available: 300
-    os: linux
-  lf.c.linux.16xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.c.linux.24xlarge:
-    disk_size: 150
-    instance_type: c5.24xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: linux
-  lf.c.linux.2xlarge:
-    disk_size: 150
-    instance_type: c5.2xlarge
-    is_ephemeral: false
-    max_available: 3120
-    os: linux
-  lf.c.linux.4xlarge:
-    disk_size: 150
-    instance_type: c5.4xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-  lf.c.linux.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.4xlarge
-    is_ephemeral: false
-    max_available: 520
-    os: linux
-  lf.c.linux.8xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.8xlarge
-    is_ephemeral: false
-    max_available: 400
-    os: linux
-  lf.c.linux.g4dn.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.12xlarge
-    is_ephemeral: false
-    max_available: 50
-    os: linux
-  lf.c.linux.g4dn.metal.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.metal
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.c.linux.g5.48xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.48xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-  lf.c.linux.g5.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.12xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-  lf.c.linux.g5.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 1200
-    os: linux
-  lf.c.linux.large:
-    disk_size: 15
-    instance_type: c5.large
-    is_ephemeral: false
-    os: linux
-  lf.c.linux.arm64.2xlarge:
-    disk_size: 256
-    instance_type: t4g.2xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-  lf.c.linux.arm64.m7g.2xlarge:
-    disk_size: 256
-    instance_type: m7g.2xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-  lf.c.windows.4xlarge:
-    disk_size: 256
-    instance_type: c5d.4xlarge
-    is_ephemeral: true
-    max_available: 420
-    os: windows
-  lf.c.windows.4xlarge.nonephemeral:
-    disk_size: 256
-    instance_type: c5d.4xlarge
-    is_ephemeral: false
-    max_available: 420
-    os: windows
-  lf.c.windows.8xlarge.nvidia.gpu:
-    disk_size: 256
-    instance_type: p3.2xlarge
-    is_ephemeral: true
-    max_available: 150
-    os: windows
-  lf.c.windows.8xlarge.nvidia.gpu.nonephemeral:
-    disk_size: 256
-    instance_type: p3.2xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: windows
-  lf.c.windows.g5.4xlarge.nvidia.gpu:
-    disk_size: 256
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: windows
-
-  ### Setup runner types to test the Amazon Linux 2023 AMI
-  lf.c.amz2023.linux.12xlarge:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.24xl.spr-metal:
-    disk_size: 200
-    instance_type: c7i.metal-24xl
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.16xlarge.spr:
-    disk_size: 200
-    instance_type: c7i.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.12xlarge.ephemeral:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: true
-    max_available: 300
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.16xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.24xlarge:
-    disk_size: 150
-    instance_type: c5.24xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.2xlarge:
-    disk_size: 150
-    instance_type: c5.2xlarge
-    is_ephemeral: false
-    max_available: 3120
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.4xlarge:
-    disk_size: 150
-    instance_type: c5.4xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.4xlarge
-    is_ephemeral: false
-    max_available: 520
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.8xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.8xlarge
-    is_ephemeral: false
-    max_available: 400
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.g4dn.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.12xlarge
-    is_ephemeral: false
-    max_available: 50
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.g4dn.metal.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.metal
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.g5.48xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.48xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.g5.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.12xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.g5.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 1200
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.large:
-    disk_size: 15
-    instance_type: c5.large
-    is_ephemeral: false
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.arm64.2xlarge:
-    disk_size: 256
-    instance_type: t4g.2xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.arm64.m7g.2xlarge:
-    disk_size: 256
-    instance_type: m7g.2xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
--- a/.github/lf-scale-config.yml
+++ b/.github/lf-scale-config.yml
@ -1,281 +0,0 @@
-# Defines runner types that will be provisioned by by LF Self-hosted
-# runners for pytorch/pytorch and their labels.
-#
-# Runners listed here will be available as self hosted runners.
-# Configuration is directly pulled from the main branch.
-#
-# Default values:
-#
-# runner_types:
-#   runner_label: # label to specify in the Github Actions workflow
-#     instance_type: m4.large
-#     os: linux
-#     max_available: 20
-#     disk_size: 50
-#     is_ephemeral: true
-
-runner_types:
-  lf.linux.12xlarge:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-  lf.linux.24xl.spr-metal:
-    disk_size: 200
-    instance_type: c7i.metal-24xl
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.linux.16xlarge.spr:
-    disk_size: 200
-    instance_type: c7i.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.linux.12xlarge.ephemeral:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: true
-    max_available: 300
-    os: linux
-  lf.linux.16xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.linux.24xlarge:
-    disk_size: 150
-    instance_type: c5.24xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: linux
-  lf.linux.2xlarge:
-    disk_size: 150
-    instance_type: c5.2xlarge
-    is_ephemeral: false
-    max_available: 3120
-    os: linux
-  lf.linux.4xlarge:
-    disk_size: 150
-    instance_type: c5.4xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-  lf.linux.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.4xlarge
-    is_ephemeral: false
-    max_available: 520
-    os: linux
-  lf.linux.8xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.8xlarge
-    is_ephemeral: false
-    max_available: 400
-    os: linux
-  lf.linux.g4dn.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.12xlarge
-    is_ephemeral: false
-    max_available: 50
-    os: linux
-  lf.linux.g4dn.metal.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.metal
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.linux.g5.48xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.48xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-  lf.linux.g5.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.12xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-  lf.linux.g5.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 1200
-    os: linux
-  lf.linux.large:
-    disk_size: 15
-    instance_type: c5.large
-    is_ephemeral: false
-    os: linux
-  lf.linux.arm64.2xlarge:
-    disk_size: 256
-    instance_type: t4g.2xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-  lf.linux.arm64.m7g.2xlarge:
-    disk_size: 256
-    instance_type: m7g.2xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-  lf.windows.4xlarge:
-    disk_size: 256
-    instance_type: c5d.4xlarge
-    is_ephemeral: true
-    max_available: 420
-    os: windows
-  lf.windows.4xlarge.nonephemeral:
-    disk_size: 256
-    instance_type: c5d.4xlarge
-    is_ephemeral: false
-    max_available: 420
-    os: windows
-  lf.windows.8xlarge.nvidia.gpu:
-    disk_size: 256
-    instance_type: p3.2xlarge
-    is_ephemeral: true
-    max_available: 150
-    os: windows
-  lf.windows.8xlarge.nvidia.gpu.nonephemeral:
-    disk_size: 256
-    instance_type: p3.2xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: windows
-  lf.windows.g5.4xlarge.nvidia.gpu:
-    disk_size: 256
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: windows
-
-  ### Setup runner types to test the Amazon Linux 2023 AMI
-  lf.amz2023.linux.12xlarge:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.24xl.spr-metal:
-    disk_size: 200
-    instance_type: c7i.metal-24xl
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.16xlarge.spr:
-    disk_size: 200
-    instance_type: c7i.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.12xlarge.ephemeral:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: true
-    max_available: 300
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.16xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.24xlarge:
-    disk_size: 150
-    instance_type: c5.24xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.2xlarge:
-    disk_size: 150
-    instance_type: c5.2xlarge
-    is_ephemeral: false
-    max_available: 3120
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.4xlarge:
-    disk_size: 150
-    instance_type: c5.4xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.4xlarge
-    is_ephemeral: false
-    max_available: 520
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.8xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.8xlarge
-    is_ephemeral: false
-    max_available: 400
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.g4dn.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.12xlarge
-    is_ephemeral: false
-    max_available: 50
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.g4dn.metal.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.metal
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.g5.48xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.48xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.g5.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.12xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.g5.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 1200
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.large:
-    disk_size: 15
-    instance_type: c5.large
-    is_ephemeral: false
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.arm64.2xlarge:
-    disk_size: 256
-    instance_type: t4g.2xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.arm64.m7g.2xlarge:
-    disk_size: 256
-    instance_type: m7g.2xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -27,9 +27,11 @@
  - third_party/onnx
  - caffe2/python/onnx/**
  approved_by:
+  - BowenBao
  - justinchuby
  - liqunfu
  - shubhambhokare1
+  - thiagocrepaldi
  - titaiwangms
  - wschin
  - xadupre
@ -242,9 +244,7 @@
  - torch/csrc/xpu/**
  - torch/xpu/**
  - test/xpu/**
-  - test/test_xpu.py
  - third_party/xpu.txt
-  - .ci/docker/ci_commit_pins/triton-xpu.txt
  approved_by:
  - EikanWang
  - jgong5
@ -286,7 +286,6 @@
  - test/cpp/dist_autograd/**
  - test/cpp/rpc/**
  approved_by:
-  - wconstab
  - mrshenli
  - pritamdamania87
  - zhaojuanmao
@ -313,25 +312,6 @@
  - Lint
  - pull

- name: DCP
-  patterns:
-  - torch/distributed/checkpoint/**
-  approved_by:
-  - LucasLLC
-  - fegin
-  - wz337
-  - saumishr
-  - daulet-askarov
-  - pradeepdfb
-  - kirtiteja
-  - mhorowitz
-  - saiteja64
-  mandatory_checks_name:
-  - EasyCLA
-  - Lint
-  - pull
-
-
 - name: IDEEP
  patterns:
  - third_party/ideep
@ -395,21 +375,13 @@

 - name: CPU inductor
  patterns:
-  - torch/_inductor/mkldnn_ir.py
  - torch/_inductor/mkldnn_lowerings.py
  - torch/_inductor/fx_passes/mkldnn_fusion.py
  - torch/_inductor/fx_passes/quantization.py
-  - torch/_inductor/codegen/cpp_prefix.h
  - torch/_inductor/codegen/cpp.py
-  - torch/_inductor/codegen/cpp_utils.py
-  - torch/_inductor/codegen/cpp_micro_gemm.py
-  - torch/_inductor/codegen/cpp_template_kernel.py
-  - torch/_inductor/codegen/cpp_template.py
-  - torch/_inductor/codegen/cpp_gemm_template.py
  - test/inductor/test_mkldnn_pattern_matcher.py
-  - test/inductor/test_cpu_repro.py
+  - test/inductor/test_cpu_repo.py
  - test/inductor/test_cpu_cpp_wrapper.py
-  - test/inductor/test_cpu_select_algorithm.py
  - aten/src/ATen/cpu/**
  - aten/src/ATen/native/quantized/cpu/**
  - test/quantization/core/test_quantized_op.py
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -1,5 +1,6 @@
 tracking_issue: 24422
 ciflow_tracking_issue: 64124
+TD_rollout_issue: 123120
 ciflow_push_tags:
 - ciflow/binaries
 - ciflow/binaries_conda
@ -8,7 +9,6 @@ ciflow_push_tags:
 - ciflow/inductor
 - ciflow/inductor-perf-compare
 - ciflow/inductor-micro-benchmark
- ciflow/inductor-cu124
 - ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
@ -20,10 +20,10 @@ ciflow_push_tags:
 - ciflow/xpu
 - ciflow/torchbench
 retryable_workflows:
+- lint
 - pull
 - trunk
 - linux-binary
 - windows-binary
 labeler_config: labeler.yml
 label_to_label_config: label_to_label.yml
-mergebot: True
--- a/.github/requirements-gha-cache.txt
+++ b/.github/requirements-gha-cache.txt
@ -10,6 +10,6 @@ lintrunner==0.10.7
 ninja==1.10.0.post1
 nvidia-ml-py==11.525.84
 pyyaml==6.0
-requests==2.32.2
+requests==2.31.0
 rich==10.9.0
 rockset==1.0.3
--- a/.github/requirements/conda-env-Linux-X64.txt
+++ b/.github/requirements/conda-env-Linux-X64.txt
@ -4,5 +4,6 @@ mkl-include=2022.1.0
 ninja=1.10.2
 numpy=1.23.3
 pyyaml=6.0
+requests=2.31.0
 setuptools=68.2.2
-typing-extensions=4.9.0
+typing-extensions=4.3.0
--- a/.github/requirements/conda-env-iOS.txt
+++ b/.github/requirements/conda-env-iOS.txt
@ -3,5 +3,6 @@ cmake=3.22.1
 ninja=1.10.2
 numpy=1.23.3
 pyyaml=6.0
+requests=2.31.0
 setuptools=68.2.2
-typing-extensions=4.9.0
+typing-extensions=4.3.0
--- a/.github/requirements/conda-env-macOS-ARM64
+++ b/.github/requirements/conda-env-macOS-ARM64
@ -2,7 +2,7 @@ numpy=1.22.3
 pyyaml=6.0
 setuptools=61.2.0
 cmake=3.22.*
-typing-extensions=4.9.0
+typing-extensions=4.3.0
 dataclasses=0.8
 pip=22.2.2
 pillow=10.0.1
--- a/.github/requirements/conda-env-macOS-X64
+++ b/.github/requirements/conda-env-macOS-X64
@ -4,7 +4,7 @@ numpy=1.21.2
 pyyaml=5.3
 setuptools=46.0.0
 cmake=3.22.*
-typing-extensions=4.9.0
+typing-extensions=4.3.0
 dataclasses=0.8
 pip=22.2.2
 pillow=10.0.1
--- a/.github/requirements/pip-requirements-iOS.txt
+++ b/.github/requirements/pip-requirements-iOS.txt
@ -1,4 +1,4 @@
 # iOS simulator requirements
 coremltools==5.0b5
 protobuf==3.20.2
-optree==0.12.1
+optree==0.11.0
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -17,16 +17,16 @@ pytest-xdist==3.3.1
 pytest-rerunfailures==10.3
 pytest-flakefinder==1.1.0
 scipy==1.10.1
-sympy==1.12.1 ; python_version == "3.8"
-sympy>=1.13.0 ; python_version >= "3.9"
+sympy==1.11.1
 unittest-xml-reporting<=3.2.0,>=2.0.0
 xdoctest==1.1.0
 filelock==3.6.0
+sympy==1.11.1
 pytest-cpp==2.3.0
 rockset==1.0.3
 z3-solver==4.12.2.0
 tensorboard==2.13.0
-optree==0.12.1
+optree==0.11.0
 # NB: test_hparams_* from test_tensorboard is failing with protobuf 5.26.0 in
 # which the stringify metadata is wrong when escaping double quote
 protobuf==3.20.2
--- a/.github/scripts/amd/package_triton_wheel.sh
+++ b/.github/scripts/amd/package_triton_wheel.sh
@ -93,8 +93,6 @@ done

 # Copy Include Files
 cp -r $ROCM_HOME/include/hip $TRITON_ROCM_DIR/include
-cp -r $ROCM_HOME/include/roctracer $TRITON_ROCM_DIR/include
-cp -r $ROCM_HOME/include/hsa $TRITON_ROCM_DIR/include

 # Copy linker
 mkdir -p $TRITON_ROCM_DIR/llvm/bin
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 import os
 import shutil
 import sys
@ -8,17 +7,12 @@ from subprocess import check_call
 from tempfile import TemporaryDirectory
 from typing import Optional

-
 SCRIPT_DIR = Path(__file__).parent
 REPO_DIR = SCRIPT_DIR.parent.parent


-def read_triton_pin(device: str = "cuda") -> str:
-    triton_file = "triton.txt"
-    if device == "rocm":
-        triton_file = "triton-rocm.txt"
-    elif device == "xpu":
-        triton_file = "triton-xpu.txt"
+def read_triton_pin(rocm_hash: bool = False) -> str:
+    triton_file = "triton.txt" if not rocm_hash else "triton-rocm.txt"
    with open(REPO_DIR / ".ci" / "docker" / "ci_commit_pins" / triton_file) as f:
        return f.read().strip()

@ -55,7 +49,7 @@ def build_triton(
    version: str,
    commit_hash: str,
    build_conda: bool = False,
-    device: str = "cuda",
+    build_rocm: bool = False,
    py_version: Optional[str] = None,
    release: bool = False,
 ) -> Path:
@ -75,14 +69,11 @@ def build_triton(
        triton_basedir = Path(tmpdir) / "triton"
        triton_pythondir = triton_basedir / "python"
        triton_repo = "https://github.com/openai/triton"
-        if device == "rocm":
+        if build_rocm:
            triton_pkg_name = "pytorch-triton-rocm"
-        elif device == "xpu":
-            triton_pkg_name = "pytorch-triton-xpu"
-            triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton"
        else:
            triton_pkg_name = "pytorch-triton"
-        check_call(["git", "clone", triton_repo, "triton"], cwd=tmpdir)
+        check_call(["git", "clone", triton_repo], cwd=tmpdir)
        if release:
            ver, rev, patch = version.split(".")
            check_call(
@ -149,7 +140,7 @@ def build_triton(
            expected_version=None,
        )

-        if device == "rocm":
+        if build_rocm:
            check_call(
                [f"{SCRIPT_DIR}/amd/package_triton_wheel.sh"],
                cwd=triton_basedir,
@ -164,7 +155,7 @@ def build_triton(
        whl_path = next(iter((triton_pythondir / "dist").glob("*.whl")))
        shutil.copy(whl_path, Path.cwd())

-        if device == "rocm":
+        if build_rocm:
            check_call(
                [f"{SCRIPT_DIR}/amd/patch_triton_wheel.sh", Path.cwd()],
                cwd=triton_basedir,
@ -179,19 +170,17 @@ def main() -> None:
    parser = ArgumentParser("Build Triton binaries")
    parser.add_argument("--release", action="store_true")
    parser.add_argument("--build-conda", action="store_true")
-    parser.add_argument(
-        "--device", type=str, default="cuda", choices=["cuda", "rocm", "xpu"]
-    )
+    parser.add_argument("--build-rocm", action="store_true")
    parser.add_argument("--py-version", type=str)
    parser.add_argument("--commit-hash", type=str)
    parser.add_argument("--triton-version", type=str, default=read_triton_version())
    args = parser.parse_args()

    build_triton(
-        device=args.device,
+        build_rocm=args.build_rocm,
        commit_hash=args.commit_hash
        if args.commit_hash
-        else read_triton_pin(args.device),
+        else read_triton_pin(args.build_rocm),
        version=args.triton_version,
        build_conda=args.build_conda,
        py_version=args.py_version,
--- a/.github/scripts/check_labels.py
+++ b/.github/scripts/check_labels.py
@ -5,6 +5,7 @@ import sys
 from typing import Any

 from github_utils import gh_delete_comment, gh_post_pr_comment
+
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
 from label_utils import has_required_labels, is_label_err_comment, LABEL_ERR_MSG
 from trymerge import GitHubPR
--- a/.github/scripts/cherry_pick.py
+++ b/.github/scripts/cherry_pick.py
@ -3,10 +3,12 @@
 import json
 import os
 import re
-from typing import Any, cast, Dict, List, Optional
+from typing import Any, Optional
+
 from urllib.error import HTTPError

-from github_utils import gh_fetch_url, gh_post_pr_comment, gh_query_issues_by_labels
+from github_utils import gh_fetch_url, gh_post_pr_comment
+
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
 from trymerge import get_pr_commit_sha, GitHubPR

@ -17,7 +19,6 @@ REQUIRES_ISSUE = {
    "critical",
    "fixnewfeature",
 }
-RELEASE_BRANCH_REGEX = re.compile(r"release/(?P<version>.+)")


 def parse_args() -> Any:
@ -57,33 +58,6 @@ def get_merge_commit_sha(repo: GitRepo, pr: GitHubPR) -> Optional[str]:
    return commit_sha if pr.is_closed() else None


-def get_release_version(onto_branch: str) -> Optional[str]:
-    """
-    Return the release version if the target branch is a release branch
-    """
-    m = re.match(RELEASE_BRANCH_REGEX, onto_branch)
-    return m.group("version") if m else ""
-
-
-def get_tracker_issues(
-    org: str, project: str, onto_branch: str
-) -> List[Dict[str, Any]]:
-    """
-    Find the tracker issue from the repo. The tracker issue needs to have the title
-    like [VERSION] Release Tracker following the convention on PyTorch
-    """
-    version = get_release_version(onto_branch)
-    if not version:
-        return []
-
-    tracker_issues = gh_query_issues_by_labels(org, project, labels=["release tracker"])
-    if not tracker_issues:
-        return []
-
-    # Figure out the tracker issue from the list by looking at the title
-    return [issue for issue in tracker_issues if version in issue.get("title", "")]
-
-
 def cherry_pick(
    github_actor: str,
    repo: GitRepo,
@ -103,49 +77,17 @@ def cherry_pick(
    )

    try:
-        org, project = repo.gh_owner_and_name()
-
-        cherry_pick_pr = ""
        if not dry_run:
+            org, project = repo.gh_owner_and_name()
            cherry_pick_pr = submit_pr(repo, pr, cherry_pick_branch, onto_branch)

-        tracker_issues_comments = []
-        tracker_issues = get_tracker_issues(org, project, onto_branch)
-        for issue in tracker_issues:
-            issue_number = int(str(issue.get("number", "0")))
-            if not issue_number:
-                continue
+            msg = f"The cherry pick PR is at {cherry_pick_pr}"
+            if fixes:
+                msg += f" and it is linked with issue {fixes}"
+            elif classification in REQUIRES_ISSUE:
+                msg += f" and it is recommended to link a {classification} cherry pick PR with an issue"

-            res = cast(
-                Dict[str, Any],
-                post_tracker_issue_comment(
-                    org,
-                    project,
-                    issue_number,
-                    pr.pr_num,
-                    cherry_pick_pr,
-                    classification,
-                    fixes,
-                    dry_run,
-                ),
-            )
-
-            comment_url = res.get("html_url", "")
-            if comment_url:
-                tracker_issues_comments.append(comment_url)
-
-        msg = f"The cherry pick PR is at {cherry_pick_pr}"
-        if fixes:
-            msg += f" and it is linked with issue {fixes}."
-        elif classification in REQUIRES_ISSUE:
-            msg += f" and it is recommended to link a {classification} cherry pick PR with an issue."
-
-        if tracker_issues_comments:
-            msg += " The following tracker issues are updated:\n"
-            for tracker_issues_comment in tracker_issues_comments:
-                msg += f"* {tracker_issues_comment}\n"
-
-        post_pr_comment(org, project, pr.pr_num, msg, dry_run)
+            post_comment(org, project, pr.pr_num, msg)

    finally:
        if current_branch:
@ -217,9 +159,7 @@ def submit_pr(
        raise RuntimeError(msg) from error


-def post_pr_comment(
-    org: str, project: str, pr_num: int, msg: str, dry_run: bool = False
-) -> List[Dict[str, Any]]:
+def post_comment(org: str, project: str, pr_num: int, msg: str) -> None:
    """
    Post a comment on the PR itself to point to the cherry picking PR when success
    or print the error when failure
@ -242,35 +182,7 @@ def post_pr_comment(
    comment = "\n".join(
        (f"### Cherry picking #{pr_num}", f"{msg}", "", f"{internal_debugging}")
    )
-    return gh_post_pr_comment(org, project, pr_num, comment, dry_run)
-
-
-def post_tracker_issue_comment(
-    org: str,
-    project: str,
-    issue_num: int,
-    pr_num: int,
-    cherry_pick_pr: str,
-    classification: str,
-    fixes: str,
-    dry_run: bool = False,
-) -> List[Dict[str, Any]]:
-    """
-    Post a comment on the tracker issue (if any) to record the cherry pick
-    """
-    comment = "\n".join(
-        (
-            "Link to landed trunk PR (if applicable):",
-            f"* https://github.com/{org}/{project}/pull/{pr_num}",
-            "",
-            "Link to release branch PR:",
-            f"* {cherry_pick_pr}",
-            "",
-            "Criteria Category:",
-            " - ".join((classification.capitalize(), fixes.capitalize())),
-        )
-    )
-    return gh_post_pr_comment(org, project, issue_num, comment, dry_run)
+    gh_post_pr_comment(org, project, pr_num, comment)


 def main() -> None:
@ -302,7 +214,7 @@ def main() -> None:

    except RuntimeError as error:
        if not args.dry_run:
-            post_pr_comment(org, project, pr_num, str(error))
+            post_comment(org, project, pr_num, str(error))
        else:
            raise error

--- a/.github/scripts/close_nonexistent_disable_issues.py
+++ b/.github/scripts/close_nonexistent_disable_issues.py
@ -10,7 +10,6 @@ import requests
 import rockset  # type: ignore[import]
 from gitutils import retries_decorator

-
 LOGS_QUERY = """
 with
    shas as (
--- a/.github/scripts/collect_ciflow_labels.py
+++ b/.github/scripts/collect_ciflow_labels.py
@ -1,12 +1,10 @@
 #!/usr/bin/env python3
-
 import sys
 from pathlib import Path
 from typing import Any, cast, Dict, List, Set

 import yaml

-
 GITHUB_DIR = Path(__file__).parent.parent


--- a/.github/scripts/convert_lintrunner_annotations_to_github.py
+++ b/.github/scripts/convert_lintrunner_annotations_to_github.py
@ -1,6 +1,7 @@
 import json
 import subprocess
 import sys
+
 from enum import Enum
 from pathlib import Path
 from typing import NamedTuple, Optional
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@ -2,14 +2,12 @@
 import os
 import re
 from datetime import datetime
-from functools import lru_cache
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Set

 from github_utils import gh_fetch_json_dict, gh_graphql
 from gitutils import GitRepo

-
 SEC_IN_DAY = 24 * 60 * 60
 CLOSED_PR_RETENTION = 30 * SEC_IN_DAY
 NO_PR_RETENTION = 1.5 * 365 * SEC_IN_DAY
@ -189,17 +187,6 @@ def get_recent_prs() -> Dict[str, Any]:
    return prs_by_branch_base


-@lru_cache(maxsize=1)
-def get_open_prs() -> List[Dict[str, Any]]:
-    return paginate_graphql(
-        GRAPHQL_OPEN_PRS,
-        {"owner": "pytorch", "repo": "pytorch"},
-        lambda data: False,
-        lambda res: res["data"]["repository"]["pullRequests"]["nodes"],
-        lambda res: res["data"]["repository"]["pullRequests"]["pageInfo"],
-    )
-
-
 def get_branches_with_magic_label_or_open_pr() -> Set[str]:
    pr_infos: List[Dict[str, Any]] = paginate_graphql(
        GRAPHQL_NO_DELETE_BRANCH_LABEL,
@ -209,7 +196,15 @@ def get_branches_with_magic_label_or_open_pr() -> Set[str]:
        lambda res: res["data"]["repository"]["label"]["pullRequests"]["pageInfo"],
    )

-    pr_infos.extend(get_open_prs())
+    pr_infos.extend(
+        paginate_graphql(
+            GRAPHQL_OPEN_PRS,
+            {"owner": "pytorch", "repo": "pytorch"},
+            lambda data: False,
+            lambda res: res["data"]["repository"]["pullRequests"]["nodes"],
+            lambda res: res["data"]["repository"]["pullRequests"]["pageInfo"],
+        )
+    )

    # Get the most recent PR for each branch base (group gh together)
    branch_bases = set()
@ -275,41 +270,5 @@ def delete_branches() -> None:
        delete_branch(git_repo, branch)


-def delete_old_ciflow_tags() -> None:
-    # Deletes ciflow tags if they are associated with a closed PR or a specific
-    # commit.  Lightweight tags don't have information about the date they were
-    # created, so we can't check how old they are.  The script just assumes that
-    # ciflow tags should be deleted regardless of creation date.
-    git_repo = GitRepo(str(REPO_ROOT), "origin", debug=True)
-
-    def delete_tag(tag: str) -> None:
-        print(f"Deleting tag {tag}")
-        ESTIMATED_TOKENS[0] += 1
-        delete_branch(git_repo, f"refs/tags/{tag}")
-
-    tags = git_repo._run_git("tag").splitlines()
-    open_pr_numbers = [x["number"] for x in get_open_prs()]
-
-    for tag in tags:
-        try:
-            if ESTIMATED_TOKENS[0] > 400:
-                print("Estimated tokens exceeded, exiting")
-                break
-            if not tag.startswith("ciflow/"):
-                continue
-            re_match_pr = re.match(r"^ciflow\/.*\/(\d{5,6})$", tag)
-            re_match_sha = re.match(r"^ciflow\/.*\/([0-9a-f]{40})$", tag)
-            if re_match_pr:
-                pr_number = int(re_match_pr.group(1))
-                if pr_number in open_pr_numbers:
-                    continue
-                delete_tag(tag)
-            elif re_match_sha:
-                delete_tag(tag)
-        except Exception as e:
-            print(f"Failed to check tag {tag}: {e}")
-
-
 if __name__ == "__main__":
    delete_branches()
-    delete_old_ciflow_tags()
--- a/.github/scripts/docathon-label-sync.py
+++ b/.github/scripts/docathon-label-sync.py
@ -1,52 +0,0 @@
-import os
-import re
-import sys
-
-from github import Github
-
-
-def main() -> None:
-    token = os.environ.get("GITHUB_TOKEN")
-
-    repo_owner = "pytorch"
-    repo_name = "pytorch"
-    pull_request_number = int(sys.argv[1])
-
-    g = Github(token)
-    repo = g.get_repo(f"{repo_owner}/{repo_name}")
-    pull_request = repo.get_pull(pull_request_number)
-    pull_request_body = pull_request.body
-    # PR without description
-    if pull_request_body is None:
-        return
-
-    # get issue number from the PR body
-    if not re.search(r"#\d{1,6}", pull_request_body):
-        print("The pull request does not mention an issue.")
-        return
-    issue_number = int(re.findall(r"#(\d{1,6})", pull_request_body)[0])
-    issue = repo.get_issue(issue_number)
-    issue_labels = issue.labels
-    docathon_label_present = any(
-        label.name == "docathon-h1-2024" for label in issue_labels
-    )
-
-    # if the issue has a docathon label, add all labels from the issue to the PR.
-    if not docathon_label_present:
-        print("The 'docathon-h1-2024' label is not present in the issue.")
-        return
-    pull_request_labels = pull_request.get_labels()
-    pull_request_label_names = [label.name for label in pull_request_labels]
-    issue_label_names = [label.name for label in issue_labels]
-    labels_to_add = [
-        label for label in issue_label_names if label not in pull_request_label_names
-    ]
-    if not labels_to_add:
-        print("The pull request already has the same labels.")
-        return
-    pull_request.add_to_labels(*labels_to_add)
-    print("Labels added to the pull request!")
-
-
-if __name__ == "__main__":
-    main()
--- a/.github/scripts/drci_mocks.json.gz
+++ b/.github/scripts/drci_mocks.json.gz
--- a/.github/scripts/ensure_actions_will_cancel.py
+++ b/.github/scripts/ensure_actions_will_cancel.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3

 import sys
+
 from pathlib import Path

 import yaml
--- a/.github/scripts/export_pytorch_labels.py
+++ b/.github/scripts/export_pytorch_labels.py
@ -14,6 +14,7 @@ import json
 from typing import Any

 import boto3  # type: ignore[import]
+
 from label_utils import gh_get_labels


--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -15,7 +15,6 @@ from urllib.request import Request, urlopen

 import yaml

-
 REENABLE_TEST_REGEX = "(?i)(Close(d|s)?|Resolve(d|s)?|Fix(ed|es)?) (#|https://github.com/pytorch/pytorch/issues/)([0-9]+)"

 PREFIX = "test-config/"
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -8,25 +8,22 @@ architectures:
    * CPU
    * Latest CUDA
    * Latest ROCM
-    * Latest XPU
 """

 import os
 from typing import Dict, List, Optional, Tuple

-
 CUDA_ARCHES = ["11.8", "12.1", "12.4"]


 CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1", "12.4": "12.4.0"}


-CUDA_ARCHES_CUDNN_VERSION = {"11.8": "9", "12.1": "9", "12.4": "9"}
+CUDA_ARCHES_CUDNN_VERSION = {"11.8": "8", "12.1": "8", "12.4": "8"}


 ROCM_ARCHES = ["6.0", "6.1"]

-XPU_ARCHES = ["xpu"]

 CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]

@ -34,50 +31,44 @@ CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]
 CPU_AARCH64_ARCH = ["cpu-aarch64"]


-CPU_S390X_ARCH = ["cpu-s390x"]
-
-
-CUDA_AARCH64_ARCH = ["cuda-aarch64"]
-
-
 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
    "11.8": (
        "nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "  # noqa: B950
        "nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "12.1": (
        "nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "  # noqa: B950
        "nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "12.4": (
        "nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
@ -135,16 +126,10 @@ def arch_type(arch_version: str) -> str:
        return "cuda"
    elif arch_version in ROCM_ARCHES:
        return "rocm"
-    elif arch_version in XPU_ARCHES:
-        return "xpu"
    elif arch_version in CPU_CXX11_ABI_ARCH:
        return "cpu-cxx11-abi"
    elif arch_version in CPU_AARCH64_ARCH:
        return "cpu-aarch64"
-    elif arch_version in CPU_S390X_ARCH:
-        return "cpu-s390x"
-    elif arch_version in CUDA_AARCH64_ARCH:
-        return "cuda-aarch64"
    else:  # arch_version should always be "cpu" in this case
        return "cpu"

@ -161,12 +146,9 @@ WHEEL_CONTAINER_IMAGES = {
        gpu_arch: f"pytorch/manylinux-builder:rocm{gpu_arch}-{DEFAULT_TAG}"
        for gpu_arch in ROCM_ARCHES
    },
-    "xpu": f"pytorch/manylinux2_28-builder:xpu-{DEFAULT_TAG}",
    "cpu": f"pytorch/manylinux-builder:cpu-{DEFAULT_TAG}",
    "cpu-cxx11-abi": f"pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-{DEFAULT_TAG}",
    "cpu-aarch64": f"pytorch/manylinuxaarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
-    "cpu-s390x": f"pytorch/manylinuxs390x-builder:cpu-s390x-{DEFAULT_TAG}",
-    "cuda-aarch64": f"pytorch/manylinuxaarch64-builder:cuda12.4-{DEFAULT_TAG}",
 }

 CONDA_CONTAINER_IMAGES = {
@ -223,11 +205,8 @@ def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
        "cpu": "cpu",
        "cpu-aarch64": "cpu",
        "cpu-cxx11-abi": "cpu-cxx11-abi",
-        "cpu-s390x": "cpu",
        "cuda": f"cu{gpu_arch_version.replace('.', '')}",
-        "cuda-aarch64": "cu124",
        "rocm": f"rocm{gpu_arch_version}",
-        "xpu": "xpu",
    }.get(gpu_arch_type, gpu_arch_version)


@ -307,11 +286,11 @@ def generate_libtorch_matrix(
                    "libtorch_variant": libtorch_variant,
                    "libtorch_config": abi_version if os == "windows" else "",
                    "devtoolset": abi_version if os != "windows" else "",
-                    "container_image": (
-                        LIBTORCH_CONTAINER_IMAGES[(arch_version, abi_version)]
-                        if os != "windows"
-                        else ""
-                    ),
+                    "container_image": LIBTORCH_CONTAINER_IMAGES[
+                        (arch_version, abi_version)
+                    ]
+                    if os != "windows"
+                    else "",
                    "package_type": "libtorch",
                    "build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{abi_version}".replace(
                        ".", "_"
@ -327,28 +306,24 @@ def generate_wheels_matrix(
    python_versions: Optional[List[str]] = None,
 ) -> List[Dict[str, str]]:
    package_type = "wheel"
-    if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
-        # NOTE: We only build manywheel packages for x86_64 and aarch64 and s390x linux
+    if os == "linux" or os == "linux-aarch64":
+        # NOTE: We only build manywheel packages for x86_64 and aarch64 linux
        package_type = "manywheel"

    if python_versions is None:
-        python_versions = FULL_PYTHON_VERSIONS + ["3.13"]
+        python_versions = FULL_PYTHON_VERSIONS

    if arches is None:
        # Define default compute archivectures
        arches = ["cpu"]
        if os == "linux":
-            arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
+            arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES
        elif os == "windows":
            arches += CUDA_ARCHES
        elif os == "linux-aarch64":
            # Only want the one arch as the CPU type is different and
            # uses different build/test scripts
-            arches = ["cpu-aarch64", "cuda-aarch64"]
-        elif os == "linux-s390x":
-            # Only want the one arch as the CPU type is different and
-            # uses different build/test scripts
-            arches = ["cpu-s390x"]
+            arches = ["cpu-aarch64"]

    ret: List[Dict[str, str]] = []
    for python_version in python_versions:
@ -359,24 +334,11 @@ def generate_wheels_matrix(
                if arch_version == "cpu"
                or arch_version == "cpu-cxx11-abi"
                or arch_version == "cpu-aarch64"
-                or arch_version == "cpu-s390x"
-                or arch_version == "cuda-aarch64"
-                or arch_version == "xpu"
                else arch_version
            )

-            # TODO: Enable python 3.13 on rocm, xpu, aarch64, windows
-            if (
-                gpu_arch_type in ["rocm", "xpu"] or os != "linux"
-            ) and python_version == "3.13":
-                continue
-
            # 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
-            if (
-                arch_version in ["12.4", "12.1", "11.8"]
-                and os == "linux"
-                or arch_version == "cuda-aarch64"
-            ):
+            if arch_version in ["12.4", "12.1", "11.8"] and os == "linux":
                ret.append(
                    {
                        "python_version": python_version,
@ -385,64 +347,15 @@ def generate_wheels_matrix(
                        "desired_cuda": translate_desired_cuda(
                            gpu_arch_type, gpu_arch_version
                        ),
-                        "devtoolset": (
-                            "cxx11-abi" if arch_version == "cuda-aarch64" else ""
-                        ),
+                        "devtoolset": "",
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
-                        "pytorch_extra_install_requirements": (
-                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]  # fmt: skip
-                            if os != "linux-aarch64"
-                            else ""
-                        ),
+                        "pytorch_extra_install_requirements": PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version],  # fmt: skip
                        "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(  # noqa: B950
                            ".", "_"
                        ),
                    }
                )
-                if arch_version != "cuda-aarch64":
-                    ret.append(
-                        {
-                            "python_version": python_version,
-                            "gpu_arch_type": gpu_arch_type,
-                            "gpu_arch_version": gpu_arch_version,
-                            "desired_cuda": translate_desired_cuda(
-                                gpu_arch_type, gpu_arch_version
-                            ),
-                            "use_split_build": "True",
-                            "devtoolset": "",
-                            "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
-                            "package_type": package_type,
-                            "pytorch_extra_install_requirements": (
-                                PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]  # fmt: skip
-                                if os != "linux-aarch64"
-                                else ""
-                            ),
-                            "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-split".replace(  # noqa: B950
-                                ".", "_"
-                            ),
-                        }
-                    )
-                    # Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
-                    if python_version == "3.10" and arch_version == "12.1":
-                        ret.append(
-                            {
-                                "python_version": python_version,
-                                "gpu_arch_type": gpu_arch_type,
-                                "gpu_arch_version": gpu_arch_version,
-                                "desired_cuda": translate_desired_cuda(
-                                    gpu_arch_type, gpu_arch_version
-                                ),
-                                "use_split_build": "False",
-                                "devtoolset": "",
-                                "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
-                                "package_type": package_type,
-                                "pytorch_extra_install_requirements": "",
-                                "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace(  # noqa: B950
-                                    ".", "_"
-                                ),
-                            }
-                        )
            else:
                ret.append(
                    {
@ -452,21 +365,17 @@ def generate_wheels_matrix(
                        "desired_cuda": translate_desired_cuda(
                            gpu_arch_type, gpu_arch_version
                        ),
-                        "devtoolset": (
-                            "cxx11-abi"
-                            if arch_version in ["cpu-cxx11-abi", "xpu"]
-                            else ""
-                        ),
+                        "devtoolset": "cxx11-abi"
+                        if arch_version == "cpu-cxx11-abi"
+                        else "",
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
                        "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
                            ".", "_"
                        ),
-                        "pytorch_extra_install_requirements": (
-                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.1"]  # fmt: skip
-                            if os != "linux"
-                            else ""
-                        ),
+                        "pytorch_extra_install_requirements":
+                        PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.1"]  # fmt: skip
+                        if os != "linux" else "",
                    }
                )
    return ret
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -5,11 +5,11 @@ import sys
 from dataclasses import asdict, dataclass, field
 from pathlib import Path
 from typing import Dict, Iterable, List, Literal, Set
-from typing_extensions import TypedDict  # Python 3.11+

 import generate_binary_build_matrix  # type: ignore[import]
-import jinja2

+import jinja2
+from typing_extensions import TypedDict  # Python 3.11+

 Arch = Literal["windows", "linux", "macos"]

@ -60,7 +60,7 @@ class BinaryBuildWorkflow:
    branches: str = "nightly"
    # Mainly for macos
    cross_compile_arm64: bool = False
-    macos_runner: str = "macos-14-xlarge"
+    macos_runner: str = "macos-12-xl"

    def __post_init__(self) -> None:
        if self.abi_version:
@ -95,7 +95,6 @@ class OperatingSystem:
    MACOS = "macos"
    MACOS_ARM64 = "macos-arm64"
    LINUX_AARCH64 = "linux-aarch64"
-    LINUX_S390X = "linux-s390x"


 LINUX_BINARY_BUILD_WORFKLOWS = [
@ -157,7 +156,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
        package_type="manywheel",
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
-            arches=["11.8", "12.1", "12.4"],
+            arches=["11.8", "12.1"],
            python_versions=["3.8"],
        ),
        branches="main",
@ -285,7 +284,7 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
            libtorch_variants=["shared-with-deps"],
        ),
        cross_compile_arm64=False,
-        macos_runner="macos-14-xlarge",
+        macos_runner="macos-13-xlarge",
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
            isolated_workflow=True,
@ -298,7 +297,7 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
            OperatingSystem.MACOS_ARM64
        ),
        cross_compile_arm64=False,
-        macos_runner="macos-14-xlarge",
+        macos_runner="macos-13-xlarge",
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
            isolated_workflow=True,
@ -308,7 +307,7 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
        os=OperatingSystem.MACOS_ARM64,
        package_type="conda",
        cross_compile_arm64=False,
-        macos_runner="macos-14-xlarge",
+        macos_runner="macos-13-xlarge",
        build_configs=generate_binary_build_matrix.generate_conda_matrix(
            OperatingSystem.MACOS_ARM64
        ),
@ -333,20 +332,6 @@ AARCH64_BINARY_BUILD_WORKFLOWS = [
    ),
 ]

-S390X_BINARY_BUILD_WORKFLOWS = [
-    BinaryBuildWorkflow(
-        os=OperatingSystem.LINUX_S390X,
-        package_type="manywheel",
-        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-            OperatingSystem.LINUX_S390X
-        ),
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
-            isolated_workflow=True,
-        ),
-    ),
-]
-

 def main() -> None:
    jinja_env = jinja2.Environment(
@ -365,10 +350,6 @@ def main() -> None:
            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
            AARCH64_BINARY_BUILD_WORKFLOWS,
        ),
-        (
-            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
-            S390X_BINARY_BUILD_WORKFLOWS,
-        ),
        (
            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
            LINUX_BINARY_SMOKE_WORKFLOWS,
--- a/.github/scripts/generate_docker_release_matrix.py
+++ b/.github/scripts/generate_docker_release_matrix.py
@ -16,7 +16,6 @@ from typing import Dict, List

 import generate_binary_build_matrix

-
 DOCKER_IMAGE_TYPES = ["runtime", "devel"]


--- a/.github/scripts/generate_pytorch_version.py
+++ b/.github/scripts/generate_pytorch_version.py
@ -4,11 +4,11 @@ import argparse
 import os
 import re
 import subprocess
+
 from datetime import datetime
 from distutils.util import strtobool
 from pathlib import Path

-
 LEADING_V_PATTERN = re.compile("^v")
 TRAILING_RC_PATTERN = re.compile("-rc[0-9]*$")
 LEGACY_BASE_VERSION_SUFFIX_PATTERN = re.compile("a0$")
--- a/.github/scripts/get_workflow_job_id.py
+++ b/.github/scripts/get_workflow_job_id.py
@ -11,6 +11,7 @@ import sys
 import time
 import urllib
 import urllib.parse
+
 from typing import Any, Callable, Dict, List, Optional, Tuple
 from urllib.request import Request, urlopen

--- a/.github/scripts/get_workflow_type.py
+++ b/.github/scripts/get_workflow_type.py
@ -0,0 +1,99 @@
+import json
+from argparse import ArgumentParser
+from typing import Any
+
+from github import Auth, Github
+from github.Issue import Issue
+
+
+WORKFLOW_TYPE_LABEL = "label"
+WORKFLOW_TYPE_RG = "rg"
+WORKFLOW_TYPE_BOTH = "both"
+
+
+def parse_args() -> Any:
+    parser = ArgumentParser("Get dynamic rollout settings")
+    parser.add_argument("--github-token", type=str, required=True, help="GitHub token")
+    parser.add_argument(
+        "--github-repo",
+        type=str,
+        required=False,
+        default="pytorch/test-infra",
+        help="GitHub repo to get the issue",
+    )
+    parser.add_argument(
+        "--github-issue", type=int, required=True, help="GitHub issue umber"
+    )
+    parser.add_argument(
+        "--github-user", type=str, required=True, help="GitHub username"
+    )
+    parser.add_argument(
+        "--github-branch", type=str, required=True, help="Current GitHub branch"
+    )
+
+    return parser.parse_args()
+
+
+def get_gh_client(github_token: str) -> Github:
+    auth = Auth.Token(github_token)
+    return Github(auth=auth)
+
+
+def get_issue(gh: Github, repo: str, issue_num: int) -> Issue:
+    repo = gh.get_repo(repo)
+    return repo.get_issue(number=issue_num)
+
+
+def is_exception_branch(branch: str) -> bool:
+    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
+
+
+def get_workflow_type(issue: Issue, username: str) -> str:
+    user_list = issue.get_comments()[0].body.split("\r\n")
+    try:
+        run_option = issue.get_comments()[1].body.split("\r\n")[0]
+    except Exception as e:
+        run_option = "single"
+
+    if user_list[0] == "!":
+        # Use old runners for everyone
+        return WORKFLOW_TYPE_LABEL
+    elif user_list[1] == "*":
+        if run_option == WORKFLOW_TYPE_BOTH:
+            # Use ARC runners and old runners for everyone
+            return WORKFLOW_TYPE_BOTH
+        else:
+            # Use only ARC runners for everyone
+            return WORKFLOW_TYPE_RG
+    elif username in user_list:
+        if run_option == WORKFLOW_TYPE_BOTH:
+            # Use ARC runners and old runners for a specific user
+            return WORKFLOW_TYPE_BOTH
+        else:
+            # Use only ARC runners for a specific user
+            return WORKFLOW_TYPE_RG
+    else:
+        # Use old runners by default
+        return WORKFLOW_TYPE_LABEL
+
+
+def main() -> None:
+    args = parse_args()
+
+    if is_exception_branch(args.github_branch):
+        output = {"workflow_type": WORKFLOW_TYPE_LABEL}
+    else:
+        try:
+            gh = get_gh_client(args.github_token)
+            issue = get_issue(gh, args.github_repo, args.github_issue)
+
+            output = {"workflow_type": get_workflow_type(issue, args.github_user)}
+        except Exception as e:
+            output = {"workflow_type": WORKFLOW_TYPE_LABEL}
+
+    json_output = json.dumps(output)
+    print(json_output)
+
+
+if __name__ == "__main__":
+    main()
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -3,6 +3,7 @@
 import json
 import os
 import warnings
+
 from dataclasses import dataclass
 from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union
 from urllib.error import HTTPError
@ -201,12 +202,3 @@ def gh_update_pr_state(org: str, repo: str, pr_num: int, state: str = "open") ->
            )
        else:
            raise
-
-
-def gh_query_issues_by_labels(
-    org: str, repo: str, labels: List[str], state: str = "open"
-) -> List[Dict[str, Any]]:
-    url = f"{GITHUB_API_URL}/repos/{org}/{repo}/issues"
-    return gh_fetch_json(
-        url, method="GET", params={"labels": ",".join(labels), "state": state}
-    )
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@ -19,7 +19,6 @@ from typing import (
    Union,
 )

-
 T = TypeVar("T")

 RE_GITHUB_URL_MATCH = re.compile("^https://.*@?github.com/(.+)/(.+)$")
--- a/.github/scripts/gql_mocks.json.gz
+++ b/.github/scripts/gql_mocks.json.gz
--- a/.github/scripts/label_utils.py
+++ b/.github/scripts/label_utils.py
@ -1,12 +1,12 @@
 """GitHub Label Utilities."""

 import json
+
 from functools import lru_cache
 from typing import Any, List, Tuple, TYPE_CHECKING, Union

 from github_utils import gh_fetch_url_and_headers, GitHubComment

-
 # TODO: this is a temp workaround to avoid circular dependencies,
 #       and should be removed once GitHubPR is refactored out of trymerge script.
 if TYPE_CHECKING:
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -7,7 +7,7 @@ eval "$(command conda 'shell.bash' 'hook' 2> /dev/null)"
 conda activate "${CONDA_ENV}"

 # Use uv to speed up lintrunner init
-python3 -m pip install uv==0.1.45
+python3 -m pip install uv

 CACHE_DIRECTORY="/tmp/.lintbin"
 # Try to recover the cached binaries
@ -29,7 +29,6 @@ python3 -m tools.pyi.gen_pyi \
    --native-functions-path aten/src/ATen/native/native_functions.yaml \
    --tags-path aten/src/ATen/native/tags.yaml \
    --deprecated-functions-path "tools/autograd/deprecated.yaml"
-python3 torch/utils/data/datapipes/gen_pyi.py

 RC=0
 # Run lintrunner on all files
--- a/.github/scripts/pytest_cache.py
+++ b/.github/scripts/pytest_cache.py
@ -9,7 +9,6 @@ from pytest_caching_utils import (
    upload_pytest_cache,
 )

-
 TEMP_DIR = "./tmp"  # a backup location in case one isn't provided


--- a/.github/scripts/pytest_caching_utils.py
+++ b/.github/scripts/pytest_caching_utils.py
@ -14,12 +14,10 @@ from file_io_utils import (
    zip_folder,
 )

-
 PYTEST_CACHE_KEY_PREFIX = "pytest_cache"
 PYTEST_CACHE_DIR_NAME = ".pytest_cache"
 BUCKET = "gha-artifacts"
 LASTFAILED_FILE_PATH = Path("v/cache/lastfailed")
-TD_HEURISTIC_PREVIOUSLY_FAILED_ADDITIONAL = "previous_failures_additional.json"

 # Temp folders
 ZIP_UPLOAD = "zip-upload"
@ -193,10 +191,6 @@ def _merge_pytest_caches(
        pytest_cache_dir_to_merge_from, pytest_cache_dir_to_merge_into
    )

-    _merge_additional_failures_files(
-        pytest_cache_dir_to_merge_from, pytest_cache_dir_to_merge_into
-    )
-

 def _merge_lastfailed_files(source_pytest_cache: Path, dest_pytest_cache: Path) -> None:
    # Simple cases where one of the files doesn't exist
@ -238,27 +232,3 @@ def _merged_lastfailed_content(
            del to_lastfailed[""]

    return to_lastfailed
-
-
-def _merge_additional_failures_files(
-    source_pytest_cache: Path, dest_pytest_cache: Path
-) -> None:
-    # Simple cases where one of the files doesn't exist
-    source_lastfailed_file = (
-        source_pytest_cache / TD_HEURISTIC_PREVIOUSLY_FAILED_ADDITIONAL
-    )
-    dest_lastfailed_file = dest_pytest_cache / TD_HEURISTIC_PREVIOUSLY_FAILED_ADDITIONAL
-
-    if not source_lastfailed_file.exists():
-        return
-    if not dest_lastfailed_file.exists():
-        copy_file(source_lastfailed_file, dest_lastfailed_file)
-        return
-
-    # Both files exist, so we need to merge them
-    from_lastfailed = load_json_file(source_lastfailed_file)
-    to_lastfailed = load_json_file(dest_lastfailed_file)
-    merged_content = list(set(from_lastfailed + to_lastfailed))
-
-    # Save the results
-    write_json_file(dest_lastfailed_file, merged_content)
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@ -1,215 +0,0 @@
-# flake8: noqa: G004
-
-import logging
-import os
-from argparse import ArgumentParser
-from logging import LogRecord
-from typing import Any, Iterable
-
-from github import Auth, Github
-from github.Issue import Issue
-
-
-WORKFLOW_LABEL_META = ""  # use meta runners
-WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
-WORKFLOW_LABEL_LF_CANARY = "lf.c."  # use canary runners from the linux foundation
-
-GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
-GH_OUTPUT_KEY_LABEL_TYPE = "label-type"
-
-
-class ColorFormatter(logging.Formatter):
-    """Color codes the log messages based on the log level"""
-
-    COLORS = {
-        "WARNING": "\033[33m",  # Yellow
-        "ERROR": "\033[31m",  # Red
-        "CRITICAL": "\033[31m",  # Red
-        "INFO": "\033[0m",  # Reset
-        "DEBUG": "\033[0m",  # Reset
-    }
-
-    def format(self, record: LogRecord) -> str:
-        log_color = self.COLORS.get(record.levelname, "\033[0m")  # Default to reset
-        record.msg = f"{log_color}{record.msg}\033[0m"
-        return super().format(record)
-
-
-handler = logging.StreamHandler()
-handler.setFormatter(ColorFormatter(fmt="%(levelname)-8s: %(message)s"))
-
-log = logging.getLogger(os.path.basename(__file__))
-log.addHandler(handler)
-log.setLevel(logging.INFO)
-
-
-def set_github_output(key: str, value: str) -> None:
-    """
-    Defines outputs of the github action that invokes this script
-    """
-    if not GITHUB_OUTPUT:
-        # See https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ for deprecation notice
-        log.warning(
-            "No env var found for GITHUB_OUTPUT, you must be running this code locally. Falling back to the deprecated print method."
-        )
-        print(f"::set-output name={key}::{value}")
-        return
-
-    with open(GITHUB_OUTPUT, "a") as f:
-        log.info(f"Setting output: {key}='{value}'")
-        f.write(f"{key}={value}\n")
-
-
-def parse_args() -> Any:
-    parser = ArgumentParser("Get dynamic rollout settings")
-    parser.add_argument("--github-token", type=str, required=True, help="GitHub token")
-    parser.add_argument(
-        "--github-issue-repo",
-        type=str,
-        required=False,
-        default="pytorch/test-infra",
-        help="GitHub repo to get the issue",
-    )
-    parser.add_argument(
-        "--github-repo",
-        type=str,
-        required=True,
-        help="GitHub repo where CI is running",
-    )
-    parser.add_argument(
-        "--github-issue", type=int, required=True, help="GitHub issue number"
-    )
-    parser.add_argument(
-        "--github-actor", type=str, required=True, help="GitHub triggering_actor"
-    )
-    parser.add_argument(
-        "--github-issue-owner", type=str, required=True, help="GitHub issue owner"
-    )
-    parser.add_argument(
-        "--github-branch", type=str, required=True, help="Current GitHub branch or tag"
-    )
-    parser.add_argument(
-        "--github-ref-type",
-        type=str,
-        required=True,
-        help="Current GitHub ref type, branch or tag",
-    )
-
-    return parser.parse_args()
-
-
-def get_gh_client(github_token: str) -> Github:
-    auth = Auth.Token(github_token)
-    return Github(auth=auth)
-
-
-def get_issue(gh: Github, repo: str, issue_num: int) -> Issue:
-    repo = gh.get_repo(repo)
-    return repo.get_issue(number=issue_num)
-
-
-def get_potential_pr_author(
-    gh: Github, repo: str, username: str, ref_type: str, ref_name: str
-) -> str:
-    # If the trigger was a new tag added by a bot, this is a ciflow case
-    # Fetch the actual username from the original PR. The PR number is
-    # embedded in the tag name: ciflow/<name>/<pr-number>
-    if username == "pytorch-bot[bot]" and ref_type == "tag":
-        split_tag = ref_name.split("/")
-        if (
-            len(split_tag) == 3
-            and split_tag[0] == "ciflow"
-            and split_tag[2].isnumeric()
-        ):
-            pr_number = split_tag[2]
-            try:
-                repository = gh.get_repo(repo)
-                pull = repository.get_pull(number=int(pr_number))
-            except Exception as e:
-                raise Exception(  # noqa: TRY002
-                    f"issue with pull request {pr_number} from repo {repository}"
-                ) from e
-            return pull.user.login
-    # In all other cases, return the original input username
-    return username
-
-
-def is_exception_branch(branch: str) -> bool:
-    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
-
-
-def get_workflow_type(issue: Issue, workflow_requestors: Iterable[str]) -> str:
-    try:
-        first_comment = issue.get_comments()[0].body.strip("\n\t ")
-
-        if first_comment[0] == "!":
-            log.info("LF Workflows are disabled for everyone. Using meta runners.")
-            return WORKFLOW_LABEL_META
-        elif first_comment[0] == "*":
-            log.info("LF Workflows are enabled for everyone. Using LF runners.")
-            return WORKFLOW_LABEL_LF
-        else:
-            all_opted_in_users = {
-                usr_raw.strip("\n\t@ ") for usr_raw in first_comment.split()
-            }
-            opted_in_requestors = {
-                usr for usr in workflow_requestors if usr in all_opted_in_users
-            }
-            if opted_in_requestors:
-                log.info(
-                    f"LF Workflows are enabled for {', '.join(opted_in_requestors)}. Using LF runners."
-                )
-                return WORKFLOW_LABEL_LF
-            else:
-                log.info(
-                    f"LF Workflows are disabled for {', '.join(workflow_requestors)}. Using meta runners."
-                )
-                return WORKFLOW_LABEL_META
-
-    except Exception as e:
-        log.error(
-            f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
-        )
-        return WORKFLOW_LABEL_META
-
-
-def main() -> None:
-    args = parse_args()
-
-    if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
-        log.info(f"Exception branch: '{args.github_branch}', using meta runners")
-        label_type = WORKFLOW_LABEL_META
-    else:
-        try:
-            gh = get_gh_client(args.github_token)
-            # The default issue we use - https://github.com/pytorch/test-infra/issues/5132
-            issue = get_issue(gh, args.github_issue_repo, args.github_issue)
-            username = get_potential_pr_author(
-                gh,
-                args.github_repo,
-                args.github_actor,
-                args.github_ref_type,
-                args.github_branch,
-            )
-            label_type = get_workflow_type(
-                issue,
-                (
-                    args.github_issue_owner,
-                    username,
-                ),
-            )
-        except Exception as e:
-            log.error(
-                f"Failed to get issue. Falling back to meta runners. Exception: {e}"
-            )
-            label_type = WORKFLOW_LABEL_META
-
-    # For Canary builds use canary runners
-    if args.github_repo == "pytorch/pytorch-canary" and label_type == WORKFLOW_LABEL_LF:
-        label_type = WORKFLOW_LABEL_LF_CANARY
-
-    set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
-
-
-if __name__ == "__main__":
-    main()
--- a/.github/scripts/sync_distributed_folder_prototype.sh
+++ b/.github/scripts/sync_distributed_folder_prototype.sh
@ -1,35 +0,0 @@
-#!/bin/bash
-
-set -eoux pipefail
-
-SYNC_BRANCH=pytorch-stable-prototype
-
-git config user.email "fake@example.com"
-git config user.name  "PyTorch Stable Bot"
-
-git fetch origin main
-git fetch origin "$SYNC_BRANCH"
-git checkout "$SYNC_BRANCH"
-
-# Using a hardcoded SHA here is a massive speedup as we can skip the entire history of the pytorch GitHub repo.
-# This specific SHA was chosen as it was before the "branch point" of the stable branch
-for SHA in $(git log ba3b05fdf37ddbc3c301294d6a560a816335e717..origin/main --pretty="%h" --reverse -- torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed)
-do
-    # `git merge-base --is-ancestor` exits with code 0 if the given SHA is an ancestor, and non-0 otherwise
-    if git merge-base --is-ancestor $SHA HEAD || [[ $(git log --grep="(cherry picked from commit $SHA") ]]
-    then
-        echo "Skipping $SHA"
-        continue
-    fi
-    echo "Copying $SHA"
-    git cherry-pick -x "$SHA" -X theirs
-    git reset --soft HEAD~1
-    git add torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed
-    git checkout .
-    git commit --reuse-message=HEAD@{1}
-    git clean -f
-done
-
-if [[ "${WITH_PUSH}" == true ]]; then
-  git push
-fi
--- a/.github/scripts/tag_docker_images_for_release.py
+++ b/.github/scripts/tag_docker_images_for_release.py
@ -41,7 +41,7 @@ def main() -> None:
    )

    options = parser.parse_args()
-    tagged_images: Dict[str, bool] = {}
+    tagged_images: Dict[str, bool] = dict()
    platform_images = [
        generate_binary_build_matrix.WHEEL_CONTAINER_IMAGES,
        generate_binary_build_matrix.LIBTORCH_CONTAINER_IMAGES,
--- a/.github/scripts/td_llm_indexer.sh
+++ b/.github/scripts/td_llm_indexer.sh
@ -7,7 +7,6 @@ cd llm-target-determinator
 pip install -q -r requirements.txt
 cd ../codellama
 pip install -e .
-pip install numpy==1.26.0

 # Run indexer
 cd ../llm-target-determinator
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -17,7 +17,9 @@ from unittest import main, mock, skip, TestCase
 from urllib.error import HTTPError

 from github_utils import gh_graphql
+
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
+
 from trymerge import (
    categorize_checks,
    DRCI_CHECKRUN_NAME,
@ -37,7 +39,6 @@ from trymerge import (
    validate_revert,
 )

-
 if "GIT_REMOTE_URL" not in os.environ:
    os.environ["GIT_REMOTE_URL"] = "https://github.com/pytorch/pytorch"

@ -179,9 +180,6 @@ def mock_gh_get_info() -> Any:
    return {
        "closed": False,
        "isCrossRepository": False,
-        "headRefName": "foo",
-        "baseRefName": "bar",
-        "baseRepository": {"defaultBranchRef": {"name": "bar"}},
        "files": {"nodes": [], "pageInfo": {"hasNextPage": False}},
        "changedFiles": 0,
    }
@ -396,7 +394,6 @@ class TestTryMerge(TestCase):
        # self.assertGreater(len(pr.get_checkrun_conclusions()), 3)
        self.assertGreater(pr.get_commit_count(), 60)

-    @skip("GitHub doesn't keep this data anymore")
    def test_gql_retrieve_checksuites(self, *args: Any) -> None:
        "Fetch comments and conclusions for PR with 60 commits"
        pr = GitHubPR("pytorch", "pytorch", 94787)
@ -776,13 +773,13 @@ class TestBypassFailures(TestCase):
                # than the one on the base commit. This should still count as broken trunk
                "pr_num": 104214,
                "related_failure_count": 0,
-                "flaky_or_broken_trunk": 1,
+                "unrelated_failure_count": 1,
            },
            {
                # This PR had one broken trunk failure and it used ghstack
                "pr_num": 105145,
                "related_failure_count": 0,
-                "flaky_or_broken_trunk": 1,
+                "unrelated_failure_count": 1,
            },
            {
                # The failure on the merge base was retried successfully and
@ -791,20 +788,20 @@ class TestBypassFailures(TestCase):
                # be used to detect broken trunk
                "pr_num": 107160,
                "related_failure_count": 0,
-                "flaky_or_broken_trunk": 1,
+                "unrelated_failure_count": 4,
            },
            {
                # This PR used Dr.CI broken trunk classification
                "pr_num": 111253,
                "related_failure_count": 1,
-                "flaky_or_broken_trunk": 1,
+                "unrelated_failure_count": 2,
            },
        ]

        for case in test_cases:
            pr_num = case["pr_num"]
            related_failure_count = case["related_failure_count"]
-            flaky_or_broken_trunk = case["flaky_or_broken_trunk"]
+            unrelated_failure_count = case["unrelated_failure_count"]

            pr = GitHubPR("pytorch", "pytorch", pr_num)
            checks = pr.get_checkrun_conclusions()
@ -826,7 +823,7 @@ class TestBypassFailures(TestCase):
            )
            self.assertTrue(len(pending) == 0)
            self.assertTrue(
-                len(failed) == flaky_or_broken_trunk + related_failure_count
+                len(failed) == unrelated_failure_count + related_failure_count
            )

    def test_ignore_current(self, *args: Any) -> None:
@ -894,24 +891,6 @@ class TestBypassFailures(TestCase):
        self.assertTrue(len(ignorable["FLAKY"]) == 1)
        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 0)

-    def test_ignore_failures_older_run_same_workflow(self, *args: Any) -> None:
-        pr = GitHubPR("pytorch", "pytorch", 129013)
-        checks = pr.get_checkrun_conclusions()
-        checks = get_classifications(
-            pr.pr_num,
-            pr.project,
-            checks,
-            [],
-        )
-        pending, failed, ignorable = categorize_checks(
-            checks,
-            list(checks.keys()),
-        )
-        self.assertTrue(len(pending) == 0)
-        self.assertTrue(len(failed) == 0)
-        self.assertTrue(len(ignorable["FLAKY"]) == 2)
-        self.assertTrue(len(ignorable["UNSTABLE"]) == 13)
-
    @mock.patch("trymerge.read_merge_rules", side_effect=xla_merge_rules)
    def test_dont_ignore_flaky_failures(self, *args: Any) -> None:
        """
@ -1040,7 +1019,7 @@ class TestGitHubPRGhstackDependencies(TestCase):
        )

    @skip(
-        reason="This test is run against a mutable PR that has changed, so it no longer works. The test should be changed"
+        reason="This test is run against a mutalbe PR that has changed, so it no longer works. The test should be changed"
    )
    @mock.patch("trymerge.read_merge_rules")
    @mock.patch("trymerge.GitRepo")
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -45,6 +45,7 @@ from github_utils import (
    gh_update_pr_state,
    GitHubComment,
 )
+
 from gitutils import (
    are_ghstack_branches_in_sync,
    get_git_remote_name,
@ -61,7 +62,6 @@ from label_utils import (
 )
 from trymerge_explainer import get_revert_message, TryMergeExplainer

-
 # labels
 MERGE_IN_PROGRESS_LABEL = "merging"
 MERGE_COMPLETE_LABEL = "merged"
@ -81,10 +81,9 @@ JobNameToStateDict = Dict[str, JobCheckState]


 class WorkflowCheckState:
-    def __init__(self, name: str, url: str, run_id: int, status: Optional[str]):
+    def __init__(self, name: str, url: str, status: Optional[str]):
        self.name: str = name
        self.url: str = url
-        self.run_id: int = run_id
        self.status: Optional[str] = status
        self.jobs: JobNameToStateDict = {}

@ -123,7 +122,6 @@ fragment PRCheckSuites on CheckSuiteConnection {
      workflowRun {
        workflow {
          name
-          databaseId
        }
        databaseId
        url
@ -514,7 +512,7 @@ def add_workflow_conclusions(
    workflows: Dict[str, WorkflowCheckState] = {}

    # for the jobs that don't have a workflow
-    no_workflow_obj: WorkflowCheckState = WorkflowCheckState("", "", 0, None)
+    no_workflow_obj: WorkflowCheckState = WorkflowCheckState("", "", None)

    def add_conclusions(edges: Any) -> None:
        for edge_idx, edge in enumerate(edges):
@ -525,30 +523,18 @@ def add_workflow_conclusions(
            workflow_obj: WorkflowCheckState = no_workflow_obj

            if workflow_run is not None:
-                # This is the usual workflow run ID we see on GitHub
-                workflow_run_id = workflow_run["databaseId"]
-                # While this is the metadata name and ID of the workflow itself
                workflow_name = workflow_run["workflow"]["name"]
-                workflow_id = workflow_run["workflow"]["databaseId"]
-
                workflow_conclusion = node["conclusion"]
                # Do not override existing status with cancelled
                if workflow_conclusion == "CANCELLED" and workflow_name in workflows:
                    continue
-
-                # Only keep the latest workflow run for each workflow, heuristically,
-                # it's the run with largest run ID
-                if (
-                    workflow_id not in workflows
-                    or workflows[workflow_id].run_id < workflow_run_id
-                ):
-                    workflows[workflow_id] = WorkflowCheckState(
+                if workflow_name not in workflows:
+                    workflows[workflow_name] = WorkflowCheckState(
                        name=workflow_name,
                        status=workflow_conclusion,
                        url=workflow_run["url"],
-                        run_id=workflow_run_id,
                    )
-                workflow_obj = workflows[workflow_id]
+                workflow_obj = workflows[workflow_name]

            while checkruns is not None:
                for checkrun_node in checkruns["nodes"]:
@ -586,12 +572,12 @@ def add_workflow_conclusions(
    # the jobs in but don't put the workflow in.  We care more about the jobs in
    # the workflow that ran than the container workflow.
    res: JobNameToStateDict = {}
-    for workflow in workflows.values():
+    for workflow_name, workflow in workflows.items():
        if len(workflow.jobs) > 0:
            for job_name, job in workflow.jobs.items():
                res[job_name] = job
        else:
-            res[workflow.name] = JobCheckState(
+            res[workflow_name] = JobCheckState(
                workflow.name,
                workflow.url,
                workflow.status,
@ -1177,6 +1163,7 @@ class GitHubPR:
            # Finally, upload the record to Rockset. The list of pending and failed
            # checks are at the time of the merge
            save_merge_record(
+                collection=ROCKSET_MERGES_COLLECTION,
                comment_id=comment_id,
                pr_num=self.pr_num,
                owner=self.org,
@ -1192,8 +1179,10 @@ class GitHubPR:
                merge_base_sha=self.get_merge_base(),
                merge_commit_sha=merge_commit_sha,
                is_failed=False,
+                dry_run=dry_run,
                skip_mandatory_checks=skip_mandatory_checks,
                ignore_current=bool(ignore_current_checks),
+                workspace=ROCKSET_MERGES_WORKSPACE,
            )
        else:
            print("Missing comment ID or PR number, couldn't upload to Rockset")
@ -1500,6 +1489,7 @@ def checks_to_markdown_bullets(

@retries_decorator()
 def save_merge_record(
+    collection: str,
    comment_id: int,
    pr_num: int,
    owner: str,
@ -1515,44 +1505,59 @@ def save_merge_record(
    merge_base_sha: str,
    merge_commit_sha: str = "",
    is_failed: bool = False,
+    dry_run: bool = False,
    skip_mandatory_checks: bool = False,
    ignore_current: bool = False,
    error: str = "",
+    workspace: str = "commons",
 ) -> None:
    """
-    This saves the merge records as a json, which can later be uploaded to s3
+    This saves the merge records into Rockset, so we can query them (for fun and profit)
    """
+    if dry_run:
+        # Decide not to save the record to Rockset if dry-run is set to not pollute
+        # the collection
+        return

-    # Prepare the record to be written into Rockset
-    data = [
-        {
-            "comment_id": comment_id,
-            "pr_num": pr_num,
-            "owner": owner,
-            "project": project,
-            "author": author,
-            "pending_checks": pending_checks,
-            "failed_checks": failed_checks,
-            "ignore_current_checks": ignore_current_checks,
-            "broken_trunk_checks": broken_trunk_checks,
-            "flaky_checks": flaky_checks,
-            "unstable_checks": unstable_checks,
-            "last_commit_sha": last_commit_sha,
-            "merge_base_sha": merge_base_sha,
-            "merge_commit_sha": merge_commit_sha,
-            "is_failed": is_failed,
-            "skip_mandatory_checks": skip_mandatory_checks,
-            "ignore_current": ignore_current,
-            "error": error,
-            # This is a unique identifier for the record for deduping purposes
-            # in rockset.  Any unique string would work
-            "_id": f"{project}-{pr_num}-{comment_id}-{os.environ.get('GITHUB_RUN_ID')}",
-        }
-    ]
-    repo_root = Path(__file__).resolve().parent.parent.parent
+    try:
+        import rockset  # type: ignore[import]

-    with open(repo_root / "merge_record.json", "w") as f:
-        json.dump(data, f)
+        # Prepare the record to be written into Rockset
+        data = [
+            {
+                "comment_id": comment_id,
+                "pr_num": pr_num,
+                "owner": owner,
+                "project": project,
+                "author": author,
+                "pending_checks": pending_checks,
+                "failed_checks": failed_checks,
+                "ignore_current_checks": ignore_current_checks,
+                "broken_trunk_checks": broken_trunk_checks,
+                "flaky_checks": flaky_checks,
+                "unstable_checks": unstable_checks,
+                "last_commit_sha": last_commit_sha,
+                "merge_base_sha": merge_base_sha,
+                "merge_commit_sha": merge_commit_sha,
+                "is_failed": is_failed,
+                "skip_mandatory_checks": skip_mandatory_checks,
+                "ignore_current": ignore_current,
+                "error": error,
+            }
+        ]
+
+        client = rockset.RocksetClient(
+            host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
+        )
+        client.Documents.add_documents(
+            collection=collection,
+            data=data,
+            workspace=workspace,
+        )
+
+    except ModuleNotFoundError:
+        print("Rockset is missing, no record will be saved")
+        return


@retries_decorator(rc=[])
@ -2022,8 +2027,10 @@ def categorize_checks(
    pending_checks: List[Tuple[str, Optional[str], Optional[int]]] = []
    failed_checks: List[Tuple[str, Optional[str], Optional[int]]] = []

-    # failed_checks_categorization is used to keep track of all ignorable failures when saving the merge record on Rockset
-    failed_checks_categorization: Dict[str, List[Any]] = defaultdict(list)
+    # ok_failed_checks is used with ok_failed_checks_threshold while ignorable_failed_checks
+    # is used to keep track of all ignorable failures when saving the merge record on Rockset
+    ok_failed_checks: List[Tuple[str, Optional[str], Optional[int]]] = []
+    ignorable_failed_checks: Dict[str, List[Any]] = defaultdict(list)

    # If required_checks is not set or empty, consider all names are relevant
    relevant_checknames = [
@ -2051,38 +2058,36 @@ def categorize_checks(
            continue
        elif not is_passing_status(check_runs[checkname].status):
            target = (
-                failed_checks_categorization[classification]
+                ignorable_failed_checks[classification]
                if classification
                in ("IGNORE_CURRENT_CHECK", "BROKEN_TRUNK", "FLAKY", "UNSTABLE")
                else failed_checks
            )
            target.append((checkname, url, job_id))

-    flaky_or_broken_trunk = (
-        failed_checks_categorization["BROKEN_TRUNK"]
-        + failed_checks_categorization["FLAKY"]
-    )
+            if classification in ("BROKEN_TRUNK", "FLAKY", "UNSTABLE"):
+                ok_failed_checks.append((checkname, url, job_id))

-    if flaky_or_broken_trunk:
+    if ok_failed_checks:
        warn(
-            f"The following {len(flaky_or_broken_trunk)} checks failed but were likely due flakiness or broken trunk: "
-            + ", ".join([x[0] for x in flaky_or_broken_trunk])
+            f"The following {len(ok_failed_checks)} checks failed but were likely due flakiness or broken trunk: "
+            + ", ".join([x[0] for x in ok_failed_checks])
            + (
                f" but this is greater than the threshold of {ok_failed_checks_threshold} so merge will fail"
                if ok_failed_checks_threshold is not None
-                and len(flaky_or_broken_trunk) > ok_failed_checks_threshold
+                and len(ok_failed_checks) > ok_failed_checks_threshold
                else ""
            )
        )

    if (
        ok_failed_checks_threshold is not None
-        and len(flaky_or_broken_trunk) > ok_failed_checks_threshold
+        and len(ok_failed_checks) > ok_failed_checks_threshold
    ):
-        failed_checks = failed_checks + flaky_or_broken_trunk
+        failed_checks = failed_checks + ok_failed_checks

-    # The list of failed_checks_categorization is returned so that it can be saved into the Rockset merge record
-    return (pending_checks, failed_checks, failed_checks_categorization)
+    # The list of ignorable_failed_checks is returned so that it can be saved into the Rockset merge record
+    return (pending_checks, failed_checks, ignorable_failed_checks)


 def merge(
@ -2325,15 +2330,6 @@ def main() -> None:
            dry_run=args.dry_run,
        )
        return
-    if not pr.is_ghstack_pr() and pr.base_ref() != pr.default_branch():
-        gh_post_pr_comment(
-            org,
-            project,
-            args.pr_num,
-            f"PR targets {pr.base_ref()} rather than {pr.default_branch()}, refusing merge request",
-            dry_run=args.dry_run,
-        )
-        return

    if args.check_mergeability:
        if pr.is_ghstack_pr():
@ -2369,6 +2365,7 @@ def main() -> None:
            # list of pending and failed checks here, but they are not really
            # needed at the moment
            save_merge_record(
+                collection=ROCKSET_MERGES_COLLECTION,
                comment_id=args.comment_id,
                pr_num=args.pr_num,
                owner=org,
@ -2383,9 +2380,11 @@ def main() -> None:
                last_commit_sha=pr.last_commit().get("oid", ""),
                merge_base_sha=pr.get_merge_base(),
                is_failed=True,
+                dry_run=args.dry_run,
                skip_mandatory_checks=args.force,
                ignore_current=args.ignore_current,
                error=str(e),
+                workspace=ROCKSET_MERGES_WORKSPACE,
            )
        else:
            print("Missing comment ID or PR number, couldn't upload to Rockset")
--- a/.github/scripts/tryrebase.py
+++ b/.github/scripts/tryrebase.py
@ -11,7 +11,6 @@ from github_utils import gh_post_pr_comment as gh_post_comment
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
 from trymerge import GitHubPR

-
 SAME_SHA_ERROR = (
    "\n```\nAborting rebase because rebasing the branch resulted in the same sha as the target branch.\n"
    + "This usually happens because the PR has already been merged.  Please rebase locally and push.\n```"
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -33,8 +33,6 @@ env:
  # Needed for conda builds
  {%- if "aarch64" in build_environment %}
  ALPINE_IMAGE: "arm64v8/alpine"
-  {%- elif "s390x" in build_environment %}
-  ALPINE_IMAGE: "docker.io/s390x/alpine"
  {%- else %}
  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
  {%- endif %}
@ -58,11 +56,8 @@ jobs:
    uses: ./.github/workflows/_binary-build-linux.yml
    with:!{{ upload.binary_env_as_input(config) }}
      {%- if "aarch64" in build_environment %}
-      runs_on: linux.arm64.m7g.4xlarge
+      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
-      {%- elif "s390x" in build_environment %}
-      runs_on: linux.s390x
-      ALPINE_IMAGE: "docker.io/s390x/alpine"
      {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
      runs_on: linux.24xlarge
      {%- endif %}
@ -71,17 +66,12 @@ jobs:
      {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
      {%- endif %}
-      {%- if config["gpu_arch_type"] == "cuda-aarch64" %}
-      timeout-minutes: 420
-      {%- endif %}
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  {%- if config["gpu_arch_type"] != "cuda-aarch64" %}
  !{{ config["build_name"] }}-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: !{{ config["build_name"] }}-build
-    {%- if config["gpu_arch_type"] not in ["rocm", "xpu"] %}
+{%- if config["gpu_arch_type"] != "rocm" %}
    uses: ./.github/workflows/_binary-test-linux.yml
    with:!{{ upload.binary_env_as_input(config) }}
      build_name: !{{ config["build_name"] }}
@ -89,9 +79,6 @@ jobs:
      {%- if "aarch64" in build_environment %}
      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
-      {%- elif "s390x" in build_environment %}
-      runs_on: linux.s390x
-      ALPINE_IMAGE: "docker.io/s390x/alpine"
      {%- elif config["gpu_arch_type"] == "rocm" %}
      runs_on: linux.rocm.gpu
      {%- elif config["gpu_arch_type"] == "cuda" %}
@ -101,41 +88,7 @@ jobs:
      {%- endif %}
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-    {%- elif config["gpu_arch_type"] == "xpu" %}
-    runs-on: linux.idc.xpu
-    timeout-minutes: !{{ common.timeout_minutes }}
-    !{{ upload.binary_env(config) }}
-    permissions:
-      id-token: write
-      contents: read
-    steps:
-      - name: Setup XPU
-        uses: ./.github/actions/setup-xpu
-      - name: configure aws credentials
-        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@v1.7.0
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-      - name: Login to Amazon ECR
-        id: login-ecr
-        uses: aws-actions/amazon-ecr-login@v2
-      - uses: !{{ common.download_artifact_action }}
-        name: Download Build Artifacts
-        with:
-          name: !{{ config["build_name"] }}
-          path: "${{ runner.temp }}/artifacts/"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
-      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch) }}
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: !{{ config["container_image"] }}
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown XPU
-        uses: ./.github/actions/teardown-xpu
-    {%- else %}
+{%- else %}
    runs-on: linux.rocm.gpu
    timeout-minutes: !{{ common.timeout_minutes }}
    !{{ upload.binary_env(config) }}
@ -160,8 +113,7 @@ jobs:
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
        uses: ./.github/actions/teardown-rocm
-    {%- endif %}
-  {%- endif %}
+{%- endif %}

 {%- if branches == "nightly" %}
  !{{ upload.upload_binaries(config) }}
--- a/Show More
+++ b/Show More