[compiled autograd] torch.compile API

ghstack-source-id: 28065ffb2ed01641b4dcd31fb8fc0e729192f9ec Pull Request resolved: https://github.com/pytorch/pytorch/pull/125880
[compiled autograd] clear compiled_autograd_verbose once test is done
2025-10-23 23:04:52 +08:00 · 2024-05-16 01:00:03 -07:00 · 2024-05-14 08:30:58 -07:00 · 2024-05-14 08:30:57 -07:00 · 2024-05-14 08:30:57 -07:00
3654 changed files with 221049 additions and 82685 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -84,30 +84,16 @@ fi
 # CMake 3.18 is needed to support CUDA17 language variant
 CMAKE_VERSION=3.18.5

-_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
-_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
+_UCX_COMMIT=00bcc6bb18fc282eb160623b4c0d300147f579af
+_UCC_COMMIT=7cb07a76ccedad7e56ceb136b865eb9319c258ea

 # It's annoying to rename jobs every time you want to rewrite a
 # configuration, so we hardcode everything here rather than do it
 # from scratch
 case "$image" in
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
+  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -119,24 +105,9 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks)
+  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -149,39 +120,9 @@ case "$image" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-focal-cuda12.1-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9)
+  pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9)
    CUDA_VERSION=11.8.0
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -193,37 +134,9 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
+  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -313,7 +226,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    XPU_VERSION=0.5
+    BASEKIT_VERSION=2024.0.0-49522
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
@ -330,10 +243,10 @@ case "$image" in
    DOCS=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda11.8-cudnn9-py3.8-clang12)
+  pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12)
    ANACONDA_PYTHON_VERSION=3.8
    CUDA_VERSION=11.8
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    CLANG_VERSION=12
    PROTOBUF=yes
    DB=yes
@ -380,7 +293,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CONDA_CMAKE=yes
    ;;
-  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter)
+  pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter)
    ANACONDA_PYTHON_VERSION=3.9
    CUDA_VERSION=11.8
    CONDA_CMAKE=yes
@ -447,7 +360,7 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
 #when using cudnn version 8 install it separately from cuda
 if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
  IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
-  if [[ ${CUDNN_VERSION} == 9 ]]; then
+  if [[ ${CUDNN_VERSION} == 8 ]]; then
    IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
  fi
 fi
@ -490,7 +403,7 @@ docker build \
       --build-arg "DOCS=${DOCS}" \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
-       --build-arg "XPU_VERSION=${XPU_VERSION}" \
+       --build-arg "BASEKIT_VERSION=${BASEKIT_VERSION}" \
       --build-arg "ACL=${ACL:-}" \
       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
       --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
@ -499,7 +412,7 @@ docker build \
       "$@" \
       .

-# NVIDIA dockers for RC releases use tag names like `11.0-cudnn9-devel-ubuntu18.04-rc`,
+# NVIDIA dockers for RC releases use tag names like `11.0-cudnn8-devel-ubuntu18.04-rc`,
 # for this case we will set UBUNTU_VERSION to `18.04-rc` so that the Dockerfile could
 # find the correct image. As a result, here we have to replace the
 #   "$UBUNTU_VERSION" == "18.04-rc"
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -77,9 +77,6 @@ RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
-COPY ./common/install_amdsmi.sh install_amdsmi.sh
-RUN bash ./install_amdsmi.sh
-RUN rm install_amdsmi.sh
 ENV PATH /opt/rocm/bin:$PATH
 ENV PATH /opt/rocm/hcc/bin:$PATH
 ENV PATH /opt/rocm/hip/bin:$PATH
@ -118,13 +115,6 @@ COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
 RUN bash ./install_cache.sh && rm install_cache.sh

-# Install AOTriton
-COPY ci_commit_pins/aotriton.txt aotriton.txt
-COPY ./common/common_utils.sh common_utils.sh
-COPY ./common/install_aotriton.sh install_aotriton.sh
-RUN bash ./install_aotriton.sh /opt/rocm/aotriton && rm -rf install_aotriton.sh aotriton aotriton.txt common_utils.sh
-ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
-
 # Include BUILD_ENVIRONMENT environment variable in image
 ARG BUILD_ENVIRONMENT
 ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
--- a/.ci/docker/ci_commit_pins/aotriton.txt
+++ b/.ci/docker/ci_commit_pins/aotriton.txt
@ -1 +0,0 @@
-24a3fe9cb57e5cda3c923df29743f9767194cc27
--- a/.ci/docker/ci_commit_pins/triton-rocm.txt
+++ b/.ci/docker/ci_commit_pins/triton-rocm.txt
@ -1 +1 @@
-01cbe5045a6898c9a925f01435c8277b2fe6afcc
+bbe6246e37d8aa791c67daaf9d9d61b26c9ccfdc
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -1,6 +1,6 @@
 set -euo pipefail

-readonly version=v24.04
+readonly version=v23.08
 readonly src_host=https://review.mlplatform.org/ml
 readonly src_repo=ComputeLibrary

--- a/.ci/docker/common/install_amdsmi.sh
+++ b/.ci/docker/common/install_amdsmi.sh
@ -1,5 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-cd /opt/rocm/share/amd_smi && pip install .
--- a/.ci/docker/common/install_aotriton.sh
+++ b/.ci/docker/common/install_aotriton.sh
@ -1,24 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
-
-AOTRITON_DIR="aotriton"
-AOTRITON_PINNED_NAME="aotriton" # No .txt extension
-AOTRITON_PINNED_COMMIT=$(get_pinned_commit ${AOTRITON_PINNED_NAME})
-AOTRITON_INSTALL_PREFIX="$1"
-
-git clone https://github.com/ROCm/aotriton.git "${AOTRITON_DIR}"
-cd "${AOTRITON_DIR}"
-git checkout "${AOTRITON_PINNED_COMMIT}"
-git submodule sync --recursive
-git submodule update --init --recursive --force --depth 1
-mkdir build
-cd build
-cmake .. -G Ninja -DCMAKE_INSTALL_PREFIX=./install_dir -DCMAKE_BUILD_TYPE=Release -DAOTRITON_COMPRESS_KERNEL=OFF -DAOTRITON_NO_PYTHON=ON -DAOTRITON_NO_SHARED=ON
-ninja install
-mkdir -p "${AOTRITON_INSTALL_PREFIX}"
-cp -r install_dir/* "${AOTRITON_INSTALL_PREFIX}"
-find /tmp/ -mindepth 1 -delete
-rm -rf ~/.triton
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@ -3,7 +3,7 @@
 set -ex

 install_ubuntu() {
-  # NVIDIA dockers for RC releases use tag names like `11.0-cudnn9-devel-ubuntu18.04-rc`,
+  # NVIDIA dockers for RC releases use tag names like `11.0-cudnn8-devel-ubuntu18.04-rc`,
  # for this case we will set UBUNTU_VERSION to `18.04-rc` so that the Dockerfile could
  # find the correct image. As a result, here we have to check for
  #   "$UBUNTU_VERSION" == "18.04"*
--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@ -1,18 +1,20 @@
 #!/bin/bash

-if [[ -n "${CUDNN_VERSION}" ]]; then
+if [[ ${CUDNN_VERSION} == 8 ]]; then
    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
    mkdir tmp_cudnn
    pushd tmp_cudnn
-    if [[ ${CUDA_VERSION:0:2} == "12" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda12-archive"
-    elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
+    if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-8.9.2.26_cuda12-archive"
+        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
+    elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-8.7.0.84_cuda11-archive"
+        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/${CUDNN_NAME}.tar.xz
    else
        print "Unsupported CUDA version ${CUDA_VERSION}"
        exit 1
    fi
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
+
    tar xf ${CUDNN_NAME}.tar.xz
    cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
    cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,14 +5,9 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt

-if [[ ${CUDA_VERSION:0:4} =~ ^12\.[1-4]$ ]]; then
-    arch_path='sbsa'
-    export TARGETARCH=${TARGETARCH:-$(uname -m)}
-    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
-        arch_path='x86_64'
-    fi
-    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.5.2.1-archive"
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
+if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
+    CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.5.2.1-archive"
+    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
 elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
    CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.4.0.7-archive"
    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -30,10 +30,10 @@ pip_install \

 pip_install coloredlogs packaging

-pip_install onnxruntime==1.18
-pip_install onnx==1.16.0
+pip_install onnxruntime==1.17.0
+pip_install onnx==1.15.0
 # pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@3e869ef8ccf19b5ebd21c10d3e9c267c9a9fa729" --no-deps
-pip_install onnxscript==0.1.0.dev20240523 --no-deps
+pip_install onnxscript==0.1.0.dev20240315 --no-deps

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -39,8 +39,7 @@ install_ubuntu() {
                   rocm-libs \
                   rccl \
                   rocprofiler-dev \
-                   roctracer-dev \
-                   amd-smi-lib
+                   roctracer-dev

    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.1) ]]; then
        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
@ -107,8 +106,7 @@ install_centos() {
                   rocm-libs \
                   rccl \
                   rocprofiler-dev \
-                   roctracer-dev \
-                   amd-smi-lib
+                   roctracer-dev

  # precompiled miopen kernels; search for all unversioned packages
  # if search fails it will abort this script; use true to avoid case where search fails
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -15,7 +15,7 @@ conda_reinstall() {
 if [ -n "${ROCM_VERSION}" ]; then
  TRITON_REPO="https://github.com/openai/triton"
  TRITON_TEXT_FILE="triton-rocm"
-elif [ -n "${XPU_VERSION}" ]; then
+elif [ -n "${BASEKIT_VERSION}" ]; then
  TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
  TRITON_TEXT_FILE="triton-xpu"
 else
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -3,7 +3,10 @@ set -xe


 # Intel® software for general purpose GPU capabilities.
-# Refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
+# Refer to https://dgpu-docs.intel.com/releases/LTS_803.29_20240131.html
+
+# Intel® oneAPI Base Toolkit (version 2024.0.0) has been updated to include functional and security updates.
+# Refer to https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html

 # Users should update to the latest version as it becomes available

@ -14,16 +17,14 @@ function install_ubuntu() {
    # Set up the repository. To do this, download the key to the system keyring
    wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
        | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
-    wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
-        | gpg --dearmor --output /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
+    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+        | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null

    # Add the signed entry to APT sources and configure the APT client to use the Intel repository
-    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] \
-        https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
+    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
        | tee /etc/apt/sources.list.d/intel-gpu-jammy.list
-    echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] \
-        https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" \
-        | tee /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list
+    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
+        | tee /etc/apt/sources.list.d/oneAPI.list

    # Update the packages list and repository index
    apt-get update
@ -39,11 +40,11 @@ function install_ubuntu() {
        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
    # Development Packages
    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
-    # Install Intel Support Packages
-    if [ -n "$XPU_VERSION" ]; then
-        apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION}
+    # Install Intel® oneAPI Base Toolkit
+    if [ -n "$BASEKIT_VERSION" ]; then
+        apt-get install intel-basekit=$BASEKIT_VERSION -y
    else
-        apt-get install -y intel-for-pytorch-gpu-dev
+        apt-get install intel-basekit -y
    fi

    # Cleanup
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -139,7 +139,7 @@ COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
 ARG CUDNN_VERSION
 ARG CUDA_VERSION
 COPY ./common/install_cudnn.sh install_cudnn.sh
-RUN if [ -n "${CUDNN_VERSION}" ]; then bash install_cudnn.sh; fi
+RUN if [ "${CUDNN_VERSION}" -eq 8 ]; then bash install_cudnn.sh; fi
 RUN rm install_cudnn.sh

 # Install CUSPARSELT
@ -152,7 +152,6 @@ RUN rm install_cusparselt.sh
 RUN if [ -h /usr/local/cuda-11.6/cuda-11.6 ]; then rm /usr/local/cuda-11.6/cuda-11.6; fi
 RUN if [ -h /usr/local/cuda-11.7/cuda-11.7 ]; then rm /usr/local/cuda-11.7/cuda-11.7; fi
 RUN if [ -h /usr/local/cuda-12.1/cuda-12.1 ]; then rm /usr/local/cuda-12.1/cuda-12.1; fi
-RUN if [ -h /usr/local/cuda-12.4/cuda-12.4 ]; then rm /usr/local/cuda-12.4/cuda-12.4; fi

 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -78,11 +78,6 @@ ENV MAGMA_HOME /opt/rocm/magma
 ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8

-# Install amdsmi
-COPY ./common/install_amdsmi.sh install_amdsmi.sh
-RUN bash ./install_amdsmi.sh
-RUN rm install_amdsmi.sh
-
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
@ -110,13 +105,6 @@ COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
 RUN bash ./install_cache.sh && rm install_cache.sh

-# Install AOTriton
-COPY ci_commit_pins/aotriton.txt aotriton.txt
-COPY ./common/common_utils.sh common_utils.sh
-COPY ./common/install_aotriton.sh install_aotriton.sh
-RUN bash ./install_aotriton.sh /opt/rocm/aotriton && rm -rf install_aotriton.sh aotriton aotriton.txt common_utils.sh
-ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
-
 # Include BUILD_ENVIRONMENT environment variable in image
 ARG BUILD_ENVIRONMENT
 ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -62,7 +62,7 @@ RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_d
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

 # Install XPU Dependencies
-ARG XPU_VERSION
+ARG BASEKIT_VERSION
 COPY ./common/install_xpu.sh install_xpu.sh
 RUN bash ./install_xpu.sh && rm install_xpu.sh

--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -44,7 +44,10 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
  fi
 fi

-if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
+if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then
+  export ATEN_THREADING=TBB
+  export USE_TBB=1
+elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
  export ATEN_THREADING=NATIVE
 fi

@ -286,9 +289,6 @@ else
      fi
      WERROR=1 python setup.py bdist_wheel
    else
-      if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
-        source .ci/pytorch/install_cache_xla.sh
-      fi
      python setup.py bdist_wheel
    fi
    pip_install_whl "$(echo dist/*.whl)"
@ -330,7 +330,7 @@ else
    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
    mkdir -p "$CUSTOM_OP_BUILD"
    pushd "$CUSTOM_OP_BUILD"
-    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -343,7 +343,7 @@ else
    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
    mkdir -p "$JIT_HOOK_BUILD"
    pushd "$JIT_HOOK_BUILD"
-    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -355,7 +355,7 @@ else
    python --version
    mkdir -p "$CUSTOM_BACKEND_BUILD"
    pushd "$CUSTOM_BACKEND_BUILD"
-    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
--- a/.ci/pytorch/docs-test.sh
+++ b/.ci/pytorch/docs-test.sh
@ -6,4 +6,4 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 echo "Testing pytorch docs"

 cd docs
-TERM=vt100 make doctest
+make doctest
--- a/.ci/pytorch/install_cache_xla.sh
+++ b/.ci/pytorch/install_cache_xla.sh
@ -1,37 +0,0 @@
-#!/bin/bash
-
-# Script for installing sccache on the xla build job, which uses xla's docker
-# image and doesn't have sccache installed on it.  This is mostly copied from
-# .ci/docker/install_cache.sh.  Changes are: removing checks that will always
-# return the same thing, ex checks for for rocm, CUDA, and changing the path
-# where sccache is installed, and not changing /etc/environment.
-
-set -ex
-
-install_binary() {
-  echo "Downloading sccache binary from S3 repo"
-  curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /tmp/cache/bin/sccache
-}
-
-mkdir -p /tmp/cache/bin
-mkdir -p /tmp/cache/lib
-export PATH="/tmp/cache/bin:$PATH"
-
-install_binary
-chmod a+x /tmp/cache/bin/sccache
-
-function write_sccache_stub() {
-  # Unset LD_PRELOAD for ps because of asan + ps issues
-  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589
-  # shellcheck disable=SC2086
-  # shellcheck disable=SC2059
-  printf "#!/bin/sh\nif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then\n  exec sccache $(which $1) \"\$@\"\nelse\n  exec $(which $1) \"\$@\"\nfi" > "/tmp/cache/bin/$1"
-  chmod a+x "/tmp/cache/bin/$1"
-}
-
-write_sccache_stub cc
-write_sccache_stub c++
-write_sccache_stub gcc
-write_sccache_stub g++
-write_sccache_stub clang
-write_sccache_stub clang++
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -18,7 +18,6 @@ time python test/run_test.py --verbose -i distributed/test_c10d_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_nccl
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
-time python test/run_test.py --verbose -i distributed/test_cuda_p2p
 time python test/run_test.py --verbose -i distributed/test_store
 time python test/run_test.py --verbose -i distributed/test_pg_wrapper
 time python test/run_test.py --verbose -i distributed/rpc/cuda/test_tensorpipe_agent
@ -51,9 +50,6 @@ time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_ra
 # FSDP2 tests
 time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh

-# Pipelining composability tests
-time python test/run_test.py --verbose -i distributed/pipelining/test_composability.py
-
 # Other tests
 time python test/run_test.py --verbose -i test_cuda_primary_ctx
 time python test/run_test.py --verbose -i test_optim -- -k test_forloop_goes_right_direction_multigpu
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -264,18 +264,6 @@ elif [[ $TEST_CONFIG == 'nogpu_AVX512' ]]; then
  export ATEN_CPU_CAPABILITY=avx2
 fi

-# temp workarounds for https://github.com/pytorch/pytorch/issues/126692, remove when fixed
-if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
-  pushd test
-  CUDA_VERSION=$(python -c "import torch; print(torch.version.cuda)")
-  if [ "$CUDA_VERSION" == "12.4" ]; then
-    ISCUDA124="cu124"
-  else
-    ISCUDA124=""
-  fi
-  popd
-fi
-
 test_python_legacy_jit() {
  time python test/run_test.py --include test_jit_legacy test_jit_fuser_legacy --verbose
  assert_git_not_dirty
@ -338,7 +326,6 @@ test_inductor_distributed() {
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py -k test_clip_grad_norm_2d --verbose
  python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose

  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
@ -363,20 +350,10 @@ test_inductor() {

 test_inductor_cpp_wrapper_abi_compatible() {
  export TORCHINDUCTOR_ABI_COMPATIBLE=1
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-
  echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
  # cpu stack allocation causes segfault and needs more investigation
-  PYTORCH_TESTING_DEVICE_ONLY_FOR="" python test/run_test.py --include inductor/test_cpu_cpp_wrapper
+  TORCHINDUCTOR_STACK_ALLOCATION=0 python test/run_test.py --include inductor/test_cpu_cpp_wrapper
  python test/run_test.py --include inductor/test_cuda_cpp_wrapper
-
-  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
-    --training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
-    --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv"
-  python benchmarks/dynamo/check_accuracy.py \
-    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
-    --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/inductor_timm_training.csv"
 }

 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
@ -538,10 +515,10 @@ test_single_dynamo_benchmark() {
      --output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
    python benchmarks/dynamo/check_accuracy.py \
      --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/${TEST_CONFIG}_${name}.csv"
+      --expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
    python benchmarks/dynamo/check_graph_breaks.py \
      --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/${TEST_CONFIG}_${name}.csv"
+      --expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
  fi
 }

@ -565,11 +542,7 @@ test_dynamo_benchmark() {
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
  else
    if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
-      if [[ "${TEST_CONFIG}" == *freezing* ]]; then
-        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 --freezing "$@"
-      else
-        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 "$@"
-      fi
+      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 "$@"
    elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
    else
@ -583,16 +556,12 @@ test_inductor_torchbench_smoketest_perf() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

-  # Test some models in the cpp wrapper mode
-  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-    --bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-    --bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-    --bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
+  # smoke test the cpp_wrapper mode
+  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy --bfloat16 \
+    --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_smoketest.csv"
  python benchmarks/dynamo/check_accuracy.py \
-    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
-    --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/inductor_torchbench_inference.csv"
+      --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_smoketest.csv" \
+      --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"

  python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
@ -607,13 +576,7 @@ test_inductor_torchbench_smoketest_perf() {
  # https://github.com/pytorch/pytorch/actions/runs/7158691360/job/19491437314,
  # and thus we lower its threshold to reduce flakiness. If this continues to be a problem,
  # we switch to use some other model.
-  # Use 4.7 for cuda 12.4, change back to 4.9 after fixing https://github.com/pytorch/pytorch/issues/126692
-  if [ "$CUDA_VERSION" == "12.4" ]; then
-    THRESHOLD=4.7
-  else
-    THRESHOLD=4.9
-  fi
-  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t $THRESHOLD
+  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.9

  # Check memory compression ratio for a few models
  for test in hf_Albert timm_vision_transformer; do
@ -625,15 +588,6 @@ test_inductor_torchbench_smoketest_perf() {
      "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv" \
      --expected benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
  done
-
-  # Perform some "warm-start" runs for a few huggingface models.
-  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
-    python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
-      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
-    python benchmarks/dynamo/check_accuracy.py \
-      --actual "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/inductor_huggingface_training.csv"
-  done
 }

 test_inductor_torchbench_cpu_smoketest_perf(){
@ -717,6 +671,7 @@ test_aten() {
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libmkldnn* "$TEST_BASE_DIR"
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libnccl* "$TEST_BASE_DIR"
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libtorch* "$TEST_BASE_DIR"
+  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libtbb* "$TEST_BASE_DIR"

  ls "$TEST_BASE_DIR"
  aten/tools/run_tests.sh "$TEST_BASE_DIR"
@ -741,6 +696,21 @@ test_without_numpy() {
  popd
 }

+# pytorch extensions require including torch/extension.h which includes all.h
+# which includes utils.h which includes Parallel.h.
+# So you can call for instance parallel_for() from your extension,
+# but the compilation will fail because of Parallel.h has only declarations
+# and definitions are conditionally included Parallel.h(see last lines of Parallel.h).
+# I tried to solve it #39612 and #39881 by including Config.h into Parallel.h
+# But if Pytorch is built with TBB it provides Config.h
+# that has AT_PARALLEL_NATIVE_TBB=1(see #3961 or #39881) and it means that if you include
+# torch/extension.h which transitively includes Parallel.h
+# which transitively includes tbb.h which is not available!
+if [[ "${BUILD_ENVIRONMENT}" == *tbb* ]]; then
+  sudo mkdir -p /usr/include/tbb
+  sudo cp -r "$PWD"/third_party/tbb/include/tbb/* /usr/include/tbb
+fi
+
 test_libtorch() {
  local SHARD="$1"

@ -754,6 +724,7 @@ test_libtorch() {
    ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libshm* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
+    ln -sf "$TORCH_LIB_DIR"/libtbb* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libnvfuser* "$TORCH_BIN_DIR"

    export CPP_TESTS_DIR="${TORCH_BIN_DIR}"
@ -890,6 +861,7 @@ test_rpc() {
  # test reporting process to function as expected.
  ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
  ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
+  ln -sf "$TORCH_LIB_DIR"/libtbb* "$TORCH_BIN_DIR"

  CPP_TESTS_DIR="${TORCH_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_cpp_rpc
 }
@ -1297,10 +1269,6 @@ elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHAR
 elif [[ "${TEST_CONFIG}" == *dynamo* && $SHARD_NUMBER -gt 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  install_torchvision
  test_dynamo_shard "${SHARD_NUMBER}"
-elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
-  install_torchvision
-  test_python_shard "$SHARD_NUMBER"
-  test_aten
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  test_without_numpy
  install_torchvision
@ -1330,6 +1298,10 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
  test_libtorch
 elif [[ "${TEST_CONFIG}" = docs_test ]]; then
  test_docs_test
+elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
+  install_torchvision
+  test_python
+  test_aten
 elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
  install_torchvision
  test_python
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -76,8 +76,8 @@ TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)

 # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
-  # Only linux Python < 3.13 are supported wheels for triton
-  TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.13'"
+  # Only linux Python < 3.12 are supported wheels for triton
+  TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.12'"
  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
  if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
      TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -1,12 +1,9 @@
 self-hosted-runner:
  labels:
-    # GitHub hosted x86 Linux runners
    - linux.20_04.4x
    - linux.20_04.16x
-    # Repo-specific LF hosted ARC runners
-    - linux.large.arc
-    # Organization-wide AWS Linux Runners
    - linux.large
+    - linux.large.arc
    - linux.2xlarge
    - linux.4xlarge
    - linux.12xlarge
@ -16,34 +13,18 @@ self-hosted-runner:
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
    - linux.g5.4xlarge.nvidia.gpu
-    # Organization-wide AWS Linux Runners on Linux Foundation account
-    - lf.linux.large
-    - lf.linux.2xlarge
-    - lf.linux.4xlarge
-    - lf.linux.12xlarge
-    - lf.linux.24xlarge
-    - lf.linux.arm64.2xlarge
-    - lf.linux.4xlarge.nvidia.gpu
-    - lf.linux.8xlarge.nvidia.gpu
-    - lf.linux.16xlarge.nvidia.gpu
-    - lf.linux.g5.4xlarge.nvidia.gpu
-    # Repo-specific IBM hosted S390x runner
    - linux.s390x
-    # Organization wide AWS Windows runners
    - windows.4xlarge.nonephemeral
    - windows.8xlarge.nvidia.gpu
    - windows.8xlarge.nvidia.gpu.nonephemeral
    - windows.g5.4xlarge.nvidia.gpu
-    # Organization-wide AMD hosted MI300 runners
+    - bm-runner
    - linux.rocm.gpu
-    # Repo-specific Apple hosted  runners
-    - macos-m1-ultra
-    - macos-m2-14
-    # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
    - macos-m1-stable
    - macos-m1-13
    - macos-m1-14
-    # GitHub-hosted MacOS runners
+    - macos-12-xl
+    - macos-12
+    - macos12.3-m1
    - macos-latest-xlarge
    - macos-13-xlarge
-    - macos-14-xlarge
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@ -66,8 +66,7 @@ runs:
        command: |
          set -eux
          # PyYAML 6.0 doesn't work with MacOS x86 anymore
-          # This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2
-          python3 -m pip install requests==2.27.1 pyyaml==6.0.1
+          python3 -m pip install requests==2.26.0 pyyaml==6.0.1

    - name: Parse ref
      id: parse-ref
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-1980f8af5bcd0bb2ce51965cf79d8d4c25dad8a0
+ea437b31ce316ea3d66fe73768c0dcb94edb79ad
--- a/.github/lf-canary-scale-config.yml
+++ b/.github/lf-canary-scale-config.yml
@ -1,154 +0,0 @@
-# Defines runner types that will be provisioned by by LF Self-hosted
-# runners for pytorch/pytorch-canary and their labels.
-#
-# Runners listed here will be available as self hosted runners.
-# Configuration is directly pulled from the main branch.
-#
-# Default values:
-#
-# runner_types:
-#   runner_label: # label to specify in the Github Actions workflow
-#     instance_type: m4.large
-#     os: linux
-#     max_available: 20
-#     disk_size: 50
-#     is_ephemeral: true
-
-runner_types:
-  lf.c.linux.12xlarge:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-  lf.c.linux.24xl.spr-metal:
-    disk_size: 200
-    instance_type: c7i.metal-24xl
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.c.linux.16xlarge.spr:
-    disk_size: 200
-    instance_type: c7i.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.c.linux.12xlarge.ephemeral:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: true
-    max_available: 300
-    os: linux
-  lf.c.linux.16xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.c.linux.24xlarge:
-    disk_size: 150
-    instance_type: c5.24xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: linux
-  lf.c.linux.2xlarge:
-    disk_size: 150
-    instance_type: c5.2xlarge
-    is_ephemeral: false
-    max_available: 3120
-    os: linux
-  lf.c.linux.4xlarge:
-    disk_size: 150
-    instance_type: c5.4xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-  lf.c.linux.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.4xlarge
-    is_ephemeral: false
-    max_available: 520
-    os: linux
-  lf.c.linux.8xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.8xlarge
-    is_ephemeral: false
-    max_available: 400
-    os: linux
-  lf.c.linux.g4dn.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.12xlarge
-    is_ephemeral: false
-    max_available: 50
-    os: linux
-  lf.c.linux.g4dn.metal.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.metal
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.c.linux.g5.48xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.48xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-  lf.c.linux.g5.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.12xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-  lf.c.linux.g5.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 1200
-    os: linux
-  lf.c.linux.large:
-    disk_size: 15
-    instance_type: c5.large
-    is_ephemeral: false
-    os: linux
-  lf.c.linux.arm64.2xlarge:
-    disk_size: 256
-    instance_type: t4g.2xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-  lf.c.linux.arm64.m7g.2xlarge:
-    disk_size: 256
-    instance_type: m7g.2xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-  lf.c.windows.4xlarge:
-    disk_size: 256
-    instance_type: c5d.4xlarge
-    is_ephemeral: true
-    max_available: 420
-    os: windows
-  lf.c.windows.4xlarge.nonephemeral:
-    disk_size: 256
-    instance_type: c5d.4xlarge
-    is_ephemeral: false
-    max_available: 420
-    os: windows
-  lf.c.windows.8xlarge.nvidia.gpu:
-    disk_size: 256
-    instance_type: p3.2xlarge
-    is_ephemeral: true
-    max_available: 150
-    os: windows
-  lf.c.windows.8xlarge.nvidia.gpu.nonephemeral:
-    disk_size: 256
-    instance_type: p3.2xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: windows
-  lf.c.windows.g5.4xlarge.nvidia.gpu:
-    disk_size: 256
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: windows
--- a/.github/lf-scale-config.yml
+++ b/.github/lf-scale-config.yml
@ -1,154 +0,0 @@
-# Defines runner types that will be provisioned by by LF Self-hosted
-# runners for pytorch/pytorch and their labels.
-#
-# Runners listed here will be available as self hosted runners.
-# Configuration is directly pulled from the main branch.
-#
-# Default values:
-#
-# runner_types:
-#   runner_label: # label to specify in the Github Actions workflow
-#     instance_type: m4.large
-#     os: linux
-#     max_available: 20
-#     disk_size: 50
-#     is_ephemeral: true
-
-runner_types:
-  lf.linux.12xlarge:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-  lf.linux.24xl.spr-metal:
-    disk_size: 200
-    instance_type: c7i.metal-24xl
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.linux.16xlarge.spr:
-    disk_size: 200
-    instance_type: c7i.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.linux.12xlarge.ephemeral:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: true
-    max_available: 300
-    os: linux
-  lf.linux.16xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.linux.24xlarge:
-    disk_size: 150
-    instance_type: c5.24xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: linux
-  lf.linux.2xlarge:
-    disk_size: 150
-    instance_type: c5.2xlarge
-    is_ephemeral: false
-    max_available: 3120
-    os: linux
-  lf.linux.4xlarge:
-    disk_size: 150
-    instance_type: c5.4xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-  lf.linux.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.4xlarge
-    is_ephemeral: false
-    max_available: 520
-    os: linux
-  lf.linux.8xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.8xlarge
-    is_ephemeral: false
-    max_available: 400
-    os: linux
-  lf.linux.g4dn.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.12xlarge
-    is_ephemeral: false
-    max_available: 50
-    os: linux
-  lf.linux.g4dn.metal.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.metal
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.linux.g5.48xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.48xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-  lf.linux.g5.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.12xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-  lf.linux.g5.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 1200
-    os: linux
-  lf.linux.large:
-    disk_size: 15
-    instance_type: c5.large
-    is_ephemeral: false
-    os: linux
-  lf.linux.arm64.2xlarge:
-    disk_size: 256
-    instance_type: t4g.2xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-  lf.linux.arm64.m7g.2xlarge:
-    disk_size: 256
-    instance_type: m7g.2xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-  lf.windows.4xlarge:
-    disk_size: 256
-    instance_type: c5d.4xlarge
-    is_ephemeral: true
-    max_available: 420
-    os: windows
-  lf.windows.4xlarge.nonephemeral:
-    disk_size: 256
-    instance_type: c5d.4xlarge
-    is_ephemeral: false
-    max_available: 420
-    os: windows
-  lf.windows.8xlarge.nvidia.gpu:
-    disk_size: 256
-    instance_type: p3.2xlarge
-    is_ephemeral: true
-    max_available: 150
-    os: windows
-  lf.windows.8xlarge.nvidia.gpu.nonephemeral:
-    disk_size: 256
-    instance_type: p3.2xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: windows
-  lf.windows.g5.4xlarge.nvidia.gpu:
-    disk_size: 256
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: windows
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -245,7 +245,6 @@
  - torch/xpu/**
  - test/xpu/**
  - third_party/xpu.txt
-  - .ci/docker/ci_commit_pins/triton-xpu.txt
  approved_by:
  - EikanWang
  - jgong5
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -1,5 +1,6 @@
 tracking_issue: 24422
 ciflow_tracking_issue: 64124
+TD_rollout_issue: 123120
 ciflow_push_tags:
 - ciflow/binaries
 - ciflow/binaries_conda
@ -8,7 +9,6 @@ ciflow_push_tags:
 - ciflow/inductor
 - ciflow/inductor-perf-compare
 - ciflow/inductor-micro-benchmark
- ciflow/inductor-cu124
 - ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
@ -20,6 +20,7 @@ ciflow_push_tags:
 - ciflow/xpu
 - ciflow/torchbench
 retryable_workflows:
+- lint
 - pull
 - trunk
 - linux-binary
--- a/.github/requirements-gha-cache.txt
+++ b/.github/requirements-gha-cache.txt
@ -10,6 +10,6 @@ lintrunner==0.10.7
 ninja==1.10.0.post1
 nvidia-ml-py==11.525.84
 pyyaml==6.0
-requests==2.32.2
+requests==2.31.0
 rich==10.9.0
 rockset==1.0.3
--- a/.github/requirements/conda-env-Linux-X64.txt
+++ b/.github/requirements/conda-env-Linux-X64.txt
@ -4,5 +4,6 @@ mkl-include=2022.1.0
 ninja=1.10.2
 numpy=1.23.3
 pyyaml=6.0
+requests=2.31.0
 setuptools=68.2.2
-typing-extensions=4.9.0
+typing-extensions=4.3.0
--- a/.github/requirements/conda-env-iOS.txt
+++ b/.github/requirements/conda-env-iOS.txt
@ -3,5 +3,6 @@ cmake=3.22.1
 ninja=1.10.2
 numpy=1.23.3
 pyyaml=6.0
+requests=2.31.0
 setuptools=68.2.2
-typing-extensions=4.9.0
+typing-extensions=4.3.0
--- a/.github/requirements/conda-env-macOS-ARM64
+++ b/.github/requirements/conda-env-macOS-ARM64
@ -2,7 +2,7 @@ numpy=1.22.3
 pyyaml=6.0
 setuptools=61.2.0
 cmake=3.22.*
-typing-extensions=4.9.0
+typing-extensions=4.3.0
 dataclasses=0.8
 pip=22.2.2
 pillow=10.0.1
--- a/.github/requirements/conda-env-macOS-X64
+++ b/.github/requirements/conda-env-macOS-X64
@ -4,7 +4,7 @@ numpy=1.21.2
 pyyaml=5.3
 setuptools=46.0.0
 cmake=3.22.*
-typing-extensions=4.9.0
+typing-extensions=4.3.0
 dataclasses=0.8
 pip=22.2.2
 pillow=10.0.1
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@ -2,7 +2,6 @@
 import os
 import re
 from datetime import datetime
-from functools import lru_cache
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Set

@ -188,17 +187,6 @@ def get_recent_prs() -> Dict[str, Any]:
    return prs_by_branch_base


-@lru_cache(maxsize=1)
-def get_open_prs() -> List[Dict[str, Any]]:
-    return paginate_graphql(
-        GRAPHQL_OPEN_PRS,
-        {"owner": "pytorch", "repo": "pytorch"},
-        lambda data: False,
-        lambda res: res["data"]["repository"]["pullRequests"]["nodes"],
-        lambda res: res["data"]["repository"]["pullRequests"]["pageInfo"],
-    )
-
-
 def get_branches_with_magic_label_or_open_pr() -> Set[str]:
    pr_infos: List[Dict[str, Any]] = paginate_graphql(
        GRAPHQL_NO_DELETE_BRANCH_LABEL,
@ -208,7 +196,15 @@ def get_branches_with_magic_label_or_open_pr() -> Set[str]:
        lambda res: res["data"]["repository"]["label"]["pullRequests"]["pageInfo"],
    )

-    pr_infos.extend(get_open_prs())
+    pr_infos.extend(
+        paginate_graphql(
+            GRAPHQL_OPEN_PRS,
+            {"owner": "pytorch", "repo": "pytorch"},
+            lambda data: False,
+            lambda res: res["data"]["repository"]["pullRequests"]["nodes"],
+            lambda res: res["data"]["repository"]["pullRequests"]["pageInfo"],
+        )
+    )

    # Get the most recent PR for each branch base (group gh together)
    branch_bases = set()
@ -274,41 +270,5 @@ def delete_branches() -> None:
        delete_branch(git_repo, branch)


-def delete_old_ciflow_tags() -> None:
-    # Deletes ciflow tags if they are associated with a closed PR or a specific
-    # commit.  Lightweight tags don't have information about the date they were
-    # created, so we can't check how old they are.  The script just assumes that
-    # ciflow tags should be deleted regardless of creation date.
-    git_repo = GitRepo(str(REPO_ROOT), "origin", debug=True)
-
-    def delete_tag(tag: str) -> None:
-        print(f"Deleting tag {tag}")
-        ESTIMATED_TOKENS[0] += 1
-        delete_branch(git_repo, f"refs/tags/{tag}")
-
-    tags = git_repo._run_git("tag").splitlines()
-    open_pr_numbers = [x["number"] for x in get_open_prs()]
-
-    for tag in tags:
-        try:
-            if ESTIMATED_TOKENS[0] > 400:
-                print("Estimated tokens exceeded, exiting")
-                break
-            if not tag.startswith("ciflow/"):
-                continue
-            re_match_pr = re.match(r"^ciflow\/.*\/(\d{5,6})$", tag)
-            re_match_sha = re.match(r"^ciflow\/.*\/([0-9a-f]{40})$", tag)
-            if re_match_pr:
-                pr_number = int(re_match_pr.group(1))
-                if pr_number in open_pr_numbers:
-                    continue
-                delete_tag(tag)
-            elif re_match_sha:
-                delete_tag(tag)
-        except Exception as e:
-            print(f"Failed to check tag {tag}: {e}")
-
-
 if __name__ == "__main__":
    delete_branches()
-    delete_old_ciflow_tags()
--- a/.github/scripts/docathon-label-sync.py
+++ b/.github/scripts/docathon-label-sync.py
@ -1,52 +0,0 @@
-import os
-import re
-import sys
-
-from github import Github
-
-
-def main() -> None:
-    token = os.environ.get("GITHUB_TOKEN")
-
-    repo_owner = "pytorch"
-    repo_name = "pytorch"
-    pull_request_number = int(sys.argv[1])
-
-    g = Github(token)
-    repo = g.get_repo(f"{repo_owner}/{repo_name}")
-    pull_request = repo.get_pull(pull_request_number)
-    pull_request_body = pull_request.body
-    # PR without description
-    if pull_request_body is None:
-        return
-
-    # get issue number from the PR body
-    if not re.search(r"#\d{1,6}", pull_request_body):
-        print("The pull request does not mention an issue.")
-        return
-    issue_number = int(re.findall(r"#(\d{1,6})", pull_request_body)[0])
-    issue = repo.get_issue(issue_number)
-    issue_labels = issue.labels
-    docathon_label_present = any(
-        label.name == "docathon-h1-2024" for label in issue_labels
-    )
-
-    # if the issue has a docathon label, add all labels from the issue to the PR.
-    if not docathon_label_present:
-        print("The 'docathon-h1-2024' label is not present in the issue.")
-        return
-    pull_request_labels = pull_request.get_labels()
-    pull_request_label_names = [label.name for label in pull_request_labels]
-    issue_label_names = [label.name for label in issue_labels]
-    labels_to_add = [
-        label for label in issue_label_names if label not in pull_request_label_names
-    ]
-    if not labels_to_add:
-        print("The pull request already has the same labels.")
-        return
-    pull_request.add_to_labels(*labels_to_add)
-    print("Labels added to the pull request!")
-
-
-if __name__ == "__main__":
-    main()
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -19,7 +19,7 @@ CUDA_ARCHES = ["11.8", "12.1", "12.4"]
 CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1", "12.4": "12.4.0"}


-CUDA_ARCHES_CUDNN_VERSION = {"11.8": "9", "12.1": "9", "12.4": "9"}
+CUDA_ARCHES_CUDNN_VERSION = {"11.8": "8", "12.1": "8", "12.4": "8"}


 ROCM_ARCHES = ["6.0", "6.1"]
@ -34,15 +34,12 @@ CPU_AARCH64_ARCH = ["cpu-aarch64"]
 CPU_S390X_ARCH = ["cpu-s390x"]


-CUDA_AARCH64_ARCH = ["cuda-aarch64"]
-
-
 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
    "11.8": (
        "nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "  # noqa: B950
        "nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
@ -55,7 +52,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "  # noqa: B950
        "nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
@ -68,7 +65,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | "
@ -138,8 +135,6 @@ def arch_type(arch_version: str) -> str:
        return "cpu-aarch64"
    elif arch_version in CPU_S390X_ARCH:
        return "cpu-s390x"
-    elif arch_version in CUDA_AARCH64_ARCH:
-        return "cuda-aarch64"
    else:  # arch_version should always be "cpu" in this case
        return "cpu"

@ -160,7 +155,6 @@ WHEEL_CONTAINER_IMAGES = {
    "cpu-cxx11-abi": f"pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-{DEFAULT_TAG}",
    "cpu-aarch64": f"pytorch/manylinuxaarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
    "cpu-s390x": f"pytorch/manylinuxs390x-builder:cpu-s390x-{DEFAULT_TAG}",
-    "cuda-aarch64": f"pytorch/manylinuxaarch64-builder:cuda12.4-{DEFAULT_TAG}",
 }

 CONDA_CONTAINER_IMAGES = {
@ -219,7 +213,6 @@ def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
        "cpu-cxx11-abi": "cpu-cxx11-abi",
        "cpu-s390x": "cpu",
        "cuda": f"cu{gpu_arch_version.replace('.', '')}",
-        "cuda-aarch64": "cu124",
        "rocm": f"rocm{gpu_arch_version}",
    }.get(gpu_arch_type, gpu_arch_version)

@ -300,11 +293,11 @@ def generate_libtorch_matrix(
                    "libtorch_variant": libtorch_variant,
                    "libtorch_config": abi_version if os == "windows" else "",
                    "devtoolset": abi_version if os != "windows" else "",
-                    "container_image": (
-                        LIBTORCH_CONTAINER_IMAGES[(arch_version, abi_version)]
-                        if os != "windows"
-                        else ""
-                    ),
+                    "container_image": LIBTORCH_CONTAINER_IMAGES[
+                        (arch_version, abi_version)
+                    ]
+                    if os != "windows"
+                    else "",
                    "package_type": "libtorch",
                    "build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{abi_version}".replace(
                        ".", "_"
@ -337,7 +330,7 @@ def generate_wheels_matrix(
        elif os == "linux-aarch64":
            # Only want the one arch as the CPU type is different and
            # uses different build/test scripts
-            arches = ["cpu-aarch64", "cuda-aarch64"]
+            arches = ["cpu-aarch64"]
        elif os == "linux-s390x":
            # Only want the one arch as the CPU type is different and
            # uses different build/test scripts
@ -347,26 +340,17 @@ def generate_wheels_matrix(
    for python_version in python_versions:
        for arch_version in arches:
            gpu_arch_type = arch_type(arch_version)
-            # Disable py3.12 builds for ROCm because of triton dependency
-            # on llnl-hatchet, which doesn't have py3.12 wheels available
-            if gpu_arch_type == "rocm" and python_version == "3.12":
-                continue
            gpu_arch_version = (
                ""
                if arch_version == "cpu"
                or arch_version == "cpu-cxx11-abi"
                or arch_version == "cpu-aarch64"
                or arch_version == "cpu-s390x"
-                or arch_version == "cuda-aarch64"
                else arch_version
            )

            # 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
-            if (
-                arch_version in ["12.4", "12.1", "11.8"]
-                and os == "linux"
-                or arch_version == "cuda-aarch64"
-            ):
+            if arch_version in ["12.4", "12.1", "11.8"] and os == "linux":
                ret.append(
                    {
                        "python_version": python_version,
@ -375,16 +359,10 @@ def generate_wheels_matrix(
                        "desired_cuda": translate_desired_cuda(
                            gpu_arch_type, gpu_arch_version
                        ),
-                        "devtoolset": (
-                            "cxx11-abi" if arch_version == "cuda-aarch64" else ""
-                        ),
+                        "devtoolset": "",
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
-                        "pytorch_extra_install_requirements": (
-                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]  # fmt: skip
-                            if os != "linux-aarch64"
-                            else ""
-                        ),
+                        "pytorch_extra_install_requirements": PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version],  # fmt: skip
                        "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(  # noqa: B950
                            ".", "_"
                        ),
@ -399,19 +377,17 @@ def generate_wheels_matrix(
                        "desired_cuda": translate_desired_cuda(
                            gpu_arch_type, gpu_arch_version
                        ),
-                        "devtoolset": (
-                            "cxx11-abi" if arch_version == "cpu-cxx11-abi" else ""
-                        ),
+                        "devtoolset": "cxx11-abi"
+                        if arch_version == "cpu-cxx11-abi"
+                        else "",
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
                        "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
                            ".", "_"
                        ),
-                        "pytorch_extra_install_requirements": (
-                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.1"]  # fmt: skip
-                            if os != "linux"
-                            else ""
-                        ),
+                        "pytorch_extra_install_requirements":
+                        PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.1"]  # fmt: skip
+                        if os != "linux" else "",
                    }
                )
    return ret
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -5,11 +5,11 @@ import sys
 from dataclasses import asdict, dataclass, field
 from pathlib import Path
 from typing import Dict, Iterable, List, Literal, Set
-from typing_extensions import TypedDict  # Python 3.11+

 import generate_binary_build_matrix  # type: ignore[import]

 import jinja2
+from typing_extensions import TypedDict  # Python 3.11+

 Arch = Literal["windows", "linux", "macos"]

@ -60,7 +60,7 @@ class BinaryBuildWorkflow:
    branches: str = "nightly"
    # Mainly for macos
    cross_compile_arm64: bool = False
-    macos_runner: str = "macos-14-xlarge"
+    macos_runner: str = "macos-12-xl"

    def __post_init__(self) -> None:
        if self.abi_version:
@ -157,7 +157,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
        package_type="manywheel",
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
-            arches=["11.8", "12.1", "12.4"],
+            arches=["11.8", "12.1"],
            python_versions=["3.8"],
        ),
        branches="main",
@ -285,7 +285,7 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
            libtorch_variants=["shared-with-deps"],
        ),
        cross_compile_arm64=False,
-        macos_runner="macos-14-xlarge",
+        macos_runner="macos-13-xlarge",
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
            isolated_workflow=True,
@ -298,7 +298,7 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
            OperatingSystem.MACOS_ARM64
        ),
        cross_compile_arm64=False,
-        macos_runner="macos-14-xlarge",
+        macos_runner="macos-13-xlarge",
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
            isolated_workflow=True,
@ -308,7 +308,7 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
        os=OperatingSystem.MACOS_ARM64,
        package_type="conda",
        cross_compile_arm64=False,
-        macos_runner="macos-14-xlarge",
+        macos_runner="macos-13-xlarge",
        build_configs=generate_binary_build_matrix.generate_conda_matrix(
            OperatingSystem.MACOS_ARM64
        ),
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -7,7 +7,7 @@ eval "$(command conda 'shell.bash' 'hook' 2> /dev/null)"
 conda activate "${CONDA_ENV}"

 # Use uv to speed up lintrunner init
-python3 -m pip install uv==0.1.45
+python3 -m pip install uv

 CACHE_DIRECTORY="/tmp/.lintbin"
 # Try to recover the cached binaries
--- a/.github/scripts/pytest_caching_utils.py
+++ b/.github/scripts/pytest_caching_utils.py
@ -18,7 +18,6 @@ PYTEST_CACHE_KEY_PREFIX = "pytest_cache"
 PYTEST_CACHE_DIR_NAME = ".pytest_cache"
 BUCKET = "gha-artifacts"
 LASTFAILED_FILE_PATH = Path("v/cache/lastfailed")
-TD_HEURISTIC_PREVIOUSLY_FAILED_ADDITIONAL = "previous_failures_additional.json"

 # Temp folders
 ZIP_UPLOAD = "zip-upload"
@ -192,10 +191,6 @@ def _merge_pytest_caches(
        pytest_cache_dir_to_merge_from, pytest_cache_dir_to_merge_into
    )

-    _merge_additional_failures_files(
-        pytest_cache_dir_to_merge_from, pytest_cache_dir_to_merge_into
-    )
-

 def _merge_lastfailed_files(source_pytest_cache: Path, dest_pytest_cache: Path) -> None:
    # Simple cases where one of the files doesn't exist
@ -237,27 +232,3 @@ def _merged_lastfailed_content(
            del to_lastfailed[""]

    return to_lastfailed
-
-
-def _merge_additional_failures_files(
-    source_pytest_cache: Path, dest_pytest_cache: Path
-) -> None:
-    # Simple cases where one of the files doesn't exist
-    source_lastfailed_file = (
-        source_pytest_cache / TD_HEURISTIC_PREVIOUSLY_FAILED_ADDITIONAL
-    )
-    dest_lastfailed_file = dest_pytest_cache / TD_HEURISTIC_PREVIOUSLY_FAILED_ADDITIONAL
-
-    if not source_lastfailed_file.exists():
-        return
-    if not dest_lastfailed_file.exists():
-        copy_file(source_lastfailed_file, dest_lastfailed_file)
-        return
-
-    # Both files exist, so we need to merge them
-    from_lastfailed = load_json_file(source_lastfailed_file)
-    to_lastfailed = load_json_file(dest_lastfailed_file)
-    merged_content = list(set(from_lastfailed + to_lastfailed))
-
-    # Save the results
-    write_json_file(dest_lastfailed_file, merged_content)
--- a/.github/scripts/sync_distributed_folder_prototype.sh
+++ b/.github/scripts/sync_distributed_folder_prototype.sh
@ -1,28 +0,0 @@
-#!/bin/bash
-
-set -eoux pipefail
-
-SYNC_BRANCH=fbcode/pytorch-stable-prototype
-
-git config user.email "fake@example.com"
-git config user.name  "PyTorch Stable Bot"
-
-git fetch origin main
-git fetch origin "$SYNC_BRANCH"
-git checkout "$SYNC_BRANCH"
-
-for SHA in $(git log 4333e122d4b74cdf84351ed2907045c6a767b4cd..origin/main --pretty="%h" --reverse -- torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed)
-do
-    # `git merge-base --is-ancestor` exits with code 0 if the given SHA is an ancestor, and non-0 otherwise
-    if git merge-base --is-ancestor $SHA HEAD || [[ $(git log --grep="(cherry picked from commit $SHA") ]]
-    then
-        echo "Skipping $SHA"
-        continue
-    fi
-    echo "Copying $SHA"
-    git cherry-pick -x "$SHA"
-done
-
-if [[ "${WITH_PUSH}" == true ]]; then
-  git push
-fi
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -773,13 +773,13 @@ class TestBypassFailures(TestCase):
                # than the one on the base commit. This should still count as broken trunk
                "pr_num": 104214,
                "related_failure_count": 0,
-                "flaky_or_broken_trunk": 1,
+                "unrelated_failure_count": 1,
            },
            {
                # This PR had one broken trunk failure and it used ghstack
                "pr_num": 105145,
                "related_failure_count": 0,
-                "flaky_or_broken_trunk": 1,
+                "unrelated_failure_count": 1,
            },
            {
                # The failure on the merge base was retried successfully and
@ -788,20 +788,20 @@ class TestBypassFailures(TestCase):
                # be used to detect broken trunk
                "pr_num": 107160,
                "related_failure_count": 0,
-                "flaky_or_broken_trunk": 1,
+                "unrelated_failure_count": 4,
            },
            {
                # This PR used Dr.CI broken trunk classification
                "pr_num": 111253,
                "related_failure_count": 1,
-                "flaky_or_broken_trunk": 1,
+                "unrelated_failure_count": 2,
            },
        ]

        for case in test_cases:
            pr_num = case["pr_num"]
            related_failure_count = case["related_failure_count"]
-            flaky_or_broken_trunk = case["flaky_or_broken_trunk"]
+            unrelated_failure_count = case["unrelated_failure_count"]

            pr = GitHubPR("pytorch", "pytorch", pr_num)
            checks = pr.get_checkrun_conclusions()
@ -823,7 +823,7 @@ class TestBypassFailures(TestCase):
            )
            self.assertTrue(len(pending) == 0)
            self.assertTrue(
-                len(failed) == flaky_or_broken_trunk + related_failure_count
+                len(failed) == unrelated_failure_count + related_failure_count
            )

    def test_ignore_current(self, *args: Any) -> None:
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -2027,8 +2027,10 @@ def categorize_checks(
    pending_checks: List[Tuple[str, Optional[str], Optional[int]]] = []
    failed_checks: List[Tuple[str, Optional[str], Optional[int]]] = []

-    # failed_checks_categorization is used to keep track of all ignorable failures when saving the merge record on Rockset
-    failed_checks_categorization: Dict[str, List[Any]] = defaultdict(list)
+    # ok_failed_checks is used with ok_failed_checks_threshold while ignorable_failed_checks
+    # is used to keep track of all ignorable failures when saving the merge record on Rockset
+    ok_failed_checks: List[Tuple[str, Optional[str], Optional[int]]] = []
+    ignorable_failed_checks: Dict[str, List[Any]] = defaultdict(list)

    # If required_checks is not set or empty, consider all names are relevant
    relevant_checknames = [
@ -2056,38 +2058,36 @@ def categorize_checks(
            continue
        elif not is_passing_status(check_runs[checkname].status):
            target = (
-                failed_checks_categorization[classification]
+                ignorable_failed_checks[classification]
                if classification
                in ("IGNORE_CURRENT_CHECK", "BROKEN_TRUNK", "FLAKY", "UNSTABLE")
                else failed_checks
            )
            target.append((checkname, url, job_id))

-    flaky_or_broken_trunk = (
-        failed_checks_categorization["BROKEN_TRUNK"]
-        + failed_checks_categorization["FLAKY"]
-    )
+            if classification in ("BROKEN_TRUNK", "FLAKY", "UNSTABLE"):
+                ok_failed_checks.append((checkname, url, job_id))

-    if flaky_or_broken_trunk:
+    if ok_failed_checks:
        warn(
-            f"The following {len(flaky_or_broken_trunk)} checks failed but were likely due flakiness or broken trunk: "
-            + ", ".join([x[0] for x in flaky_or_broken_trunk])
+            f"The following {len(ok_failed_checks)} checks failed but were likely due flakiness or broken trunk: "
+            + ", ".join([x[0] for x in ok_failed_checks])
            + (
                f" but this is greater than the threshold of {ok_failed_checks_threshold} so merge will fail"
                if ok_failed_checks_threshold is not None
-                and len(flaky_or_broken_trunk) > ok_failed_checks_threshold
+                and len(ok_failed_checks) > ok_failed_checks_threshold
                else ""
            )
        )

    if (
        ok_failed_checks_threshold is not None
-        and len(flaky_or_broken_trunk) > ok_failed_checks_threshold
+        and len(ok_failed_checks) > ok_failed_checks_threshold
    ):
-        failed_checks = failed_checks + flaky_or_broken_trunk
+        failed_checks = failed_checks + ok_failed_checks

-    # The list of failed_checks_categorization is returned so that it can be saved into the Rockset merge record
-    return (pending_checks, failed_checks, failed_checks_categorization)
+    # The list of ignorable_failed_checks is returned so that it can be saved into the Rockset merge record
+    return (pending_checks, failed_checks, ignorable_failed_checks)


 def merge(
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -58,7 +58,7 @@ jobs:
    uses: ./.github/workflows/_binary-build-linux.yml
    with:!{{ upload.binary_env_as_input(config) }}
      {%- if "aarch64" in build_environment %}
-      runs_on: linux.arm64.m7g.4xlarge
+      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      {%- elif "s390x" in build_environment %}
      runs_on: linux.s390x
@ -71,17 +71,12 @@ jobs:
      {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
      {%- endif %}
-      {%- if config["gpu_arch_type"] == "cuda-aarch64" %}
-      timeout-minutes: 420
-      {%- endif %}
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  {%- if config["gpu_arch_type"] != "cuda-aarch64" %}
  !{{ config["build_name"] }}-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: !{{ config["build_name"] }}-build
-    {%- if config["gpu_arch_type"] != "rocm" %}
+{%- if config["gpu_arch_type"] != "rocm" %}
    uses: ./.github/workflows/_binary-test-linux.yml
    with:!{{ upload.binary_env_as_input(config) }}
      build_name: !{{ config["build_name"] }}
@ -101,7 +96,7 @@ jobs:
      {%- endif %}
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-    {%- else %}
+{%- else %}
    runs-on: linux.rocm.gpu
    timeout-minutes: !{{ common.timeout_minutes }}
    !{{ upload.binary_env(config) }}
@ -126,8 +121,7 @@ jobs:
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
        uses: ./.github/actions/teardown-rocm
-    {%- endif %}
-  {%- endif %}
+{%- endif %}

 {%- if branches == "nightly" %}
  !{{ upload.upload_binaries(config) }}
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@ -57,11 +57,7 @@
      id-token: write
      contents: read
 {%- if has_test %}
-    {%- if config["gpu_arch_type"] == "cuda-aarch64" %}
-    needs: !{{ config["build_name"] }}-build
-    {%- else %}
    needs: !{{ config["build_name"] }}-test
-    {%- endif %}
 {%- else %}
    needs: !{{ config["build_name"] }}-build
 {%- endif %}
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -12,15 +12,10 @@ on:
        type: string
        description: The build environment
      runs_on:
-        required: false
-        default: linux.12xlarge
-        type: string
-        description: Hardware to run this "build"job on, linux.12xlarge or linux.arm64.2xlarge.
-      timeout-minutes:
-        required: false
-        default: 210
-        type: number
-        description: timeout for the job
+          required: false
+          default: linux.12xlarge
+          type: string
+          description: Hardware to run this "build"job on, linux.12xlarge or linux.arm64.2xlarge.
      ALPINE_IMAGE:
        required: false
        type: string
@ -83,7 +78,7 @@ on:
 jobs:
  build:
    runs-on: ${{ inputs.runs_on }}
-    timeout-minutes: ${{ inputs.timeout-minutes }}
+    timeout-minutes: 210
    env:
      PYTORCH_ROOT: ${{ inputs.PYTORCH_ROOT }}
      BUILDER_ROOT: ${{ inputs.BUILDER_ROOT }}
--- a/.github/workflows/assigntome-docathon.yml
+++ b/.github/workflows/assigntome-docathon.yml
@ -8,8 +8,6 @@ on:
 jobs:
  assign:
    runs-on: ubuntu-latest
-    permissions:
-      issues: write
    steps:
      - name: Check for "/assigntome" in comment
        uses: actions/github-script@v6
@ -28,14 +26,14 @@ jobs:
                  repo: context.repo.repo,
                  issue_number: issueNumber
                });
-              const hasLabel = issue.labels.some(label => label.name === 'docathon-h1-2024');
+              const hasLabel = issue.labels.some(label => label.name === 'docathon-h2-2023');
              if (hasLabel) {
                if (issue.assignee !== null) {
                  await github.rest.issues.createComment({
                    owner: context.repo.owner,
                    repo: context.repo.repo,
                    issue_number: issueNumber,
-                    body: "The issue is already assigned. Please pick an opened and unnasigned issue with the [docathon-h1-2024 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2024)."
+                    body: "The issue is already assigned. Please pick an opened and unnasigned issue with the [docathon-h2-2023 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h2-2023)."
                  });
                } else {
                  await github.rest.issues.addAssignees({
@ -46,7 +44,7 @@ jobs:
                  });
                }
              } else {
-                const commmentMessage = "This issue does not have the correct label. Please pick an opened and unnasigned issue with the [docathon-h1-2024 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2024)."
+                const commmentMessage = "This issue does not have the correct label. Please pick an opened and unnasigned issue with the [docathon-h2-2023 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h2-2023)."
                await github.rest.issues.createComment({
                  owner: context.repo.owner,
                  repo: context.repo.repo,
--- a/.github/workflows/build-ios-binaries.yml
+++ b/.github/workflows/build-ios-binaries.yml
@ -49,7 +49,7 @@ jobs:
          { config: "default",
            shard: 1,
            num_shards: 1,
-            runner: "macos-14-xlarge",
+            runner: "macos-13-xlarge",
            ios_platform: "SIMULATOR",
            ios_arch: "arm64",
            use_lite_interpreter: ${{ inputs.use_lite_interpreter || 1 }},
@ -60,7 +60,7 @@ jobs:
          { config: "default",
            shard: 1,
            num_shards: 1,
-            runner: "macos-14-xlarge",
+            runner: "macos-13-xlarge",
            ios_platform: "OS",
            ios_arch: "arm64",
            use_lite_interpreter: ${{ inputs.use_lite_interpreter || 1 }},
--- a/.github/workflows/close-nonexistent-disable-issues.yml
+++ b/.github/workflows/close-nonexistent-disable-issues.yml
@ -18,6 +18,6 @@ jobs:
          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
-          pip3 install requests==2.32.2
+          pip3 install requests==2.26
          pip3 install rockset==1.0.3
          python3 .github/scripts/close_nonexistent_disable_issues.py
--- a/.github/workflows/delete_old_branches.yml
+++ b/.github/workflows/delete_old_branches.yml
@ -29,7 +29,7 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v4
        with:
-          python-version: '3.11'
+          python-version: '3.8'
          architecture: x64
          check-latest: false

--- a/.github/workflows/docathon-sync-label.yml
+++ b/.github/workflows/docathon-sync-label.yml
@ -1,30 +0,0 @@
-name: Docathon Labels Sync
-
-on:
-  pull_request_target:
-    types: [opened, synchronize, edited]
-    branches: [main]
-
-jobs:
-  check-labels:
-    runs-on: ubuntu-latest
-    permissions:
-      issues: write
-      pull-requests: write
-    steps:
-      - name: Check out the repo
-        uses: actions/checkout@v2
-        with:
-          fetch-depth: 1
-      - name: Set up Python
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.x
-      - name: Install dependencies
-        run: |
-          pip install requests==2.32.3
-          pip install PyGithub==2.3.0
-      - name: Run Python script
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: python ./.github/scripts/docathon-label-sync.py ${{ github.event.pull_request.number }}
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -38,19 +38,15 @@ jobs:
      matrix:
        runner: [linux.12xlarge]
        docker-image-name: [
-          pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9,
-          pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks,
-          pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks,
-          pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9,
-          pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks,
-          pytorch-linux-focal-cuda12.1-cudnn9-py3.12-gcc9-inductor-benchmarks,
-          pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9,
+          pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9,
+          pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks,
+          pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9,
          pytorch-linux-focal-py3.8-clang10,
          pytorch-linux-focal-py3.11-clang10,
          pytorch-linux-focal-py3.12-clang10,
          pytorch-linux-focal-rocm-n-1-py3,
          pytorch-linux-focal-rocm-n-py3,
-          pytorch-linux-jammy-cuda11.8-cudnn9-py3.8-clang12,
+          pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12,
          pytorch-linux-focal-py3-clang9-android-ndk-r21e,
          pytorch-linux-jammy-py3.8-gcc11,
          pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks,
@ -58,7 +54,7 @@ jobs:
          pytorch-linux-jammy-py3-clang15-asan,
          pytorch-linux-focal-py3-clang10-onnx,
          pytorch-linux-focal-linter,
-          pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter,
+          pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter,
          pytorch-linux-jammy-py3-clang12-executorch
          ]
        include:
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@ -149,10 +149,3 @@ jobs:
      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
        if: always()
-
-  validate:
-    needs: build
-    uses: pytorch/builder/.github/workflows/validate-docker-images.yml@main
-    with:
-      channel: nightly
-      ref: main
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -50,11 +50,11 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      DESIRED_PYTHON: "3.8"
-      runs_on: linux.arm64.m7g.4xlarge
+      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_8-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cpu-aarch64-test:  # Testing
@ -100,51 +100,6 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_8-cuda-aarch64-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.8"
-      runs_on: linux.arm64.m7g.4xlarge
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_8-cuda-aarch64
-      build_environment: linux-aarch64-binary-manywheel
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda-aarch64-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_8-cuda-aarch64-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda-aarch64
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_9-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -158,11 +113,11 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      DESIRED_PYTHON: "3.9"
-      runs_on: linux.arm64.m7g.4xlarge
+      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-aarch64-test:  # Testing
@ -208,51 +163,6 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_9-cuda-aarch64-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.9"
-      runs_on: linux.arm64.m7g.4xlarge
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_9-cuda-aarch64
-      build_environment: linux-aarch64-binary-manywheel
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda-aarch64-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_9-cuda-aarch64-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda-aarch64
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_10-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -266,11 +176,11 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      DESIRED_PYTHON: "3.10"
-      runs_on: linux.arm64.m7g.4xlarge
+      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-aarch64-test:  # Testing
@ -316,51 +226,6 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_10-cuda-aarch64-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.10"
-      runs_on: linux.arm64.m7g.4xlarge
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_10-cuda-aarch64
-      build_environment: linux-aarch64-binary-manywheel
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda-aarch64-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_10-cuda-aarch64-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda-aarch64
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_11-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -374,11 +239,11 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      DESIRED_PYTHON: "3.11"
-      runs_on: linux.arm64.m7g.4xlarge
+      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-aarch64-test:  # Testing
@ -424,51 +289,6 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_11-cuda-aarch64-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.11"
-      runs_on: linux.arm64.m7g.4xlarge
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_11-cuda-aarch64
-      build_environment: linux-aarch64-binary-manywheel
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda-aarch64-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_11-cuda-aarch64-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda-aarch64
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_12-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -482,11 +302,11 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      DESIRED_PYTHON: "3.12"
-      runs_on: linux.arm64.m7g.4xlarge
+      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-aarch64-test:  # Testing
@ -531,48 +351,3 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_12-cuda-aarch64-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.12"
-      runs_on: linux.arm64.m7g.4xlarge
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_12-cuda-aarch64
-      build_environment: linux-aarch64-binary-manywheel
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda-aarch64-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_12-cuda-aarch64-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda-aarch64
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -48,7 +48,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda11_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cuda11_8-test:  # Testing
@ -88,7 +88,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda12_1
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cuda12_1-test:  # Testing
@ -111,43 +111,3 @@ jobs:
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_8-cuda12_4-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_4
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda12_4-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda12_4-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_4
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -174,7 +174,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda11_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cuda11_8-test:  # Testing
@ -237,7 +237,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda12_1
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cuda12_1-test:  # Testing
@ -300,7 +300,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cuda12_4-test:  # Testing
@ -690,7 +690,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda11_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda11_8-test:  # Testing
@ -753,7 +753,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_1
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_1-test:  # Testing
@ -816,7 +816,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_4-test:  # Testing
@ -1206,7 +1206,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda11_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda11_8-test:  # Testing
@ -1269,7 +1269,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda12_1
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_1-test:  # Testing
@ -1332,7 +1332,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_4-test:  # Testing
@ -1722,7 +1722,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda11_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda11_8-test:  # Testing
@ -1785,7 +1785,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda12_1
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_1-test:  # Testing
@ -1848,7 +1848,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_4-test:  # Testing
@ -2238,7 +2238,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda11_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda11_8-test:  # Testing
@ -2301,7 +2301,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda12_1
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_1-test:  # Testing
@ -2364,7 +2364,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_4-test:  # Testing
@ -2410,3 +2410,209 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_12-rocm6_0-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-rocm6_0
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-rocm6_0-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_12-rocm6_0-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
+      DESIRED_PYTHON: "3.12"
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_12-rocm6_0
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: pytorch/manylinux-builder:rocm6.0-main
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  manywheel-py3_12-rocm6_0-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-rocm6_0-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.0
+      GPU_ARCH_VERSION: 6.0
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-rocm6_0
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_12-rocm6_1-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-rocm6_1
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-rocm6_1-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_12-rocm6_1-build
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
+      DESIRED_PYTHON: "3.12"
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: manywheel-py3_12-rocm6_1
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: pytorch/manylinux-builder:rocm6.1-main
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  manywheel-py3_12-rocm6_1-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-rocm6_1-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.1
+      GPU_ARCH_VERSION: 6.1
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-rocm6_1
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -54,7 +54,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_8-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cpu-s390x-test:  # Testing
@ -117,7 +117,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_9-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-s390x-test:  # Testing
@ -180,7 +180,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_10-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-s390x-test:  # Testing
@ -243,7 +243,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_11-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-s390x-test:  # Testing
@ -306,7 +306,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_12-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-s390x-test:  # Testing
--- a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
@ -34,7 +34,7 @@ concurrency:
 jobs:
  conda-py3_8-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-14-xlarge
+    runs-on: macos-13-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -152,7 +152,7 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_9-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-14-xlarge
+    runs-on: macos-13-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -270,7 +270,7 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_10-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-14-xlarge
+    runs-on: macos-13-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -388,7 +388,7 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_11-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-14-xlarge
+    runs-on: macos-13-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -506,7 +506,7 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_12-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-14-xlarge
+    runs-on: macos-13-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
--- a/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
@ -34,7 +34,7 @@ concurrency:
 jobs:
  libtorch-cpu-shared-with-deps-cxx11-abi-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-14-xlarge
+    runs-on: macos-13-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -34,7 +34,7 @@ concurrency:
 jobs:
  wheel-py3_8-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-14-xlarge
+    runs-on: macos-13-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -46,7 +46,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -153,7 +153,7 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_9-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-14-xlarge
+    runs-on: macos-13-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -165,7 +165,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -272,7 +272,7 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_10-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-14-xlarge
+    runs-on: macos-13-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -284,7 +284,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -391,7 +391,7 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_11-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-14-xlarge
+    runs-on: macos-13-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -403,7 +403,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -510,7 +510,7 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_12-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-14-xlarge
+    runs-on: macos-13-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -522,7 +522,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@ -46,7 +46,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -290,7 +290,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -536,7 +536,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -782,7 +782,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1027,7 +1027,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1271,7 +1271,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1517,7 +1517,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1763,7 +1763,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2008,7 +2008,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2252,7 +2252,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2498,7 +2498,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2744,7 +2744,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2989,7 +2989,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3233,7 +3233,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3479,7 +3479,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3725,7 +3725,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3970,7 +3970,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4214,7 +4214,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4460,7 +4460,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4706,7 +4706,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
--- a/.github/workflows/inductor-cu124.yml
+++ b/.github/workflows/inductor-cu124.yml
@ -1,108 +0,0 @@
-name: inductor-cu124
-
-on:
-  push:
-    tags:
-      - ciflow/inductor-cu124/*
-  workflow_dispatch:
-  schedule:
-    # Run every 4 hours during the week and every 12 hours on the weekend
-    - cron: 45 0,4,8,12,16,20 * * 1-5
-    - cron: 45 4,12 * * 0,6
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-permissions: read-all
-
-jobs:
-  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
-    # Should be synced with the one in inductor.yml, but this doesn't run inductor_timm
-    name: cuda12.4-py3.10-gcc9-sm86
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.6'
-      test-matrix: |
-        { include: [
-          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
-          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_cpp_wrapper_abi_compatible", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-
-  linux-focal-cuda12_4-py3_10-gcc9-inductor-test:
-    name: cuda12.4-py3.10-gcc9-sm86
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
-    with:
-      sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-test
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-
-  linux-focal-cuda12_4-py3_10-gcc9-inductor-build-gcp:
-    name: cuda12.4-py3.10-gcc9-sm80
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.0'
-      test-matrix: |
-        { include: [
-          { config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
-        ]}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-
-  linux-focal-cuda12_4-py3_10-gcc9-inductor-test-gcp:
-    name: cuda12.4-py3.10-gcc9-sm80
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_4-py3_10-gcc9-inductor-build-gcp
-    with:
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80
-      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build-gcp.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build-gcp.outputs.test-matrix }}
-      use-gha: anything-non-empty-to-use-gha
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-
-  linux-focal-cuda12_4-py3_12-gcc9-inductor-build:
-    name: cuda12.4-py3.12-gcc9-sm86
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-cuda12.4-py3.12-gcc9-sm86
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.6'
-      test-matrix: |
-        { include: [
-          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-focal-cuda12_4-py3_12-gcc9-inductor-test:
-    name: cuda12.4-py3.12-gcc9-sm86
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_4-py3_12-gcc9-inductor-build
-    with:
-      build-environment: linux-focal-cuda12.4-py3.12-gcc9-sm86
-      docker-image: ${{ needs.linux-focal-cuda12_4-py3_12-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_12-gcc9-inductor-build.outputs.test-matrix }}
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@ -21,7 +21,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@ -18,7 +18,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -71,7 +71,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -23,7 +23,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -44,7 +44,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
@ -86,7 +86,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
@ -107,56 +107,6 @@ jobs:
    secrets:
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}

-  linux-focal-cuda12_1-py3_12-gcc9-inductor-build:
-    name: cuda12.1-py3.12-gcc9-sm86
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3.12-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.6'
-      test-matrix: |
-        { include: [
-          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-focal-cuda12_1-py3_12-gcc9-inductor-test:
-    name: cuda12.1-py3.12-gcc9-sm86
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_1-py3_12-gcc9-inductor-build
-    with:
-      build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
-      docker-image: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.test-matrix }}
-
-  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
-    # Should be synced with the one in inductor-periodic.yml but this only runs inductor_timm
-    name: cuda12.4-py3.10-gcc9-sm86
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.6'
-      test-matrix: |
-        { include: [
-          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-        ]}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-
-  linux-focal-cuda12_4-py3_10-gcc9-inductor-test:
-    name: cuda12.4-py3.10-gcc9-sm86
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
-    with:
-      sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-test
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-
  linux-jammy-cpu-py3_8-gcc11-inductor-build:
    name: linux-jammy-cpu-py3.8-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
@ -170,17 +120,12 @@ jobs:
          { config: "cpu_inductor_timm", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
          { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_inductor_huggingface_freezing", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
-          { config: "cpu_inductor_timm_freezing", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_inductor_timm_freezing", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_inductor_torchbench_freezing", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_inductor_torchbench_freezing", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
+          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
        ]}
    secrets:
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -20,7 +20,7 @@ jobs:
    with:
      timeout: 120
      runner: linux.2xlarge
-      docker-image: pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter
+      docker-image: pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter
      # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
      # to run git rev-parse HEAD~:.ci/docker when a new image is needed
      fetch-depth: 0
@ -36,7 +36,7 @@ jobs:
    with:
      timeout: 120
      runner: linux.2xlarge
-      docker-image: pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter
+      docker-image: pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter
      # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
      # to run git rev-parse HEAD~:.ci/docker when a new image is needed
      fetch-depth: 0
--- a/.github/workflows/mac-mps.yml
+++ b/.github/workflows/mac-mps.yml
@ -13,31 +13,29 @@ concurrency:
 permissions: read-all

 jobs:
-  macos-py3-arm64-build:
-    name: macos-py3-arm64
+  macos-13-py3-arm64-build:
+    name: macos-13-py3-arm64
    uses: ./.github/workflows/_mac-build.yml
    with:
      sync-tag: macos-py3-arm64-build
-      build-environment: macos-py3-arm64
+      build-environment: macos-13-py3-arm64
      runner-type: macos-m1-stable
      build-generates-artifacts: true
      # To match the one pre-installed in the m1 runners
      python-version: 3.9.12
-      # The runner macos-m2-14 is not a typo, it's a custom runner that is different
-      # than our AWS macos-m1-14 runners
      test-matrix: |
        { include: [
-          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
+          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-stable" },
          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-14" },
        ]}

  macos-py3-arm64-mps-test:
    name: macos-py3-arm64-mps
    uses: ./.github/workflows/_mac-test-mps.yml
-    needs: macos-py3-arm64-build
+    needs: macos-13-py3-arm64-build
    with:
      sync-tag: macos-py3-arm64-mps-test
-      build-environment: macos-py3-arm64
+      build-environment: macos-13-py3-arm64
      # Same as the build job
      python-version: 3.9.12
-      test-matrix: ${{ needs.macos-py3-arm64-build.outputs.test-matrix }}
+      test-matrix: ${{ needs.macos-13-py3-arm64-build.outputs.test-matrix }}
--- a/.github/workflows/nightly-rockset-uploads.yml
+++ b/.github/workflows/nightly-rockset-uploads.yml
@ -32,7 +32,7 @@ jobs:
          cache: pip

      - run: |
-          pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
+          pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12

      - name: Upload external contribution stats
        uses: nick-fields/retry@v2.8.2
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -37,59 +37,6 @@ jobs:
    permissions:
      id-token: write
      contents: read
-  linux-focal-cuda12_1-py3_10-gcc9-build:
-    name: linux-focal-cuda12.1-py3.10-gcc9
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
-      test-matrix: |
-        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
-        ]}
-  linux-focal-cuda12_1-py3_10-gcc9-test:
-    name: linux-focal-cuda12.1-py3.10-gcc9
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-focal-cuda12_1-py3_10-gcc9-build
-      - target-determination
-    with:
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9
-      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-build.outputs.test-matrix }}
-
-  linux-focal-cuda12_4-py3_10-gcc9-build:
-    name: linux-focal-cuda12.4-py3.10-gcc9
-    uses: ./.github/workflows/_linux-build-label.yml
-    with:
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "deploy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-focal-cuda12_4-py3_10-gcc9-test:
-    name: linux-focal-cuda12.4-py3.10-gcc9
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-focal-cuda12_4-py3_10-gcc9-build
-      - target-determination
-    with:
-      timeout-minutes: 360
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9
-      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-build.outputs.test-matrix }}

  parallelnative-linux-jammy-py3_8-gcc11-build:
    name: parallelnative-linux-jammy-py3.8-gcc11
@ -120,7 +67,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda11.8-py3.9-gcc9
-      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
@ -142,7 +89,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda11.8-py3.10-gcc9-debug
-      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9
      build-with-debug: true
      test-matrix: |
        { include: [
@ -204,7 +151,7 @@ jobs:
          { config: "default",
            shard: 1,
            num_shards: 1,
-            runner: "macos-14-xlarge",
+            runner: "macos-13-xlarge",
            ios_platform: "SIMULATOR",
            ios_arch: "arm64",
            use_lite_interpreter: 1,
@ -215,7 +162,7 @@ jobs:
          { config: "default",
            shard: 1,
            num_shards: 1,
-            runner: "macos-14-xlarge",
+            runner: "macos-13-xlarge",
            ios_platform: "OS",
            ios_arch: "arm64",
            use_lite_interpreter: 1,
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -237,7 +237,7 @@ jobs:
    uses: ./.github/workflows/_linux-build-label.yml
    with:
      build-environment: linux-focal-cuda11.8-py3.10-gcc9
-      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9
      test-matrix: |
        { include: [
          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
@ -262,7 +262,7 @@ jobs:
    uses: ./.github/workflows/_linux-build-label.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
@ -297,12 +297,12 @@ jobs:
          { config: "default", shard: 1, num_shards: 1 },
        ]}

-  linux-jammy-cuda-11_8-cudnn9-py3_8-clang12-build:
-    name: linux-jammy-cuda11.8-cudnn9-py3.8-clang12
+  linux-jammy-cuda-11_8-cudnn8-py3_8-clang12-build:
+    name: linux-jammy-cuda11.8-cudnn8-py3.8-clang12
    uses: ./.github/workflows/_linux-build-label.yml
    with:
-      build-environment: linux-jammy-cuda11.8-cudnn9-py3.8-clang12
-      docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn9-py3.8-clang12
+      build-environment: linux-jammy-cuda11.8-cudnn8-py3.8-clang12
+      docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 1 },
@ -361,7 +361,7 @@ jobs:
    uses: ./.github/workflows/_bazel-build-test.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-bazel-test
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
      cuda-version: cpu
      test-matrix: |
        { include: [
@ -373,25 +373,13 @@ jobs:
    uses: ./.github/workflows/_bazel-build-test.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-bazel-test
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
      cuda-version: "12.1"
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
        ]}

-  linux-focal-cuda12_4-py3_10-gcc9-bazel-test:
-    name: linux-focal-cuda12.4-py3.10-gcc9-bazel-test
-    uses: ./.github/workflows/_bazel-build-test.yml
-    with:
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-bazel-test
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
-      cuda-version: "12.4"
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
-        ]}
-
  linux-focal-py3-clang9-android-ndk-r21e-gradle-custom-build-single:
    name: linux-focal-py3-clang9-android-ndk-r21e-gradle-custom-build-single
    uses: ./.github/workflows/_android-build-test.yml
@ -447,7 +435,7 @@ jobs:
    uses: ./.github/workflows/_linux-build-label.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -41,7 +41,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3-gcc9-slow-gradcheck
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
@ -70,7 +70,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
--- a/.github/workflows/sync_distributed_folder_prototype.yml
+++ b/.github/workflows/sync_distributed_folder_prototype.yml
@ -1,30 +0,0 @@
-name: Sync Distributed Folder
-
-on:
-  #push:
-  #  branches:
-  #    - 'main'
-  #  paths:
-  #    - 'torch/distributed/**'
-  workflow_dispatch:
-  pull_request:
-    paths:
-      - '.github/scripts/sync_distributed_folder_prototype.sh'
-      - '.github/workflows/sync_distributed_folder_prototype.yml'
-
-env:
-  WITH_PUSH: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
-
-permissions:
-  contents: write
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  sync:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - run: .github/scripts/sync_distributed_folder_prototype.sh
--- a/.github/workflows/target-determination-indexer.yml
+++ b/.github/workflows/target-determination-indexer.yml
@ -26,7 +26,7 @@ jobs:
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
-          docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
+          docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
          working-directory: pytorch

      - name: Use following to pull public copy of the image
--- a/.github/workflows/torchbench.yml
+++ b/.github/workflows/torchbench.yml
@ -16,7 +16,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -34,39 +34,36 @@ jobs:
      id-token: write
      contents: read

-  linux-focal-cuda12_4-py3_10-gcc9-sm86-build:
-    name: linux-focal-cuda12.4-py3.10-gcc9-sm86
-    uses: ./.github/workflows/_linux-build-label.yml
+  linux-focal-cuda12_1-py3_10-gcc9-build:
+    name: linux-focal-cuda12.1-py3.10-gcc9
+    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
-      cuda-arch-list: 8.6
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
        ]}

-  linux-focal-cuda12_4-py3_10-gcc9-sm86-test:
-    name: linux-focal-cuda12.4-py3.10-gcc9-sm86
+  linux-focal-cuda12_1-py3_10-gcc9-test:
+    name: linux-focal-cuda12.1-py3.10-gcc9
    uses: ./.github/workflows/_linux-test.yml
    needs:
-      - linux-focal-cuda12_4-py3_10-gcc9-sm86-build
+      - linux-focal-cuda12_1-py3_10-gcc9-build
      - target-determination
    with:
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
-      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-sm86-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-sm86-build.outputs.test-matrix }}
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9
+      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-build.outputs.test-matrix }}

  libtorch-linux-focal-cuda12_1-py3_7-gcc9-debug-build:
    name: libtorch-linux-focal-cuda12.1-py3.7-gcc9-debug
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: libtorch-linux-focal-cuda12.1-py3.7-gcc9
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
      build-generates-artifacts: false
      runner: linux.4xlarge
      test-matrix: |
@ -80,32 +77,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-no-ops
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 1 },
-        ]}
-
-  libtorch-linux-focal-cuda12_4-py3_7-gcc9-debug-build:
-    name: libtorch-linux-focal-cuda12.4-py3.7-gcc9-debug
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: libtorch-linux-focal-cuda12.4-py3.7-gcc9
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
-      build-generates-artifacts: false
-      runner: linux.4xlarge
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 1 },
-        ]}
-
-  # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated
-  linux-focal-cuda12_4-py3_10-gcc9-no-ops-build:
-    name: linux-focal-cuda12.4-py3.10-gcc9-no-ops
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-no-ops
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 1 },
@ -122,12 +94,12 @@ jobs:
          { config: "default", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
        ]}

-  macos-py3-arm64-build:
-    name: macos-py3-arm64
+  macos-13-py3-arm64-build:
+    name: macos-13-py3-arm64
    uses: ./.github/workflows/_mac-build.yml
    with:
      sync-tag: macos-py3-arm64-build
-      build-environment: macos-py3-arm64
+      build-environment: macos-13-py3-arm64
      runner-type: macos-m1-stable
      build-generates-artifacts: true
      # To match the one pre-installed in the m1 runners
@ -142,30 +114,31 @@ jobs:
  macos-py3-arm64-mps-test:
    name: macos-py3-arm64-mps
    uses: ./.github/workflows/_mac-test-mps.yml
-    needs: macos-py3-arm64-build
-    if: needs.macos-py3-arm64-build.outputs.build-outcome == 'success'
+    needs: macos-13-py3-arm64-build
+    if: needs.macos-13-py3-arm64-build.outputs.build-outcome == 'success'
    with:
      sync-tag: macos-py3-arm64-mps-test
-      build-environment: macos-py3-arm64
+      build-environment: macos-13-py3-arm64
      # Same as the build job
      python-version: 3.9.12
      test-matrix: |
        { include: [
-          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
+          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-stable" },
          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
+
        ]}

-  macos-py3-arm64-test:
-    name: macos-py3-arm64
+  macos-13-py3-arm64-test:
+    name: macos-13-py3-arm64
    uses: ./.github/workflows/_mac-test.yml
    needs:
-      - macos-py3-arm64-build
+      - macos-13-py3-arm64-build
      - target-determination
    with:
-      build-environment: macos-py3-arm64
+      build-environment: macos-13-py3-arm64
      # Same as the build job
      python-version: 3.9.12
-      test-matrix: ${{ needs.macos-py3-arm64-build.outputs.test-matrix }}
+      test-matrix: ${{ needs.macos-13-py3-arm64-build.outputs.test-matrix }}

  win-vs2019-cpu-py3-build:
    name: win-vs2019-cpu-py3
@ -219,9 +192,7 @@ jobs:
      sync-tag: rocm-build
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
-          { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
+          { config: "default", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
        ]}

  linux-focal-rocm6_1-py3_8-test:
@ -237,4 +208,4 @@ jobs:
      build-environment: linux-focal-rocm6.1-py3.8
      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
-      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
+      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@ -32,3 +32,174 @@ jobs:
          echo
          echo "Once the jobs are deemed stable enough (% red signal < 5% and TTS < 3h),"
          echo " they can graduate and move back to pull or trunk."
+
+  #
+  # Experimental ARC jobs
+  #
+  llm-td:
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
+  target-determination:
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
+    permissions:
+      id-token: write
+      contents: read
+
+  linux-jammy-py3_8-gcc11-build:
+    name: linux-jammy-py3.8-gcc11
+    uses: ./.github/workflows/_linux-build-rg.yml
+    with:
+      build-environment: linux-jammy-py3.8-gcc11
+      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "docs_test", shard: 1, num_shards: 1,  runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "backwards_compat", shard: 1, num_shards: 1, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "distributed", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "distributed", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+        ]}
+
+  linux-jammy-py3_8-gcc11-test:
+    name: linux-jammy-py3.8-gcc11
+    uses: ./.github/workflows/_linux-test-rg.yml
+    needs:
+      - linux-jammy-py3_8-gcc11-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.8-gcc11
+      docker-image: ${{ needs.linux-jammy-py3_8-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_8-gcc11-build.outputs.test-matrix }}
+
+  linux-jammy-py3_8-gcc11-no-ops:
+    name: linux-jammy-py3.8-gcc11-no-ops
+    uses: ./.github/workflows/_linux-build-rg.yml
+    with:
+      build-environment: linux-jammy-py3.8-gcc11-no-ops
+      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1 },
+        ]}
+
+  linux-jammy-py3_8-gcc11-pch:
+    name: linux-jammy-py3.8-gcc11-pch
+    uses: ./.github/workflows/_linux-build-rg.yml
+    with:
+      build-environment: linux-jammy-py3.8-gcc11-pch
+      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1 },
+        ]}
+
+  linux-focal-py3_8-clang10-onnx-build:
+    name: linux-focal-py3.8-clang10-onnx
+    uses: ./.github/workflows/_linux-build-rg.yml
+    with:
+      build-environment: linux-focal-py3.8-clang10-onnx
+      docker-image-name: pytorch-linux-focal-py3-clang10-onnx
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+        ]}
+
+  linux-focal-py3_8-clang10-onnx-test:
+    name: linux-focal-py3.8-clang10-onnx
+    uses: ./.github/workflows/_linux-test-rg.yml
+    needs:
+      - linux-focal-py3_8-clang10-onnx-build
+      - target-determination
+    with:
+      build-environment: linux-focal-py3.8-clang10-onnx
+      docker-image: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.test-matrix }}
+
+  linux-jammy-py3_10-clang15-asan-build:
+    name: linux-jammy-py3.10-clang15-asan
+    uses: ./.github/workflows/_linux-build-rg.yml
+    with:
+      build-environment: linux-jammy-py3.10-clang15-asan
+      docker-image-name: pytorch-linux-jammy-py3-clang15-asan
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.4xlarge" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.4xlarge" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.4xlarge" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.4xlarge" },
+        ]}
+      sync-tag: asan-build-arc
+
+  linux-focal-py3_8-clang10-build:
+    name: linux-focal-py3.8-clang10
+    uses: ./.github/workflows/_linux-build-rg.yml
+    with:
+      build-environment: linux-focal-py3.8-clang10
+      docker-image-name: pytorch-linux-focal-py3.8-clang10
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "crossref", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "crossref", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "dynamo", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "dynamo", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "dynamo", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+        ]}
+
+  linux-focal-py3_8-clang10-test:
+    name: linux-focal-py3.8-clang10
+    uses: ./.github/workflows/_linux-test-rg.yml
+    needs:
+      - linux-focal-py3_8-clang10-build
+      - target-determination
+    with:
+      build-environment: linux-focal-py3.8-clang10
+      docker-image: ${{ needs.linux-focal-py3_8-clang10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-py3_8-clang10-build.outputs.test-matrix }}
+
+  linux-focal-py3_11-clang10-build:
+    name: linux-focal-py3.11-clang10
+    uses: ./.github/workflows/_linux-build-rg.yml
+    with:
+      build-environment: linux-focal-py3.11-clang10
+      docker-image-name: pytorch-linux-focal-py3.11-clang10
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "default", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "crossref", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "crossref", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "dynamo", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "dynamo", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+          { config: "dynamo", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
+        ]}
+
+  linux-focal-py3_11-clang10-test:
+    name: linux-focal-py3.11-clang10
+    uses: ./.github/workflows/_linux-test-rg.yml
+    needs:
+      - linux-focal-py3_11-clang10-build
+      - target-determination
+    with:
+      build-environment: linux-focal-py3.11-clang10
+      docker-image: ${{ needs.linux-focal-py3_11-clang10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-py3_11-clang10-build.outputs.test-matrix }}
+
+  #
+  # End of Experimental ARC jobs
+  #
--- a/.github/workflows/upload-alerts.yml
+++ b/.github/workflows/upload-alerts.yml
@ -28,7 +28,7 @@ jobs:

      - name: Install Python Packages
        run: |
-          pip3 install rockset==1.0.3 boto3==1.19.12 requests==2.32.2
+          pip3 install rockset==1.0.3 boto3==1.19.12 requests==2.27.1

      - name: Create alerts
        run: |
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -47,7 +47,7 @@ jobs:
          cache: pip

      - run: |
-          pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
+          pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12

      - name: Upload test artifacts
        id: upload-s3
--- a/.github/workflows/upload-torch-dynamo-perf-stats.yml
+++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml
@ -40,7 +40,7 @@ jobs:
          cache: pip

      - run: |
-          pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
+          pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12

      - name: Upload torch dynamo performance stats to S3
        id: upload-s3
--- a/.github/workflows/upload_test_stats_intermediate.yml
+++ b/.github/workflows/upload_test_stats_intermediate.yml
@ -1,43 +0,0 @@
-name: Upload test stats intermediate
-
-on:
-  workflow_dispatch:
-    inputs:
-      workflow_id:
-        description: workflow_id of the run
-        required: true
-      workflow_run_attempt:
-        description: workflow_run_attempt of the run
-        required: true
-
-jobs:
-  intermediate_upload_test_stats:
-    name: Intermediate upload test stats for ${{ inputs.workflow_id }}
-    runs-on: ubuntu-22.04
-    environment: upload-stats
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          fetch-depth: 1
-          submodules: false
-
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.11'
-          cache: pip
-
-      - run: |
-          pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
-
-      - name: Upload test stats
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          WORKFLOW_RUN_ID: ${{ inputs.workflow_id }}
-          WORKFLOW_RUN_ATTEMPT: ${{ inputs.workflow_run_attempt }}
-        run: |
-          python3 -m tools.stats.upload_test_stats_intermediate \
-            --workflow-run-id "${WORKFLOW_RUN_ID}" \
-            --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" \
--- a/.gitmodules
+++ b/.gitmodules
@ -2,6 +2,10 @@
    ignore = dirty
    path = third_party/pybind11
    url = https://github.com/pybind/pybind11.git
+[submodule "third_party/cub"]
+    ignore = dirty
+    path = third_party/cub
+    url = https://github.com/NVlabs/cub.git
 [submodule "third_party/eigen"]
    ignore = dirty
    path = third_party/eigen
@ -18,6 +22,10 @@
    ignore = dirty
    path = third_party/protobuf
    url = https://github.com/protocolbuffers/protobuf.git
+[submodule "third_party/ios-cmake"]
+    ignore = dirty
+    path = third_party/ios-cmake
+    url = https://github.com/Yangqing/ios-cmake.git
 [submodule "third_party/NNPACK"]
    ignore = dirty
    path = third_party/NNPACK
@ -42,6 +50,10 @@
    ignore = dirty
    path = third_party/psimd
    url = https://github.com/Maratyszcza/psimd.git
+[submodule "third_party/zstd"]
+    ignore = dirty
+    path = third_party/zstd
+    url = https://github.com/facebook/zstd.git
 [submodule "third_party/cpuinfo"]
    ignore = dirty
    path = third_party/cpuinfo
@ -54,6 +66,10 @@
    ignore = dirty
    path = third_party/onnx
    url = https://github.com/onnx/onnx.git
+[submodule "third_party/onnx-tensorrt"]
+    ignore = dirty
+    path = third_party/onnx-tensorrt
+    url = https://github.com/onnx/onnx-tensorrt
 [submodule "third_party/sleef"]
    ignore = dirty
    path = third_party/sleef
@ -70,6 +86,14 @@
    ignore = dirty
    path = third_party/gemmlowp/gemmlowp
    url = https://github.com/google/gemmlowp.git
+[submodule "third_party/QNNPACK"]
+    ignore = dirty
+    path = third_party/QNNPACK
+    url = https://github.com/pytorch/QNNPACK
+[submodule "third_party/neon2sse"]
+    ignore = dirty
+    path = third_party/neon2sse
+    url = https://github.com/intel/ARM_NEON_2_x86_SSE.git
 [submodule "third_party/fbgemm"]
    ignore = dirty
    path = third_party/fbgemm
@ -78,6 +102,10 @@
    ignore = dirty
    path = third_party/foxi
    url = https://github.com/houseroad/foxi.git
+[submodule "third_party/tbb"]
+    path = third_party/tbb
+    url = https://github.com/01org/tbb
+    branch = tbb_2018
 [submodule "android/libs/fbjni"]
    ignore = dirty
    path = android/libs/fbjni
@ -124,7 +152,3 @@
 [submodule "third_party/opentelemetry-cpp"]
 	path = third_party/opentelemetry-cpp
 	url = https://github.com/open-telemetry/opentelemetry-cpp.git
-[submodule "third_party/cpp-httplib"]
-	path = third_party/cpp-httplib
-	url = https://github.com/yhirose/cpp-httplib.git
-	branch = v0.15.3
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -193,7 +193,6 @@ include_patterns = [
    'aten/src/ATen/*.cpp',
    'aten/src/ATen/core/*.h',
    'aten/src/ATen/core/*.cpp',
-    'aten/src/ATen/detail/*',
    'aten/src/ATen/functorch/*.h',
    'aten/src/ATen/functorch/*.cpp',
    'c10/**/*.cpp',
@ -235,6 +234,7 @@ exclude_patterns = [
    'torch/csrc/jit/serialization/import_legacy.cpp',
    'torch/csrc/jit/serialization/export.cpp',
    'torch/csrc/lazy/**/*',
+    'torch/csrc/onnx/init.cpp',
    'torch/csrc/mps/**/*',
 ]
 init_command = [
@ -1052,18 +1052,19 @@ exclude_patterns = [
    'test/quantization/fx/test_numeric_suite_fx.py',
    'test/quantization/fx/test_quantize_fx.py',
    'test/quantization/fx/test_subgraph_rewriter.py',
+    'test/test_datapipe.py',
    'test/test_fake_tensor.py',
    'test/test_flop_counter.py',
    'test/test_function_schema.py',
    'test/test_functional_autograd_benchmark.py',
    'test/test_functional_optim.py',
    'test/test_functionalization_of_rng_ops.py',
-    'test/test_datapipe.py',
    'test/test_futures.py',
    'test/test_fx.py',
    'test/test_fx_experimental.py',
    'test/test_fx_passes.py',
    'test/test_fx_reinplace_pass.py',
+    'test/test_hub.py',
    'test/test_import_stats.py',
    'test/test_itt.py',
    'test/test_jit.py',
@ -1072,6 +1073,7 @@ exclude_patterns = [
    'test/test_jit_disabled.py',
    'test/test_jit_fuser.py',
    'test/test_jit_fuser_legacy.py',
+    'test/test_jit_fuser_te.py',
    'test/test_jit_legacy.py',
    'test/test_jit_llga_fuser.py',
    'test/test_jit_profiling.py',
@ -1101,6 +1103,13 @@ exclude_patterns = [
    'test/test_native_mha.py',
    'test/test_nestedtensor.py',
    'test/test_nn.py',
+    'test/test_nnapi.py',
+    'test/test_numba_integration.py',
+    'test/test_numpy_interop.py',
+    'test/test_nvfuser_dynamo.py',
+    'test/test_nvfuser_frontend.py',
+    'test/test_openmp.py',
+    'test/test_optim.py',
    'test/test_out_dtype_op.py',
    'test/test_overrides.py',
    'test/test_prims.py',
@ -1114,6 +1123,9 @@ exclude_patterns = [
    'test/test_segment_reductions.py',
    'test/test_serialization.py',
    'test/test_set_default_mobile_cpu_allocator.py',
+    'test/test_shape_ops.py',
+    'test/test_show_pickle.py',
+    'test/test_sort_and_select.py',
    'test/test_sparse.py',
    'test/test_sparse_csr.py',
    'test/test_sparse_semi_structured.py',
@ -1131,9 +1143,23 @@ exclude_patterns = [
    'test/test_transformers.py',
    'test/test_type_promotion.py',
    'test/test_unary_ufuncs.py',
+    'test/test_utils.py',
    'test/test_vulkan.py',
    'test/test_xnnpack_integration.py',
    'test/torch_np/numpy_test/**/*.py',
+    'test/typing/fail/bitwise_ops.py',
+    'test/typing/fail/creation_ops.py',
+    'test/typing/fail/random.py',
+    'test/typing/pass/creation_ops.py',
+    'test/typing/pass/math_ops.py',
+    'test/typing/reveal/module_list.py',
+    'test/typing/reveal/namedtuple.py',
+    'test/typing/reveal/opt_size.py',
+    'test/typing/reveal/size.py',
+    'test/typing/reveal/tensor_constructors.py',
+    'test/typing/reveal/tensor_copy.py',
+    'test/typing/reveal/tensor_sampling.py',
+    'test/typing/reveal/torch_optim.py',
    'torch/_awaits/__init__.py',
    'torch/_custom_op/__init__.py',
    'torch/_custom_op/autograd.py',
@ -1532,6 +1558,28 @@ exclude_patterns = [
    'torch/distributed/optim/post_localSGD_optimizer.py',
    'torch/distributed/optim/utils.py',
    'torch/distributed/optim/zero_redundancy_optimizer.py',
+    'torch/distributed/pipeline/__init__.py',
+    'torch/distributed/pipeline/sync/__init__.py',
+    'torch/distributed/pipeline/sync/_balance/__init__.py',
+    'torch/distributed/pipeline/sync/_balance/blockpartition.py',
+    'torch/distributed/pipeline/sync/_balance/profile.py',
+    'torch/distributed/pipeline/sync/batchnorm.py',
+    'torch/distributed/pipeline/sync/checkpoint.py',
+    'torch/distributed/pipeline/sync/copy.py',
+    'torch/distributed/pipeline/sync/dependency.py',
+    'torch/distributed/pipeline/sync/microbatch.py',
+    'torch/distributed/pipeline/sync/phony.py',
+    'torch/distributed/pipeline/sync/pipe.py',
+    'torch/distributed/pipeline/sync/pipeline.py',
+    'torch/distributed/pipeline/sync/skip/__init__.py',
+    'torch/distributed/pipeline/sync/skip/layout.py',
+    'torch/distributed/pipeline/sync/skip/namespace.py',
+    'torch/distributed/pipeline/sync/skip/portal.py',
+    'torch/distributed/pipeline/sync/skip/skippable.py',
+    'torch/distributed/pipeline/sync/skip/tracker.py',
+    'torch/distributed/pipeline/sync/stream.py',
+    'torch/distributed/pipeline/sync/utils.py',
+    'torch/distributed/pipeline/sync/worker.py',
    'torch/distributed/remote_device.py',
    'torch/distributed/rendezvous.py',
    'torch/distributed/rpc/__init__.py',
@ -1556,6 +1604,7 @@ exclude_patterns = [
    'torch/distributed/tensor/parallel/input_reshard.py',
    'torch/distributed/tensor/parallel/multihead_attention_tp.py',
    'torch/distributed/tensor/parallel/style.py',
+    'torch/distributed/utils.py',
    'torch/fft/__init__.py',
    'torch/func/__init__.py',
    'torch/functional.py',
@ -1647,6 +1696,18 @@ exclude_patterns = [
    'torch/hub.py',
    'torch/library.py',
    'torch/linalg/__init__.py',
+    # UFMT causes import cycle on masked
+    'torch/masked/__init__.py',
+    'torch/masked/_docs.py',
+    'torch/masked/_ops.py',
+    'torch/masked/maskedtensor/__init__.py',
+    'torch/masked/maskedtensor/_ops_refs.py',
+    'torch/masked/maskedtensor/binary.py',
+    'torch/masked/maskedtensor/core.py',
+    'torch/masked/maskedtensor/creation.py',
+    'torch/masked/maskedtensor/passthrough.py',
+    'torch/masked/maskedtensor/reductions.py',
+    'torch/masked/maskedtensor/unary.py',
    'torch/monitor/__init__.py',
    'torch/nested/__init__.py',
    'torch/nn/__init__.py',
@ -1825,6 +1886,8 @@ exclude_patterns = [
    'torch/testing/_internal/distributed/nn/__init__.py',
    'torch/testing/_internal/distributed/nn/api/__init__.py',
    'torch/testing/_internal/distributed/nn/api/remote_module_test.py',
+    'torch/testing/_internal/distributed/pipe_with_ddp_test.py',
+    'torch/testing/_internal/distributed/pipeline/__init__.py',
    'torch/testing/_internal/distributed/rpc/__init__.py',
    'torch/testing/_internal/distributed/rpc/dist_autograd_test.py',
    'torch/testing/_internal/distributed/rpc/dist_optimizer_test.py',
@ -1867,6 +1930,8 @@ exclude_patterns = [
    'torch/utils/_mode_utils.py',
    'torch/utils/_python_dispatch.py',
    'torch/utils/_stats.py',
+    'torch/utils/_sympy/__init__.py',
+    'torch/utils/_sympy/functions.py',
    'torch/utils/_traceback.py',
    'torch/utils/_zip.py',
    'torch/utils/backcompat/__init__.py',
@ -2079,7 +2144,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'ruff==0.4.8',
+    'ruff==0.4.4',
 ]
 is_formatter = true

--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -125,6 +125,10 @@ filegroup(
    data = [":generate-code"],
 )

+exports_files(
+    srcs = ["aten/src/ATen/cpu/tbb/extra/version_string.ver.in"],
+)
+
 # ATen
 filegroup(
    name = "aten_base_cpp",
@ -271,6 +275,7 @@ header_template_rule(
        "@AT_BUILD_WITH_LAPACK@": "1",
        "@AT_PARALLEL_OPENMP@": "0",
        "@AT_PARALLEL_NATIVE@": "1",
+        "@AT_PARALLEL_NATIVE_TBB@": "0",
        "@AT_BLAS_F2C@": "0",
        "@AT_BLAS_USE_CBLAS_DOT@": "1",
    },
@ -354,9 +359,6 @@ cc_library(
        ":aten_src_ATen_config",
    ] + generated_cpu_cpp + aten_ufunc_generated_cpu_sources("aten/src/ATen/{}"),
    copts = ATEN_COPTS,
-    linkopts = [
-      "-ldl",
-    ],
    data = if_cuda(
        [":libcaffe2_nvrtc.so"],
        [],
@ -454,7 +456,50 @@ CAFFE2_COPTS = COMMON_COPTS + [
 filegroup(
    name = "caffe2_core_srcs",
    srcs = [
+        "caffe2/core/allocator.cc",
+        "caffe2/core/blob_serialization.cc",
+        "caffe2/core/blob_stats.cc",
        "caffe2/core/common.cc",
+        "caffe2/core/context.cc",
+        "caffe2/core/context_base.cc",
+        "caffe2/core/db.cc",
+        "caffe2/core/event.cc",
+        "caffe2/core/export_c10_op_to_caffe2.cc",
+        "caffe2/core/graph.cc",
+        "caffe2/core/init.cc",
+        "caffe2/core/init_denormals.cc",
+        "caffe2/core/init_intrinsics_check.cc",
+        "caffe2/core/init_omp.cc",
+        "caffe2/core/int8_serialization.cc",
+        "caffe2/core/memonger.cc",
+        "caffe2/core/module.cc",
+        "caffe2/core/net.cc",
+        "caffe2/core/net_async_base.cc",
+        "caffe2/core/net_async_scheduling.cc",
+        "caffe2/core/net_async_task.cc",
+        "caffe2/core/net_async_task_future.cc",
+        "caffe2/core/net_async_task_graph.cc",
+        "caffe2/core/net_async_tracing.cc",
+        "caffe2/core/net_dag_utils.cc",
+        "caffe2/core/net_parallel.cc",
+        "caffe2/core/net_simple.cc",
+        "caffe2/core/net_simple_refcount.cc",
+        "caffe2/core/nomnigraph/Representations/NeuralNet.cc",
+        "caffe2/core/nomnigraph/tests/test_util.cc",
+        "caffe2/core/numa.cc",
+        "caffe2/core/operator.cc",
+        "caffe2/core/operator_schema.cc",
+        "caffe2/core/plan_executor.cc",
+        "caffe2/core/prof_dag_counters.cc",
+        "caffe2/core/qtensor.cc",
+        "caffe2/core/qtensor_serialization.cc",
+        "caffe2/core/stats.cc",
+        "caffe2/core/tensor.cc",
+        "caffe2/core/tensor_int8.cc",
+        "caffe2/core/test_utils.cc",
+        "caffe2/core/transform.cc",
+        "caffe2/core/types.cc",
+        "caffe2/core/workspace.cc",
    ],
 )

@ -487,7 +532,19 @@ filegroup(
 filegroup(
    name = "caffe2_utils_srcs",
    srcs = [
+        "caffe2/utils/bench_utils.cc",
+        "caffe2/utils/cpuid.cc",
+        "caffe2/utils/math/broadcast.cc",
+        "caffe2/utils/math/elementwise.cc",
+        "caffe2/utils/math/reduce.cc",
+        "caffe2/utils/math/transpose.cc",
+        "caffe2/utils/math/utils.cc",
+        "caffe2/utils/math_cpu.cc",
+        "caffe2/utils/murmur_hash3.cc",
+        "caffe2/utils/proto_utils.cc",
        "caffe2/utils/proto_wrap.cc",
+        "caffe2/utils/signal_handler.cc",
+        "caffe2/utils/smart_tensor_printer.cc",
        "caffe2/utils/string_utils.cc",
        "caffe2/utils/threadpool/ThreadPool.cc",
        "caffe2/utils/threadpool/pthreadpool.cc",
@ -505,9 +562,12 @@ cc_library(
    name = "caffe2_for_aten_headers",
    hdrs = [
        "caffe2/core/common.h",
+        "caffe2/core/logging.h",
+        "caffe2/core/types.h",
        "caffe2/perfkernels/common.h",
        "caffe2/perfkernels/embedding_lookup.h",
        "caffe2/perfkernels/embedding_lookup_idx.h",
+        "caffe2/utils/cpuid.h",
        "caffe2/utils/fixed_divisor.h",
    ] + glob([
        "caffe2/utils/threadpool/*.h",
@ -517,6 +577,7 @@ cc_library(
    deps = [
        ":caffe2_core_macros",
        "//c10",
+        "//caffe2/proto:caffe2_pb",
    ],
 )

@ -524,9 +585,18 @@ cc_library(
    name = "caffe2_headers",
    hdrs = glob(
        [
+            "caffe2/core/*.h",
+            "caffe2/core/nomnigraph/include/nomnigraph/Converters/*.h",
+            "caffe2/core/nomnigraph/include/nomnigraph/Generated/*.h",
+            "caffe2/core/nomnigraph/include/nomnigraph/Graph/*.h",
+            "caffe2/core/nomnigraph/include/nomnigraph/Representations/*.h",
+            "caffe2/core/nomnigraph/include/nomnigraph/Support/*.h",
+            "caffe2/core/nomnigraph/include/nomnigraph/Transformations/*.h",
+            "caffe2/core/nomnigraph/tests/*.h",
            "caffe2/perfkernels/*.h",
            "caffe2/serialize/*.h",
            "caffe2/utils/*.h",
+            "caffe2/utils/math/*.h",
            "caffe2/utils/threadpool/*.h",
            "modules/**/*.h",
        ],
@ -535,12 +605,18 @@ cc_library(
        ],
    ) + if_cuda(glob([
        "caffe2/**/*.cuh",
+        "caffe2/image/*.h",
    ])),
    copts = CAFFE2_COPTS,
+    includes = [
+        "caffe2/core/nomnigraph/include",
+    ],
    visibility = ["//visibility:public"],
    deps = [
        ":caffe2_core_macros",
        ":caffe2_for_aten_headers",
+        "//caffe2/proto:caffe2_pb",
+        "//caffe2/proto:cc_proto",
    ],
 )

@ -561,6 +637,8 @@ cc_library(
        ":caffe2_perfkernels_avx",
        ":caffe2_perfkernels_avx2",
        ":caffe2_perfkernels_avx512",
+        "//caffe2/proto:caffe2_pb",
+        "//caffe2/proto:cc_proto",
        "//third_party/miniz-2.1.0:miniz",
        "@com_google_protobuf//:protobuf",
        "@eigen",
@ -585,7 +663,6 @@ cu_library(
    name = "torch_cuda",
    srcs = [
        "torch/csrc/distributed/c10d/intra_node_comm.cu",
-        "torch/csrc/distributed/c10d/Utils.cu",
        "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
    ],
    copts = torch_cuda_half_options,
@ -694,7 +771,7 @@ cc_library(
        [
            "torch/*.h",
            "torch/csrc/**/*.h",
-            "torch/csrc/distributed/c10d/**/*.hpp",
+            "torch/csrc/distributed/c10d/*.hpp",
            "torch/lib/libshm/*.h",
        ],
        exclude = [
@ -753,14 +830,10 @@ cc_library(
            "torch/csrc/cuda/python_nccl.cpp",
            "torch/csrc/cuda/nccl.cpp",
            "torch/csrc/distributed/c10d/intra_node_comm.cu",
-            "torch/csrc/distributed/c10d/Utils.cu",
            "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
        ],
    )) + torch_sources,
    copts = TORCH_COPTS,
-    linkopts = [
-      "-lrt",
-    ],
    defines = [
        "CAFFE2_NIGHTLY_VERSION=20200115",
    ],
@ -768,8 +841,8 @@ cc_library(
    deps = [
        ":caffe2",
        ":torch_headers",
+        "//caffe2/proto:torch_cc_proto",
        "@kineto",
-        "@cpp-httplib",
    ] + if_cuda([
        "@cuda//:nvToolsExt",
        "@cutlass",
@ -781,9 +854,6 @@ cc_library(
 cc_library(
    name = "shm",
    srcs = glob(["torch/lib/libshm/*.cpp"]),
-    linkopts = [
-      "-lrt",
-    ],
    deps = [
        ":torch",
    ],
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-![PyTorch Logo](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/pytorch-logo-dark.png)
+![PyTorch Logo](https://github.com/pytorch/pytorch/blob/main/docs/source/_static/img/pytorch-logo-dark.png)

 --------------------------------------------------------------------------------

@ -24,9 +24,6 @@ Our trunk health (Continuous Integration signals) can be found at [hud.pytorch.o
    - [NVIDIA Jetson Platforms](#nvidia-jetson-platforms)
  - [From Source](#from-source)
    - [Prerequisites](#prerequisites)
-      - [NVIDIA CUDA Support](#nvidia-cuda-support)
-      - [AMD ROCm Support](#amd-rocm-support)
-      - [Intel GPU Support](#intel-gpu-support)
    - [Install Dependencies](#install-dependencies)
    - [Get the PyTorch Source](#get-the-pytorch-source)
    - [Install PyTorch](#install-pytorch)
@ -98,7 +95,7 @@ from several research papers on this topic, as well as current and past work suc
 While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date.
 You get the best of speed and flexibility for your crazy research.

-![Dynamic graph](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/dynamic_graph.gif)
+![Dynamic graph](https://github.com/pytorch/pytorch/blob/main/docs/source/_static/img/dynamic_graph.gif)

 ### Python First

@ -165,7 +162,6 @@ If you are installing from source, you will need:

 We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.

-##### NVIDIA CUDA Support
 If you want to compile with CUDA support, [select a supported version of CUDA from our support matrix](https://pytorch.org/get-started/locally/), then install the following:
 - [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
 - [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v8.5 or above
@ -178,7 +174,6 @@ Other potentially useful environment variables may be found in `setup.py`.

 If you are building for NVIDIA's Jetson platforms (Jetson Nano, TX1, TX2, AGX Xavier), Instructions to install PyTorch for Jetson Nano are [available here](https://devtalk.nvidia.com/default/topic/1049071/jetson-nano/pytorch-for-jetson-nano/)

-##### AMD ROCm Support
 If you want to compile with ROCm support, install
 - [AMD ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) 4.0 and above installation
 - ROCm is currently supported only for Linux systems.
@ -186,14 +181,6 @@ If you want to compile with ROCm support, install
 If you want to disable ROCm support, export the environment variable `USE_ROCM=0`.
 Other potentially useful environment variables may be found in `setup.py`.

-##### Intel GPU Support
-If you want to compile with Intel GPU support, follow these
- [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) instructions.
- Intel GPU is supported for Linux and Windows.
-
-If you want to disable Intel GPU support, export the environment variable `USE_XPU=0`.
-Other potentially useful environment variables may be found in `setup.py`.
-
 #### Install Dependencies

 **Common**
@ -209,11 +196,10 @@ pip install -r requirements.txt
 ```bash
 conda install intel::mkl-static intel::mkl-include
 # CUDA only: Add LAPACK support for the GPU if needed
-conda install -c pytorch magma-cuda121  # or the magma-cuda* that matches your CUDA version from https://anaconda.org/pytorch/repo
+conda install -c pytorch magma-cuda110  # or the magma-cuda* that matches your CUDA version from https://anaconda.org/pytorch/repo

 # (optional) If using torch.compile with inductor/triton, install the matching version of triton
 # Run from the pytorch directory after cloning
-# For Intel GPU support, please explicitly `export USE_XPU=1` before running command.
 make triton
 ```

--- a/RELEASE.md
+++ b/RELEASE.md
@ -37,7 +37,6 @@
    - [TL;DR](#tldr)
  - [Accelerator Software](#accelerator-software)
    - [Special support cases](#special-support-cases)
-  - [Operating Systems](#operating-systems)
 - [Submitting Tutorials](#submitting-tutorials)
 - [Special Topics](#special-topics)
  - [Updating submodules for a release](#updating-submodules-for-a-release)
@ -427,15 +426,6 @@ the size restrictions for publishing on PyPI so the default version that is publ
 These special support cases will be handled on a case by case basis and support may be continued if current PyTorch maintainers feel as though there may still be a
 need to support these particular versions of software.

-## Operating Systems
-Supported OS flavors are summarized in the table below:
-| Operating System family | Architectrue | Notes |
-| --- | --- | --- |
-| Linux | aarch64, x86_64 | Wheels are manylinux2014 compatible, i.e. they should be runnable on any Linux system with glibc-2.17 or above. |
-| MacOS | arm64 | Builds should be compatible with MacOS 11 (Big Sur) or newer, but are actively tested against MacOS 14 (Sonoma). |
-| MacOS | x86_64 | Requires MacOS Catalina or above, not supported after 2.2, see https://github.com/pytorch/pytorch/issues/114602 |
-| Windows | x86_64 | Buils are compatible with Windows-10 or newer. |
-
 # Submitting Tutorials

 Tutorials in support of a release feature must be submitted to the [pytorch/tutorials](https://github.com/pytorch/tutorials) repo at least two weeks before the release date to allow for editorial and technical review. There is no cherry-pick process for tutorials. All tutorials will be merged around the release day and published at [pytorch.org/tutorials](https://pytorch.org/tutorials/).
--- a/SECURITY.md
+++ b/SECURITY.md
@ -5,7 +5,6 @@
   - [Untrusted models](#untrusted-models)
   - [Untrusted inputs](#untrusted-inputs)
   - [Data privacy](#data-privacy)
-   - [Using distributed features](#using-distributed-features)

 ## Reporting Security Issues

@ -55,9 +54,3 @@ If applicable, prepare your model against bad inputs and prompt injections. Some
 **Take special security measures if your model if you train models with sensitive data**. Prioritize [sandboxing](https://developers.google.com/code-sandboxing) your models and:
 - Do not feed sensitive data to untrusted model (even if runs in a sandboxed environment)
 - If you consider publishing a model that was partially trained with sensitive data, be aware that data can potentially be recovered from the trained weights (especially if model overfits).
-
-### Using distributed features
-
-PyTorch can be used for distributed computing, and as such there is a `torch.distributed` package. PyTorch Distributed features are intended for internal communication only. They are not built for use in untrusted environments or networks.
-
-For performance reasons, none of the PyTorch Distributed primitives (including c10d, RPC, and TCPStore) include any authorization protocol and will send messages unencrypted. They accept connections from anywhere, and execute the workload sent without performing any checks. Therefore, if you run a PyTorch Distributed program on your network, anybody with access to the network can execute arbitrary code with the privileges of the user running PyTorch.
--- a/17
+++ b/17
@ -168,10 +168,14 @@ new_local_repository(
    path = "third_party/opentelemetry-cpp",
 )

-new_local_repository(
-    name = "cpp-httplib",
-    build_file = "//third_party:cpp-httplib.BUILD",
-    path = "third_party/cpp-httplib",
+new_patched_local_repository(
+    name = "tbb",
+    build_file = "//third_party:tbb.BUILD",
+    patch_strip = 1,
+    patches = [
+        "@//third_party:tbb.patch",
+    ],
+    path = "third_party/tbb",
 )

 new_local_repository(
@ -351,4 +355,9 @@ local_repository(
    path = "third_party/onnx/third_party/benchmark",
 )

+local_repository(
+    name = "unused_onnx_tensorrt_benchmark",
+    path = "third_party/onnx-tensorrt/third_party/onnx/third_party/benchmark",
+)
+
 ### Unused repos end
--- a/android/test_app/make_assets.py
+++ b/android/test_app/make_assets.py
@ -1,6 +1,5 @@
-from torchvision import models
-
 import torch
+from torchvision import models

 print(torch.version.__version__)

--- a/android/test_app/make_assets_custom.py
+++ b/android/test_app/make_assets_custom.py
@ -4,11 +4,10 @@ MobileNetV2 TorchScript model, and dumps root ops used by the model for custom
 build script to create a tailored build which only contains these used ops.
 """

+import torch
 import yaml
 from torchvision import models

-import torch
-
 # Download and trace the model.
 model = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V1)
 model.eval()
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -349,6 +349,16 @@ endif()

 list(APPEND ATen_CPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/..)

+if(USE_TBB)
+  if(USE_SYSTEM_TBB)
+    message("ATen is compiled with system-provided Intel TBB.")
+  else()
+    message("ATen is compiled with Intel TBB (${TBB_ROOT_DIR}).")
+  endif()
+  list(APPEND ATen_CPU_INCLUDE ${TBB_INCLUDE_DIR})
+  list(APPEND ATen_CPU_DEPENDENCY_LIBS TBB::tbb)
+endif()
+
 if(BLAS_FOUND)
  if($ENV{TH_BINARY_BUILD})
    message(STATUS "TH_BINARY_BUILD detected. Enabling special linkage.")
@ -386,7 +396,6 @@ if(UNIX AND NOT APPLE)
 endif(UNIX AND NOT APPLE)

 if(UNIX)
-  include(CheckFunctionExists)
  set(CMAKE_EXTRA_INCLUDE_FILES "sys/mman.h")
  CHECK_FUNCTION_EXISTS(mmap HAVE_MMAP)
  if(HAVE_MMAP)
--- a/aten/src/ATen/CPUGeneratorImpl.cpp
+++ b/aten/src/ATen/CPUGeneratorImpl.cpp
@ -81,8 +81,8 @@ inline uint64_t make64BitsFrom32Bits(uint32_t hi, uint32_t lo) {
 CPUGeneratorImpl::CPUGeneratorImpl(uint64_t seed_in)
  : c10::GeneratorImpl{Device(DeviceType::CPU), DispatchKeySet(c10::DispatchKey::CPU)},
    engine_{seed_in},
-    next_float_normal_sample_{std::optional<float>()},
-    next_double_normal_sample_{std::optional<double>()} { }
+    next_float_normal_sample_{c10::optional<float>()},
+    next_double_normal_sample_{c10::optional<double>()} { }

 /**
 * Manually seeds the engine with the seed input
@ -151,8 +151,8 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
  detail::check_rng_state(new_state);

  at::mt19937 engine;
-  auto float_normal_sample = std::optional<float>();
-  auto double_normal_sample = std::optional<double>();
+  auto float_normal_sample = c10::optional<float>();
+  auto double_normal_sample = c10::optional<double>();

  // Construct the state of at::CPUGeneratorImpl based on input byte tensor size.
  CPUGeneratorImplStateLegacy* legacy_pod{nullptr};
@ -160,7 +160,7 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
  if (new_state_size == size_legacy) {
    legacy_pod = (CPUGeneratorImplStateLegacy*)new_state.data();
    // Note that in CPUGeneratorImplStateLegacy, we didn't have float version
-    // of normal sample and hence we leave the std::optional<float> as is
+    // of normal sample and hence we leave the c10::optional<float> as is

    // Update next_double_normal_sample.
    // Note that CPUGeneratorImplStateLegacy stores two uniform values (normal_x, normal_y)
@ -171,14 +171,14 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
      auto r = legacy_pod->normal_rho;
      auto theta = 2.0 * c10::pi<double> * legacy_pod->normal_x;
      // we return the sin version of the normal sample when in caching mode
-      double_normal_sample = std::optional<double>(r * ::sin(theta));
+      double_normal_sample = c10::optional<double>(r * ::sin(theta));
    }
  } else if (new_state_size == size_current) {
    auto rng_state = (CPUGeneratorImplState*)new_state.data();
    legacy_pod = &rng_state->legacy_pod;
    // update next_float_normal_sample
    if (rng_state->is_next_float_normal_sample_valid) {
-      float_normal_sample = std::optional<float>(rng_state->next_float_normal_sample);
+      float_normal_sample = c10::optional<float>(rng_state->next_float_normal_sample);
    }

    // Update next_double_normal_sample.
@ -186,7 +186,7 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) {
    // and if it's valid in normal_is_valid. The redundant normal_x and normal_rho
    // are squashed to 0.0.
    if (legacy_pod->normal_is_valid) {
-      double_normal_sample = std::optional<double>(legacy_pod->normal_y);
+      double_normal_sample = c10::optional<double>(legacy_pod->normal_y);
    }
  } else {
    AT_ERROR("Expected either a CPUGeneratorImplStateLegacy of size ", size_legacy,
@ -283,14 +283,14 @@ uint64_t CPUGeneratorImpl::random64() {
 /**
 * Get the cached normal random in float
 */
-std::optional<float> CPUGeneratorImpl::next_float_normal_sample() {
+c10::optional<float> CPUGeneratorImpl::next_float_normal_sample() {
  return next_float_normal_sample_;
 }

 /**
 * Get the cached normal random in double
 */
-std::optional<double> CPUGeneratorImpl::next_double_normal_sample() {
+c10::optional<double> CPUGeneratorImpl::next_double_normal_sample() {
  return next_double_normal_sample_;
 }

@ -299,7 +299,7 @@ std::optional<double> CPUGeneratorImpl::next_double_normal_sample() {
 *
 * See Note [Acquire lock when using random generators]
 */
-void CPUGeneratorImpl::set_next_float_normal_sample(std::optional<float> randn) {
+void CPUGeneratorImpl::set_next_float_normal_sample(c10::optional<float> randn) {
  next_float_normal_sample_ = randn;
 }

@ -308,7 +308,7 @@ void CPUGeneratorImpl::set_next_float_normal_sample(std::optional<float> randn)
 *
 * See Note [Acquire lock when using random generators]
 */
-void CPUGeneratorImpl::set_next_double_normal_sample(std::optional<double> randn) {
+void CPUGeneratorImpl::set_next_double_normal_sample(c10::optional<double> randn) {
  next_double_normal_sample_ = randn;
 }

--- a/aten/src/ATen/CPUGeneratorImpl.h
+++ b/aten/src/ATen/CPUGeneratorImpl.h
@ -24,18 +24,18 @@ struct TORCH_API CPUGeneratorImpl : public c10::GeneratorImpl {
  static c10::DeviceType device_type();
  uint32_t random();
  uint64_t random64();
-  std::optional<float> next_float_normal_sample();
-  std::optional<double> next_double_normal_sample();
-  void set_next_float_normal_sample(std::optional<float> randn);
-  void set_next_double_normal_sample(std::optional<double> randn);
+  c10::optional<float> next_float_normal_sample();
+  c10::optional<double> next_double_normal_sample();
+  void set_next_float_normal_sample(c10::optional<float> randn);
+  void set_next_double_normal_sample(c10::optional<double> randn);
  at::mt19937 engine();
  void set_engine(at::mt19937 engine);

 private:
  CPUGeneratorImpl* clone_impl() const override;
  at::mt19937 engine_;
-  std::optional<float> next_float_normal_sample_;
-  std::optional<double> next_double_normal_sample_;
+  c10::optional<float> next_float_normal_sample_;
+  c10::optional<double> next_double_normal_sample_;
 };

 namespace detail {
--- a/aten/src/ATen/Config.h.in
+++ b/aten/src/ATen/Config.h.in
@ -17,5 +17,6 @@
 #define AT_BUILD_WITH_LAPACK() @AT_BUILD_WITH_LAPACK@
 #define AT_PARALLEL_OPENMP @AT_PARALLEL_OPENMP@
 #define AT_PARALLEL_NATIVE @AT_PARALLEL_NATIVE@
+#define AT_PARALLEL_NATIVE_TBB @AT_PARALLEL_NATIVE_TBB@
 #define AT_BLAS_F2C() @AT_BLAS_F2C@
 #define AT_BLAS_USE_CBLAS_DOT() @AT_BLAS_USE_CBLAS_DOT@
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -59,7 +59,7 @@ class TORCH_API Context {
    }
  }
  const AcceleratorHooksInterface& getAcceleratorHooksInterface(
-      std::optional<c10::DeviceType> opt_device_type = c10::nullopt) {
+      c10::optional<c10::DeviceType> opt_device_type = c10::nullopt) {
    c10::DeviceType device_type = opt_device_type.has_value()
        ? opt_device_type.value()
        : at::getAccelerator(true).value();
@ -395,7 +395,7 @@ class TORCH_API Context {
  bool release_original_weights = false;
 #endif
  bool display_vmap_fallback_warnings_ = false;
-  std::optional<at::QEngine> quantized_engine = c10::nullopt;
+  c10::optional<at::QEngine> quantized_engine = c10::nullopt;
  bool enable_sparse_tensor_invariant_checks = false;
  bool allow_fp16_reduction_cpu = false;

--- a/aten/src/ATen/DeviceGuard.h
+++ b/aten/src/ATen/DeviceGuard.h
@ -15,7 +15,7 @@ namespace at {
 //    OptionalDeviceGuard guard(device_of(tensor));

 /// Return the Device of a Tensor, if the Tensor is defined.
-inline std::optional<Device> device_of(const Tensor& t) {
+inline c10::optional<Device> device_of(const Tensor& t) {
  if (t.defined()) {
    return c10::make_optional(t.device());
  } else {
@ -23,14 +23,14 @@ inline std::optional<Device> device_of(const Tensor& t) {
  }
 }

-inline std::optional<Device> device_of(const std::optional<Tensor>& t) {
+inline c10::optional<Device> device_of(const c10::optional<Tensor>& t) {
  return t.has_value() ? device_of(t.value()) : c10::nullopt;
 }

 /// Return the Device of a TensorList, if the list is non-empty and
 /// the first Tensor is defined.  (This function implicitly assumes
 /// that all tensors in the list have the same device.)
-inline std::optional<Device> device_of(ITensorListRef t) {
+inline c10::optional<Device> device_of(ITensorListRef t) {
  if (!t.empty()) {
    return device_of(t.front());
  } else {
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@ -163,7 +163,7 @@ TensorBase _empty_generic(
    c10::Allocator* allocator,
    c10::DispatchKeySet ks,
    ScalarType scalar_type,
-    std::optional<c10::MemoryFormat> memory_format_opt) {
+    c10::optional<c10::MemoryFormat> memory_format_opt) {
  at::detail::check_size_nonnegative(size);
  at::detail::raise_warning_for_complex_half(scalar_type);
  caffe2::TypeMeta dtype = scalarTypeToTypeMeta(scalar_type);
@ -197,7 +197,7 @@ TensorBase empty_generic(
    c10::Allocator* allocator,
    c10::DispatchKeySet ks,
    ScalarType scalar_type,
-    std::optional<c10::MemoryFormat> memory_format_opt) {
+    c10::optional<c10::MemoryFormat> memory_format_opt) {
  return _empty_generic(size, allocator, ks, scalar_type, memory_format_opt);
 }

@ -206,7 +206,7 @@ TensorBase empty_generic_symint(
    c10::Allocator* allocator,
    c10::DispatchKeySet ks,
    ScalarType scalar_type,
-    std::optional<c10::MemoryFormat> memory_format_opt) {
+    c10::optional<c10::MemoryFormat> memory_format_opt) {
  return _empty_generic(size, allocator, ks, scalar_type, memory_format_opt);
 }

@ -252,7 +252,7 @@ TensorBase empty_strided_symint_generic(
 }

 TensorBase empty_cpu(IntArrayRef size, ScalarType dtype, bool pin_memory,
-                     std::optional<c10::MemoryFormat> memory_format_opt) {
+                     c10::optional<c10::MemoryFormat> memory_format_opt) {
  auto allocator = GetCPUAllocatorMaybePinned(pin_memory);
  constexpr c10::DispatchKeySet cpu_ks(c10::DispatchKey::CPU);
  return empty_generic(size, allocator, cpu_ks, dtype, memory_format_opt);
@ -260,11 +260,11 @@ TensorBase empty_cpu(IntArrayRef size, ScalarType dtype, bool pin_memory,

 TensorBase empty_cpu(
    IntArrayRef size,
-    std::optional<ScalarType> dtype_opt,
-    std::optional<Layout> layout_opt,
-    std::optional<Device> device_opt,
-    std::optional<bool> pin_memory_opt,
-    std::optional<c10::MemoryFormat> memory_format_opt) {
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt,
+    c10::optional<c10::MemoryFormat> memory_format_opt) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::CPU);
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);

@ -295,10 +295,10 @@ TensorBase empty_strided_cpu(IntArrayRef size, IntArrayRef stride,
 TensorBase empty_strided_cpu(
    IntArrayRef size,
    IntArrayRef stride,
-    std::optional<ScalarType> dtype_opt,
-    std::optional<Layout> layout_opt,
-    std::optional<Device> device_opt,
-    std::optional<bool> pin_memory_opt) {
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::CPU);
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);

@ -342,7 +342,7 @@ static MetaAllocator g_meta_alloc;
 REGISTER_ALLOCATOR(kMeta, &g_meta_alloc);

 TensorBase empty_meta(IntArrayRef size, ScalarType dtype,
-                     std::optional<c10::MemoryFormat> memory_format_opt) {
+                     c10::optional<c10::MemoryFormat> memory_format_opt) {
  auto *allocator = GetAllocator(kMeta);
  constexpr c10::DispatchKeySet meta_dks(c10::DispatchKey::Meta);
  return at::detail::empty_generic(
@ -351,11 +351,11 @@ TensorBase empty_meta(IntArrayRef size, ScalarType dtype,

 TensorBase empty_meta(
  IntArrayRef size,
-  std::optional<ScalarType> dtype_opt,
-  std::optional<Layout> layout_opt,
-  std::optional<Device> device_opt,
-  std::optional<bool> pin_memory_opt,
-  std::optional<c10::MemoryFormat> memory_format_opt
+  c10::optional<ScalarType> dtype_opt,
+  c10::optional<Layout> layout_opt,
+  c10::optional<Device> device_opt,
+  c10::optional<bool> pin_memory_opt,
+  c10::optional<c10::MemoryFormat> memory_format_opt
 ) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::Meta);
  // NB: because there is no SparseMeta (yet), non-strided layout is
@ -371,11 +371,11 @@ TensorBase empty_meta(

 TensorBase empty_symint_meta(
  SymIntArrayRef size,
-  std::optional<ScalarType> dtype_opt,
-  std::optional<Layout> layout_opt,
-  std::optional<Device> device_opt,
-  std::optional<bool> pin_memory_opt,
-  std::optional<c10::MemoryFormat> memory_format_opt
+  c10::optional<ScalarType> dtype_opt,
+  c10::optional<Layout> layout_opt,
+  c10::optional<Device> device_opt,
+  c10::optional<bool> pin_memory_opt,
+  c10::optional<c10::MemoryFormat> memory_format_opt
 ) {
  auto *allocator = GetAllocator(kMeta);
  constexpr c10::DispatchKeySet ks(c10::DispatchKey::Meta);
@ -405,10 +405,10 @@ TensorBase empty_strided_meta(IntArrayRef size, IntArrayRef stride,
 TensorBase empty_strided_meta(
    IntArrayRef size,
    IntArrayRef stride,
-    std::optional<ScalarType> dtype_opt,
-    std::optional<Layout> layout_opt,
-    std::optional<Device> device_opt,
-    std::optional<bool> pin_memory_opt) {
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::Meta);
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);

@ -440,10 +440,10 @@ TensorBase empty_strided_symint_meta(SymIntArrayRef size, SymIntArrayRef stride,
 TensorBase empty_strided_symint_meta(
    SymIntArrayRef size,
    SymIntArrayRef stride,
-    std::optional<ScalarType> dtype_opt,
-    std::optional<Layout> layout_opt,
-    std::optional<Device> device_opt,
-    std::optional<bool> pin_memory_opt) {
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Layout> layout_opt,
+    c10::optional<Device> device_opt,
+    c10::optional<bool> pin_memory_opt) {
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(device_or_default(device_opt).type() == DeviceType::Meta);
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(layout_or_default(layout_opt) == Layout::Strided);

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Simon Fan	bbf5ebbd4b	[compiled autograd] torch.compile API ghstack-source-id: 28065ffb2ed01641b4dcd31fb8fc0e729192f9ec Pull Request resolved: https://github.com/pytorch/pytorch/pull/125880	2024-05-16 01:00:03 -07:00
Simon Fan	463913e679	[compiled autograd] clear compiled_autograd_verbose once test is done ghstack-source-id: f817bd618a06b97a86ef1262dd457cf19879c548 Pull Request resolved: https://github.com/pytorch/pytorch/pull/126148	2024-05-14 08:30:58 -07:00
Simon Fan	643f57a782	[inductor] Clear cache on ctx manager exit ghstack-source-id: 9146aea2680868b25af31a9271a7aa0a396668af Pull Request resolved: https://github.com/pytorch/pytorch/pull/126146	2024-05-14 08:30:57 -07:00
Simon Fan	f13bfd8d87	[compiled autograd] Fix flaky tests ghstack-source-id: 9e999edf4e9a1e41c381fdf20063338a6eb2f313 Pull Request resolved: https://github.com/pytorch/pytorch/pull/126144	2024-05-14 08:30:57 -07:00