[pipelining][doc] Remove duplicated words (#128368 )

"for execution" is used in both step titles Pull Request resolved: https://github.com/pytorch/pytorch/pull/128368 Approved by: https://github.com/wconstab ghstack dependencies: #128361
Support aten operations with out tensor (#124926 )
2025-10-22 22:25:10 +08:00 · 2024-06-11 04:52:57 +00:00 · 2024-06-11 04:35:27 +00:00 · 2024-06-11 04:27:09 +00:00 · 2024-06-11 03:12:11 +00:00 · 2024-06-11 03:02:51 +00:00
4630 changed files with 95885 additions and 233318 deletions
--- a/.ci/docker/aotriton_version.txt
+++ b/.ci/docker/aotriton_version.txt
@ -0,0 +1,5 @@
+0.6b
+manylinux_2_17
+rocm6
+04b5df8c8123f90cba3ede7e971e6fbc6040d506
+3db6ecbc915893ff967abd6e1b43bd5f54949868873be60dc802086c3863e648
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -84,16 +84,16 @@ fi
 # CMake 3.18 is needed to support CUDA17 language variant
 CMAKE_VERSION=3.18.5

-_UCX_COMMIT=00bcc6bb18fc282eb160623b4c0d300147f579af
-_UCC_COMMIT=7cb07a76ccedad7e56ceb136b865eb9319c258ea
+_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
+_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b

 # It's annoying to rename jobs every time you want to rewrite a
 # configuration, so we hardcode everything here rather than do it
 # from scratch
 case "$image" in
-  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
-    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=8
+  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
+    CUDA_VERSION=12.4.0
+    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -105,9 +105,23 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks)
+  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=8
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.4.0
+    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -120,9 +134,54 @@ case "$image" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9)
+  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.1.1
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-focal-cuda12.1-cudnn9-py3.12-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.1.1
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.4.0
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9)
    CUDA_VERSION=11.8.0
-    CUDNN_VERSION=8
+    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -134,9 +193,37 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
+  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
+    CUDA_VERSION=12.4.0
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=8
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
+    CUDA_VERSION=12.4.0
+    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -226,7 +313,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    BASEKIT_VERSION=2024.0.0-49522
+    XPU_VERSION=0.5
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
@ -243,10 +330,10 @@ case "$image" in
    DOCS=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12)
+  pytorch-linux-jammy-cuda11.8-cudnn9-py3.8-clang12)
    ANACONDA_PYTHON_VERSION=3.8
    CUDA_VERSION=11.8
-    CUDNN_VERSION=8
+    CUDNN_VERSION=9
    CLANG_VERSION=12
    PROTOBUF=yes
    DB=yes
@ -293,7 +380,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CONDA_CMAKE=yes
    ;;
-  pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter)
+  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter)
    ANACONDA_PYTHON_VERSION=3.9
    CUDA_VERSION=11.8
    CONDA_CMAKE=yes
@ -360,7 +447,7 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
 #when using cudnn version 8 install it separately from cuda
 if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
  IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
-  if [[ ${CUDNN_VERSION} == 8 ]]; then
+  if [[ ${CUDNN_VERSION} == 9 ]]; then
    IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
  fi
 fi
@ -403,7 +490,7 @@ docker build \
       --build-arg "DOCS=${DOCS}" \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
-       --build-arg "BASEKIT_VERSION=${BASEKIT_VERSION}" \
+       --build-arg "XPU_VERSION=${XPU_VERSION}" \
       --build-arg "ACL=${ACL:-}" \
       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
       --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
@ -412,7 +499,7 @@ docker build \
       "$@" \
       .

-# NVIDIA dockers for RC releases use tag names like `11.0-cudnn8-devel-ubuntu18.04-rc`,
+# NVIDIA dockers for RC releases use tag names like `11.0-cudnn9-devel-ubuntu18.04-rc`,
 # for this case we will set UBUNTU_VERSION to `18.04-rc` so that the Dockerfile could
 # find the correct image. As a result, here we have to replace the
 #   "$UBUNTU_VERSION" == "18.04-rc"
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -62,7 +62,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV and ffmpeg
+# (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -77,6 +77,9 @@ RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
+COPY ./common/install_amdsmi.sh install_amdsmi.sh
+RUN bash ./install_amdsmi.sh
+RUN rm install_amdsmi.sh
 ENV PATH /opt/rocm/bin:$PATH
 ENV PATH /opt/rocm/hcc/bin:$PATH
 ENV PATH /opt/rocm/hip/bin:$PATH
@ -110,6 +113,13 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt

+# Install AOTriton (Early fail)
+COPY ./aotriton_version.txt aotriton_version.txt
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/install_aotriton.sh install_aotriton.sh
+RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
+ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
+
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
--- a/.ci/docker/ci_commit_pins/triton-rocm.txt
+++ b/.ci/docker/ci_commit_pins/triton-rocm.txt
@ -1 +1 @@
-bbe6246e37d8aa791c67daaf9d9d61b26c9ccfdc
+01cbe5045a6898c9a925f01435c8277b2fe6afcc
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -1,6 +1,6 @@
 set -euo pipefail

-readonly version=v23.08
+readonly version=v24.04
 readonly src_host=https://review.mlplatform.org/ml
 readonly src_repo=ComputeLibrary

--- a/.ci/docker/common/install_amdsmi.sh
+++ b/.ci/docker/common/install_amdsmi.sh
@ -0,0 +1,5 @@
+#!/bin/bash
+
+set -ex
+
+cd /opt/rocm/share/amd_smi && pip install .
--- a/.ci/docker/common/install_aotriton.sh
+++ b/.ci/docker/common/install_aotriton.sh
@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -ex
+
+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
+TARBALL='aotriton.tar.bz2'
+# This read command alwasy returns with exit code 1
+read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
+ARCH=$(uname -m)
+AOTRITON_INSTALL_PREFIX="$1"
+AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}.tar.bz2"
+
+cd "${AOTRITON_INSTALL_PREFIX}"
+# Must use -L to follow redirects
+curl -L --retry 3 -o "${TARBALL}" "${AOTRITON_URL}"
+ACTUAL_SHA256=$(sha256sum "${TARBALL}" | cut -d " " -f 1)
+if [ "${SHA256}" != "${ACTUAL_SHA256}" ]; then
+  echo -n "Error: The SHA256 of downloaded tarball is ${ACTUAL_SHA256},"
+  echo " which does not match the expected value ${SHA256}."
+  exit
+fi
+tar xf "${TARBALL}" && rm -rf "${TARBALL}"
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@ -3,7 +3,7 @@
 set -ex

 install_ubuntu() {
-  # NVIDIA dockers for RC releases use tag names like `11.0-cudnn8-devel-ubuntu18.04-rc`,
+  # NVIDIA dockers for RC releases use tag names like `11.0-cudnn9-devel-ubuntu18.04-rc`,
  # for this case we will set UBUNTU_VERSION to `18.04-rc` so that the Dockerfile could
  # find the correct image. As a result, here we have to check for
  #   "$UBUNTU_VERSION" == "18.04"*
--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@ -1,20 +1,18 @@
 #!/bin/bash

-if [[ ${CUDNN_VERSION} == 8 ]]; then
+if [[ -n "${CUDNN_VERSION}" ]]; then
    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
    mkdir tmp_cudnn
    pushd tmp_cudnn
-    if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-8.9.2.26_cuda12-archive"
-        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
-    elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-8.7.0.84_cuda11-archive"
-        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/${CUDNN_NAME}.tar.xz
+    if [[ ${CUDA_VERSION:0:2} == "12" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda12-archive"
+    elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
    else
        print "Unsupported CUDA version ${CUDA_VERSION}"
        exit 1
    fi
-
+    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
    tar xf ${CUDNN_NAME}.tar.xz
    cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
    cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,9 +5,14 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt

-if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
-    CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.5.2.1-archive"
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[1-4]$ ]]; then
+    arch_path='sbsa'
+    export TARGETARCH=${TARGETARCH:-$(uname -m)}
+    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
+        arch_path='x86_64'
+    fi
+    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.5.2.1-archive"
+    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
 elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
    CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.4.0.7-archive"
    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -30,10 +30,10 @@ pip_install \

 pip_install coloredlogs packaging

-pip_install onnxruntime==1.17.0
-pip_install onnx==1.15.0
+pip_install onnxruntime==1.18
+pip_install onnx==1.16.0
 # pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@3e869ef8ccf19b5ebd21c10d3e9c267c9a9fa729" --no-deps
-pip_install onnxscript==0.1.0.dev20240315 --no-deps
+pip_install onnxscript==0.1.0.dev20240523 --no-deps

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -6,9 +6,6 @@ ver() {
    printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
 }

-# Map ROCm version to AMDGPU version
-declare -A AMDGPU_VERSIONS=( ["5.0"]="21.50" ["5.1.1"]="22.10.1" ["5.2"]="22.20" )
-
 install_ubuntu() {
    apt-get update
    if [[ $UBUNTU_VERSION == 18.04 ]]; then
@ -26,31 +23,14 @@ install_ubuntu() {
    apt-get install -y libc++1
    apt-get install -y libc++abi1

-    if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then
-        # Add amdgpu repository
-        UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
-        local amdgpu_baseurl
-        if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
-          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
-        else
-          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/ubuntu"
-        fi
-        echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
-    fi
-
-    ROCM_REPO="ubuntu"
-    if [[ $(ver $ROCM_VERSION) -lt $(ver 4.2) ]]; then
-        ROCM_REPO="xenial"
-    fi
-
-    if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
-        ROCM_REPO="${UBUNTU_VERSION_NAME}"
-    fi
+    # Add amdgpu repository
+    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
+    echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list

    # Add rocm repository
    wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
    local rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
-    echo "deb [arch=amd64] ${rocm_baseurl} ${ROCM_REPO} main" > /etc/apt/sources.list.d/rocm.list
+    echo "deb [arch=amd64] ${rocm_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/rocm.list
    apt-get update --allow-insecure-repositories

    DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
@ -59,7 +39,8 @@ install_ubuntu() {
                   rocm-libs \
                   rccl \
                   rocprofiler-dev \
-                   roctracer-dev
+                   roctracer-dev \
+                   amd-smi-lib

    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.1) ]]; then
        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
@ -68,29 +49,18 @@ install_ubuntu() {
    # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
    # search for all unversioned packages
    # if search fails it will abort this script; use true to avoid case where search fails
-    if [[ $(ver $ROCM_VERSION) -ge $(ver 5.5) ]]; then
-        MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
-        if [[ "x${MIOPENHIPGFX}" = x ]]; then
-          echo "miopen-hip-gfx package not available" && exit 1
-        else
-          DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
-        fi
+    MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
+    if [[ "x${MIOPENHIPGFX}" = x ]]; then
+      echo "miopen-hip-gfx package not available" && exit 1
    else
-        MIOPENKERNELS=$(apt-cache search --names-only miopenkernels | awk '{print $1}' | grep -F -v . || true)
-        if [[ "x${MIOPENKERNELS}" = x ]]; then
-          echo "miopenkernels package not available" && exit 1
-        else
-          DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENKERNELS}
-        fi
+      DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
    fi

    # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
-    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then
-        for kdb in /opt/rocm/share/miopen/db/*.kdb
-        do
-            sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
-        done
-    fi
+    for kdb in /opt/rocm/share/miopen/db/*.kdb
+    do
+        sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
+    done

    # Cleanup
    apt-get autoclean && apt-get clean
@ -107,25 +77,19 @@ install_centos() {
  yum install -y epel-release
  yum install -y dkms kernel-headers-`uname -r` kernel-devel-`uname -r`

-  if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then
-      # Add amdgpu repository
-      local amdgpu_baseurl
-      if [[ $OS_VERSION == 9 ]]; then
-          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/9.0/main/x86_64"
-      else
-        if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
-          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64"
-        else
-          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/7.9/main/x86_64"
-        fi
-      fi
-      echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo
-      echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo
-      echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo
-      echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo
-      echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo
-      echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo
+  # Add amdgpu repository
+  local amdgpu_baseurl
+  if [[ $OS_VERSION == 9 ]]; then
+      amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/9.0/main/x86_64"
+  else
+      amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64"
  fi
+  echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo
+  echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo
+  echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo
+  echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo
+  echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo
+  echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo

  local rocm_baseurl="http://repo.radeon.com/rocm/yum/${ROCM_VERSION}"
  echo "[ROCm]" > /etc/yum.repos.d/rocm.repo
@ -143,33 +107,23 @@ install_centos() {
                   rocm-libs \
                   rccl \
                   rocprofiler-dev \
-                   roctracer-dev
+                   roctracer-dev \
+                   amd-smi-lib

  # precompiled miopen kernels; search for all unversioned packages
  # if search fails it will abort this script; use true to avoid case where search fails
-  if [[ $(ver $ROCM_VERSION) -ge $(ver 5.5) ]]; then
-      MIOPENHIPGFX=$(yum -q search miopen-hip-gfx | grep miopen-hip-gfx | awk '{print $1}'| grep -F kdb. || true)
-      if [[ "x${MIOPENHIPGFX}" = x ]]; then
-        echo "miopen-hip-gfx package not available" && exit 1
-      else
-        yum install -y ${MIOPENHIPGFX}
-      fi
+  MIOPENHIPGFX=$(yum -q search miopen-hip-gfx | grep miopen-hip-gfx | awk '{print $1}'| grep -F kdb. || true)
+  if [[ "x${MIOPENHIPGFX}" = x ]]; then
+    echo "miopen-hip-gfx package not available" && exit 1
  else
-      MIOPENKERNELS=$(yum -q search miopenkernels | grep miopenkernels- | awk '{print $1}'| grep -F kdb. || true)
-      if [[ "x${MIOPENKERNELS}" = x ]]; then
-        echo "miopenkernels package not available" && exit 1
-      else
-        yum install -y ${MIOPENKERNELS}
-      fi
+    yum install -y ${MIOPENHIPGFX}
  fi

  # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
-  if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then
-      for kdb in /opt/rocm/share/miopen/db/*.kdb
-      do
-          sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
-      done
-  fi
+  for kdb in /opt/rocm/share/miopen/db/*.kdb
+  do
+      sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
+  done

  # Cleanup
  yum clean all
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -15,7 +15,7 @@ conda_reinstall() {
 if [ -n "${ROCM_VERSION}" ]; then
  TRITON_REPO="https://github.com/openai/triton"
  TRITON_TEXT_FILE="triton-rocm"
-elif [ -n "${BASEKIT_VERSION}" ]; then
+elif [ -n "${XPU_VERSION}" ]; then
  TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
  TRITON_TEXT_FILE="triton-xpu"
 else
--- a/.ci/docker/common/install_vision.sh
+++ b/.ci/docker/common/install_vision.sh
@ -5,8 +5,7 @@ set -ex
 install_ubuntu() {
  apt-get update
  apt-get install -y --no-install-recommends \
-          libopencv-dev \
-          libavcodec-dev
+          libopencv-dev

  # Cleanup
  apt-get autoclean && apt-get clean
@ -19,8 +18,7 @@ install_centos() {
  yum --enablerepo=extras install -y epel-release

  yum install -y \
-      opencv-devel \
-      ffmpeg-devel
+      opencv-devel

  # Cleanup
  yum clean all
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -3,10 +3,7 @@ set -xe


 # Intel® software for general purpose GPU capabilities.
-# Refer to https://dgpu-docs.intel.com/releases/LTS_803.29_20240131.html
-
-# Intel® oneAPI Base Toolkit (version 2024.0.0) has been updated to include functional and security updates.
-# Refer to https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html
+# Refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html

 # Users should update to the latest version as it becomes available

@ -17,14 +14,16 @@ function install_ubuntu() {
    # Set up the repository. To do this, download the key to the system keyring
    wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
        | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
-    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
-        | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
+    wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+        | gpg --dearmor --output /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg

    # Add the signed entry to APT sources and configure the APT client to use the Intel repository
-    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
+    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] \
+        https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
        | tee /etc/apt/sources.list.d/intel-gpu-jammy.list
-    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
-        | tee /etc/apt/sources.list.d/oneAPI.list
+    echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] \
+        https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" \
+        | tee /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list

    # Update the packages list and repository index
    apt-get update
@ -40,11 +39,11 @@ function install_ubuntu() {
        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
    # Development Packages
    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
-    # Install Intel® oneAPI Base Toolkit
-    if [ -n "$BASEKIT_VERSION" ]; then
-        apt-get install intel-basekit=$BASEKIT_VERSION -y
+    # Install Intel Support Packages
+    if [ -n "$XPU_VERSION" ]; then
+        apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION}
    else
-        apt-get install intel-basekit -y
+        apt-get install -y intel-for-pytorch-gpu-dev
    fi

    # Cleanup
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -56,7 +56,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV and ffmpeg
+# (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -139,7 +139,7 @@ COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
 ARG CUDNN_VERSION
 ARG CUDA_VERSION
 COPY ./common/install_cudnn.sh install_cudnn.sh
-RUN if [ "${CUDNN_VERSION}" -eq 8 ]; then bash install_cudnn.sh; fi
+RUN if [ -n "${CUDNN_VERSION}" ]; then bash install_cudnn.sh; fi
 RUN rm install_cudnn.sh

 # Install CUSPARSELT
@ -152,6 +152,7 @@ RUN rm install_cusparselt.sh
 RUN if [ -h /usr/local/cuda-11.6/cuda-11.6 ]; then rm /usr/local/cuda-11.6/cuda-11.6; fi
 RUN if [ -h /usr/local/cuda-11.7/cuda-11.7 ]; then rm /usr/local/cuda-11.7/cuda-11.7; fi
 RUN if [ -h /usr/local/cuda-12.1/cuda-12.1 ]; then rm /usr/local/cuda-12.1/cuda-12.1; fi
+RUN if [ -h /usr/local/cuda-12.4/cuda-12.4 ]; then rm /usr/local/cuda-12.4/cuda-12.4; fi

 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -53,7 +53,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV and ffmpeg
+# (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -78,6 +78,11 @@ ENV MAGMA_HOME /opt/rocm/magma
 ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8

+# Install amdsmi
+COPY ./common/install_amdsmi.sh install_amdsmi.sh
+RUN bash ./install_amdsmi.sh
+RUN rm install_amdsmi.sh
+
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
@ -100,6 +105,13 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt

+# Install AOTriton
+COPY ./aotriton_version.txt aotriton_version.txt
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/install_aotriton.sh install_aotriton.sh
+RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
+ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
+
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -62,7 +62,7 @@ RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_d
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

 # Install XPU Dependencies
-ARG BASEKIT_VERSION
+ARG XPU_VERSION
 COPY ./common/install_xpu.sh install_xpu.sh
 RUN bash ./install_xpu.sh && rm install_xpu.sh

@ -83,7 +83,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV and ffmpeg
+# (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -80,7 +80,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV and ffmpeg
+# (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -44,15 +44,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
  fi
 fi

-if [[ ${BUILD_ENVIRONMENT} == *"caffe2"* ]]; then
-  echo "Caffe2 build is ON"
-  export BUILD_CAFFE2=ON
-fi
-
-if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then
-  export ATEN_THREADING=TBB
-  export USE_TBB=1
-elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
+if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
  export ATEN_THREADING=NATIVE
 fi

@ -294,6 +286,9 @@ else
      fi
      WERROR=1 python setup.py bdist_wheel
    else
+      if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
+        source .ci/pytorch/install_cache_xla.sh
+      fi
      python setup.py bdist_wheel
    fi
    pip_install_whl "$(echo dist/*.whl)"
@ -335,7 +330,7 @@ else
    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
    mkdir -p "$CUSTOM_OP_BUILD"
    pushd "$CUSTOM_OP_BUILD"
-    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -348,7 +343,7 @@ else
    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
    mkdir -p "$JIT_HOOK_BUILD"
    pushd "$JIT_HOOK_BUILD"
-    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
+    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -360,7 +355,7 @@ else
    python --version
    mkdir -p "$CUSTOM_BACKEND_BUILD"
    pushd "$CUSTOM_BACKEND_BUILD"
-    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
--- a/.ci/pytorch/docs-test.sh
+++ b/.ci/pytorch/docs-test.sh
@ -6,4 +6,4 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 echo "Testing pytorch docs"

 cd docs
-make doctest
+TERM=vt100 make doctest
--- a/.ci/pytorch/install_cache_xla.sh
+++ b/.ci/pytorch/install_cache_xla.sh
@ -0,0 +1,37 @@
+#!/bin/bash
+
+# Script for installing sccache on the xla build job, which uses xla's docker
+# image and doesn't have sccache installed on it.  This is mostly copied from
+# .ci/docker/install_cache.sh.  Changes are: removing checks that will always
+# return the same thing, ex checks for for rocm, CUDA, and changing the path
+# where sccache is installed, and not changing /etc/environment.
+
+set -ex
+
+install_binary() {
+  echo "Downloading sccache binary from S3 repo"
+  curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /tmp/cache/bin/sccache
+}
+
+mkdir -p /tmp/cache/bin
+mkdir -p /tmp/cache/lib
+export PATH="/tmp/cache/bin:$PATH"
+
+install_binary
+chmod a+x /tmp/cache/bin/sccache
+
+function write_sccache_stub() {
+  # Unset LD_PRELOAD for ps because of asan + ps issues
+  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589
+  # shellcheck disable=SC2086
+  # shellcheck disable=SC2059
+  printf "#!/bin/sh\nif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then\n  exec sccache $(which $1) \"\$@\"\nelse\n  exec $(which $1) \"\$@\"\nfi" > "/tmp/cache/bin/$1"
+  chmod a+x "/tmp/cache/bin/$1"
+}
+
+write_sccache_stub cc
+write_sccache_stub c++
+write_sccache_stub gcc
+write_sccache_stub g++
+write_sccache_stub clang
+write_sccache_stub clang++
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -18,6 +18,7 @@ time python test/run_test.py --verbose -i distributed/test_c10d_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_nccl
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
+time python test/run_test.py --verbose -i distributed/test_cuda_p2p
 time python test/run_test.py --verbose -i distributed/test_store
 time python test/run_test.py --verbose -i distributed/test_pg_wrapper
 time python test/run_test.py --verbose -i distributed/rpc/cuda/test_tensorpipe_agent
@ -50,6 +51,9 @@ time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_ra
 # FSDP2 tests
 time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh

+# Pipelining composability tests
+time python test/run_test.py --verbose -i distributed/pipelining/test_composability.py
+
 # Other tests
 time python test/run_test.py --verbose -i test_cuda_primary_ctx
 time python test/run_test.py --verbose -i test_optim -- -k test_forloop_goes_right_direction_multigpu
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -264,6 +264,18 @@ elif [[ $TEST_CONFIG == 'nogpu_AVX512' ]]; then
  export ATEN_CPU_CAPABILITY=avx2
 fi

+# temp workarounds for https://github.com/pytorch/pytorch/issues/126692, remove when fixed
+if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
+  pushd test
+  CUDA_VERSION=$(python -c "import torch; print(torch.version.cuda)")
+  if [ "$CUDA_VERSION" == "12.4" ]; then
+    ISCUDA124="cu124"
+  else
+    ISCUDA124=""
+  fi
+  popd
+fi
+
 test_python_legacy_jit() {
  time python test/run_test.py --include test_jit_legacy test_jit_fuser_legacy --verbose
  assert_git_not_dirty
@ -326,6 +338,7 @@ test_inductor_distributed() {
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py -k test_clip_grad_norm_2d --verbose
  python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose

  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
@ -350,10 +363,20 @@ test_inductor() {

 test_inductor_cpp_wrapper_abi_compatible() {
  export TORCHINDUCTOR_ABI_COMPATIBLE=1
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+
  echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
  # cpu stack allocation causes segfault and needs more investigation
-  TORCHINDUCTOR_STACK_ALLOCATION=0 python test/run_test.py --include inductor/test_cpu_cpp_wrapper
+  PYTORCH_TESTING_DEVICE_ONLY_FOR="" python test/run_test.py --include inductor/test_cpu_cpp_wrapper
  python test/run_test.py --include inductor/test_cuda_cpp_wrapper
+
+  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
+    --training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
+    --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv"
+  python benchmarks/dynamo/check_accuracy.py \
+    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
+    --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/inductor_timm_training.csv"
 }

 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
@ -515,16 +538,16 @@ test_single_dynamo_benchmark() {
      --output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
    python benchmarks/dynamo/check_accuracy.py \
      --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
+      --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/${TEST_CONFIG}_${name}.csv"
    python benchmarks/dynamo/check_graph_breaks.py \
      --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
+      --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/${TEST_CONFIG}_${name}.csv"
  fi
 }

 test_inductor_micro_benchmark() {
-  TEST_REPORTS_DIR=$(pwd)/test/test-micro-reports
-  python benchmarks/gpt_fast/benchmark.py
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
 }

 test_dynamo_benchmark() {
@ -542,7 +565,11 @@ test_dynamo_benchmark() {
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
  else
    if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
-      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 "$@"
+      if [[ "${TEST_CONFIG}" == *freezing* ]]; then
+        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 --freezing "$@"
+      else
+        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 "$@"
+      fi
    elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
    else
@ -556,12 +583,16 @@ test_inductor_torchbench_smoketest_perf() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

-  # smoke test the cpp_wrapper mode
-  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy --bfloat16 \
-    --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_smoketest.csv"
+  # Test some models in the cpp wrapper mode
+  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
+    --bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
+  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
+    --bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
+  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
+    --bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
  python benchmarks/dynamo/check_accuracy.py \
-      --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_smoketest.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"
+    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
+    --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/inductor_torchbench_inference.csv"

  python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
@ -576,7 +607,13 @@ test_inductor_torchbench_smoketest_perf() {
  # https://github.com/pytorch/pytorch/actions/runs/7158691360/job/19491437314,
  # and thus we lower its threshold to reduce flakiness. If this continues to be a problem,
  # we switch to use some other model.
-  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.9
+  # Use 4.7 for cuda 12.4, change back to 4.9 after fixing https://github.com/pytorch/pytorch/issues/126692
+  if [ "$CUDA_VERSION" == "12.4" ]; then
+    THRESHOLD=4.7
+  else
+    THRESHOLD=4.9
+  fi
+  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t $THRESHOLD

  # Check memory compression ratio for a few models
  for test in hf_Albert timm_vision_transformer; do
@ -588,6 +625,15 @@ test_inductor_torchbench_smoketest_perf() {
      "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv" \
      --expected benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
  done
+
+  # Perform some "warm-start" runs for a few huggingface models.
+  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
+    python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
+      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
+    python benchmarks/dynamo/check_accuracy.py \
+      --actual "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" \
+      --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/inductor_huggingface_training.csv"
+  done
 }

 test_inductor_torchbench_cpu_smoketest_perf(){
@ -671,7 +717,6 @@ test_aten() {
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libmkldnn* "$TEST_BASE_DIR"
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libnccl* "$TEST_BASE_DIR"
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libtorch* "$TEST_BASE_DIR"
-  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libtbb* "$TEST_BASE_DIR"

  ls "$TEST_BASE_DIR"
  aten/tools/run_tests.sh "$TEST_BASE_DIR"
@ -696,21 +741,6 @@ test_without_numpy() {
  popd
 }

-# pytorch extensions require including torch/extension.h which includes all.h
-# which includes utils.h which includes Parallel.h.
-# So you can call for instance parallel_for() from your extension,
-# but the compilation will fail because of Parallel.h has only declarations
-# and definitions are conditionally included Parallel.h(see last lines of Parallel.h).
-# I tried to solve it #39612 and #39881 by including Config.h into Parallel.h
-# But if Pytorch is built with TBB it provides Config.h
-# that has AT_PARALLEL_NATIVE_TBB=1(see #3961 or #39881) and it means that if you include
-# torch/extension.h which transitively includes Parallel.h
-# which transitively includes tbb.h which is not available!
-if [[ "${BUILD_ENVIRONMENT}" == *tbb* ]]; then
-  sudo mkdir -p /usr/include/tbb
-  sudo cp -r "$PWD"/third_party/tbb/include/tbb/* /usr/include/tbb
-fi
-
 test_libtorch() {
  local SHARD="$1"

@ -724,7 +754,6 @@ test_libtorch() {
    ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libshm* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
-    ln -sf "$TORCH_LIB_DIR"/libtbb* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libnvfuser* "$TORCH_BIN_DIR"

    export CPP_TESTS_DIR="${TORCH_BIN_DIR}"
@ -861,7 +890,6 @@ test_rpc() {
  # test reporting process to function as expected.
  ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
  ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
-  ln -sf "$TORCH_LIB_DIR"/libtbb* "$TORCH_BIN_DIR"

  CPP_TESTS_DIR="${TORCH_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_cpp_rpc
 }
@ -1269,6 +1297,10 @@ elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHAR
 elif [[ "${TEST_CONFIG}" == *dynamo* && $SHARD_NUMBER -gt 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  install_torchvision
  test_dynamo_shard "${SHARD_NUMBER}"
+elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
+  install_torchvision
+  test_python_shard "$SHARD_NUMBER"
+  test_aten
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  test_without_numpy
  install_torchvision
@ -1298,10 +1330,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
  test_libtorch
 elif [[ "${TEST_CONFIG}" = docs_test ]]; then
  test_docs_test
-elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
-  install_torchvision
-  test_python
-  test_aten
 elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
  install_torchvision
  test_python
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -96,8 +96,13 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
    conda install \${EXTRA_CONDA_FLAGS} -y "\$pkg" --offline
  )
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
-  pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
-  retry pip install -q numpy protobuf typing-extensions
+  if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then
+    pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
+    retry pip install -q numpy protobuf typing-extensions
+  else
+    pip install "\$pkg"
+    retry pip install -q numpy protobuf typing-extensions
+  fi
 fi
 if [[ "$PACKAGE_TYPE" == libtorch ]]; then
  pkg="\$(ls /final_pkgs/*-latest.zip)"
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -76,8 +76,8 @@ TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)

 # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
-  # Only linux Python < 3.12 are supported wheels for triton
-  TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.12'"
+  # Only linux Python < 3.13 are supported wheels for triton
+  TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.13'"
  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
  if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
      TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
--- a/.clang-tidy
+++ b/.clang-tidy
@ -61,6 +61,7 @@ readability-simplify-subscript-expr,
 readability-string-compare,
 '
 HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
-AnalyzeTemporaryDtors: false
 WarningsAsErrors: '*'
+CheckOptions:
+  misc-header-include-cycle.IgnoredFilesList: 'format.h;ivalue.h;custom_class.h;Dict.h;List.h'
 ...
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -1,9 +1,12 @@
 self-hosted-runner:
  labels:
+    # GitHub hosted x86 Linux runners
    - linux.20_04.4x
    - linux.20_04.16x
-    - linux.large
+    # Repo-specific LF hosted ARC runners
    - linux.large.arc
+    # Organization-wide AWS Linux Runners
+    - linux.large
    - linux.2xlarge
    - linux.4xlarge
    - linux.12xlarge
@ -13,17 +16,34 @@ self-hosted-runner:
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
    - linux.g5.4xlarge.nvidia.gpu
+    # Organization-wide AWS Linux Runners on Linux Foundation account
+    - lf.linux.large
+    - lf.linux.2xlarge
+    - lf.linux.4xlarge
+    - lf.linux.12xlarge
+    - lf.linux.24xlarge
+    - lf.linux.arm64.2xlarge
+    - lf.linux.4xlarge.nvidia.gpu
+    - lf.linux.8xlarge.nvidia.gpu
+    - lf.linux.16xlarge.nvidia.gpu
+    - lf.linux.g5.4xlarge.nvidia.gpu
+    # Repo-specific IBM hosted S390x runner
+    - linux.s390x
+    # Organization wide AWS Windows runners
    - windows.4xlarge.nonephemeral
    - windows.8xlarge.nvidia.gpu
    - windows.8xlarge.nvidia.gpu.nonephemeral
    - windows.g5.4xlarge.nvidia.gpu
-    - bm-runner
+    # Organization-wide AMD hosted MI300 runners
    - linux.rocm.gpu
+    # Repo-specific Apple hosted  runners
+    - macos-m1-ultra
+    - macos-m2-14
+    # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
    - macos-m1-stable
    - macos-m1-13
    - macos-m1-14
-    - macos-12-xl
-    - macos-12
-    - macos12.3-m1
+    # GitHub-hosted MacOS runners
    - macos-latest-xlarge
    - macos-13-xlarge
+    - macos-14-xlarge
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@ -66,7 +66,8 @@ runs:
        command: |
          set -eux
          # PyYAML 6.0 doesn't work with MacOS x86 anymore
-          python3 -m pip install requests==2.26.0 pyyaml==6.0.1
+          # This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2
+          python3 -m pip install requests==2.27.1 pyyaml==6.0.1

    - name: Parse ref
      id: parse-ref
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@ -35,7 +35,7 @@ runs:
          "${DOCKER_IMAGE}"
        )

-        if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" ]]; then
+        if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" ]]; then
          # Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner
          grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts"
        fi
@ -44,3 +44,12 @@ runs:
        # Generate test script
        docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
        docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
+
+    - name: Cleanup docker
+      if: always() && env.BUILD_ENVIRONMENT == 'linux-s390x-binary-manywheel'
+      shell: bash
+      run: |
+        # on s390x stop the container for clean worker stop
+        # ignore expansion of "docker ps -q" since it could be empty
+        # shellcheck disable=SC2046
+        docker stop $(docker ps -q) || true
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-ea437b31ce316ea3d66fe73768c0dcb94edb79ad
+b829e936f7cc61b48149f5f957a451a38bf2a178
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-e3fc03314dab5f44e3ed9ccbba6c15fbca3285cd
+6f0b61e5d782913a0fc7743812f2a8e522189111
--- a/.github/lf-canary-scale-config.yml
+++ b/.github/lf-canary-scale-config.yml
@ -0,0 +1,154 @@
+# Defines runner types that will be provisioned by by LF Self-hosted
+# runners for pytorch/pytorch-canary and their labels.
+#
+# Runners listed here will be available as self hosted runners.
+# Configuration is directly pulled from the main branch.
+#
+# Default values:
+#
+# runner_types:
+#   runner_label: # label to specify in the Github Actions workflow
+#     instance_type: m4.large
+#     os: linux
+#     max_available: 20
+#     disk_size: 50
+#     is_ephemeral: true
+
+runner_types:
+  lf.c.linux.12xlarge:
+    disk_size: 200
+    instance_type: c5.12xlarge
+    is_ephemeral: false
+    max_available: 1000
+    os: linux
+  lf.c.linux.24xl.spr-metal:
+    disk_size: 200
+    instance_type: c7i.metal-24xl
+    is_ephemeral: false
+    max_available: 30
+    os: linux
+  lf.c.linux.16xlarge.spr:
+    disk_size: 200
+    instance_type: c7i.16xlarge
+    is_ephemeral: false
+    max_available: 30
+    os: linux
+  lf.c.linux.12xlarge.ephemeral:
+    disk_size: 200
+    instance_type: c5.12xlarge
+    is_ephemeral: true
+    max_available: 300
+    os: linux
+  lf.c.linux.16xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.16xlarge
+    is_ephemeral: false
+    max_available: 30
+    os: linux
+  lf.c.linux.24xlarge:
+    disk_size: 150
+    instance_type: c5.24xlarge
+    is_ephemeral: false
+    max_available: 250
+    os: linux
+  lf.c.linux.2xlarge:
+    disk_size: 150
+    instance_type: c5.2xlarge
+    is_ephemeral: false
+    max_available: 3120
+    os: linux
+  lf.c.linux.4xlarge:
+    disk_size: 150
+    instance_type: c5.4xlarge
+    is_ephemeral: false
+    max_available: 1000
+    os: linux
+  lf.c.linux.4xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.4xlarge
+    is_ephemeral: false
+    max_available: 520
+    os: linux
+  lf.c.linux.8xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.8xlarge
+    is_ephemeral: false
+    max_available: 400
+    os: linux
+  lf.c.linux.g4dn.12xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g4dn.12xlarge
+    is_ephemeral: false
+    max_available: 50
+    os: linux
+  lf.c.linux.g4dn.metal.nvidia.gpu:
+    disk_size: 150
+    instance_type: g4dn.metal
+    is_ephemeral: false
+    max_available: 30
+    os: linux
+  lf.c.linux.g5.48xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.48xlarge
+    is_ephemeral: false
+    max_available: 20
+    os: linux
+  lf.c.linux.g5.12xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.12xlarge
+    is_ephemeral: false
+    max_available: 150
+    os: linux
+  lf.c.linux.g5.4xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.4xlarge
+    is_ephemeral: false
+    max_available: 1200
+    os: linux
+  lf.c.linux.large:
+    disk_size: 15
+    instance_type: c5.large
+    is_ephemeral: false
+    os: linux
+  lf.c.linux.arm64.2xlarge:
+    disk_size: 256
+    instance_type: t4g.2xlarge
+    is_ephemeral: false
+    max_available: 200
+    os: linux
+  lf.c.linux.arm64.m7g.2xlarge:
+    disk_size: 256
+    instance_type: m7g.2xlarge
+    is_ephemeral: false
+    max_available: 20
+    os: linux
+  lf.c.windows.4xlarge:
+    disk_size: 256
+    instance_type: c5d.4xlarge
+    is_ephemeral: true
+    max_available: 420
+    os: windows
+  lf.c.windows.4xlarge.nonephemeral:
+    disk_size: 256
+    instance_type: c5d.4xlarge
+    is_ephemeral: false
+    max_available: 420
+    os: windows
+  lf.c.windows.8xlarge.nvidia.gpu:
+    disk_size: 256
+    instance_type: p3.2xlarge
+    is_ephemeral: true
+    max_available: 150
+    os: windows
+  lf.c.windows.8xlarge.nvidia.gpu.nonephemeral:
+    disk_size: 256
+    instance_type: p3.2xlarge
+    is_ephemeral: false
+    max_available: 150
+    os: windows
+  lf.c.windows.g5.4xlarge.nvidia.gpu:
+    disk_size: 256
+    instance_type: g5.4xlarge
+    is_ephemeral: false
+    max_available: 250
+    os: windows
--- a/.github/lf-scale-config.yml
+++ b/.github/lf-scale-config.yml
@ -0,0 +1,154 @@
+# Defines runner types that will be provisioned by by LF Self-hosted
+# runners for pytorch/pytorch and their labels.
+#
+# Runners listed here will be available as self hosted runners.
+# Configuration is directly pulled from the main branch.
+#
+# Default values:
+#
+# runner_types:
+#   runner_label: # label to specify in the Github Actions workflow
+#     instance_type: m4.large
+#     os: linux
+#     max_available: 20
+#     disk_size: 50
+#     is_ephemeral: true
+
+runner_types:
+  lf.linux.12xlarge:
+    disk_size: 200
+    instance_type: c5.12xlarge
+    is_ephemeral: false
+    max_available: 1000
+    os: linux
+  lf.linux.24xl.spr-metal:
+    disk_size: 200
+    instance_type: c7i.metal-24xl
+    is_ephemeral: false
+    max_available: 30
+    os: linux
+  lf.linux.16xlarge.spr:
+    disk_size: 200
+    instance_type: c7i.16xlarge
+    is_ephemeral: false
+    max_available: 30
+    os: linux
+  lf.linux.12xlarge.ephemeral:
+    disk_size: 200
+    instance_type: c5.12xlarge
+    is_ephemeral: true
+    max_available: 300
+    os: linux
+  lf.linux.16xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.16xlarge
+    is_ephemeral: false
+    max_available: 30
+    os: linux
+  lf.linux.24xlarge:
+    disk_size: 150
+    instance_type: c5.24xlarge
+    is_ephemeral: false
+    max_available: 250
+    os: linux
+  lf.linux.2xlarge:
+    disk_size: 150
+    instance_type: c5.2xlarge
+    is_ephemeral: false
+    max_available: 3120
+    os: linux
+  lf.linux.4xlarge:
+    disk_size: 150
+    instance_type: c5.4xlarge
+    is_ephemeral: false
+    max_available: 1000
+    os: linux
+  lf.linux.4xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.4xlarge
+    is_ephemeral: false
+    max_available: 520
+    os: linux
+  lf.linux.8xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.8xlarge
+    is_ephemeral: false
+    max_available: 400
+    os: linux
+  lf.linux.g4dn.12xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g4dn.12xlarge
+    is_ephemeral: false
+    max_available: 50
+    os: linux
+  lf.linux.g4dn.metal.nvidia.gpu:
+    disk_size: 150
+    instance_type: g4dn.metal
+    is_ephemeral: false
+    max_available: 30
+    os: linux
+  lf.linux.g5.48xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.48xlarge
+    is_ephemeral: false
+    max_available: 20
+    os: linux
+  lf.linux.g5.12xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.12xlarge
+    is_ephemeral: false
+    max_available: 150
+    os: linux
+  lf.linux.g5.4xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.4xlarge
+    is_ephemeral: false
+    max_available: 1200
+    os: linux
+  lf.linux.large:
+    disk_size: 15
+    instance_type: c5.large
+    is_ephemeral: false
+    os: linux
+  lf.linux.arm64.2xlarge:
+    disk_size: 256
+    instance_type: t4g.2xlarge
+    is_ephemeral: false
+    max_available: 200
+    os: linux
+  lf.linux.arm64.m7g.2xlarge:
+    disk_size: 256
+    instance_type: m7g.2xlarge
+    is_ephemeral: false
+    max_available: 20
+    os: linux
+  lf.windows.4xlarge:
+    disk_size: 256
+    instance_type: c5d.4xlarge
+    is_ephemeral: true
+    max_available: 420
+    os: windows
+  lf.windows.4xlarge.nonephemeral:
+    disk_size: 256
+    instance_type: c5d.4xlarge
+    is_ephemeral: false
+    max_available: 420
+    os: windows
+  lf.windows.8xlarge.nvidia.gpu:
+    disk_size: 256
+    instance_type: p3.2xlarge
+    is_ephemeral: true
+    max_available: 150
+    os: windows
+  lf.windows.8xlarge.nvidia.gpu.nonephemeral:
+    disk_size: 256
+    instance_type: p3.2xlarge
+    is_ephemeral: false
+    max_available: 150
+    os: windows
+  lf.windows.g5.4xlarge.nvidia.gpu:
+    disk_size: 256
+    instance_type: g5.4xlarge
+    is_ephemeral: false
+    max_available: 250
+    os: windows
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -245,6 +245,7 @@
  - torch/xpu/**
  - test/xpu/**
  - third_party/xpu.txt
+  - .ci/docker/ci_commit_pins/triton-xpu.txt
  approved_by:
  - EikanWang
  - jgong5
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -1,6 +1,5 @@
 tracking_issue: 24422
 ciflow_tracking_issue: 64124
-TD_rollout_issue: 123120
 ciflow_push_tags:
 - ciflow/binaries
 - ciflow/binaries_conda
@ -9,6 +8,7 @@ ciflow_push_tags:
 - ciflow/inductor
 - ciflow/inductor-perf-compare
 - ciflow/inductor-micro-benchmark
+- ciflow/inductor-cu124
 - ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
@ -20,7 +20,6 @@ ciflow_push_tags:
 - ciflow/xpu
 - ciflow/torchbench
 retryable_workflows:
- lint
 - pull
 - trunk
 - linux-binary
--- a/.github/requirements-gha-cache.txt
+++ b/.github/requirements-gha-cache.txt
@ -10,6 +10,6 @@ lintrunner==0.10.7
 ninja==1.10.0.post1
 nvidia-ml-py==11.525.84
 pyyaml==6.0
-requests==2.31.0
+requests==2.32.2
 rich==10.9.0
 rockset==1.0.3
--- a/.github/requirements/conda-env-Linux-X64.txt
+++ b/.github/requirements/conda-env-Linux-X64.txt
@ -4,6 +4,5 @@ mkl-include=2022.1.0
 ninja=1.10.2
 numpy=1.23.3
 pyyaml=6.0
-requests=2.31.0
 setuptools=68.2.2
-typing-extensions=4.3.0
+typing-extensions=4.9.0
--- a/.github/requirements/conda-env-iOS.txt
+++ b/.github/requirements/conda-env-iOS.txt
@ -3,6 +3,5 @@ cmake=3.22.1
 ninja=1.10.2
 numpy=1.23.3
 pyyaml=6.0
-requests=2.31.0
 setuptools=68.2.2
-typing-extensions=4.3.0
+typing-extensions=4.9.0
--- a/.github/requirements/conda-env-macOS-ARM64
+++ b/.github/requirements/conda-env-macOS-ARM64
@ -2,7 +2,7 @@ numpy=1.22.3
 pyyaml=6.0
 setuptools=61.2.0
 cmake=3.22.*
-typing-extensions=4.3.0
+typing-extensions=4.9.0
 dataclasses=0.8
 pip=22.2.2
 pillow=10.0.1
--- a/.github/requirements/conda-env-macOS-X64
+++ b/.github/requirements/conda-env-macOS-X64
@ -4,7 +4,7 @@ numpy=1.21.2
 pyyaml=5.3
 setuptools=46.0.0
 cmake=3.22.*
-typing-extensions=4.3.0
+typing-extensions=4.9.0
 dataclasses=0.8
 pip=22.2.2
 pillow=10.0.1
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@ -2,6 +2,7 @@
 import os
 import re
 from datetime import datetime
+from functools import lru_cache
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Set

@ -187,6 +188,17 @@ def get_recent_prs() -> Dict[str, Any]:
    return prs_by_branch_base


+@lru_cache(maxsize=1)
+def get_open_prs() -> List[Dict[str, Any]]:
+    return paginate_graphql(
+        GRAPHQL_OPEN_PRS,
+        {"owner": "pytorch", "repo": "pytorch"},
+        lambda data: False,
+        lambda res: res["data"]["repository"]["pullRequests"]["nodes"],
+        lambda res: res["data"]["repository"]["pullRequests"]["pageInfo"],
+    )
+
+
 def get_branches_with_magic_label_or_open_pr() -> Set[str]:
    pr_infos: List[Dict[str, Any]] = paginate_graphql(
        GRAPHQL_NO_DELETE_BRANCH_LABEL,
@ -196,15 +208,7 @@ def get_branches_with_magic_label_or_open_pr() -> Set[str]:
        lambda res: res["data"]["repository"]["label"]["pullRequests"]["pageInfo"],
    )

-    pr_infos.extend(
-        paginate_graphql(
-            GRAPHQL_OPEN_PRS,
-            {"owner": "pytorch", "repo": "pytorch"},
-            lambda data: False,
-            lambda res: res["data"]["repository"]["pullRequests"]["nodes"],
-            lambda res: res["data"]["repository"]["pullRequests"]["pageInfo"],
-        )
-    )
+    pr_infos.extend(get_open_prs())

    # Get the most recent PR for each branch base (group gh together)
    branch_bases = set()
@ -270,5 +274,41 @@ def delete_branches() -> None:
        delete_branch(git_repo, branch)


+def delete_old_ciflow_tags() -> None:
+    # Deletes ciflow tags if they are associated with a closed PR or a specific
+    # commit.  Lightweight tags don't have information about the date they were
+    # created, so we can't check how old they are.  The script just assumes that
+    # ciflow tags should be deleted regardless of creation date.
+    git_repo = GitRepo(str(REPO_ROOT), "origin", debug=True)
+
+    def delete_tag(tag: str) -> None:
+        print(f"Deleting tag {tag}")
+        ESTIMATED_TOKENS[0] += 1
+        delete_branch(git_repo, f"refs/tags/{tag}")
+
+    tags = git_repo._run_git("tag").splitlines()
+    open_pr_numbers = [x["number"] for x in get_open_prs()]
+
+    for tag in tags:
+        try:
+            if ESTIMATED_TOKENS[0] > 400:
+                print("Estimated tokens exceeded, exiting")
+                break
+            if not tag.startswith("ciflow/"):
+                continue
+            re_match_pr = re.match(r"^ciflow\/.*\/(\d{5,6})$", tag)
+            re_match_sha = re.match(r"^ciflow\/.*\/([0-9a-f]{40})$", tag)
+            if re_match_pr:
+                pr_number = int(re_match_pr.group(1))
+                if pr_number in open_pr_numbers:
+                    continue
+                delete_tag(tag)
+            elif re_match_sha:
+                delete_tag(tag)
+        except Exception as e:
+            print(f"Failed to check tag {tag}: {e}")
+
+
 if __name__ == "__main__":
    delete_branches()
+    delete_old_ciflow_tags()
--- a/.github/scripts/docathon-label-sync.py
+++ b/.github/scripts/docathon-label-sync.py
@ -0,0 +1,52 @@
+import os
+import re
+import sys
+
+from github import Github
+
+
+def main() -> None:
+    token = os.environ.get("GITHUB_TOKEN")
+
+    repo_owner = "pytorch"
+    repo_name = "pytorch"
+    pull_request_number = int(sys.argv[1])
+
+    g = Github(token)
+    repo = g.get_repo(f"{repo_owner}/{repo_name}")
+    pull_request = repo.get_pull(pull_request_number)
+    pull_request_body = pull_request.body
+    # PR without description
+    if pull_request_body is None:
+        return
+
+    # get issue number from the PR body
+    if not re.search(r"#\d{1,6}", pull_request_body):
+        print("The pull request does not mention an issue.")
+        return
+    issue_number = int(re.findall(r"#(\d{1,6})", pull_request_body)[0])
+    issue = repo.get_issue(issue_number)
+    issue_labels = issue.labels
+    docathon_label_present = any(
+        label.name == "docathon-h1-2024" for label in issue_labels
+    )
+
+    # if the issue has a docathon label, add all labels from the issue to the PR.
+    if not docathon_label_present:
+        print("The 'docathon-h1-2024' label is not present in the issue.")
+        return
+    pull_request_labels = pull_request.get_labels()
+    pull_request_label_names = [label.name for label in pull_request_labels]
+    issue_label_names = [label.name for label in issue_labels]
+    labels_to_add = [
+        label for label in issue_label_names if label not in pull_request_label_names
+    ]
+    if not labels_to_add:
+        print("The pull request already has the same labels.")
+        return
+    pull_request.add_to_labels(*labels_to_add)
+    print("Labels added to the pull request!")
+
+
+if __name__ == "__main__":
+    main()
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -19,7 +19,7 @@ CUDA_ARCHES = ["11.8", "12.1", "12.4"]
 CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1", "12.4": "12.4.0"}


-CUDA_ARCHES_CUDNN_VERSION = {"11.8": "8", "12.1": "8", "12.4": "8"}
+CUDA_ARCHES_CUDNN_VERSION = {"11.8": "9", "12.1": "9", "12.4": "9"}


 ROCM_ARCHES = ["6.0", "6.1"]
@ -31,12 +31,18 @@ CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]
 CPU_AARCH64_ARCH = ["cpu-aarch64"]


+CPU_S390X_ARCH = ["cpu-s390x"]
+
+
+CUDA_AARCH64_ARCH = ["cuda-aarch64"]
+
+
 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
    "11.8": (
        "nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "  # noqa: B950
        "nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
@ -49,7 +55,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "  # noqa: B950
        "nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
@ -62,7 +68,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | "
@ -130,6 +136,10 @@ def arch_type(arch_version: str) -> str:
        return "cpu-cxx11-abi"
    elif arch_version in CPU_AARCH64_ARCH:
        return "cpu-aarch64"
+    elif arch_version in CPU_S390X_ARCH:
+        return "cpu-s390x"
+    elif arch_version in CUDA_AARCH64_ARCH:
+        return "cuda-aarch64"
    else:  # arch_version should always be "cpu" in this case
        return "cpu"

@ -149,6 +159,8 @@ WHEEL_CONTAINER_IMAGES = {
    "cpu": f"pytorch/manylinux-builder:cpu-{DEFAULT_TAG}",
    "cpu-cxx11-abi": f"pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-{DEFAULT_TAG}",
    "cpu-aarch64": f"pytorch/manylinuxaarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
+    "cpu-s390x": f"pytorch/manylinuxs390x-builder:cpu-s390x-{DEFAULT_TAG}",
+    "cuda-aarch64": f"pytorch/manylinuxaarch64-builder:cuda12.4-{DEFAULT_TAG}",
 }

 CONDA_CONTAINER_IMAGES = {
@ -205,7 +217,9 @@ def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
        "cpu": "cpu",
        "cpu-aarch64": "cpu",
        "cpu-cxx11-abi": "cpu-cxx11-abi",
+        "cpu-s390x": "cpu",
        "cuda": f"cu{gpu_arch_version.replace('.', '')}",
+        "cuda-aarch64": "cu124",
        "rocm": f"rocm{gpu_arch_version}",
    }.get(gpu_arch_type, gpu_arch_version)

@ -286,11 +300,11 @@ def generate_libtorch_matrix(
                    "libtorch_variant": libtorch_variant,
                    "libtorch_config": abi_version if os == "windows" else "",
                    "devtoolset": abi_version if os != "windows" else "",
-                    "container_image": LIBTORCH_CONTAINER_IMAGES[
-                        (arch_version, abi_version)
-                    ]
-                    if os != "windows"
-                    else "",
+                    "container_image": (
+                        LIBTORCH_CONTAINER_IMAGES[(arch_version, abi_version)]
+                        if os != "windows"
+                        else ""
+                    ),
                    "package_type": "libtorch",
                    "build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{abi_version}".replace(
                        ".", "_"
@ -306,8 +320,8 @@ def generate_wheels_matrix(
    python_versions: Optional[List[str]] = None,
 ) -> List[Dict[str, str]]:
    package_type = "wheel"
-    if os == "linux" or os == "linux-aarch64":
-        # NOTE: We only build manywheel packages for x86_64 and aarch64 linux
+    if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
+        # NOTE: We only build manywheel packages for x86_64 and aarch64 and s390x linux
        package_type = "manywheel"

    if python_versions is None:
@ -323,22 +337,36 @@ def generate_wheels_matrix(
        elif os == "linux-aarch64":
            # Only want the one arch as the CPU type is different and
            # uses different build/test scripts
-            arches = ["cpu-aarch64"]
+            arches = ["cpu-aarch64", "cuda-aarch64"]
+        elif os == "linux-s390x":
+            # Only want the one arch as the CPU type is different and
+            # uses different build/test scripts
+            arches = ["cpu-s390x"]

    ret: List[Dict[str, str]] = []
    for python_version in python_versions:
        for arch_version in arches:
            gpu_arch_type = arch_type(arch_version)
+            # Disable py3.12 builds for ROCm because of triton dependency
+            # on llnl-hatchet, which doesn't have py3.12 wheels available
+            if gpu_arch_type == "rocm" and python_version == "3.12":
+                continue
            gpu_arch_version = (
                ""
                if arch_version == "cpu"
                or arch_version == "cpu-cxx11-abi"
                or arch_version == "cpu-aarch64"
+                or arch_version == "cpu-s390x"
+                or arch_version == "cuda-aarch64"
                else arch_version
            )

            # 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
-            if arch_version in ["12.4", "12.1", "11.8"] and os == "linux":
+            if (
+                arch_version in ["12.4", "12.1", "11.8"]
+                and os == "linux"
+                or arch_version == "cuda-aarch64"
+            ):
                ret.append(
                    {
                        "python_version": python_version,
@ -347,10 +375,16 @@ def generate_wheels_matrix(
                        "desired_cuda": translate_desired_cuda(
                            gpu_arch_type, gpu_arch_version
                        ),
-                        "devtoolset": "",
+                        "devtoolset": (
+                            "cxx11-abi" if arch_version == "cuda-aarch64" else ""
+                        ),
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
-                        "pytorch_extra_install_requirements": PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version],  # fmt: skip
+                        "pytorch_extra_install_requirements": (
+                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]  # fmt: skip
+                            if os != "linux-aarch64"
+                            else ""
+                        ),
                        "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(  # noqa: B950
                            ".", "_"
                        ),
@ -365,17 +399,19 @@ def generate_wheels_matrix(
                        "desired_cuda": translate_desired_cuda(
                            gpu_arch_type, gpu_arch_version
                        ),
-                        "devtoolset": "cxx11-abi"
-                        if arch_version == "cpu-cxx11-abi"
-                        else "",
+                        "devtoolset": (
+                            "cxx11-abi" if arch_version == "cpu-cxx11-abi" else ""
+                        ),
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
                        "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
                            ".", "_"
                        ),
-                        "pytorch_extra_install_requirements":
-                        PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.1"]  # fmt: skip
-                        if os != "linux" else "",
+                        "pytorch_extra_install_requirements": (
+                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.1"]  # fmt: skip
+                            if os != "linux"
+                            else ""
+                        ),
                    }
                )
    return ret
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -5,11 +5,11 @@ import sys
 from dataclasses import asdict, dataclass, field
 from pathlib import Path
 from typing import Dict, Iterable, List, Literal, Set
+from typing_extensions import TypedDict  # Python 3.11+

 import generate_binary_build_matrix  # type: ignore[import]

 import jinja2
-from typing_extensions import TypedDict  # Python 3.11+

 Arch = Literal["windows", "linux", "macos"]

@ -60,7 +60,7 @@ class BinaryBuildWorkflow:
    branches: str = "nightly"
    # Mainly for macos
    cross_compile_arm64: bool = False
-    macos_runner: str = "macos-12-xl"
+    macos_runner: str = "macos-14-xlarge"

    def __post_init__(self) -> None:
        if self.abi_version:
@ -95,6 +95,7 @@ class OperatingSystem:
    MACOS = "macos"
    MACOS_ARM64 = "macos-arm64"
    LINUX_AARCH64 = "linux-aarch64"
+    LINUX_S390X = "linux-s390x"


 LINUX_BINARY_BUILD_WORFKLOWS = [
@ -156,7 +157,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
        package_type="manywheel",
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
-            arches=["11.8", "12.1"],
+            arches=["11.8", "12.1", "12.4"],
            python_versions=["3.8"],
        ),
        branches="main",
@ -284,7 +285,7 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
            libtorch_variants=["shared-with-deps"],
        ),
        cross_compile_arm64=False,
-        macos_runner="macos-13-xlarge",
+        macos_runner="macos-14-xlarge",
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
            isolated_workflow=True,
@ -297,7 +298,7 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
            OperatingSystem.MACOS_ARM64
        ),
        cross_compile_arm64=False,
-        macos_runner="macos-13-xlarge",
+        macos_runner="macos-14-xlarge",
        ciflow_config=CIFlowConfig(
            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
            isolated_workflow=True,
@ -307,7 +308,7 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
        os=OperatingSystem.MACOS_ARM64,
        package_type="conda",
        cross_compile_arm64=False,
-        macos_runner="macos-13-xlarge",
+        macos_runner="macos-14-xlarge",
        build_configs=generate_binary_build_matrix.generate_conda_matrix(
            OperatingSystem.MACOS_ARM64
        ),
@ -332,6 +333,20 @@ AARCH64_BINARY_BUILD_WORKFLOWS = [
    ),
 ]

+S390X_BINARY_BUILD_WORKFLOWS = [
+    BinaryBuildWorkflow(
+        os=OperatingSystem.LINUX_S390X,
+        package_type="manywheel",
+        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+            OperatingSystem.LINUX_S390X
+        ),
+        ciflow_config=CIFlowConfig(
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
+            isolated_workflow=True,
+        ),
+    ),
+]
+

 def main() -> None:
    jinja_env = jinja2.Environment(
@ -350,6 +365,10 @@ def main() -> None:
            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
            AARCH64_BINARY_BUILD_WORKFLOWS,
        ),
+        (
+            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
+            S390X_BINARY_BUILD_WORKFLOWS,
+        ),
        (
            jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
            LINUX_BINARY_SMOKE_WORKFLOWS,
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -7,7 +7,7 @@ eval "$(command conda 'shell.bash' 'hook' 2> /dev/null)"
 conda activate "${CONDA_ENV}"

 # Use uv to speed up lintrunner init
-python3 -m pip install uv
+python3 -m pip install uv==0.1.45

 CACHE_DIRECTORY="/tmp/.lintbin"
 # Try to recover the cached binaries
--- a/.github/scripts/pytest_caching_utils.py
+++ b/.github/scripts/pytest_caching_utils.py
@ -18,6 +18,7 @@ PYTEST_CACHE_KEY_PREFIX = "pytest_cache"
 PYTEST_CACHE_DIR_NAME = ".pytest_cache"
 BUCKET = "gha-artifacts"
 LASTFAILED_FILE_PATH = Path("v/cache/lastfailed")
+TD_HEURISTIC_PREVIOUSLY_FAILED_ADDITIONAL = "previous_failures_additional.json"

 # Temp folders
 ZIP_UPLOAD = "zip-upload"
@ -191,6 +192,10 @@ def _merge_pytest_caches(
        pytest_cache_dir_to_merge_from, pytest_cache_dir_to_merge_into
    )

+    _merge_additional_failures_files(
+        pytest_cache_dir_to_merge_from, pytest_cache_dir_to_merge_into
+    )
+

 def _merge_lastfailed_files(source_pytest_cache: Path, dest_pytest_cache: Path) -> None:
    # Simple cases where one of the files doesn't exist
@ -232,3 +237,27 @@ def _merged_lastfailed_content(
            del to_lastfailed[""]

    return to_lastfailed
+
+
+def _merge_additional_failures_files(
+    source_pytest_cache: Path, dest_pytest_cache: Path
+) -> None:
+    # Simple cases where one of the files doesn't exist
+    source_lastfailed_file = (
+        source_pytest_cache / TD_HEURISTIC_PREVIOUSLY_FAILED_ADDITIONAL
+    )
+    dest_lastfailed_file = dest_pytest_cache / TD_HEURISTIC_PREVIOUSLY_FAILED_ADDITIONAL
+
+    if not source_lastfailed_file.exists():
+        return
+    if not dest_lastfailed_file.exists():
+        copy_file(source_lastfailed_file, dest_lastfailed_file)
+        return
+
+    # Both files exist, so we need to merge them
+    from_lastfailed = load_json_file(source_lastfailed_file)
+    to_lastfailed = load_json_file(dest_lastfailed_file)
+    merged_content = list(set(from_lastfailed + to_lastfailed))
+
+    # Save the results
+    write_json_file(dest_lastfailed_file, merged_content)
--- a/.github/scripts/sync_distributed_folder_prototype.sh
+++ b/.github/scripts/sync_distributed_folder_prototype.sh
@ -0,0 +1,28 @@
+#!/bin/bash
+
+set -eoux pipefail
+
+SYNC_BRANCH=fbcode/pytorch-stable-prototype
+
+git config user.email "fake@example.com"
+git config user.name  "PyTorch Stable Bot"
+
+git fetch origin main
+git fetch origin "$SYNC_BRANCH"
+git checkout "$SYNC_BRANCH"
+
+for SHA in $(git log 4333e122d4b74cdf84351ed2907045c6a767b4cd..origin/main --pretty="%h" --reverse -- torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed)
+do
+    # `git merge-base --is-ancestor` exits with code 0 if the given SHA is an ancestor, and non-0 otherwise
+    if git merge-base --is-ancestor $SHA HEAD || [[ $(git log --grep="(cherry picked from commit $SHA") ]]
+    then
+        echo "Skipping $SHA"
+        continue
+    fi
+    echo "Copying $SHA"
+    git cherry-pick -x "$SHA"
+done
+
+if [[ "${WITH_PUSH}" == true ]]; then
+  git push
+fi
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -773,13 +773,13 @@ class TestBypassFailures(TestCase):
                # than the one on the base commit. This should still count as broken trunk
                "pr_num": 104214,
                "related_failure_count": 0,
-                "unrelated_failure_count": 1,
+                "flaky_or_broken_trunk": 1,
            },
            {
                # This PR had one broken trunk failure and it used ghstack
                "pr_num": 105145,
                "related_failure_count": 0,
-                "unrelated_failure_count": 1,
+                "flaky_or_broken_trunk": 1,
            },
            {
                # The failure on the merge base was retried successfully and
@ -788,20 +788,20 @@ class TestBypassFailures(TestCase):
                # be used to detect broken trunk
                "pr_num": 107160,
                "related_failure_count": 0,
-                "unrelated_failure_count": 4,
+                "flaky_or_broken_trunk": 1,
            },
            {
                # This PR used Dr.CI broken trunk classification
                "pr_num": 111253,
                "related_failure_count": 1,
-                "unrelated_failure_count": 2,
+                "flaky_or_broken_trunk": 1,
            },
        ]

        for case in test_cases:
            pr_num = case["pr_num"]
            related_failure_count = case["related_failure_count"]
-            unrelated_failure_count = case["unrelated_failure_count"]
+            flaky_or_broken_trunk = case["flaky_or_broken_trunk"]

            pr = GitHubPR("pytorch", "pytorch", pr_num)
            checks = pr.get_checkrun_conclusions()
@ -823,7 +823,7 @@ class TestBypassFailures(TestCase):
            )
            self.assertTrue(len(pending) == 0)
            self.assertTrue(
-                len(failed) == unrelated_failure_count + related_failure_count
+                len(failed) == flaky_or_broken_trunk + related_failure_count
            )

    def test_ignore_current(self, *args: Any) -> None:
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -2027,10 +2027,8 @@ def categorize_checks(
    pending_checks: List[Tuple[str, Optional[str], Optional[int]]] = []
    failed_checks: List[Tuple[str, Optional[str], Optional[int]]] = []

-    # ok_failed_checks is used with ok_failed_checks_threshold while ignorable_failed_checks
-    # is used to keep track of all ignorable failures when saving the merge record on Rockset
-    ok_failed_checks: List[Tuple[str, Optional[str], Optional[int]]] = []
-    ignorable_failed_checks: Dict[str, List[Any]] = defaultdict(list)
+    # failed_checks_categorization is used to keep track of all ignorable failures when saving the merge record on Rockset
+    failed_checks_categorization: Dict[str, List[Any]] = defaultdict(list)

    # If required_checks is not set or empty, consider all names are relevant
    relevant_checknames = [
@ -2058,36 +2056,38 @@ def categorize_checks(
            continue
        elif not is_passing_status(check_runs[checkname].status):
            target = (
-                ignorable_failed_checks[classification]
+                failed_checks_categorization[classification]
                if classification
                in ("IGNORE_CURRENT_CHECK", "BROKEN_TRUNK", "FLAKY", "UNSTABLE")
                else failed_checks
            )
            target.append((checkname, url, job_id))

-            if classification in ("BROKEN_TRUNK", "FLAKY", "UNSTABLE"):
-                ok_failed_checks.append((checkname, url, job_id))
+    flaky_or_broken_trunk = (
+        failed_checks_categorization["BROKEN_TRUNK"]
+        + failed_checks_categorization["FLAKY"]
+    )

-    if ok_failed_checks:
+    if flaky_or_broken_trunk:
        warn(
-            f"The following {len(ok_failed_checks)} checks failed but were likely due flakiness or broken trunk: "
-            + ", ".join([x[0] for x in ok_failed_checks])
+            f"The following {len(flaky_or_broken_trunk)} checks failed but were likely due flakiness or broken trunk: "
+            + ", ".join([x[0] for x in flaky_or_broken_trunk])
            + (
                f" but this is greater than the threshold of {ok_failed_checks_threshold} so merge will fail"
                if ok_failed_checks_threshold is not None
-                and len(ok_failed_checks) > ok_failed_checks_threshold
+                and len(flaky_or_broken_trunk) > ok_failed_checks_threshold
                else ""
            )
        )

    if (
        ok_failed_checks_threshold is not None
-        and len(ok_failed_checks) > ok_failed_checks_threshold
+        and len(flaky_or_broken_trunk) > ok_failed_checks_threshold
    ):
-        failed_checks = failed_checks + ok_failed_checks
+        failed_checks = failed_checks + flaky_or_broken_trunk

-    # The list of ignorable_failed_checks is returned so that it can be saved into the Rockset merge record
-    return (pending_checks, failed_checks, ignorable_failed_checks)
+    # The list of failed_checks_categorization is returned so that it can be saved into the Rockset merge record
+    return (pending_checks, failed_checks, failed_checks_categorization)


 def merge(
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -33,6 +33,8 @@ env:
  # Needed for conda builds
  {%- if "aarch64" in build_environment %}
  ALPINE_IMAGE: "arm64v8/alpine"
+  {%- elif "s390x" in build_environment %}
+  ALPINE_IMAGE: "docker.io/s390x/alpine"
  {%- else %}
  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
  {%- endif %}
@ -56,8 +58,11 @@ jobs:
    uses: ./.github/workflows/_binary-build-linux.yml
    with:!{{ upload.binary_env_as_input(config) }}
      {%- if "aarch64" in build_environment %}
-      runs_on: linux.arm64.2xlarge
+      runs_on: linux.arm64.m7g.4xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
+      {%- elif "s390x" in build_environment %}
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
      {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
      runs_on: linux.24xlarge
      {%- endif %}
@ -66,12 +71,17 @@ jobs:
      {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
      {%- endif %}
+      {%- if config["gpu_arch_type"] == "cuda-aarch64" %}
+      timeout-minutes: 420
+      {%- endif %}
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
+
+  {%- if config["gpu_arch_type"] != "cuda-aarch64" %}
  !{{ config["build_name"] }}-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: !{{ config["build_name"] }}-build
-{%- if config["gpu_arch_type"] != "rocm" %}
+    {%- if config["gpu_arch_type"] != "rocm" %}
    uses: ./.github/workflows/_binary-test-linux.yml
    with:!{{ upload.binary_env_as_input(config) }}
      build_name: !{{ config["build_name"] }}
@ -79,6 +89,9 @@ jobs:
      {%- if "aarch64" in build_environment %}
      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
+      {%- elif "s390x" in build_environment %}
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
      {%- elif config["gpu_arch_type"] == "rocm" %}
      runs_on: linux.rocm.gpu
      {%- elif config["gpu_arch_type"] == "cuda" %}
@ -88,7 +101,7 @@ jobs:
      {%- endif %}
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-{%- else %}
+    {%- else %}
    runs-on: linux.rocm.gpu
    timeout-minutes: !{{ common.timeout_minutes }}
    !{{ upload.binary_env(config) }}
@ -113,7 +126,8 @@ jobs:
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
        uses: ./.github/actions/teardown-rocm
-{%- endif %}
+    {%- endif %}
+  {%- endif %}

 {%- if branches == "nightly" %}
  !{{ upload.upload_binaries(config) }}
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@ -57,7 +57,11 @@
      id-token: write
      contents: read
 {%- if has_test %}
+    {%- if config["gpu_arch_type"] == "cuda-aarch64" %}
+    needs: !{{ config["build_name"] }}-build
+    {%- else %}
    needs: !{{ config["build_name"] }}-test
+    {%- endif %}
 {%- else %}
    needs: !{{ config["build_name"] }}-build
 {%- endif %}
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -12,10 +12,15 @@ on:
        type: string
        description: The build environment
      runs_on:
-          required: false
-          default: linux.12xlarge
-          type: string
-          description: Hardware to run this "build"job on, linux.12xlarge or linux.arm64.2xlarge.
+        required: false
+        default: linux.12xlarge
+        type: string
+        description: Hardware to run this "build"job on, linux.12xlarge or linux.arm64.2xlarge.
+      timeout-minutes:
+        required: false
+        default: 210
+        type: number
+        description: timeout for the job
      ALPINE_IMAGE:
        required: false
        type: string
@ -78,7 +83,7 @@ on:
 jobs:
  build:
    runs-on: ${{ inputs.runs_on }}
-    timeout-minutes: 210
+    timeout-minutes: ${{ inputs.timeout-minutes }}
    env:
      PYTORCH_ROOT: ${{ inputs.PYTORCH_ROOT }}
      BUILDER_ROOT: ${{ inputs.BUILDER_ROOT }}
@ -139,6 +144,7 @@ jobs:
        run: env

      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        if: inputs.build_environment != 'linux-s390x-binary-manywheel'
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
        continue-on-error: true
        with:
@ -147,12 +153,14 @@ jobs:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
-          no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' }}
+          no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }}

      - name: Setup Linux
+        if: inputs.build_environment != 'linux-s390x-binary-manywheel'
        uses: ./.github/actions/setup-linux

      - name: Chown workspace
+        if: inputs.build_environment != 'linux-s390x-binary-manywheel'
        uses: ./.github/actions/chown-workspace
        with:
          ALPINE_IMAGE: ${{ inputs.ALPINE_IMAGE }}
@ -165,7 +173,7 @@ jobs:
          rm -rf "${GITHUB_WORKSPACE}"
          mkdir "${GITHUB_WORKSPACE}"

-          if [[ ${{ inputs.build_environment }} == 'linux-aarch64-binary-manywheel' ]]; then
+          if [[ ${{ inputs.build_environment }} == 'linux-aarch64-binary-manywheel' ]] || [[ ${{ inputs.build_environment }} == 'linux-s390x-binary-manywheel' ]] ; then
            rm -rf "${RUNNER_TEMP}/artifacts"
            mkdir "${RUNNER_TEMP}/artifacts"
          fi
@ -212,7 +220,7 @@ jobs:
            ]}

      - name: Pull Docker image
-        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }}
+        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
        with:
          docker-image: ${{ inputs.DOCKER_IMAGE }}
@ -254,7 +262,7 @@ jobs:
          fi

      - name: Chown artifacts
-        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }}
+        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
        shell: bash
        run: |
          # Ensure the working directory gets chowned back to the current user
@ -269,11 +277,20 @@ jobs:
            ${{ runner.temp }}/artifacts/*

      - name: Teardown Linux
-        if: always()
+        if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
        uses: pytorch/test-infra/.github/actions/teardown-linux@main

      - name: Chown workspace
-        if: always()
+        if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
        uses: ./pytorch/.github/actions/chown-workspace
        with:
          ALPINE_IMAGE: ${{ inputs.ALPINE_IMAGE }}
+
+      - name: Cleanup docker
+        if: always() && inputs.build_environment == 'linux-s390x-binary-manywheel'
+        shell: bash
+        run: |
+          # on s390x stop the container for clean worker stop
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -127,6 +127,7 @@ jobs:
          } >> "${GITHUB_ENV} }}"

      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        if: inputs.build_environment != 'linux-s390x-binary-manywheel'
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
        continue-on-error: true
        with:
@ -136,12 +137,14 @@ jobs:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
-          no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' }}
+          no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }}

      - name: Setup Linux
+        if: inputs.build_environment != 'linux-s390x-binary-manywheel'
        uses: ./.github/actions/setup-linux

      - name: Chown workspace
+        if: inputs.build_environment != 'linux-s390x-binary-manywheel'
        uses: ./.github/actions/chown-workspace
        with:
          ALPINE_IMAGE: ${{ inputs.ALPINE_IMAGE }}
@ -203,7 +206,7 @@ jobs:
        if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}

      - name: Pull Docker image
-        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }}
+        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
        with:
          docker-image: ${{ inputs.DOCKER_IMAGE }}
@ -213,11 +216,11 @@ jobs:
        uses: ./pytorch/.github/actions/test-pytorch-binary

      - name: Teardown Linux
-        if: always()
+        if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
        uses: pytorch/test-infra/.github/actions/teardown-linux@main

      - name: Chown workspace
-        if: always()
+        if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
        uses: ./pytorch/.github/actions/chown-workspace
        with:
          ALPINE_IMAGE: ${{ inputs.ALPINE_IMAGE }}
--- a/.github/workflows/assigntome-docathon.yml
+++ b/.github/workflows/assigntome-docathon.yml
@ -8,6 +8,8 @@ on:
 jobs:
  assign:
    runs-on: ubuntu-latest
+    permissions:
+      issues: write
    steps:
      - name: Check for "/assigntome" in comment
        uses: actions/github-script@v6
@ -26,14 +28,14 @@ jobs:
                  repo: context.repo.repo,
                  issue_number: issueNumber
                });
-              const hasLabel = issue.labels.some(label => label.name === 'docathon-h2-2023');
+              const hasLabel = issue.labels.some(label => label.name === 'docathon-h1-2024');
              if (hasLabel) {
                if (issue.assignee !== null) {
                  await github.rest.issues.createComment({
                    owner: context.repo.owner,
                    repo: context.repo.repo,
                    issue_number: issueNumber,
-                    body: "The issue is already assigned. Please pick an opened and unnasigned issue with the [docathon-h2-2023 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h2-2023)."
+                    body: "The issue is already assigned. Please pick an opened and unnasigned issue with the [docathon-h1-2024 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2024)."
                  });
                } else {
                  await github.rest.issues.addAssignees({
@ -44,7 +46,7 @@ jobs:
                  });
                }
              } else {
-                const commmentMessage = "This issue does not have the correct label. Please pick an opened and unnasigned issue with the [docathon-h2-2023 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h2-2023)."
+                const commmentMessage = "This issue does not have the correct label. Please pick an opened and unnasigned issue with the [docathon-h1-2024 label](https://github.com/pytorch/pytorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-h1-2024)."
                await github.rest.issues.createComment({
                  owner: context.repo.owner,
                  repo: context.repo.repo,
--- a/.github/workflows/build-ios-binaries.yml
+++ b/.github/workflows/build-ios-binaries.yml
@ -49,7 +49,7 @@ jobs:
          { config: "default",
            shard: 1,
            num_shards: 1,
-            runner: "macos-13-xlarge",
+            runner: "macos-14-xlarge",
            ios_platform: "SIMULATOR",
            ios_arch: "arm64",
            use_lite_interpreter: ${{ inputs.use_lite_interpreter || 1 }},
@ -60,7 +60,7 @@ jobs:
          { config: "default",
            shard: 1,
            num_shards: 1,
-            runner: "macos-13-xlarge",
+            runner: "macos-14-xlarge",
            ios_platform: "OS",
            ios_arch: "arm64",
            use_lite_interpreter: ${{ inputs.use_lite_interpreter || 1 }},
--- a/.github/workflows/close-nonexistent-disable-issues.yml
+++ b/.github/workflows/close-nonexistent-disable-issues.yml
@ -18,6 +18,6 @@ jobs:
          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
-          pip3 install requests==2.26
+          pip3 install requests==2.32.2
          pip3 install rockset==1.0.3
          python3 .github/scripts/close_nonexistent_disable_issues.py
--- a/.github/workflows/delete_old_branches.yml
+++ b/.github/workflows/delete_old_branches.yml
@ -29,7 +29,7 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v4
        with:
-          python-version: '3.8'
+          python-version: '3.11'
          architecture: x64
          check-latest: false

--- a/.github/workflows/docathon-sync-label.yml
+++ b/.github/workflows/docathon-sync-label.yml
@ -0,0 +1,30 @@
+name: Docathon Labels Sync
+
+on:
+  pull_request_target:
+    types: [opened, synchronize, edited]
+    branches: [main]
+
+jobs:
+  check-labels:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 1
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.x
+      - name: Install dependencies
+        run: |
+          pip install requests==2.32.3
+          pip install PyGithub==2.3.0
+      - name: Run Python script
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: python ./.github/scripts/docathon-label-sync.py ${{ github.event.pull_request.number }}
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -38,15 +38,19 @@ jobs:
      matrix:
        runner: [linux.12xlarge]
        docker-image-name: [
-          pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9,
-          pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks,
-          pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9,
+          pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9,
+          pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks,
+          pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks,
+          pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9,
+          pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks,
+          pytorch-linux-focal-cuda12.1-cudnn9-py3.12-gcc9-inductor-benchmarks,
+          pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9,
          pytorch-linux-focal-py3.8-clang10,
          pytorch-linux-focal-py3.11-clang10,
          pytorch-linux-focal-py3.12-clang10,
          pytorch-linux-focal-rocm-n-1-py3,
          pytorch-linux-focal-rocm-n-py3,
-          pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12,
+          pytorch-linux-jammy-cuda11.8-cudnn9-py3.8-clang12,
          pytorch-linux-focal-py3-clang9-android-ndk-r21e,
          pytorch-linux-jammy-py3.8-gcc11,
          pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks,
@ -54,7 +58,7 @@ jobs:
          pytorch-linux-jammy-py3-clang15-asan,
          pytorch-linux-focal-py3-clang10-onnx,
          pytorch-linux-focal-linter,
-          pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter,
+          pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter,
          pytorch-linux-jammy-py3-clang12-executorch
          ]
        include:
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@ -149,3 +149,10 @@ jobs:
      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
        if: always()
+
+  validate:
+    needs: build
+    uses: pytorch/builder/.github/workflows/validate-docker-images.yml@main
+    with:
+      channel: nightly
+      ref: main
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -50,11 +50,11 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      DESIRED_PYTHON: "3.8"
-      runs_on: linux.arm64.2xlarge
+      runs_on: linux.arm64.m7g.4xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_8-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cpu-aarch64-test:  # Testing
@ -100,6 +100,51 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_8-cuda-aarch64-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      DESIRED_PYTHON: "3.8"
+      runs_on: linux.arm64.m7g.4xlarge
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_8-cuda-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_8-cuda-aarch64-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_8-cuda-aarch64-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      DESIRED_PYTHON: "3.8"
+      build_name: manywheel-py3_8-cuda-aarch64
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_9-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -113,11 +158,11 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      DESIRED_PYTHON: "3.9"
-      runs_on: linux.arm64.2xlarge
+      runs_on: linux.arm64.m7g.4xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-aarch64-test:  # Testing
@ -163,6 +208,51 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_9-cuda-aarch64-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      DESIRED_PYTHON: "3.9"
+      runs_on: linux.arm64.m7g.4xlarge
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_9-cuda-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cuda-aarch64-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_9-cuda-aarch64-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda-aarch64
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_10-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -176,11 +266,11 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      DESIRED_PYTHON: "3.10"
-      runs_on: linux.arm64.2xlarge
+      runs_on: linux.arm64.m7g.4xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-aarch64-test:  # Testing
@ -226,6 +316,51 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_10-cuda-aarch64-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      DESIRED_PYTHON: "3.10"
+      runs_on: linux.arm64.m7g.4xlarge
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_10-cuda-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-cuda-aarch64-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_10-cuda-aarch64-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda-aarch64
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_11-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -239,11 +374,11 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      DESIRED_PYTHON: "3.11"
-      runs_on: linux.arm64.2xlarge
+      runs_on: linux.arm64.m7g.4xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-aarch64-test:  # Testing
@ -289,6 +424,51 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_11-cuda-aarch64-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      DESIRED_PYTHON: "3.11"
+      runs_on: linux.arm64.m7g.4xlarge
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_11-cuda-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda-aarch64-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda-aarch64-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda-aarch64
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_12-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -302,11 +482,11 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      DESIRED_PYTHON: "3.12"
-      runs_on: linux.arm64.2xlarge
+      runs_on: linux.arm64.m7g.4xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-aarch64-test:  # Testing
@ -351,3 +531,48 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_12-cuda-aarch64-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      DESIRED_PYTHON: "3.12"
+      runs_on: linux.arm64.m7g.4xlarge
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_12-cuda-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda-aarch64-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-cuda-aarch64-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda-aarch64
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -48,7 +48,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda11_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cuda11_8-test:  # Testing
@ -88,7 +88,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda12_1
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cuda12_1-test:  # Testing
@ -111,3 +111,43 @@ jobs:
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
+
+  manywheel-py3_8-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.8"
+      build_name: manywheel-py3_8-cuda12_4
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_8-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-cuda12_4-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      DESIRED_PYTHON: "3.8"
+      build_name: manywheel-py3_8-cuda12_4
+      build_environment: linux-binary-manywheel
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -174,7 +174,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda11_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cuda11_8-test:  # Testing
@ -237,7 +237,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda12_1
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cuda12_1-test:  # Testing
@ -300,7 +300,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cuda12_4-test:  # Testing
@ -690,7 +690,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda11_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda11_8-test:  # Testing
@ -753,7 +753,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_1
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_1-test:  # Testing
@ -816,7 +816,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_4-test:  # Testing
@ -1206,7 +1206,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda11_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda11_8-test:  # Testing
@ -1269,7 +1269,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda12_1
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_1-test:  # Testing
@ -1332,7 +1332,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_4-test:  # Testing
@ -1722,7 +1722,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda11_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda11_8-test:  # Testing
@ -1785,7 +1785,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda12_1
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_1-test:  # Testing
@ -1848,7 +1848,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_4-test:  # Testing
@ -2238,7 +2238,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda11_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda11_8-test:  # Testing
@ -2301,7 +2301,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda12_1
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_1-test:  # Testing
@ -2364,7 +2364,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_4-test:  # Testing
@ -2410,209 +2410,3 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_12-rocm6_0-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.0
-      GPU_ARCH_VERSION: 6.0
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-rocm6_0
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-rocm6_0-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_12-rocm6_0-build
-    runs-on: linux.rocm.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.0
-      GPU_ARCH_VERSION: 6.0
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
-      DESIRED_PYTHON: "3.12"
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_12-rocm6_0
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: pytorch/manylinux-builder:rocm6.0-main
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
-  manywheel-py3_12-rocm6_0-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_12-rocm6_0-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.0
-      GPU_ARCH_VERSION: 6.0
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-rocm6_0
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_12-rocm6_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.1
-      GPU_ARCH_VERSION: 6.1
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-rocm6_1
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-rocm6_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_12-rocm6_1-build
-    runs-on: linux.rocm.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.1
-      GPU_ARCH_VERSION: 6.1
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
-      DESIRED_PYTHON: "3.12"
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: manywheel-py3_12-rocm6_1
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: pytorch/manylinux-builder:rocm6.1-main
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
-  manywheel-py3_12-rocm6_1-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_12-rocm6_1-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.1
-      GPU_ARCH_VERSION: 6.1
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-rocm6_1
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -0,0 +1,353 @@
+# @generated DO NOT EDIT MANUALLY
+
+# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: linux-s390x-binary-manywheel
+
+
+on:
+  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
+    branches:
+      - nightly
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/binaries/*'
+      - 'ciflow/binaries_wheel/*'
+  workflow_dispatch:
+
+env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "docker.io/s390x/alpine"
+  ANACONDA_USER: pytorch
+  AWS_DEFAULT_REGION: us-east-1
+  BINARY_ENV_FILE: /tmp/env
+  BUILD_ENVIRONMENT: linux-s390x-binary-manywheel
+  BUILDER_ROOT: /builder
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
+  PYTORCH_ROOT: /pytorch
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 0
+concurrency:
+  group: linux-s390x-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  manywheel-py3_8-cpu-s390x-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DESIRED_PYTHON: "3.8"
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+      build_name: manywheel-py3_8-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_8-cpu-s390x-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-cpu-s390x-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DESIRED_PYTHON: "3.8"
+      build_name: manywheel-py3_8-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_8-cpu-s390x-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_8-cpu-s390x-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DESIRED_PYTHON: "3.8"
+      build_name: manywheel-py3_8-cpu-s390x
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_9-cpu-s390x-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DESIRED_PYTHON: "3.9"
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+      build_name: manywheel-py3_9-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cpu-s390x-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_9-cpu-s390x-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_9-cpu-s390x-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_9-cpu-s390x-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cpu-s390x
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_10-cpu-s390x-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DESIRED_PYTHON: "3.10"
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+      build_name: manywheel-py3_10-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-cpu-s390x-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_10-cpu-s390x-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-cpu-s390x-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_10-cpu-s390x-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cpu-s390x
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_11-cpu-s390x-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DESIRED_PYTHON: "3.11"
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+      build_name: manywheel-py3_11-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cpu-s390x-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_11-cpu-s390x-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cpu-s390x-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cpu-s390x-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cpu-s390x
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_12-cpu-s390x-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DESIRED_PYTHON: "3.12"
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+      build_name: manywheel-py3_12-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cpu-s390x-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_12-cpu-s390x-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cpu-s390x-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-cpu-s390x-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cpu-s390x
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
@ -34,7 +34,7 @@ concurrency:
 jobs:
  conda-py3_8-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-13-xlarge
+    runs-on: macos-14-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -152,7 +152,7 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_9-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-13-xlarge
+    runs-on: macos-14-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -270,7 +270,7 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_10-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-13-xlarge
+    runs-on: macos-14-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -388,7 +388,7 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_11-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-13-xlarge
+    runs-on: macos-14-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -506,7 +506,7 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_12-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-13-xlarge
+    runs-on: macos-14-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
--- a/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
@ -34,7 +34,7 @@ concurrency:
 jobs:
  libtorch-cpu-shared-with-deps-cxx11-abi-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-13-xlarge
+    runs-on: macos-14-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -34,7 +34,7 @@ concurrency:
 jobs:
  wheel-py3_8-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-13-xlarge
+    runs-on: macos-14-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -46,7 +46,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -153,7 +153,7 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_9-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-13-xlarge
+    runs-on: macos-14-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -165,7 +165,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -272,7 +272,7 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_10-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-13-xlarge
+    runs-on: macos-14-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -284,7 +284,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -391,7 +391,7 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_11-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-13-xlarge
+    runs-on: macos-14-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -403,7 +403,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -510,7 +510,7 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_12-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-13-xlarge
+    runs-on: macos-14-xlarge
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -522,7 +522,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@ -46,7 +46,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -290,7 +290,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -536,7 +536,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -782,7 +782,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1027,7 +1027,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1271,7 +1271,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1517,7 +1517,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1763,7 +1763,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2008,7 +2008,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2252,7 +2252,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2498,7 +2498,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2744,7 +2744,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2989,7 +2989,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3233,7 +3233,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3479,7 +3479,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3725,7 +3725,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3970,7 +3970,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4214,7 +4214,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4460,7 +4460,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4706,7 +4706,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
--- a/.github/workflows/inductor-cu124.yml
+++ b/.github/workflows/inductor-cu124.yml
@ -0,0 +1,108 @@
+name: inductor-cu124
+
+on:
+  push:
+    tags:
+      - ciflow/inductor-cu124/*
+  workflow_dispatch:
+  schedule:
+    # Run every 4 hours during the week and every 12 hours on the weekend
+    - cron: 45 0,4,8,12,16,20 * * 1-5
+    - cron: 45 4,12 * * 0,6
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
+    # Should be synced with the one in inductor.yml, but this doesn't run inductor_timm
+    name: cuda12.4-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
+      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      test-matrix: |
+        { include: [
+          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
+          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_cpp_wrapper_abi_compatible", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+    secrets:
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+
+  linux-focal-cuda12_4-py3_10-gcc9-inductor-test:
+    name: cuda12.4-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
+    with:
+      sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-test
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+    secrets:
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+
+  linux-focal-cuda12_4-py3_10-gcc9-inductor-build-gcp:
+    name: cuda12.4-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80
+      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
+        ]}
+    secrets:
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+
+  linux-focal-cuda12_4-py3_10-gcc9-inductor-test-gcp:
+    name: cuda12.4-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-cuda12_4-py3_10-gcc9-inductor-build-gcp
+    with:
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80
+      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build-gcp.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build-gcp.outputs.test-matrix }}
+      use-gha: anything-non-empty-to-use-gha
+    secrets:
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+
+  linux-focal-cuda12_4-py3_12-gcc9-inductor-build:
+    name: cuda12.4-py3.12-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-cuda12.4-py3.12-gcc9-sm86
+      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      test-matrix: |
+        { include: [
+          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+
+  linux-focal-cuda12_4-py3_12-gcc9-inductor-test:
+    name: cuda12.4-py3.12-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-cuda12_4-py3_12-gcc9-inductor-build
+    with:
+      build-environment: linux-focal-cuda12.4-py3.12-gcc9-sm86
+      docker-image: ${{ needs.linux-focal-cuda12_4-py3_12-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_12-gcc9-inductor-build.outputs.test-matrix }}
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@ -21,7 +21,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@ -18,7 +18,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -71,7 +71,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -23,7 +23,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -44,7 +44,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
@ -86,7 +86,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
@ -107,6 +107,56 @@ jobs:
    secrets:
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}

+  linux-focal-cuda12_1-py3_12-gcc9-inductor-build:
+    name: cuda12.1-py3.12-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3.12-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      test-matrix: |
+        { include: [
+          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+
+  linux-focal-cuda12_1-py3_12-gcc9-inductor-test:
+    name: cuda12.1-py3.12-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-cuda12_1-py3_12-gcc9-inductor-build
+    with:
+      build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
+      docker-image: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.test-matrix }}
+
+  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
+    # Should be synced with the one in inductor-periodic.yml but this only runs inductor_timm
+    name: cuda12.4-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
+      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      test-matrix: |
+        { include: [
+          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+    secrets:
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+
+  linux-focal-cuda12_4-py3_10-gcc9-inductor-test:
+    name: cuda12.4-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
+    with:
+      sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-test
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+    secrets:
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+
  linux-jammy-cpu-py3_8-gcc11-inductor-build:
    name: linux-jammy-cpu-py3.8-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
@ -120,12 +170,17 @@ jobs:
          { config: "cpu_inductor_timm", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
          { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
+          { config: "cpu_inductor_huggingface_freezing", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+          { config: "cpu_inductor_timm_freezing", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
+          { config: "cpu_inductor_timm_freezing", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
+          { config: "cpu_inductor_torchbench_freezing", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
+          { config: "cpu_inductor_torchbench_freezing", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
        ]}
    secrets:
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -20,7 +20,7 @@ jobs:
    with:
      timeout: 120
      runner: linux.2xlarge
-      docker-image: pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter
+      docker-image: pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter
      # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
      # to run git rev-parse HEAD~:.ci/docker when a new image is needed
      fetch-depth: 0
@ -36,7 +36,7 @@ jobs:
    with:
      timeout: 120
      runner: linux.2xlarge
-      docker-image: pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter
+      docker-image: pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter
      # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
      # to run git rev-parse HEAD~:.ci/docker when a new image is needed
      fetch-depth: 0
--- a/.github/workflows/mac-mps.yml
+++ b/.github/workflows/mac-mps.yml
@ -13,29 +13,31 @@ concurrency:
 permissions: read-all

 jobs:
-  macos-13-py3-arm64-build:
-    name: macos-13-py3-arm64
+  macos-py3-arm64-build:
+    name: macos-py3-arm64
    uses: ./.github/workflows/_mac-build.yml
    with:
      sync-tag: macos-py3-arm64-build
-      build-environment: macos-13-py3-arm64
+      build-environment: macos-py3-arm64
      runner-type: macos-m1-stable
      build-generates-artifacts: true
      # To match the one pre-installed in the m1 runners
      python-version: 3.9.12
+      # The runner macos-m2-14 is not a typo, it's a custom runner that is different
+      # than our AWS macos-m1-14 runners
      test-matrix: |
        { include: [
-          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-stable" },
+          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-14" },
        ]}

  macos-py3-arm64-mps-test:
    name: macos-py3-arm64-mps
    uses: ./.github/workflows/_mac-test-mps.yml
-    needs: macos-13-py3-arm64-build
+    needs: macos-py3-arm64-build
    with:
      sync-tag: macos-py3-arm64-mps-test
-      build-environment: macos-13-py3-arm64
+      build-environment: macos-py3-arm64
      # Same as the build job
      python-version: 3.9.12
-      test-matrix: ${{ needs.macos-13-py3-arm64-build.outputs.test-matrix }}
+      test-matrix: ${{ needs.macos-py3-arm64-build.outputs.test-matrix }}
--- a/.github/workflows/nightly-rockset-uploads.yml
+++ b/.github/workflows/nightly-rockset-uploads.yml
@ -32,7 +32,7 @@ jobs:
          cache: pip

      - run: |
-          pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
+          pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12

      - name: Upload external contribution stats
        uses: nick-fields/retry@v2.8.2
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -37,6 +37,59 @@ jobs:
    permissions:
      id-token: write
      contents: read
+  linux-focal-cuda12_1-py3_10-gcc9-build:
+    name: linux-focal-cuda12.1-py3.10-gcc9
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
+      test-matrix: |
+        { include: [
+          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
+        ]}
+  linux-focal-cuda12_1-py3_10-gcc9-test:
+    name: linux-focal-cuda12.1-py3.10-gcc9
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-focal-cuda12_1-py3_10-gcc9-build
+      - target-determination
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9
+      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-build.outputs.test-matrix }}
+
+  linux-focal-cuda12_4-py3_10-gcc9-build:
+    name: linux-focal-cuda12.4-py3.10-gcc9
+    uses: ./.github/workflows/_linux-build-label.yml
+    with:
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "deploy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
+        ]}
+
+  linux-focal-cuda12_4-py3_10-gcc9-test:
+    name: linux-focal-cuda12.4-py3.10-gcc9
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-focal-cuda12_4-py3_10-gcc9-build
+      - target-determination
+    with:
+      timeout-minutes: 360
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9
+      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-build.outputs.test-matrix }}

  parallelnative-linux-jammy-py3_8-gcc11-build:
    name: parallelnative-linux-jammy-py3.8-gcc11
@ -67,7 +120,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda11.8-py3.9-gcc9
-      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
@ -89,7 +142,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda11.8-py3.10-gcc9-debug
-      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
      build-with-debug: true
      test-matrix: |
        { include: [
@ -151,7 +204,7 @@ jobs:
          { config: "default",
            shard: 1,
            num_shards: 1,
-            runner: "macos-13-xlarge",
+            runner: "macos-14-xlarge",
            ios_platform: "SIMULATOR",
            ios_arch: "arm64",
            use_lite_interpreter: 1,
@ -162,7 +215,7 @@ jobs:
          { config: "default",
            shard: 1,
            num_shards: 1,
-            runner: "macos-13-xlarge",
+            runner: "macos-14-xlarge",
            ios_platform: "OS",
            ios_arch: "arm64",
            use_lite_interpreter: 1,
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -237,7 +237,7 @@ jobs:
    uses: ./.github/workflows/_linux-build-label.yml
    with:
      build-environment: linux-focal-cuda11.8-py3.10-gcc9
-      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
@ -262,7 +262,7 @@ jobs:
    uses: ./.github/workflows/_linux-build-label.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
@ -297,12 +297,12 @@ jobs:
          { config: "default", shard: 1, num_shards: 1 },
        ]}

-  linux-jammy-cuda-11_8-cudnn8-py3_8-clang12-build:
-    name: linux-jammy-cuda11.8-cudnn8-py3.8-clang12
+  linux-jammy-cuda-11_8-cudnn9-py3_8-clang12-build:
+    name: linux-jammy-cuda11.8-cudnn9-py3.8-clang12
    uses: ./.github/workflows/_linux-build-label.yml
    with:
-      build-environment: linux-jammy-cuda11.8-cudnn8-py3.8-clang12
-      docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12
+      build-environment: linux-jammy-cuda11.8-cudnn9-py3.8-clang12
+      docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn9-py3.8-clang12
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 1 },
@ -361,7 +361,7 @@ jobs:
    uses: ./.github/workflows/_bazel-build-test.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-bazel-test
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      cuda-version: cpu
      test-matrix: |
        { include: [
@ -373,13 +373,25 @@ jobs:
    uses: ./.github/workflows/_bazel-build-test.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-bazel-test
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      cuda-version: "12.1"
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
        ]}

+  linux-focal-cuda12_4-py3_10-gcc9-bazel-test:
+    name: linux-focal-cuda12.4-py3.10-gcc9-bazel-test
+    uses: ./.github/workflows/_bazel-build-test.yml
+    with:
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-bazel-test
+      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      cuda-version: "12.4"
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
+        ]}
+
  linux-focal-py3-clang9-android-ndk-r21e-gradle-custom-build-single:
    name: linux-focal-py3-clang9-android-ndk-r21e-gradle-custom-build-single
    uses: ./.github/workflows/_android-build-test.yml
@ -435,7 +447,7 @@ jobs:
    uses: ./.github/workflows/_linux-build-label.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -41,7 +41,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3-gcc9-slow-gradcheck
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
@ -70,7 +70,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
--- a/.github/workflows/sync_distributed_folder_prototype.yml
+++ b/.github/workflows/sync_distributed_folder_prototype.yml
@ -0,0 +1,30 @@
+name: Sync Distributed Folder
+
+on:
+  #push:
+  #  branches:
+  #    - 'main'
+  #  paths:
+  #    - 'torch/distributed/**'
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - '.github/scripts/sync_distributed_folder_prototype.sh'
+      - '.github/workflows/sync_distributed_folder_prototype.yml'
+
+env:
+  WITH_PUSH: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+
+permissions:
+  contents: write
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  sync:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - run: .github/scripts/sync_distributed_folder_prototype.sh
--- a/.github/workflows/target-determination-indexer.yml
+++ b/.github/workflows/target-determination-indexer.yml
@ -26,7 +26,7 @@ jobs:
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
-          docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
+          docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
          working-directory: pytorch

      - name: Use following to pull public copy of the image
--- a/.github/workflows/torchbench.yml
+++ b/.github/workflows/torchbench.yml
@ -16,7 +16,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -34,36 +34,39 @@ jobs:
      id-token: write
      contents: read

-  linux-focal-cuda12_1-py3_10-gcc9-build:
-    name: linux-focal-cuda12.1-py3.10-gcc9
-    uses: ./.github/workflows/_linux-build.yml
+  linux-focal-cuda12_4-py3_10-gcc9-sm86-build:
+    name: linux-focal-cuda12.4-py3.10-gcc9-sm86
+    uses: ./.github/workflows/_linux-build-label.yml
    with:
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
+      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}

-  linux-focal-cuda12_1-py3_10-gcc9-test:
-    name: linux-focal-cuda12.1-py3.10-gcc9
+  linux-focal-cuda12_4-py3_10-gcc9-sm86-test:
+    name: linux-focal-cuda12.4-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-test.yml
    needs:
-      - linux-focal-cuda12_1-py3_10-gcc9-build
+      - linux-focal-cuda12_4-py3_10-gcc9-sm86-build
      - target-determination
    with:
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9
-      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-build.outputs.test-matrix }}
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
+      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-sm86-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-sm86-build.outputs.test-matrix }}

  libtorch-linux-focal-cuda12_1-py3_7-gcc9-debug-build:
    name: libtorch-linux-focal-cuda12.1-py3.7-gcc9-debug
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: libtorch-linux-focal-cuda12.1-py3.7-gcc9
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      build-generates-artifacts: false
      runner: linux.4xlarge
      test-matrix: |
@ -77,7 +80,32 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-no-ops
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1 },
+        ]}
+
+  libtorch-linux-focal-cuda12_4-py3_7-gcc9-debug-build:
+    name: libtorch-linux-focal-cuda12.4-py3.7-gcc9-debug
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: libtorch-linux-focal-cuda12.4-py3.7-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      build-generates-artifacts: false
+      runner: linux.4xlarge
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1 },
+        ]}
+
+  # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated
+  linux-focal-cuda12_4-py3_10-gcc9-no-ops-build:
+    name: linux-focal-cuda12.4-py3.10-gcc9-no-ops
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-no-ops
+      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 1 },
@ -94,12 +122,12 @@ jobs:
          { config: "default", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
        ]}

-  macos-13-py3-arm64-build:
-    name: macos-13-py3-arm64
+  macos-py3-arm64-build:
+    name: macos-py3-arm64
    uses: ./.github/workflows/_mac-build.yml
    with:
      sync-tag: macos-py3-arm64-build
-      build-environment: macos-13-py3-arm64
+      build-environment: macos-py3-arm64
      runner-type: macos-m1-stable
      build-generates-artifacts: true
      # To match the one pre-installed in the m1 runners
@ -114,31 +142,30 @@ jobs:
  macos-py3-arm64-mps-test:
    name: macos-py3-arm64-mps
    uses: ./.github/workflows/_mac-test-mps.yml
-    needs: macos-13-py3-arm64-build
-    if: needs.macos-13-py3-arm64-build.outputs.build-outcome == 'success'
+    needs: macos-py3-arm64-build
+    if: needs.macos-py3-arm64-build.outputs.build-outcome == 'success'
    with:
      sync-tag: macos-py3-arm64-mps-test
-      build-environment: macos-13-py3-arm64
+      build-environment: macos-py3-arm64
      # Same as the build job
      python-version: 3.9.12
      test-matrix: |
        { include: [
-          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-stable" },
+          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-13" },
          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
-
        ]}

-  macos-13-py3-arm64-test:
-    name: macos-13-py3-arm64
+  macos-py3-arm64-test:
+    name: macos-py3-arm64
    uses: ./.github/workflows/_mac-test.yml
    needs:
-      - macos-13-py3-arm64-build
+      - macos-py3-arm64-build
      - target-determination
    with:
-      build-environment: macos-13-py3-arm64
+      build-environment: macos-py3-arm64
      # Same as the build job
      python-version: 3.9.12
-      test-matrix: ${{ needs.macos-13-py3-arm64-build.outputs.test-matrix }}
+      test-matrix: ${{ needs.macos-py3-arm64-build.outputs.test-matrix }}

  win-vs2019-cpu-py3-build:
    name: win-vs2019-cpu-py3
@ -192,7 +219,9 @@ jobs:
      sync-tag: rocm-build
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
+          { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
        ]}

  linux-focal-rocm6_1-py3_8-test:
@ -208,4 +237,4 @@ jobs:
      build-environment: linux-focal-rocm6.1-py3.8
      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
-      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
+      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@ -32,174 +32,3 @@ jobs:
          echo
          echo "Once the jobs are deemed stable enough (% red signal < 5% and TTS < 3h),"
          echo " they can graduate and move back to pull or trunk."
-
-  #
-  # Experimental ARC jobs
-  #
-  llm-td:
-    name: before-test
-    uses: ./.github/workflows/llm_td_retrieval.yml
-    permissions:
-      id-token: write
-      contents: read
-
-  target-determination:
-    name: before-test
-    uses: ./.github/workflows/target_determination.yml
-    needs: llm-td
-    permissions:
-      id-token: write
-      contents: read
-
-  linux-jammy-py3_8-gcc11-build:
-    name: linux-jammy-py3.8-gcc11
-    uses: ./.github/workflows/_linux-build-rg.yml
-    with:
-      build-environment: linux-jammy-py3.8-gcc11
-      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "default", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "default", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "docs_test", shard: 1, num_shards: 1,  runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "backwards_compat", shard: 1, num_shards: 1, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "distributed", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "distributed", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
-        ]}
-
-  linux-jammy-py3_8-gcc11-test:
-    name: linux-jammy-py3.8-gcc11
-    uses: ./.github/workflows/_linux-test-rg.yml
-    needs:
-      - linux-jammy-py3_8-gcc11-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-py3.8-gcc11
-      docker-image: ${{ needs.linux-jammy-py3_8-gcc11-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_8-gcc11-build.outputs.test-matrix }}
-
-  linux-jammy-py3_8-gcc11-no-ops:
-    name: linux-jammy-py3.8-gcc11-no-ops
-    uses: ./.github/workflows/_linux-build-rg.yml
-    with:
-      build-environment: linux-jammy-py3.8-gcc11-no-ops
-      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 1 },
-        ]}
-
-  linux-jammy-py3_8-gcc11-pch:
-    name: linux-jammy-py3.8-gcc11-pch
-    uses: ./.github/workflows/_linux-build-rg.yml
-    with:
-      build-environment: linux-jammy-py3.8-gcc11-pch
-      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 1 },
-        ]}
-
-  linux-focal-py3_8-clang10-onnx-build:
-    name: linux-focal-py3.8-clang10-onnx
-    uses: ./.github/workflows/_linux-build-rg.yml
-    with:
-      build-environment: linux-focal-py3.8-clang10-onnx
-      docker-image-name: pytorch-linux-focal-py3-clang10-onnx
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "default", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
-        ]}
-
-  linux-focal-py3_8-clang10-onnx-test:
-    name: linux-focal-py3.8-clang10-onnx
-    uses: ./.github/workflows/_linux-test-rg.yml
-    needs:
-      - linux-focal-py3_8-clang10-onnx-build
-      - target-determination
-    with:
-      build-environment: linux-focal-py3.8-clang10-onnx
-      docker-image: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.test-matrix }}
-
-  linux-jammy-py3_10-clang15-asan-build:
-    name: linux-jammy-py3.10-clang15-asan
-    uses: ./.github/workflows/_linux-build-rg.yml
-    with:
-      build-environment: linux-jammy-py3.10-clang15-asan
-      docker-image-name: pytorch-linux-jammy-py3-clang15-asan
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "linux.4xlarge" },
-          { config: "default", shard: 2, num_shards: 6, runner: "linux.4xlarge" },
-          { config: "default", shard: 3, num_shards: 6, runner: "linux.4xlarge" },
-          { config: "default", shard: 4, num_shards: 6, runner: "linux.4xlarge" },
-          { config: "default", shard: 5, num_shards: 6, runner: "linux.4xlarge" },
-          { config: "default", shard: 6, num_shards: 6, runner: "linux.4xlarge" },
-        ]}
-      sync-tag: asan-build-arc
-
-  linux-focal-py3_8-clang10-build:
-    name: linux-focal-py3.8-clang10
-    uses: ./.github/workflows/_linux-build-rg.yml
-    with:
-      build-environment: linux-focal-py3.8-clang10
-      docker-image-name: pytorch-linux-focal-py3.8-clang10
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "default", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "default", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "crossref", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "crossref", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "dynamo", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "dynamo", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "dynamo", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
-        ]}
-
-  linux-focal-py3_8-clang10-test:
-    name: linux-focal-py3.8-clang10
-    uses: ./.github/workflows/_linux-test-rg.yml
-    needs:
-      - linux-focal-py3_8-clang10-build
-      - target-determination
-    with:
-      build-environment: linux-focal-py3.8-clang10
-      docker-image: ${{ needs.linux-focal-py3_8-clang10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_8-clang10-build.outputs.test-matrix }}
-
-  linux-focal-py3_11-clang10-build:
-    name: linux-focal-py3.11-clang10
-    uses: ./.github/workflows/_linux-build-rg.yml
-    with:
-      build-environment: linux-focal-py3.11-clang10
-      docker-image-name: pytorch-linux-focal-py3.11-clang10
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "default", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "default", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "crossref", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "crossref", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "dynamo", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "dynamo", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
-          { config: "dynamo", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" },
-        ]}
-
-  linux-focal-py3_11-clang10-test:
-    name: linux-focal-py3.11-clang10
-    uses: ./.github/workflows/_linux-test-rg.yml
-    needs:
-      - linux-focal-py3_11-clang10-build
-      - target-determination
-    with:
-      build-environment: linux-focal-py3.11-clang10
-      docker-image: ${{ needs.linux-focal-py3_11-clang10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_11-clang10-build.outputs.test-matrix }}
-
-  #
-  # End of Experimental ARC jobs
-  #
--- a/.github/workflows/upload-alerts.yml
+++ b/.github/workflows/upload-alerts.yml
@ -28,7 +28,7 @@ jobs:

      - name: Install Python Packages
        run: |
-          pip3 install rockset==1.0.3 boto3==1.19.12 requests==2.27.1
+          pip3 install rockset==1.0.3 boto3==1.19.12 requests==2.32.2

      - name: Create alerts
        run: |
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -2,7 +2,7 @@ name: Upload test stats

 on:
  workflow_run:
-    workflows: [pull, trunk, periodic, inductor, unstable, slow, unstable-periodic, inductor-periodic, rocm]
+    workflows: [pull, trunk, periodic, inductor, unstable, slow, unstable-periodic, inductor-periodic, rocm, inductor-micro-benchmark]
    types:
      - completed

@ -47,9 +47,10 @@ jobs:
          cache: pip

      - run: |
-          pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
+          pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12

      - name: Upload test artifacts
+        id: upload-s3
        env:
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
@ -94,6 +95,19 @@ jobs:
          # Analyze the results from disable tests rerun and upload them to S3
          python3 -m tools.stats.check_disabled_tests --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}"

+      - name: Upload gpt-fast benchmark results to Rockset
+        if: steps.upload-s3.outcome && steps.upload-s3.outcome == 'success' && github.event.workflow_run.name == 'inductor-micro-benchmark'
+        env:
+          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
+          WORKFLOW_RUN_ATTEMPT: ${{ github.event.workflow_run.run_attempt }}
+          REPO_FULLNAME: ${{ github.event.workflow_run.repository.full_name }}
+          HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }}
+        run: |
+          python3 -m tools.stats.upload_dynamo_perf_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}" --head-branch "${HEAD_BRANCH}" --rockset-collection oss_ci_benchmark --rockset-workspace benchmarks --match-filename "^gpt_fast_benchmark"
+
  check-api-rate:
    if: ${{ always() && github.repository_owner == 'pytorch' }}
    runs-on: ubuntu-latest
--- a/.github/workflows/upload-torch-dynamo-perf-stats.yml
+++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml
@ -40,7 +40,7 @@ jobs:
          cache: pip

      - run: |
-          pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
+          pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12

      - name: Upload torch dynamo performance stats to S3
        id: upload-s3
@ -68,4 +68,4 @@ jobs:
          REPO_FULLNAME: ${{ github.event.workflow_run.repository.full_name }}
          HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }}
        run: |
-          python3 -m tools.stats.upload_dynamo_perf_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}" --head-branch "${HEAD_BRANCH}"
+          python3 -m tools.stats.upload_dynamo_perf_stats --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}" --head-branch "${HEAD_BRANCH}" --rockset-collection torch_dynamo_perf_stats --rockset-workspace inductor --match-filename "^inductor_"
--- a/.github/workflows/upload_test_stats_intermediate.yml
+++ b/.github/workflows/upload_test_stats_intermediate.yml
@ -0,0 +1,43 @@
+name: Upload test stats intermediate
+
+on:
+  workflow_dispatch:
+    inputs:
+      workflow_id:
+        description: workflow_id of the run
+        required: true
+      workflow_run_attempt:
+        description: workflow_run_attempt of the run
+        required: true
+
+jobs:
+  intermediate_upload_test_stats:
+    name: Intermediate upload test stats for ${{ inputs.workflow_id }}
+    runs-on: ubuntu-22.04
+    environment: upload-stats
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          fetch-depth: 1
+          submodules: false
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+          cache: pip
+
+      - run: |
+          pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
+
+      - name: Upload test stats
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          WORKFLOW_RUN_ID: ${{ inputs.workflow_id }}
+          WORKFLOW_RUN_ATTEMPT: ${{ inputs.workflow_run_attempt }}
+        run: |
+          python3 -m tools.stats.upload_test_stats_intermediate \
+            --workflow-run-id "${WORKFLOW_RUN_ID}" \
+            --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" \
--- a/.gitmodules
+++ b/.gitmodules
@ -2,10 +2,6 @@
    ignore = dirty
    path = third_party/pybind11
    url = https://github.com/pybind/pybind11.git
-[submodule "third_party/cub"]
-    ignore = dirty
-    path = third_party/cub
-    url = https://github.com/NVlabs/cub.git
 [submodule "third_party/eigen"]
    ignore = dirty
    path = third_party/eigen
@ -22,10 +18,6 @@
    ignore = dirty
    path = third_party/protobuf
    url = https://github.com/protocolbuffers/protobuf.git
-[submodule "third_party/ios-cmake"]
-    ignore = dirty
-    path = third_party/ios-cmake
-    url = https://github.com/Yangqing/ios-cmake.git
 [submodule "third_party/NNPACK"]
    ignore = dirty
    path = third_party/NNPACK
@ -50,10 +42,6 @@
    ignore = dirty
    path = third_party/psimd
    url = https://github.com/Maratyszcza/psimd.git
-[submodule "third_party/zstd"]
-    ignore = dirty
-    path = third_party/zstd
-    url = https://github.com/facebook/zstd.git
 [submodule "third_party/cpuinfo"]
    ignore = dirty
    path = third_party/cpuinfo
@ -66,10 +54,6 @@
    ignore = dirty
    path = third_party/onnx
    url = https://github.com/onnx/onnx.git
-[submodule "third_party/onnx-tensorrt"]
-    ignore = dirty
-    path = third_party/onnx-tensorrt
-    url = https://github.com/onnx/onnx-tensorrt
 [submodule "third_party/sleef"]
    ignore = dirty
    path = third_party/sleef
@ -86,14 +70,6 @@
    ignore = dirty
    path = third_party/gemmlowp/gemmlowp
    url = https://github.com/google/gemmlowp.git
-[submodule "third_party/QNNPACK"]
-    ignore = dirty
-    path = third_party/QNNPACK
-    url = https://github.com/pytorch/QNNPACK
-[submodule "third_party/neon2sse"]
-    ignore = dirty
-    path = third_party/neon2sse
-    url = https://github.com/intel/ARM_NEON_2_x86_SSE.git
 [submodule "third_party/fbgemm"]
    ignore = dirty
    path = third_party/fbgemm
@ -102,10 +78,6 @@
    ignore = dirty
    path = third_party/foxi
    url = https://github.com/houseroad/foxi.git
-[submodule "third_party/tbb"]
-    path = third_party/tbb
-    url = https://github.com/01org/tbb
-    branch = tbb_2018
 [submodule "android/libs/fbjni"]
    ignore = dirty
    path = android/libs/fbjni
@ -152,3 +124,7 @@
 [submodule "third_party/opentelemetry-cpp"]
 	path = third_party/opentelemetry-cpp
 	url = https://github.com/open-telemetry/opentelemetry-cpp.git
+[submodule "third_party/cpp-httplib"]
+	path = third_party/cpp-httplib
+	url = https://github.com/yhirose/cpp-httplib.git
+	branch = v0.15.3
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -193,6 +193,7 @@ include_patterns = [
    'aten/src/ATen/*.cpp',
    'aten/src/ATen/core/*.h',
    'aten/src/ATen/core/*.cpp',
+    'aten/src/ATen/detail/*',
    'aten/src/ATen/functorch/*.h',
    'aten/src/ATen/functorch/*.cpp',
    'c10/**/*.cpp',
@ -234,7 +235,6 @@ exclude_patterns = [
    'torch/csrc/jit/serialization/import_legacy.cpp',
    'torch/csrc/jit/serialization/export.cpp',
    'torch/csrc/lazy/**/*',
-    'torch/csrc/onnx/init.cpp',
    'torch/csrc/mps/**/*',
 ]
 init_command = [
@ -1052,19 +1052,18 @@ exclude_patterns = [
    'test/quantization/fx/test_numeric_suite_fx.py',
    'test/quantization/fx/test_quantize_fx.py',
    'test/quantization/fx/test_subgraph_rewriter.py',
-    'test/test_datapipe.py',
    'test/test_fake_tensor.py',
    'test/test_flop_counter.py',
    'test/test_function_schema.py',
    'test/test_functional_autograd_benchmark.py',
    'test/test_functional_optim.py',
    'test/test_functionalization_of_rng_ops.py',
+    'test/test_datapipe.py',
    'test/test_futures.py',
    'test/test_fx.py',
    'test/test_fx_experimental.py',
    'test/test_fx_passes.py',
    'test/test_fx_reinplace_pass.py',
-    'test/test_hub.py',
    'test/test_import_stats.py',
    'test/test_itt.py',
    'test/test_jit.py',
@ -1073,7 +1072,6 @@ exclude_patterns = [
    'test/test_jit_disabled.py',
    'test/test_jit_fuser.py',
    'test/test_jit_fuser_legacy.py',
-    'test/test_jit_fuser_te.py',
    'test/test_jit_legacy.py',
    'test/test_jit_llga_fuser.py',
    'test/test_jit_profiling.py',
@ -1081,9 +1079,7 @@ exclude_patterns = [
    'test/test_jit_string.py',
    'test/test_jiterator.py',
    'test/test_kernel_launch_checks.py',
-    'test/test_license.py',
    'test/test_linalg.py',
-    'test/test_logging.py',
    'test/test_masked.py',
    'test/test_maskedtensor.py',
    'test/test_matmul_cuda.py',
@ -1105,13 +1101,6 @@ exclude_patterns = [
    'test/test_native_mha.py',
    'test/test_nestedtensor.py',
    'test/test_nn.py',
-    'test/test_nnapi.py',
-    'test/test_numba_integration.py',
-    'test/test_numpy_interop.py',
-    'test/test_nvfuser_dynamo.py',
-    'test/test_nvfuser_frontend.py',
-    'test/test_openmp.py',
-    'test/test_optim.py',
    'test/test_out_dtype_op.py',
    'test/test_overrides.py',
    'test/test_prims.py',
@ -1125,9 +1114,6 @@ exclude_patterns = [
    'test/test_segment_reductions.py',
    'test/test_serialization.py',
    'test/test_set_default_mobile_cpu_allocator.py',
-    'test/test_shape_ops.py',
-    'test/test_show_pickle.py',
-    'test/test_sort_and_select.py',
    'test/test_sparse.py',
    'test/test_sparse_csr.py',
    'test/test_sparse_semi_structured.py',
@ -1141,30 +1127,13 @@ exclude_patterns = [
    'test/test_tensorexpr.py',
    'test/test_tensorexpr_pybind.py',
    'test/test_testing.py',
-    'test/test_throughput_benchmark.py',
    'test/test_torch.py',
    'test/test_transformers.py',
-    'test/test_type_hints.py',
-    'test/test_type_info.py',
    'test/test_type_promotion.py',
    'test/test_unary_ufuncs.py',
-    'test/test_utils.py',
    'test/test_vulkan.py',
    'test/test_xnnpack_integration.py',
    'test/torch_np/numpy_test/**/*.py',
-    'test/typing/fail/bitwise_ops.py',
-    'test/typing/fail/creation_ops.py',
-    'test/typing/fail/random.py',
-    'test/typing/pass/creation_ops.py',
-    'test/typing/pass/math_ops.py',
-    'test/typing/reveal/module_list.py',
-    'test/typing/reveal/namedtuple.py',
-    'test/typing/reveal/opt_size.py',
-    'test/typing/reveal/size.py',
-    'test/typing/reveal/tensor_constructors.py',
-    'test/typing/reveal/tensor_copy.py',
-    'test/typing/reveal/tensor_sampling.py',
-    'test/typing/reveal/torch_optim.py',
    'torch/_awaits/__init__.py',
    'torch/_custom_op/__init__.py',
    'torch/_custom_op/autograd.py',
@ -1563,28 +1532,6 @@ exclude_patterns = [
    'torch/distributed/optim/post_localSGD_optimizer.py',
    'torch/distributed/optim/utils.py',
    'torch/distributed/optim/zero_redundancy_optimizer.py',
-    'torch/distributed/pipeline/__init__.py',
-    'torch/distributed/pipeline/sync/__init__.py',
-    'torch/distributed/pipeline/sync/_balance/__init__.py',
-    'torch/distributed/pipeline/sync/_balance/blockpartition.py',
-    'torch/distributed/pipeline/sync/_balance/profile.py',
-    'torch/distributed/pipeline/sync/batchnorm.py',
-    'torch/distributed/pipeline/sync/checkpoint.py',
-    'torch/distributed/pipeline/sync/copy.py',
-    'torch/distributed/pipeline/sync/dependency.py',
-    'torch/distributed/pipeline/sync/microbatch.py',
-    'torch/distributed/pipeline/sync/phony.py',
-    'torch/distributed/pipeline/sync/pipe.py',
-    'torch/distributed/pipeline/sync/pipeline.py',
-    'torch/distributed/pipeline/sync/skip/__init__.py',
-    'torch/distributed/pipeline/sync/skip/layout.py',
-    'torch/distributed/pipeline/sync/skip/namespace.py',
-    'torch/distributed/pipeline/sync/skip/portal.py',
-    'torch/distributed/pipeline/sync/skip/skippable.py',
-    'torch/distributed/pipeline/sync/skip/tracker.py',
-    'torch/distributed/pipeline/sync/stream.py',
-    'torch/distributed/pipeline/sync/utils.py',
-    'torch/distributed/pipeline/sync/worker.py',
    'torch/distributed/remote_device.py',
    'torch/distributed/rendezvous.py',
    'torch/distributed/rpc/__init__.py',
@ -1609,7 +1556,6 @@ exclude_patterns = [
    'torch/distributed/tensor/parallel/input_reshard.py',
    'torch/distributed/tensor/parallel/multihead_attention_tp.py',
    'torch/distributed/tensor/parallel/style.py',
-    'torch/distributed/utils.py',
    'torch/fft/__init__.py',
    'torch/func/__init__.py',
    'torch/functional.py',
@ -1701,18 +1647,6 @@ exclude_patterns = [
    'torch/hub.py',
    'torch/library.py',
    'torch/linalg/__init__.py',
-    # UFMT causes import cycle on masked
-    'torch/masked/__init__.py',
-    'torch/masked/_docs.py',
-    'torch/masked/_ops.py',
-    'torch/masked/maskedtensor/__init__.py',
-    'torch/masked/maskedtensor/_ops_refs.py',
-    'torch/masked/maskedtensor/binary.py',
-    'torch/masked/maskedtensor/core.py',
-    'torch/masked/maskedtensor/creation.py',
-    'torch/masked/maskedtensor/passthrough.py',
-    'torch/masked/maskedtensor/reductions.py',
-    'torch/masked/maskedtensor/unary.py',
    'torch/monitor/__init__.py',
    'torch/nested/__init__.py',
    'torch/nn/__init__.py',
@ -1891,8 +1825,6 @@ exclude_patterns = [
    'torch/testing/_internal/distributed/nn/__init__.py',
    'torch/testing/_internal/distributed/nn/api/__init__.py',
    'torch/testing/_internal/distributed/nn/api/remote_module_test.py',
-    'torch/testing/_internal/distributed/pipe_with_ddp_test.py',
-    'torch/testing/_internal/distributed/pipeline/__init__.py',
    'torch/testing/_internal/distributed/rpc/__init__.py',
    'torch/testing/_internal/distributed/rpc/dist_autograd_test.py',
    'torch/testing/_internal/distributed/rpc/dist_optimizer_test.py',
@ -1935,8 +1867,6 @@ exclude_patterns = [
    'torch/utils/_mode_utils.py',
    'torch/utils/_python_dispatch.py',
    'torch/utils/_stats.py',
-    'torch/utils/_sympy/__init__.py',
-    'torch/utils/_sympy/functions.py',
    'torch/utils/_traceback.py',
    'torch/utils/_zip.py',
    'torch/utils/backcompat/__init__.py',
@ -2149,7 +2079,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'ruff==0.4.1',
+    'ruff==0.4.8',
 ]
 is_formatter = true

--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -125,10 +125,6 @@ filegroup(
    data = [":generate-code"],
 )

-exports_files(
-    srcs = ["aten/src/ATen/cpu/tbb/extra/version_string.ver.in"],
-)
-
 # ATen
 filegroup(
    name = "aten_base_cpp",
@ -275,7 +271,6 @@ header_template_rule(
        "@AT_BUILD_WITH_LAPACK@": "1",
        "@AT_PARALLEL_OPENMP@": "0",
        "@AT_PARALLEL_NATIVE@": "1",
-        "@AT_PARALLEL_NATIVE_TBB@": "0",
        "@AT_BLAS_F2C@": "0",
        "@AT_BLAS_USE_CBLAS_DOT@": "1",
    },
@ -359,6 +354,9 @@ cc_library(
        ":aten_src_ATen_config",
    ] + generated_cpu_cpp + aten_ufunc_generated_cpu_sources("aten/src/ATen/{}"),
    copts = ATEN_COPTS,
+    linkopts = [
+      "-ldl",
+    ],
    data = if_cuda(
        [":libcaffe2_nvrtc.so"],
        [],
@ -456,65 +454,15 @@ CAFFE2_COPTS = COMMON_COPTS + [
 filegroup(
    name = "caffe2_core_srcs",
    srcs = [
-        "caffe2/core/allocator.cc",
-        "caffe2/core/blob_serialization.cc",
-        "caffe2/core/blob_stats.cc",
        "caffe2/core/common.cc",
-        "caffe2/core/context.cc",
-        "caffe2/core/context_base.cc",
-        "caffe2/core/db.cc",
-        "caffe2/core/event.cc",
-        "caffe2/core/export_c10_op_to_caffe2.cc",
-        "caffe2/core/graph.cc",
-        "caffe2/core/init.cc",
-        "caffe2/core/init_denormals.cc",
-        "caffe2/core/init_intrinsics_check.cc",
-        "caffe2/core/init_omp.cc",
-        "caffe2/core/int8_serialization.cc",
-        "caffe2/core/memonger.cc",
-        "caffe2/core/module.cc",
-        "caffe2/core/net.cc",
-        "caffe2/core/net_async_base.cc",
-        "caffe2/core/net_async_scheduling.cc",
-        "caffe2/core/net_async_task.cc",
-        "caffe2/core/net_async_task_future.cc",
-        "caffe2/core/net_async_task_graph.cc",
-        "caffe2/core/net_async_tracing.cc",
-        "caffe2/core/net_dag_utils.cc",
-        "caffe2/core/net_parallel.cc",
-        "caffe2/core/net_simple.cc",
-        "caffe2/core/net_simple_refcount.cc",
-        "caffe2/core/nomnigraph/Representations/NeuralNet.cc",
-        "caffe2/core/nomnigraph/tests/test_util.cc",
-        "caffe2/core/numa.cc",
-        "caffe2/core/operator.cc",
-        "caffe2/core/operator_schema.cc",
-        "caffe2/core/plan_executor.cc",
-        "caffe2/core/prof_dag_counters.cc",
-        "caffe2/core/qtensor.cc",
-        "caffe2/core/qtensor_serialization.cc",
-        "caffe2/core/stats.cc",
-        "caffe2/core/tensor.cc",
-        "caffe2/core/tensor_int8.cc",
-        "caffe2/core/test_utils.cc",
-        "caffe2/core/transform.cc",
-        "caffe2/core/types.cc",
-        "caffe2/core/workspace.cc",
    ],
 )

 filegroup(
    name = "caffe2_perfkernels_srcs",
    srcs = [
-        "caffe2/perfkernels/adagrad.cc",
        "caffe2/perfkernels/embedding_lookup.cc",
        "caffe2/perfkernels/embedding_lookup_idx.cc",
-        "caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.cc",
-        "caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup_idx.cc",
-        "caffe2/perfkernels/fused_nbit_rowwise_conversion.cc",
-        "caffe2/perfkernels/lstm_unit_cpu_common.cc",
-        "caffe2/perfkernels/math_cpu_base.cc",
-        "caffe2/perfkernels/typed_axpy.cc",
    ],
 )

@ -532,19 +480,7 @@ filegroup(
 filegroup(
    name = "caffe2_utils_srcs",
    srcs = [
-        "caffe2/utils/bench_utils.cc",
-        "caffe2/utils/cpuid.cc",
-        "caffe2/utils/math/broadcast.cc",
-        "caffe2/utils/math/elementwise.cc",
-        "caffe2/utils/math/reduce.cc",
-        "caffe2/utils/math/transpose.cc",
-        "caffe2/utils/math/utils.cc",
-        "caffe2/utils/math_cpu.cc",
-        "caffe2/utils/murmur_hash3.cc",
-        "caffe2/utils/proto_utils.cc",
        "caffe2/utils/proto_wrap.cc",
-        "caffe2/utils/signal_handler.cc",
-        "caffe2/utils/smart_tensor_printer.cc",
        "caffe2/utils/string_utils.cc",
        "caffe2/utils/threadpool/ThreadPool.cc",
        "caffe2/utils/threadpool/pthreadpool.cc",
@ -562,12 +498,9 @@ cc_library(
    name = "caffe2_for_aten_headers",
    hdrs = [
        "caffe2/core/common.h",
-        "caffe2/core/logging.h",
-        "caffe2/core/types.h",
        "caffe2/perfkernels/common.h",
        "caffe2/perfkernels/embedding_lookup.h",
        "caffe2/perfkernels/embedding_lookup_idx.h",
-        "caffe2/utils/cpuid.h",
        "caffe2/utils/fixed_divisor.h",
    ] + glob([
        "caffe2/utils/threadpool/*.h",
@ -577,7 +510,6 @@ cc_library(
    deps = [
        ":caffe2_core_macros",
        "//c10",
-        "//caffe2/proto:caffe2_pb",
    ],
 )

@ -585,18 +517,9 @@ cc_library(
    name = "caffe2_headers",
    hdrs = glob(
        [
-            "caffe2/core/*.h",
-            "caffe2/core/nomnigraph/include/nomnigraph/Converters/*.h",
-            "caffe2/core/nomnigraph/include/nomnigraph/Generated/*.h",
-            "caffe2/core/nomnigraph/include/nomnigraph/Graph/*.h",
-            "caffe2/core/nomnigraph/include/nomnigraph/Representations/*.h",
-            "caffe2/core/nomnigraph/include/nomnigraph/Support/*.h",
-            "caffe2/core/nomnigraph/include/nomnigraph/Transformations/*.h",
-            "caffe2/core/nomnigraph/tests/*.h",
            "caffe2/perfkernels/*.h",
            "caffe2/serialize/*.h",
            "caffe2/utils/*.h",
-            "caffe2/utils/math/*.h",
            "caffe2/utils/threadpool/*.h",
            "modules/**/*.h",
        ],
@ -605,18 +528,12 @@ cc_library(
        ],
    ) + if_cuda(glob([
        "caffe2/**/*.cuh",
-        "caffe2/image/*.h",
    ])),
    copts = CAFFE2_COPTS,
-    includes = [
-        "caffe2/core/nomnigraph/include",
-    ],
    visibility = ["//visibility:public"],
    deps = [
        ":caffe2_core_macros",
        ":caffe2_for_aten_headers",
-        "//caffe2/proto:caffe2_pb",
-        "//caffe2/proto:cc_proto",
    ],
 )

@ -637,8 +554,6 @@ cc_library(
        ":caffe2_perfkernels_avx",
        ":caffe2_perfkernels_avx2",
        ":caffe2_perfkernels_avx512",
-        "//caffe2/proto:caffe2_pb",
-        "//caffe2/proto:cc_proto",
        "//third_party/miniz-2.1.0:miniz",
        "@com_google_protobuf//:protobuf",
        "@eigen",
@ -663,6 +578,7 @@ cu_library(
    name = "torch_cuda",
    srcs = [
        "torch/csrc/distributed/c10d/intra_node_comm.cu",
+        "torch/csrc/distributed/c10d/Utils.cu",
        "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
    ],
    copts = torch_cuda_half_options,
@ -771,7 +687,7 @@ cc_library(
        [
            "torch/*.h",
            "torch/csrc/**/*.h",
-            "torch/csrc/distributed/c10d/*.hpp",
+            "torch/csrc/distributed/c10d/**/*.hpp",
            "torch/lib/libshm/*.h",
        ],
        exclude = [
@ -830,10 +746,14 @@ cc_library(
            "torch/csrc/cuda/python_nccl.cpp",
            "torch/csrc/cuda/nccl.cpp",
            "torch/csrc/distributed/c10d/intra_node_comm.cu",
+            "torch/csrc/distributed/c10d/Utils.cu",
            "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
        ],
    )) + torch_sources,
    copts = TORCH_COPTS,
+    linkopts = [
+      "-lrt",
+    ],
    defines = [
        "CAFFE2_NIGHTLY_VERSION=20200115",
    ],
@ -841,8 +761,8 @@ cc_library(
    deps = [
        ":caffe2",
        ":torch_headers",
-        "//caffe2/proto:torch_cc_proto",
        "@kineto",
+        "@cpp-httplib",
    ] + if_cuda([
        "@cuda//:nvToolsExt",
        "@cutlass",
@ -854,6 +774,9 @@ cc_library(
 cc_library(
    name = "shm",
    srcs = glob(["torch/lib/libshm/*.cpp"]),
+    linkopts = [
+      "-lrt",
+    ],
    deps = [
        ":torch",
    ],
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -667,7 +667,6 @@ only interested in a specific component.
 - Working on a test binary? Run `(cd build && ninja bin/test_binary_name)` to
  rebuild only that test binary (without rerunning cmake). (Replace `ninja` with
  `make` if you don't have ninja installed).
- Don't need Caffe2?  Pass `BUILD_CAFFE2=0` to disable Caffe2 build.

 On the initial build, you can also speed things up with the environment
 variables `DEBUG`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_CUDA`, `USE_FLASH_ATTENTION`, `USE_MEM_EFF_ATTENTION`, `BUILD_TEST`, `USE_FBGEMM`, `USE_NNPACK` and `USE_QNNPACK`.
@ -790,7 +789,7 @@ USE_PRECOMPILED_HEADERS=1 python setup.py develop
 ```

 This adds a build step where the compiler takes `<ATen/ATen.h>` and essentially
-dumps it's internal AST to a file so the compiler can avoid repeating itself for
+dumps its internal AST to a file so the compiler can avoid repeating itself for
 every `.cpp` file.

 One caveat is that when enabled, this header gets included in every file by default.
@ -1196,7 +1195,7 @@ build_with_asan()
  LDFLAGS="-stdlib=libstdc++" \
  CFLAGS="-fsanitize=address -fno-sanitize-recover=all -shared-libasan -pthread" \
  CXX_FLAGS="-pthread" \
-  USE_CUDA=0 USE_OPENMP=0 BUILD_CAFFE2_OPS=0 USE_DISTRIBUTED=0 DEBUG=1 \
+  USE_CUDA=0 USE_OPENMP=0 USE_DISTRIBUTED=0 DEBUG=1 \
  python setup.py develop
 }

@ -1321,7 +1320,7 @@ There are two possible choices for which commit to use:
 1. Checkout commit `B`, the head of the PR (manually committed by the PR
   author).
 2. Checkout commit `C`, the hypothetical result of what would happen if the PR
-   were merged into it's destination (usually `main`).
+   were merged into its destination (usually `main`).

 For all practical purposes, most people can think of the commit being used as
 commit `B` (choice **1**).
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-![PyTorch Logo](https://github.com/pytorch/pytorch/blob/main/docs/source/_static/img/pytorch-logo-dark.png)
+![PyTorch Logo](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/pytorch-logo-dark.png)

 --------------------------------------------------------------------------------

@ -24,6 +24,9 @@ Our trunk health (Continuous Integration signals) can be found at [hud.pytorch.o
    - [NVIDIA Jetson Platforms](#nvidia-jetson-platforms)
  - [From Source](#from-source)
    - [Prerequisites](#prerequisites)
+      - [NVIDIA CUDA Support](#nvidia-cuda-support)
+      - [AMD ROCm Support](#amd-rocm-support)
+      - [Intel GPU Support](#intel-gpu-support)
    - [Install Dependencies](#install-dependencies)
    - [Get the PyTorch Source](#get-the-pytorch-source)
    - [Install PyTorch](#install-pytorch)
@ -95,7 +98,7 @@ from several research papers on this topic, as well as current and past work suc
 While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date.
 You get the best of speed and flexibility for your crazy research.

-![Dynamic graph](https://github.com/pytorch/pytorch/blob/main/docs/source/_static/img/dynamic_graph.gif)
+![Dynamic graph](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/dynamic_graph.gif)

 ### Python First

@ -162,6 +165,7 @@ If you are installing from source, you will need:

 We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.

+##### NVIDIA CUDA Support
 If you want to compile with CUDA support, [select a supported version of CUDA from our support matrix](https://pytorch.org/get-started/locally/), then install the following:
 - [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
 - [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v8.5 or above
@ -174,6 +178,7 @@ Other potentially useful environment variables may be found in `setup.py`.

 If you are building for NVIDIA's Jetson platforms (Jetson Nano, TX1, TX2, AGX Xavier), Instructions to install PyTorch for Jetson Nano are [available here](https://devtalk.nvidia.com/default/topic/1049071/jetson-nano/pytorch-for-jetson-nano/)

+##### AMD ROCm Support
 If you want to compile with ROCm support, install
 - [AMD ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) 4.0 and above installation
 - ROCm is currently supported only for Linux systems.
@ -181,6 +186,14 @@ If you want to compile with ROCm support, install
 If you want to disable ROCm support, export the environment variable `USE_ROCM=0`.
 Other potentially useful environment variables may be found in `setup.py`.

+##### Intel GPU Support
+If you want to compile with Intel GPU support, follow these
+- [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) instructions.
+- Intel GPU is supported for Linux and Windows.
+
+If you want to disable Intel GPU support, export the environment variable `USE_XPU=0`.
+Other potentially useful environment variables may be found in `setup.py`.
+
 #### Install Dependencies

 **Common**
@ -196,10 +209,11 @@ pip install -r requirements.txt
 ```bash
 conda install intel::mkl-static intel::mkl-include
 # CUDA only: Add LAPACK support for the GPU if needed
-conda install -c pytorch magma-cuda110  # or the magma-cuda* that matches your CUDA version from https://anaconda.org/pytorch/repo
+conda install -c pytorch magma-cuda121  # or the magma-cuda* that matches your CUDA version from https://anaconda.org/pytorch/repo

 # (optional) If using torch.compile with inductor/triton, install the matching version of triton
 # Run from the pytorch directory after cloning
+# For Intel GPU support, please explicitly `export USE_XPU=1` before running command.
 make triton
 ```

@ -379,7 +393,7 @@ You can also pass the `CMAKE_VARS="..."` environment variable to specify additio
 See [setup.py](./setup.py) for the list of available variables.

 ```bash
-CMAKE_VARS="BUILD_CAFFE2=ON BUILD_CAFFE2_OPS=ON" make -f docker.Makefile
+make -f docker.Makefile
 ```

 ### Building the Documentation
--- a/RELEASE.md
+++ b/RELEASE.md
@ -37,6 +37,7 @@
    - [TL;DR](#tldr)
  - [Accelerator Software](#accelerator-software)
    - [Special support cases](#special-support-cases)
+  - [Operating Systems](#operating-systems)
 - [Submitting Tutorials](#submitting-tutorials)
 - [Special Topics](#special-topics)
  - [Updating submodules for a release](#updating-submodules-for-a-release)
@ -48,14 +49,14 @@

 Following is the Release Compatibility Matrix for PyTorch releases:

-| PyTorch version | Python | Stable CUDA | Experimental CUDA |
-| --- | --- | --- | --- |
-| 2.3 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 |
-| 2.2 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 |
-| 2.1 | >=3.8, <=3.11 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 |
-| 2.0 | >=3.8, <=3.11 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 |
-| 1.13 | >=3.7, <=3.10 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 |
-| 1.12 | >=3.7, <=3.10 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 |
+| PyTorch version | Python | Stable CUDA | Experimental CUDA | Stable ROCm |
+| --- | --- | --- | --- | --- |
+| 2.3 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 |
+| 2.2 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 |
+| 2.1 | >=3.8, <=3.11 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.6 |
+| 2.0 | >=3.8, <=3.11 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 | ROCm 5.4 |
+| 1.13 | >=3.7, <=3.10 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
+| 1.12 | >=3.7, <=3.10 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |

 ## Release Cadence

@ -262,7 +263,7 @@ requires `pytorchbot`, so it's only available in PyTorch atm.

 ### Cherry Picking Reverts

-If PR that has been cherry-picked into release branch has been reverted, it's cherry-pick must be reverted as well.
+If PR that has been cherry-picked into release branch has been reverted, its cherry-pick must be reverted as well.

 Reverts for changes that was committed into the main branch prior to the branch cut, must be propagated into release branch as well.

@ -426,6 +427,15 @@ the size restrictions for publishing on PyPI so the default version that is publ
 These special support cases will be handled on a case by case basis and support may be continued if current PyTorch maintainers feel as though there may still be a
 need to support these particular versions of software.

+## Operating Systems
+Supported OS flavors are summarized in the table below:
+| Operating System family | Architectrue | Notes |
+| --- | --- | --- |
+| Linux | aarch64, x86_64 | Wheels are manylinux2014 compatible, i.e. they should be runnable on any Linux system with glibc-2.17 or above. |
+| MacOS | arm64 | Builds should be compatible with MacOS 11 (Big Sur) or newer, but are actively tested against MacOS 14 (Sonoma). |
+| MacOS | x86_64 | Requires MacOS Catalina or above, not supported after 2.2, see https://github.com/pytorch/pytorch/issues/114602 |
+| Windows | x86_64 | Buils are compatible with Windows-10 or newer. |
+
 # Submitting Tutorials

 Tutorials in support of a release feature must be submitted to the [pytorch/tutorials](https://github.com/pytorch/tutorials) repo at least two weeks before the release date to allow for editorial and technical review. There is no cherry-pick process for tutorials. All tutorials will be merged around the release day and published at [pytorch.org/tutorials](https://pytorch.org/tutorials/).
--- a/SECURITY.md
+++ b/SECURITY.md
@ -5,6 +5,7 @@
   - [Untrusted models](#untrusted-models)
   - [Untrusted inputs](#untrusted-inputs)
   - [Data privacy](#data-privacy)
+   - [Using distributed features](#using-distributed-features)

 ## Reporting Security Issues

@ -39,7 +40,7 @@ Important Note: The trustworthiness of a model is not binary. You must always de

 ### Untrusted inputs during training and prediction

-If you plan to open your model to untrusted inputs, be aware that inputs can also be used as vectors by malicious agents. To minimize risks, make sure to give your model only the permisisons strictly required, and keep your libraries updated with the lates security patches.
+If you plan to open your model to untrusted inputs, be aware that inputs can also be used as vectors by malicious agents. To minimize risks, make sure to give your model only the permissions strictly required, and keep your libraries updated with the latest security patches.

 If applicable, prepare your model against bad inputs and prompt injections. Some recommendations:
 - Pre-analysis: check how the model performs by default when exposed to prompt injection (e.g. using fuzzing for prompt injection).
@ -54,3 +55,9 @@ If applicable, prepare your model against bad inputs and prompt injections. Some
 **Take special security measures if your model if you train models with sensitive data**. Prioritize [sandboxing](https://developers.google.com/code-sandboxing) your models and:
 - Do not feed sensitive data to untrusted model (even if runs in a sandboxed environment)
 - If you consider publishing a model that was partially trained with sensitive data, be aware that data can potentially be recovered from the trained weights (especially if model overfits).
+
+### Using distributed features
+
+PyTorch can be used for distributed computing, and as such there is a `torch.distributed` package. PyTorch Distributed features are intended for internal communication only. They are not built for use in untrusted environments or networks.
+
+For performance reasons, none of the PyTorch Distributed primitives (including c10d, RPC, and TCPStore) include any authorization protocol and will send messages unencrypted. They accept connections from anywhere, and execute the workload sent without performing any checks. Therefore, if you run a PyTorch Distributed program on your network, anybody with access to the network can execute arbitrary code with the privileges of the user running PyTorch.
--- a/17
+++ b/17
@ -168,14 +168,10 @@ new_local_repository(
    path = "third_party/opentelemetry-cpp",
 )

-new_patched_local_repository(
-    name = "tbb",
-    build_file = "//third_party:tbb.BUILD",
-    patch_strip = 1,
-    patches = [
-        "@//third_party:tbb.patch",
-    ],
-    path = "third_party/tbb",
+new_local_repository(
+    name = "cpp-httplib",
+    build_file = "//third_party:cpp-httplib.BUILD",
+    path = "third_party/cpp-httplib",
 )

 new_local_repository(
@ -355,9 +351,4 @@ local_repository(
    path = "third_party/onnx/third_party/benchmark",
 )

-local_repository(
-    name = "unused_onnx_tensorrt_benchmark",
-    path = "third_party/onnx-tensorrt/third_party/onnx/third_party/benchmark",
-)
-
 ### Unused repos end
--- a/android/test_app/make_assets.py
+++ b/android/test_app/make_assets.py
@ -1,6 +1,7 @@
-import torch
 from torchvision import models

+import torch
+
 print(torch.version.__version__)

 resnet18 = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
--- a/Show More
+++ b/Show More