expandable_segments <-> other allocator options (#134338 )

Previously setting garbage_collection_threshold or max_split_size_mb along with expandable_segments:True could cause the allocator to hit assert failures when running nearly out of memory. This PR ensures garbage_collection and max_split freeing do not accidentally try to release expandable segments. Pull Request resolved: https://github.com/pytorch/pytorch/pull/134338 Approved by: https://github.com/ezyang
[AOTI] Fix cosmetic indentation issue in cuda cpp wrapper codegen for DeferredCudaKernelLine/GridLine (#134705 )
2025-10-26 16:44:54 +08:00 · 2024-08-29 18:43:59 +00:00 · 2024-08-29 18:38:45 +00:00 · 2024-08-29 18:35:47 +00:00 · 2024-08-29 18:32:04 +00:00 · 2024-08-29 17:22:52 +00:00
1728 changed files with 71629 additions and 24875 deletions
--- a/.ci/docker/aotriton_version.txt
+++ b/.ci/docker/aotriton_version.txt
@ -1,5 +1,5 @@
 0.6b
 manylinux_2_17
-rocm6.1
+rocm6.2
 7f07e8a1cb1f99627eb6d77f5c0e9295c775f3c7
-77c29fa3f3b614e187d7213d745e989a92708cee2bc6020419ab49019af399d1
+e4ab195d2bd19e939c675a13280c29714c6ef9f2cf420690da150fa0cac043b1
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -92,7 +92,7 @@ _UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
 # from scratch
 case "$image" in
  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.0
+    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
@ -120,7 +120,7 @@ case "$image" in
    TRITON=yes
    ;;
  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.4.0
+    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
@ -165,7 +165,7 @@ case "$image" in
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.4.0
+    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
@ -194,7 +194,7 @@ case "$image" in
    TRITON=yes
    ;;
  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.0
+    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
@ -222,7 +222,7 @@ case "$image" in
    TRITON=yes
    ;;
  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.0
+    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
@ -308,7 +308,7 @@ case "$image" in
    TRITON=yes
    ;;
  pytorch-linux-jammy-xpu-2024.0-py3)
-    ANACONDA_PYTHON_VERSION=3.8
+    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
    DB=yes
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-91298923a0076c1b41059efb6dad2876426e4b03
+69472e5c43481324ad923ceb29392ab72830acee
--- a/.ci/docker/ci_commit_pins/timm.txt
+++ b/.ci/docker/ci_commit_pins/timm.txt
@ -1 +1 @@
-730b907b4d45a4713cbc425cbf224c46089fd514
+ac3470188b914c5d7a5058a7e28b9eb685a62427
--- a/.ci/docker/common/aotriton_version.txt
+++ b/.ci/docker/common/aotriton_version.txt
@ -1,5 +0,0 @@
-0.6b
-manylinux_2_17
-rocm6.1
-04b5df8c8123f90cba3ede7e971e6fbc6040d506
-77c29fa3f3b614e187d7213d745e989a92708cee2bc6020419ab49019af399d1
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -5,32 +5,22 @@ set -ex
 # Optionally install conda
 if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  BASE_URL="https://repo.anaconda.com/miniconda"
+  CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh"
+  if [[ $(uname -m) == "aarch64" ]] || [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
+    BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"
+    CONDA_FILE="Miniforge3-Linux-$(uname -m).sh"
+  fi

  MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1)
  MINOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 2)

-if [[ $(uname -m) == "aarch64" ]]; then
-  BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"
  case "$MAJOR_PYTHON_VERSION" in
-    3)
-      CONDA_FILE="Miniforge3-Linux-aarch64.sh"
-    ;;
+    3);;
    *)
      echo "Unsupported ANACONDA_PYTHON_VERSION: $ANACONDA_PYTHON_VERSION"
      exit 1
      ;;
  esac
-else
-  case "$MAJOR_PYTHON_VERSION" in
-    3)
-      CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh"
-    ;;
-    *)
-      echo "Unsupported ANACONDA_PYTHON_VERSION: $ANACONDA_PYTHON_VERSION"
-      exit 1
-      ;;
-  esac
-fi

  mkdir -p /opt/conda
  chown jenkins:jenkins /opt/conda
@ -78,19 +68,20 @@ fi
    CONDA_COMMON_DEPS="astunparse pyyaml setuptools openblas==0.3.25=*openmp* ninja==1.11.1 scons==4.5.2"

    if [ "$ANACONDA_PYTHON_VERSION" = "3.8" ]; then
-      conda_install numpy=1.24.4 ${CONDA_COMMON_DEPS}
+      NUMPY_VERSION=1.24.4
    else
-      conda_install numpy=1.26.2 ${CONDA_COMMON_DEPS}
+      NUMPY_VERSION=1.26.2
    fi
  else
    CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"

    if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.13" ]; then
-      conda_install numpy=1.26.0 ${CONDA_COMMON_DEPS}
+      NUMPY_VERSION=1.26.0
    else
-      conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
+      NUMPY_VERSION=1.21.2
    fi
  fi
+  conda_install ${CONDA_COMMON_DEPS}

  # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
  # and libpython-static for torch deploy
@ -112,7 +103,7 @@ fi

  # Install some other packages, including those needed for Python test reporting
  pip_install -r /opt/conda/requirements-ci.txt
-
+  pip_install numpy=="$NUMPY_VERSION"
  pip_install -U scikit-learn

  if [ -n "$DOCS" ]; then
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -58,7 +58,8 @@ function do_cpython_build {
    if [ -e ${prefix}/bin/pip3 ] && [ ! -e ${prefix}/bin/pip ]; then
        ln -s pip3 ${prefix}/bin/pip
    fi
-    ${prefix}/bin/pip install wheel==0.34.2
+    # install setuptools since python 3.12 is required to use distutils
+    ${prefix}/bin/pip install wheel==0.34.2 setuptools==68.2.2
    local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")
    ln -s ${prefix} /opt/python/${abi_tag}
 }
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -27,6 +27,17 @@ function install_cusparselt_052 {
    rm -rf tmp_cusparselt
 }

+function install_cusparselt_062 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
 function install_118 {
    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
    rm -rf /usr/local/cuda-11.8 /usr/local/cuda
@ -94,13 +105,13 @@ function install_121 {
 }

 function install_124 {
-  echo "Installing CUDA 12.4 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
-  # install CUDA 12.4.0 in the same container
-  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run
-  chmod +x cuda_12.4.0_550.54.14_linux.run
-  ./cuda_12.4.0_550.54.14_linux.run --toolkit --silent
-  rm -f cuda_12.4.0_550.54.14_linux.run
+  # install CUDA 12.4.1 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
+  chmod +x cuda_12.4.1_550.54.15_linux.run
+  ./cuda_12.4.1_550.54.15_linux.run --toolkit --silent
+  rm -f cuda_12.4.1_550.54.15_linux.run
  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda

  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
@ -121,7 +132,7 @@ function install_124 {
  cd ..
  rm -rf nccl

-  install_cusparselt_052
+  install_cusparselt_062

  ldconfig
 }
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -17,13 +17,13 @@ function install_cusparselt_052 {
 }

 function install_124 {
-  echo "Installing CUDA 12.4 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+  echo "Installing CUDA 12.4.1 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
-  # install CUDA 12.4.0 in the same container
-  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux_sbsa.run
-  chmod +x cuda_12.4.0_550.54.14_linux_sbsa.run
-  ./cuda_12.4.0_550.54.14_linux_sbsa.run --toolkit --silent
-  rm -f cuda_12.4.0_550.54.14_linux_sbsa.run
+  # install CUDA 12.4.1 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux_sbsa.run
+  chmod +x cuda_12.4.1_550.54.15_linux_sbsa.run
+  ./cuda_12.4.1_550.54.15_linux_sbsa.run --toolkit --silent
+  rm -f cuda_12.4.1_550.54.15_linux_sbsa.run
  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda

  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
--- a/.ci/docker/common/install_cudss.sh
+++ b/.ci/docker/common/install_cudss.sh
@ -0,0 +1,25 @@
+#!/bin/bash
+
+set -ex
+
+# cudss license: https://docs.nvidia.com/cuda/cudss/license.html
+mkdir tmp_cudss && cd tmp_cudss
+
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[1-4]$ ]]; then
+    arch_path='sbsa'
+    export TARGETARCH=${TARGETARCH:-$(uname -m)}
+    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
+        arch_path='x86_64'
+    fi
+    CUDSS_NAME="libcudss-linux-${arch_path}-0.3.0.9_cuda12-archive"
+    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudss/redist/libcudss/linux-${arch_path}/${CUDSS_NAME}.tar.xz
+
+    # only for cuda 12
+    tar xf ${CUDSS_NAME}.tar.xz
+    cp -a ${CUDSS_NAME}/include/* /usr/local/cuda/include/
+    cp -a ${CUDSS_NAME}/lib/* /usr/local/cuda/lib64/
+fi
+
+cd ..
+rm -rf tmp_cudss
+ldconfig
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,7 +5,15 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt

-if [[ ${CUDA_VERSION:0:4} =~ ^12\.[1-4]$ ]]; then
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[2-4]$ ]]; then
+    arch_path='sbsa'
+    export TARGETARCH=${TARGETARCH:-$(uname -m)}
+    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
+        arch_path='x86_64'
+    fi
+    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.6.2.3-archive"
+    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
+elif [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
--- a/.ci/docker/common/install_miopen.sh
+++ b/.ci/docker/common/install_miopen.sh
@ -57,7 +57,10 @@ MIOPEN_CMAKE_COMMON_FLAGS="
 -DMIOPEN_BUILD_DRIVER=OFF
 "
 # Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version
-if [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then
+if [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60300 ]]; then
+    echo "ROCm 6.2 MIOpen does not need any patches, do not build from source"
+    exit 0
+elif [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then
    echo "ROCm 6.1 MIOpen does not need any patches, do not build from source"
    exit 0
 elif [[ $ROCM_INT -ge 60000 ]] && [[ $ROCM_INT -lt 60100 ]]; then
--- a/.ci/docker/common/install_nvpl.sh
+++ b/.ci/docker/common/install_nvpl.sh
@ -0,0 +1,20 @@
+#!/bin/bash
+
+set -ex
+
+function install_nvpl {
+
+    mkdir -p /opt/nvpl/lib /opt/nvpl/include
+
+    wget https://developer.download.nvidia.com/compute/nvpl/redist/nvpl_blas/linux-sbsa/nvpl_blas-linux-sbsa-0.3.0-archive.tar.xz
+    tar xf nvpl_blas-linux-sbsa-0.3.0-archive.tar.xz
+    cp -r nvpl_blas-linux-sbsa-0.3.0-archive/lib/* /opt/nvpl/lib/
+    cp -r nvpl_blas-linux-sbsa-0.3.0-archive/include/* /opt/nvpl/include/
+
+    wget https://developer.download.nvidia.com/compute/nvpl/redist/nvpl_lapack/linux-sbsa/nvpl_lapack-linux-sbsa-0.2.3.1-archive.tar.xz
+    tar xf nvpl_lapack-linux-sbsa-0.2.3.1-archive.tar.xz
+    cp -r nvpl_lapack-linux-sbsa-0.2.3.1-archive/lib/* /opt/nvpl/lib/
+    cp -r nvpl_lapack-linux-sbsa-0.2.3.1-archive/include/* /opt/nvpl/include/
+}
+
+install_nvpl
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -41,19 +41,33 @@ if [ -z "${MAX_JOBS}" ]; then
    export MAX_JOBS=$(nproc)
 fi

+# Git checkout triton
+mkdir /var/lib/jenkins/triton
+chown -R jenkins /var/lib/jenkins/triton
+chgrp -R jenkins /var/lib/jenkins/triton
+pushd /var/lib/jenkins/
+
+as_jenkins git clone ${TRITON_REPO} triton
+cd triton
+as_jenkins git checkout ${TRITON_PINNED_COMMIT}
+cd python
+
+# TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
+as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py
+
 if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}" == "7" ]]; then
  # Triton needs at least gcc-9 to build
  apt-get install -y g++-9

-  CXX=g++-9 pip_install "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python"
+  CXX=g++-9 pip_install -e .
 elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
  # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
  add-apt-repository -y ppa:ubuntu-toolchain-r/test
  apt-get install -y g++-9

-  CXX=g++-9 pip_install "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python"
+  CXX=g++-9 pip_install -e .
 else
-  pip_install "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python"
+  pip_install -e .
 fi

 if [ -n "${CONDA_CMAKE}" ]; then
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -16,11 +16,11 @@ function install_ubuntu() {

    apt-get update -y
    apt-get install -y gpg-agent wget
-    # To add the online network package repository for the GPU Driver LTS releases
+    # To add the online network package repository for the GPU Driver
    wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
        | gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] \
-        https://repositories.intel.com/gpu/ubuntu ${VERSION_CODENAME}/lts/2350 unified" \
+        https://repositories.intel.com/gpu/ubuntu ${VERSION_CODENAME}${XPU_DRIVER_VERSION} unified" \
        | tee /etc/apt/sources.list.d/intel-gpu-${VERSION_CODENAME}.list
    # To add the online network network package repository for the Intel Support Packages
    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
@ -45,9 +45,9 @@ function install_ubuntu() {
    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
    # Install Intel Support Packages
    if [ -n "$XPU_VERSION" ]; then
-        apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION}
+        apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION} intel-pti-dev
    else
-        apt-get install -y intel-for-pytorch-gpu-dev
+        apt-get install -y intel-for-pytorch-gpu-dev intel-pti-dev
    fi

    # Cleanup
@ -55,52 +55,6 @@ function install_ubuntu() {
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 }

-function install_centos() {
-    dnf install -y 'dnf-command(config-manager)'
-    dnf config-manager --add-repo \
-        https://repositories.intel.com/gpu/rhel/8.6/production/2328/unified/intel-gpu-8.6.repo
-    # To add the EPEL repository needed for DKMS
-    dnf -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm
-        # https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
-
-    # Create the YUM repository file in the /temp directory as a normal user
-    tee > /tmp/oneAPI.repo << EOF
-[oneAPI]
-name=Intel® oneAPI repository
-baseurl=https://yum.repos.intel.com/oneapi
-enabled=1
-gpgcheck=1
-repo_gpgcheck=1
-gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-EOF
-
-    # Move the newly created oneAPI.repo file to the YUM configuration directory /etc/yum.repos.d
-    mv /tmp/oneAPI.repo /etc/yum.repos.d
-
-    # The xpu-smi packages
-    dnf install -y flex bison xpu-smi
-    # Compute and Media Runtimes
-    dnf install -y \
-        intel-opencl intel-media intel-mediasdk libmfxgen1 libvpl2\
-        level-zero intel-level-zero-gpu mesa-dri-drivers mesa-vulkan-drivers \
-        mesa-vdpau-drivers libdrm mesa-libEGL mesa-libgbm mesa-libGL \
-        mesa-libxatracker libvpl-tools intel-metrics-discovery \
-        intel-metrics-library intel-igc-core intel-igc-cm \
-        libva libva-utils intel-gmmlib libmetee intel-gsc intel-ocloc hwinfo clinfo
-    # Development packages
-    dnf install -y --refresh \
-        intel-igc-opencl-devel level-zero-devel intel-gsc-devel libmetee-devel \
-        level-zero-devel
-    # Install Intel® oneAPI Base Toolkit
-    dnf install intel-basekit -y
-
-    # Cleanup
-    dnf clean all
-    rm -rf /var/cache/yum
-    rm -rf /var/lib/yum/yumdb
-    rm -rf /var/lib/yum/history
-}
-
 function install_rhel() {
    . /etc/os-release
    if [[ "${ID}" == "rhel" ]]; then
@ -114,9 +68,9 @@ function install_rhel() {
    fi

    dnf install -y 'dnf-command(config-manager)'
-    # To add the online network package repository for the GPU Driver LTS releases
+    # To add the online network package repository for the GPU Driver
    dnf config-manager --add-repo \
-        https://repositories.intel.com/gpu/rhel/${VERSION_ID}/lts/2350/unified/intel-gpu-${VERSION_ID}.repo
+        https://repositories.intel.com/gpu/rhel/${VERSION_ID}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_ID}.repo
    # To add the online network network package repository for the Intel Support Packages
    tee > /etc/yum.repos.d/intel-for-pytorch-gpu-dev.repo << EOF
 [intel-for-pytorch-gpu-dev]
@ -131,7 +85,7 @@ EOF
    # The xpu-smi packages
    dnf install -y xpu-smi
    # Compute and Media Runtimes
-    dnf install -y \
+    dnf install --skip-broken -y \
        intel-opencl intel-media intel-mediasdk libmfxgen1 libvpl2\
        level-zero intel-level-zero-gpu mesa-dri-drivers mesa-vulkan-drivers \
        mesa-vdpau-drivers libdrm mesa-libEGL mesa-libgbm mesa-libGL \
@ -160,9 +114,9 @@ function install_sles() {
        exit
    fi

-    # To add the online network package repository for the GPU Driver LTS releases
+    # To add the online network package repository for the GPU Driver
    zypper addrepo -f -r \
-        https://repositories.intel.com/gpu/sles/${VERSION_SP}/lts/2350/unified/intel-gpu-${VERSION_SP}.repo
+        https://repositories.intel.com/gpu/sles/${VERSION_SP}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_SP}.repo
    rpm --import https://repositories.intel.com/gpu/intel-graphics.key
    # To add the online network network package repository for the Intel Support Packages
    zypper addrepo https://yum.repos.intel.com/intel-for-pytorch-gpu-dev intel-for-pytorch-gpu-dev
@ -181,6 +135,12 @@ function install_sles() {

 }

+# Default use GPU driver LTS releases
+XPU_DRIVER_VERSION="/lts/2350"
+if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
+    # Use GPU driver rolling releases
+    XPU_DRIVER_VERSION=""
+fi

 # The installation depends on the base OS
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
@ -188,9 +148,6 @@ case "$ID" in
    ubuntu)
        install_ubuntu
    ;;
-    centos)
-        install_centos
-    ;;
    rhel|almalinux)
        install_rhel
    ;;
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -89,7 +89,7 @@ RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh

 # Install AOTriton
 COPY ./common/common_utils.sh common_utils.sh
-COPY ./common/aotriton_version.txt aotriton_version.txt
+COPY ./aotriton_version.txt aotriton_version.txt
 COPY ./common/install_aotriton.sh install_aotriton.sh
 RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
 ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -196,7 +196,7 @@ RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh

 # Install AOTriton
 COPY ./common/common_utils.sh common_utils.sh
-COPY ./common/aotriton_version.txt aotriton_version.txt
+COPY ./aotriton_version.txt aotriton_version.txt
 COPY ./common/install_aotriton.sh install_aotriton.sh
 RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
 ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -145,9 +145,13 @@ ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh

 FROM cpu_final as xpu_final
+# XPU CD use rolling driver
+ENV XPU_DRIVER_TYPE ROLLING
 # cmake-3.28.4 from pip
 RUN python3 -m pip install --upgrade pip && \
    python3 -mpip install cmake==3.28.4
+# Install setuptools and wheel for python 3.13
+RUN /opt/python/cp313-cp313/bin/python -m pip install setuptools wheel
 ADD ./common/install_xpu.sh install_xpu.sh
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
--- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@ -75,17 +75,17 @@ ARG BASE_CUDA_VERSION
 ADD ./common/install_magma.sh install_magma.sh
 RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh

-FROM base as openblas
-# Install openblas
-ADD ./common/install_openblas.sh install_openblas.sh
-RUN bash ./install_openblas.sh && rm install_openblas.sh
+FROM base as nvpl
+# Install nvpl
+ADD ./common/install_nvpl.sh install_nvpl.sh
+RUN bash ./install_nvpl.sh && rm install_nvpl.sh

 FROM final as cuda_final
 ARG BASE_CUDA_VERSION
 RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
-COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
+COPY --from=nvpl /opt/nvpl/lib/  /usr/local/lib/
+COPY --from=nvpl /opt/nvpl/include/  /usr/local/include/
 RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
 ENV PATH=/usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -218,7 +218,7 @@ pygments==2.15.0
 #test that import:

 scikit-image==0.19.3 ; python_version < "3.10"
-scikit-image==0.20.0 ; python_version >= "3.10"
+scikit-image==0.22.0 ; python_version >= "3.10"
 #Description: image processing routines
 #Pinned versions:
 #test that import: test_nn.py
@ -269,6 +269,10 @@ lintrunner==0.12.5
 #Pinned versions: 0.12.5
 #test that import:

+redis>=4.0.0
+#Description: redis database
+#test that import: anything that tests OSS caching/mocking (inductor/test_codecache.py, inductor/test_max_autotune.py)
+
 rockset==1.0.3
 #Description: queries Rockset
 #Pinned versions: 1.0.3
@ -312,3 +316,19 @@ lxml==5.0.0
 # Python-3.9 binaries

 PyGithub==2.3.0
+
+sympy==1.12.1 ; python_version == "3.8"
+sympy==1.13.1 ; python_version >= "3.9"
+#Description: Required by coremltools, also pinned in .github/requirements/pip-requirements-macOS.txt
+#Pinned versions:
+#test that import:
+
+onnx==1.16.1
+#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
+#Pinned versions:
+#test that import:
+
+onnxscript==0.1.0.dev20240817
+#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
+#Pinned versions:
+#test that import:
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -156,6 +156,12 @@ COPY ./common/install_cusparselt.sh install_cusparselt.sh
 RUN bash install_cusparselt.sh
 RUN rm install_cusparselt.sh

+# Install CUDSS
+ARG CUDA_VERSION
+COPY ./common/install_cudss.sh install_cudss.sh
+RUN bash install_cudss.sh
+RUN rm install_cudss.sh
+
 # Delete /usr/local/cuda-11.X/cuda-11.X symlinks
 RUN if [ -h /usr/local/cuda-11.6/cuda-11.6 ]; then rm /usr/local/cuda-11.6/cuda-11.6; fi
 RUN if [ -h /usr/local/cuda-11.7/cuda-11.7 ]; then rm /usr/local/cuda-11.7/cuda-11.7; fi
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -30,6 +30,7 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
 ARG ANACONDA_PYTHON_VERSION
 ARG CONDA_CMAKE
 ARG DOCS
+ARG BUILD_ENVIRONMENT
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
 ENV DOCS=$DOCS
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -176,7 +176,8 @@ fi
 if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
-  export USE_XPU=1
+  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
+  export USE_KINETO=0
 fi

 # sccache will fail for CUDA builds if all cores are used for compiling
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -179,7 +179,7 @@ function install_torchvision() {
 }

 function install_tlparse() {
-  pip_install --user "tlparse==0.3.7"
+  pip_install --user "tlparse==0.3.25"
  PATH="$(python -m site --user-base)/bin:$PATH"
 }

--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -44,19 +44,15 @@ time python test/run_test.py --verbose -i distributed/_tensor/test_dtensor_compi
 time python test/run_test.py --verbose -i distributed/test_device_mesh

 # DTensor/TP tests
-time python test/run_test.py --verbose -i distributed/tensor/parallel/test_ddp_2d_parallel
-time python test/run_test.py --verbose -i distributed/tensor/parallel/test_fsdp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_examples
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state

 # FSDP2 tests
 time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh

-# Pipelining composability tests
-time python test/run_test.py --verbose -i distributed/pipelining/test_composability.py
-
 # ND composability tests
 time python test/run_test.py --verbose -i distributed/_composable/test_composability/test_2d_composability
+time python test/run_test.py --verbose -i distributed/_composable/test_composability/test_pp_composability

 # Other tests
 time python test/run_test.py --verbose -i test_cuda_primary_ctx
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -6,6 +6,9 @@

 set -ex

+# Suppress ANSI color escape sequences
+export TERM=vt100
+
 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"

@ -166,7 +169,7 @@ fi

 if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # Source Intel oneAPI envrioment script to enable xpu runtime related libraries
-  # refer to https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/2024-0/use-the-setvars-and-oneapi-vars-scripts-with-linux.html
+  # refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu/2-5.html
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
  # Check XPU status before testing
@ -316,7 +319,6 @@ test_inductor_distributed() {
  python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
  python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
  python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
-  python test/run_test.py -i distributed/tensor/parallel/test_fsdp_2d_parallel.py --verbose
  python test/run_test.py -i distributed/tensor/parallel/test_micro_pipeline_tp.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
@ -359,10 +361,12 @@ test_inductor_shard() {
 test_inductor_aoti() {
  # docker build uses bdist_wheel which does not work with test_aot_inductor
  # TODO: need a faster way to build
-  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-    BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
-    CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
+  if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+    # We need to hipify before building again
+    python3 tools/amd_build/build_amd.py
  fi
+  BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
+  CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
 }

 test_inductor_cpp_wrapper_abi_compatible() {
@ -391,7 +395,22 @@ test_inductor_cpp_wrapper_abi_compatible() {
 # .github/workflows/inductor-perf-test-nightly.yml
 DYNAMO_BENCHMARK_FLAGS=()

-if [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
+pr_time_benchmarks() {
+
+  pip_install --user "fbscribelogger"
+
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_after.txt" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
+  echo "benchmark results on current PR: "
+  cat  "$TEST_REPORTS_DIR/pr_time_benchmarks_after.txt"
+
+}
+
+if [[ "${TEST_CONFIG}" == *pr_time_benchmarks* ]]; then
+  pr_time_benchmarks
+  exit 0
+elif [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--backend eager)
 elif [[ "${TEST_CONFIG}" == *aot_eager* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--backend aot_eager)
@ -487,6 +506,12 @@ test_perf_for_dashboard() {
            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_autotune_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *aotinductor-true* ]] && [[ "$mode" == "inference" ]]; then
+        if [[ "$target" == "accuracy" ]]; then
+          # Also collect Export pass rate and display as a separate row
+          $TASKSET python "benchmarks/dynamo/$suite.py" \
+              "${target_flag[@]}" --"$mode" --"$dtype" --export --disable-cudagraphs "$@" \
+              --output "$TEST_REPORTS_DIR/${backend}_export_${suite}_${dtype}_${mode}_${device}_${target}.csv"
+        fi
        TORCHINDUCTOR_ABI_COMPATIBLE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --export-aot-inductor --disable-cudagraphs "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_aot_inductor_${suite}_${dtype}_${mode}_${device}_${target}.csv"
@ -550,7 +575,10 @@ test_single_dynamo_benchmark() {
    fi

    if [[ "${TEST_CONFIG}" == *_avx2* ]]; then
-      TEST_CONFIG=${TEST_CONFIG::-5}
+      TEST_CONFIG=${TEST_CONFIG//_avx2/}
+    fi
+    if [[ "${TEST_CONFIG}" == *_avx512* ]]; then
+      TEST_CONFIG=${TEST_CONFIG//_avx512/}
    fi
    python "benchmarks/dynamo/$suite.py" \
      --ci --accuracy --timing --explain \
@ -637,8 +665,7 @@ test_inductor_torchbench_smoketest_perf() {
  # https://github.com/pytorch/pytorch/actions/runs/7158691360/job/19491437314,
  # and thus we lower its threshold to reduce flakiness. If this continues to be a problem,
  # we switch to use some other model.
-  # lowering threshold from 4.9 to 4.7 for cu124. Will bump it up after cuda 12.4.0->12.4.1 update
-  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.7
+  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.9

  # Check memory compression ratio for a few models
  for test in hf_Albert timm_vision_transformer; do
@ -662,7 +689,7 @@ test_inductor_torchbench_smoketest_perf() {
 }

 test_inductor_get_core_number() {
-  if [[ "${TEST_CONFIG}" == *aarch64 ]]; then
+  if [[ "${TEST_CONFIG}" == *aarch64* ]]; then
    echo "$(($(lscpu | grep 'Cluster(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per cluster:' | awk '{print $4}')))"
  else
    echo "$(($(lscpu | grep 'Socket(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per socket:' | awk '{print $4}')))"
@ -672,11 +699,16 @@ test_inductor_get_core_number() {
 test_inductor_set_cpu_affinity(){
  #set jemalloc
  JEMALLOC_LIB="$(find /usr/lib -name libjemalloc.so.2)"
-  IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
-  export LD_PRELOAD="$JEMALLOC_LIB":"$IOMP_LIB":"$LD_PRELOAD"
+  export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
  export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
-  export KMP_AFFINITY=granularity=fine,compact,1,0
-  export KMP_BLOCKTIME=1
+
+  if [[ "${TEST_CONFIG}" != *aarch64* ]]; then
+    # Use Intel OpenMP for x86
+    IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
+    export LD_PRELOAD="$IOMP_LIB":"$LD_PRELOAD"
+    export KMP_AFFINITY=granularity=fine,compact,1,0
+    export KMP_BLOCKTIME=1
+  fi
  cores=$(test_inductor_get_core_number)
  export OMP_NUM_THREADS=$cores
  end_core=$((cores-1))
@ -1448,8 +1480,6 @@ elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  test_inductor_shard "${SHARD_NUMBER}"
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
    if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.8-gcc11-build ]]; then
-      # Temporarily skip test_inductor_aoti due to https://github.com/pytorch/pytorch/issues/130311
-      test_inductor_aoti
      test_inductor_distributed
    fi
  fi
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -65,13 +65,6 @@ set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
 set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
 set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
 set CUDNN_ROOT_DIR=%CUDA_PATH%
-set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
-set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
-
-set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
-set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
-set CUDNN_ROOT_DIR=%CUDA_PATH%
-set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
 set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%

 :cuda_build_end
--- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
@ -40,7 +40,6 @@ set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
 set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
 set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
 set CUDNN_ROOT_DIR=%CUDA_PATH%
-set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
 set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
 set NUMBAPRO_CUDALIB=%CUDA_PATH%\bin
 set NUMBAPRO_LIBDEVICE=%CUDA_PATH%\nvvm\libdevice
--- a/.ci/pytorch/win-test-helpers/test_custom_backend.bat
+++ b/.ci/pytorch/win-test-helpers/test_custom_backend.bat
@ -31,6 +31,6 @@ if ERRORLEVEL 1 exit /b 1

 :: Run tests C++-side and load the exported script module.
 cd build
-set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%
+set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH%
 test_custom_backend.exe model.pt
 if ERRORLEVEL 1 exit /b 1
--- a/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat
+++ b/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat
@ -31,6 +31,6 @@ if ERRORLEVEL 1 exit /b 1

 :: Run tests C++-side and load the exported script module.
 cd build
-set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%
+set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH%
 test_custom_ops.exe model.pt
 if ERRORLEVEL 1 exit /b 1
--- a/.ci/pytorch/win-test-helpers/test_libtorch.bat
+++ b/.ci/pytorch/win-test-helpers/test_libtorch.bat
@ -5,7 +5,7 @@ if errorlevel 1 exit /b 1
 set CWD=%cd%

 set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\bin
-set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%
+set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH%

 set TORCH_CPP_TEST_MNIST_PATH=%CWD%\test\cpp\api\mnist
 python tools\download_mnist.py --quiet -d %TORCH_CPP_TEST_MNIST_PATH%
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -116,12 +116,6 @@ if [[ "$PACKAGE_TYPE" == libtorch ]]; then
  cd /tmp/libtorch
 fi

-if [[ "$GPU_ARCH_TYPE" == xpu ]]; then
-  # Workaround for __mkl_tmp_MOD unbound variable issue, refer https://github.com/pytorch/pytorch/issues/130543
-  set +u
-  source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
-fi
-
 # Test the package
 /builder/check_binary.sh

--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -102,10 +102,10 @@ fi

 # Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton xpu package
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*xpu.* && $(uname) == "Linux" ]]; then
-    TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}"
+    TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-xpu.txt)
-        TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}+${TRITON_SHORTHASH}"
+        TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
    fi
    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
--- a/.flake8
+++ b/.flake8
@ -57,7 +57,7 @@ per-file-ignores =
    torch/distributed/_tensor/_collective_utils.py: TOR901
    # This is a full package that happen to live within the test
    # folder, so ok to skip
-    test/cpp_extensions/open_registration_extension/pytorch_openreg/__init__.py: TOR901
+    test/cpp_extensions/open_registration_extension/pytorch_openreg/_aten_impl.py: TOR901
 optional-ascii-coding = True
 exclude =
    ./.git,
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -10,11 +10,15 @@ self-hosted-runner:
    - linux.2xlarge
    - linux.4xlarge
    - linux.9xlarge.ephemeral
+    - am2.linux.9xlarge.ephemeral
    - linux.12xlarge
    - linux.12xlarge.ephemeral
    - linux.24xlarge
+    - linux.24xlarge.ephemeral
    - linux.arm64.2xlarge
+    - linux.arm64.2xlarge.ephemeral
    - linux.arm64.m7g.4xlarge
+    - linux.arm64.m7g.4xlarge.ephemeral
    - linux.4xlarge.nvidia.gpu
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
@ -38,6 +42,7 @@ self-hosted-runner:
    - amz2023.linux.24xlarge
    - amz2023.linux.arm64.2xlarge
    - amz2023.linux.arm64.m7g.4xlarge
+    - amz2023.linux.arm64.m7g.4xlarge.ephemeral
    - amz2023.linux.4xlarge.nvidia.gpu
    - amz2023.linux.8xlarge.nvidia.gpu
    - amz2023.linux.16xlarge.nvidia.gpu
@ -56,6 +61,9 @@ self-hosted-runner:
    # Repo-specific IBM hosted S390x runner
    - linux.s390x
    # Organization wide AWS Windows runners
+    - windows.g4dn.xlarge
+    - windows.g4dn.xlarge.nonephemeral
+    - windows.4xlarge
    - windows.4xlarge.nonephemeral
    - windows.8xlarge.nvidia.gpu
    - windows.8xlarge.nvidia.gpu.nonephemeral
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@ -57,7 +57,7 @@ outputs:
 runs:
  using: composite
  steps:
-    - uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
+    - uses: nick-fields/retry@v3.0.0
      name: Setup dependencies
      env:
        GITHUB_TOKEN: ${{ inputs.github-token }}
--- a/.github/actions/pytest-cache-download/action.yml
+++ b/.github/actions/pytest-cache-download/action.yml
@ -17,7 +17,7 @@ inputs:
 runs:
  using: composite
  steps:
-    - uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
+    - uses: nick-fields/retry@v3.0.0
      name: Setup dependencies
      with:
        shell: bash
--- a/.github/actions/pytest-cache-upload/action.yml
+++ b/.github/actions/pytest-cache-upload/action.yml
@ -24,7 +24,7 @@ inputs:
 runs:
  using: composite
  steps:
-    - uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
+    - uses: nick-fields/retry@v3.0.0
      name: Setup dependencies
      with:
        shell: bash
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@ -44,7 +44,7 @@ runs:
        fi

    - name: Log in to ECR
-      uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
+      uses: nick-fields/retry@v3.0.0
      env:
        AWS_RETRY_MODE: standard
        AWS_MAX_ATTEMPTS: "5"
@ -59,6 +59,13 @@ runs:
          aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"

+          # For LF Runners we need to make sure we also login to Meta's ECR docker registry too.
+          META_AWS_ACCOUNT_ID=308535385114
+          if [ "$AWS_ACCOUNT_ID" != "$META_AWS_ACCOUNT_ID" ] ; then
+              aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
+                  --password-stdin "$META_AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
+          fi
+
    - name: Preserve github env variables for use in docker
      shell: bash
      run: |
--- a/.github/actions/teardown-win/action.yml
+++ b/.github/actions/teardown-win/action.yml
@ -31,7 +31,7 @@ runs:
    # retry this step several time similar to how checkout-pytorch GHA does
    - name: Cleanup workspace
      if: always()
-      uses: nick-fields/retry@v2.8.2
+      uses: nick-fields/retry@v3.0.0
      env:
        EXTRA_DELETE_DIR: ${{ inputs.extra-delete-dir }}
      with:
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-b3f6f511f2a1082bd56b13a3f6794e7fc3ba4862
+97ed7b36b7a741253d4e41e4da3c901d83294503
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-5ea4535f0699f366adb554183a65ebf7dc34a8be
+2eb4a60ed14a38260b85b0c765161f0ce45be6d1
--- a/.github/label_to_label.yml
+++ b/.github/label_to_label.yml
@ -1,13 +1,46 @@
 # Use this to auto apply labels based on other labels.  Applies to both PRs and
 # issues. Currently only supports any and all
 - any:
-  - "module: custom operators"
+  - "module: opcheck"
+  then:
+  - "module: custom-operators"
+- any:
+  - "module: custom-operators"
+  - "module: functionalization"
  - "module: aotdispatch"
+  - "module: higher order operators"
+  - "module: fakeTensor"
+  - "module: ProxyTensor"
+  - "module: library"
+  - "module: reinplacing"
  then:
  - "module: pt2-dispatcher"
+- any:
+  - "module: vmap"
+  then:
+  - "module: functorch"
+- any:
+  - "module: reinplacing"
+  then:
+  - "module: inductor"
+- any:
+  - "module: pt2 optimizer"
+  then:
+  - "module: dynamo"
+- any:
+  - "module: flex attention"
+  then:
+  - "module: higher order operators"
 - any:
  - "module: dynamo"
  - "module: pt2-dispatcher"
  - "module: inductor"
+  - "module: aotinductor"
+  - "module: cudagraphs"
+  - "oncall: export"
+  - "module: startup-tracing-compile"
+  - "module: compiled autograd"
+  - "module: flex attention"
+  - "module: dynamic shapes"
  then:
  - "oncall: pt2"
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -29,7 +29,6 @@
 - torch/fx/experimental/recording.py
 - torch/fx/experimental/sym_node.py
 - torch/fx/experimental/validator.py
- torch/fx/experimental/_sym_dispatch_mode.py
 - torch/fx/experimental/proxy_tensor.py
 - test/distributed/_tensor/test_dtensor_compile.py
 - test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
--- a/.github/lf-canary-scale-config.yml
+++ b/.github/lf-canary-scale-config.yml
@ -31,132 +31,312 @@ runner_types:
    is_ephemeral: false
    max_available: 1000
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.10xlarge.avx2:
    disk_size: 200
    instance_type: m4.10xlarge
    is_ephemeral: false
    max_available: 450
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.24xl.spr-metal:
    disk_size: 200
    instance_type: c7i.metal-24xl
    is_ephemeral: false
    max_available: 150
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.16xlarge.spr:
    disk_size: 200
    instance_type: c7i.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.9xlarge.ephemeral:
    disk_size: 200
    instance_type: c5.9xlarge
    is_ephemeral: true
-    max_available: 20
+    max_available: 50
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.12xlarge.ephemeral:
    disk_size: 200
    instance_type: c5.12xlarge
    is_ephemeral: true
    max_available: 300
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.16xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.24xlarge:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: false
-    max_available: 250
+    max_available: 500
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+  lf.c.linux.24xlarge.ephemeral:
+    disk_size: 150
+    instance_type: c5.24xlarge
+    is_ephemeral: true
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.2xlarge:
    disk_size: 150
    instance_type: c5.2xlarge
    is_ephemeral: false
    max_available: 3120
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.4xlarge:
    disk_size: 150
    instance_type: c5.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.8xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.8xlarge
    is_ephemeral: false
    max_available: 400
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.g4dn.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.12xlarge
    is_ephemeral: false
    max_available: 250
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.g4dn.metal.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.metal
    is_ephemeral: false
    max_available: 300
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.g5.48xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.48xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.g5.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.12xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.g5.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.4xlarge
    is_ephemeral: false
    max_available: 2400
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.g6.4xlarge.experimental.nvidia.gpu:
    disk_size: 150
    instance_type: g6.4xlarge
    is_ephemeral: false
-    max_available: 30
+    max_available: 50
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.large:
    max_available: 1200
    disk_size: 15
    instance_type: c5.large
    is_ephemeral: false
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.arm64.2xlarge:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
  lf.c.linux.arm64.m7g.4xlarge:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+  lf.c.linux.arm64.2xlarge.ephemeral:
+    disk_size: 256
+    instance_type: t4g.2xlarge
+    is_ephemeral: true
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+  lf.c.linux.arm64.m7g.4xlarge.ephemeral:
+    disk_size: 256
+    instance_type: m7g.4xlarge
+    is_ephemeral: true
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
  lf.c.linux.arm64.m7g.metal:
    disk_size: 256
    instance_type: m7g.metal
    is_ephemeral: false
    max_available: 100
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+  lf.c.windows.g4dn.xlarge:
+    disk_size: 256
+    instance_type: g4dn.xlarge
+    is_ephemeral: true
+    max_available: 100
+    os: windows
+  lf.c.windows.g4dn.xlarge.nonephemeral:
+    disk_size: 256
+    instance_type: g4dn.xlarge
+    is_ephemeral: false
+    max_available: 100
+    os: windows
  lf.c.windows.4xlarge:
    disk_size: 256
    instance_type: c5d.4xlarge
@ -187,159 +367,3 @@ runner_types:
    is_ephemeral: false
    max_available: 250
    os: windows
-
-  ### Setup runner types to test the Amazon Linux 2023 AMI
-  lf.c.amz2023.linux.12xlarge:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.10xlarge.avx2:
-    disk_size: 200
-    instance_type: m4.10xlarge
-    is_ephemeral: false
-    max_available: 450
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.24xl.spr-metal:
-    disk_size: 200
-    instance_type: c7i.metal-24xl
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.16xlarge.spr:
-    disk_size: 200
-    instance_type: c7i.16xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.9xlarge.ephemeral:
-    disk_size: 200
-    instance_type: c5.9xlarge
-    is_ephemeral: true
-    max_available: 20
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.12xlarge.ephemeral:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: true
-    max_available: 300
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.16xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.16xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.24xlarge:
-    disk_size: 150
-    instance_type: c5.24xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.2xlarge:
-    disk_size: 150
-    instance_type: c5.2xlarge
-    is_ephemeral: false
-    max_available: 3120
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.4xlarge:
-    disk_size: 150
-    instance_type: c5.4xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.4xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.8xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.8xlarge
-    is_ephemeral: false
-    max_available: 400
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.g4dn.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.12xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.g4dn.metal.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.metal
-    is_ephemeral: false
-    max_available: 300
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.g5.48xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.48xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.g5.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.12xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.g5.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 2400
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.g6.4xlarge.experimental.nvidia.gpu:
-    disk_size: 150
-    instance_type: g6.4xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.large:
-    max_available: 1200
-    disk_size: 15
-    instance_type: c5.large
-    is_ephemeral: false
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.arm64.2xlarge:
-    disk_size: 256
-    instance_type: t4g.2xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-  lf.c.amz2023.linux.arm64.m7g.4xlarge:
-    disk_size: 256
-    instance_type: m7g.4xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-  lf.c.amz2023.linux.arm64.m7g.metal:
-    disk_size: 256
-    instance_type: m7g.metal
-    is_ephemeral: false
-    max_available: 100
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
--- a/.github/lf-scale-config.yml
+++ b/.github/lf-scale-config.yml
@ -31,132 +31,312 @@ runner_types:
    is_ephemeral: false
    max_available: 1000
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.10xlarge.avx2:
    disk_size: 200
    instance_type: m4.10xlarge
    is_ephemeral: false
    max_available: 450
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.24xl.spr-metal:
    disk_size: 200
    instance_type: c7i.metal-24xl
    is_ephemeral: false
    max_available: 150
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.16xlarge.spr:
    disk_size: 200
    instance_type: c7i.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.9xlarge.ephemeral:
    disk_size: 200
    instance_type: c5.9xlarge
    is_ephemeral: true
-    max_available: 20
+    max_available: 50
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.12xlarge.ephemeral:
    disk_size: 200
    instance_type: c5.12xlarge
    is_ephemeral: true
    max_available: 300
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.16xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.24xlarge:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: false
-    max_available: 250
+    max_available: 500
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+  lf.linux.24xlarge.ephemeral:
+    disk_size: 150
+    instance_type: c5.24xlarge
+    is_ephemeral: true
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.2xlarge:
    disk_size: 150
    instance_type: c5.2xlarge
    is_ephemeral: false
    max_available: 3120
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.4xlarge:
    disk_size: 150
    instance_type: c5.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.8xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.8xlarge
    is_ephemeral: false
    max_available: 400
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.g4dn.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.12xlarge
    is_ephemeral: false
    max_available: 250
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.g4dn.metal.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.metal
    is_ephemeral: false
    max_available: 300
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.g5.48xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.48xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.g5.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.12xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.g5.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.4xlarge
    is_ephemeral: false
    max_available: 2400
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.g6.4xlarge.experimental.nvidia.gpu:
    disk_size: 150
    instance_type: g6.4xlarge
    is_ephemeral: false
-    max_available: 30
+    max_available: 50
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.large:
    max_available: 1200
    disk_size: 15
    instance_type: c5.large
    is_ephemeral: false
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.arm64.2xlarge:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
  lf.linux.arm64.m7g.4xlarge:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+  lf.linux.arm64.2xlarge.ephemeral:
+    disk_size: 256
+    instance_type: t4g.2xlarge
+    is_ephemeral: true
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+  lf.linux.arm64.m7g.4xlarge.ephemeral:
+    disk_size: 256
+    instance_type: m7g.4xlarge
+    is_ephemeral: true
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
  lf.linux.arm64.m7g.metal:
    disk_size: 256
    instance_type: m7g.metal
    is_ephemeral: false
    max_available: 100
    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    variants:
+      amz2023:
+        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+  lf.windows.g4dn.xlarge:
+    disk_size: 256
+    instance_type: g4dn.xlarge
+    is_ephemeral: true
+    max_available: 100
+    os: windows
+  lf.windows.g4dn.xlarge.nonephemeral:
+    disk_size: 256
+    instance_type: g4dn.xlarge
+    is_ephemeral: false
+    max_available: 100
+    os: windows
  lf.windows.4xlarge:
    disk_size: 256
    instance_type: c5d.4xlarge
@ -187,159 +367,3 @@ runner_types:
    is_ephemeral: false
    max_available: 250
    os: windows
-
-  ### Setup runner types to test the Amazon Linux 2023 AMI
-  lf.amz2023.linux.12xlarge:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.10xlarge.avx2:
-    disk_size: 200
-    instance_type: m4.10xlarge
-    is_ephemeral: false
-    max_available: 450
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.24xl.spr-metal:
-    disk_size: 200
-    instance_type: c7i.metal-24xl
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.16xlarge.spr:
-    disk_size: 200
-    instance_type: c7i.16xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.9xlarge.ephemeral:
-    disk_size: 200
-    instance_type: c5.9xlarge
-    is_ephemeral: true
-    max_available: 20
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.12xlarge.ephemeral:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: true
-    max_available: 300
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.16xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.16xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.24xlarge:
-    disk_size: 150
-    instance_type: c5.24xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.2xlarge:
-    disk_size: 150
-    instance_type: c5.2xlarge
-    is_ephemeral: false
-    max_available: 3120
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.4xlarge:
-    disk_size: 150
-    instance_type: c5.4xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.4xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.8xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.8xlarge
-    is_ephemeral: false
-    max_available: 400
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.g4dn.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.12xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.g4dn.metal.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.metal
-    is_ephemeral: false
-    max_available: 300
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.g5.48xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.48xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.g5.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.12xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.g5.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 2400
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.g6.4xlarge.experimental.nvidia.gpu:
-    disk_size: 150
-    instance_type: g6.4xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.large:
-    max_available: 1200
-    disk_size: 15
-    instance_type: c5.large
-    is_ephemeral: false
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.arm64.2xlarge:
-    disk_size: 256
-    instance_type: t4g.2xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-  lf.amz2023.linux.arm64.m7g.4xlarge:
-    disk_size: 256
-    instance_type: m7g.4xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-  lf.amz2023.linux.arm64.m7g.metal:
-    disk_size: 256
-    instance_type: m7g.metal
-    is_ephemeral: false
-    max_available: 100
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -282,9 +282,11 @@
  - torch/_C/_distributed*
  - torch/csrc/distributed/**
  - torch/testing/_internal/distributed/**
+  - torch/multiprocessing/**
  - test/distributed/**
  - test/cpp/dist_autograd/**
  - test/cpp/rpc/**
+  - test/*multiprocessing*
  approved_by:
  - wconstab
  - mrshenli
--- a/.github/nitpicks.yml
+++ b/.github/nitpicks.yml
@ -0,0 +1,5 @@
+- markdown: |
+    ## Attention! native_functions.yaml was changed
+    If you are adding a new function or defaulted argument to native_functions.yaml, you cannot use it from pre-existing Python frontend code until our FC window passes (two weeks).  Split your PR into two PRs, one which adds the new C++ functionality, and one that makes use of it from Python, and land them two weeks apart.  See https://github.com/pytorch/pytorch/wiki/PyTorch's-Python-Frontend-Backward-and-Forward-Compatibility-Policy#forwards-compatibility-fc for more info.
+  pathFilter:
+    - 'aten/src/ATen/native/native_functions.yaml'
--- a/.github/requirements/conda-env-iOS.txt
+++ b/.github/requirements/conda-env-iOS.txt
@ -4,4 +4,4 @@ ninja=1.10.2
 numpy=1.23.3
 pyyaml=6.0
 setuptools=68.2.2
-typing-extensions=4.9.0
+typing-extensions=4.11.0
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -18,7 +18,7 @@ pytest-rerunfailures==10.3
 pytest-flakefinder==1.1.0
 scipy==1.10.1
 sympy==1.12.1 ; python_version == "3.8"
-sympy>=1.13.0 ; python_version >= "3.9"
+sympy==1.13.1 ; python_version >= "3.9"
 unittest-xml-reporting<=3.2.0,>=2.0.0
 xdoctest==1.1.0
 filelock==3.6.0
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@ -50,6 +50,25 @@ def patch_init_py(
        f.write(orig)


+# TODO: remove patch_setup_py() once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
+def patch_setup_py(path: Path) -> None:
+    with open(path) as f:
+        orig = f.read()
+    try:
+        orig = check_and_replace(
+            orig,
+            "https://tritonlang.blob.core.windows.net/llvm-builds/",
+            "https://oaitriton.blob.core.windows.net/public/llvm-builds/",
+        )
+        with open(path, "w") as f:
+            f.write(orig)
+    except RuntimeError as e:
+        print(
+            f"Applying patch_setup_py() for llvm-build package failed: {e}.",
+            "If you are trying to build a newer version of Triton, you can ignore this.",
+        )
+
+
 def build_triton(
    *,
    version: str,
@ -91,6 +110,9 @@ def build_triton(
        else:
            check_call(["git", "checkout", commit_hash], cwd=triton_basedir)

+        # TODO: remove this and patch_setup_py() once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
+        patch_setup_py(triton_pythondir / "setup.py")
+
        if build_conda:
            with open(triton_basedir / "meta.yaml", "w") as meta:
                print(
--- a/.github/scripts/check_labels.py
+++ b/.github/scripts/check_labels.py
@ -27,6 +27,12 @@ def parse_args() -> Any:

    parser = ArgumentParser("Check PR labels")
    parser.add_argument("pr_num", type=int)
+    # add a flag to return a non-zero exit code if the PR does not have the required labels
+    parser.add_argument(
+        "--exit-non-zero",
+        action="store_true",
+        help="Return a non-zero exit code if the PR does not have the required labels",
+    )

    return parser.parse_args()

@ -41,10 +47,13 @@ def main() -> None:
        if not has_required_labels(pr):
            print(LABEL_ERR_MSG)
            add_label_err_comment(pr)
+            if args.exit_non_zero:
+                sys.exit(1)
        else:
            delete_all_label_err_comments(pr)
    except Exception as e:
-        pass
+        if args.exit_non_zero:
+            sys.exit(1)

    sys.exit(0)

--- a/.github/scripts/cherry_pick.py
+++ b/.github/scripts/cherry_pick.py
@ -169,7 +169,8 @@ def create_cherry_pick_branch(
    repo.create_branch_and_checkout(branch=cherry_pick_branch)

    # We might want to support ghstack later
-    repo._run_git("cherry-pick", "-x", "-X", "theirs", commit_sha)
+    # We don't want to resolve conflicts here.
+    repo._run_git("cherry-pick", "-x", commit_sha)
    repo.push(branch=cherry_pick_branch, dry_run=False)

    return cherry_pick_branch
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -18,13 +18,13 @@ from typing import Dict, List, Optional, Tuple
 CUDA_ARCHES = ["11.8", "12.1", "12.4"]


-CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1", "12.4": "12.4.0"}
+CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1", "12.4": "12.4.1"}


 CUDA_ARCHES_CUDNN_VERSION = {"11.8": "9", "12.1": "9", "12.4": "9"}


-ROCM_ARCHES = ["6.0", "6.1"]
+ROCM_ARCHES = ["6.1", "6.2"]

 XPU_ARCHES = ["xpu"]

@ -68,18 +68,18 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "12.4": (
-        "nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'"
+        "nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
 }

@ -365,9 +365,9 @@ def generate_wheels_matrix(
                else arch_version
            )

-            # TODO: Enable python 3.13 on rocm, xpu, aarch64, windows
+            # TODO: Enable python 3.13 on rocm, aarch64, windows
            if (
-                gpu_arch_type in ["rocm", "xpu"] or os != "linux"
+                gpu_arch_type == "rocm" or (os != "linux" and os != "linux-s390x")
            ) and python_version == "3.13":
                continue

@ -453,9 +453,7 @@ def generate_wheels_matrix(
                            gpu_arch_type, gpu_arch_version
                        ),
                        "devtoolset": (
-                            "cxx11-abi"
-                            if arch_version in ["cpu-cxx11-abi", "xpu"]
-                            else ""
+                            "cxx11-abi" if arch_version == "cpu-cxx11-abi" else ""
                        ),
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -158,7 +158,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
            arches=["11.8", "12.1", "12.4"],
-            python_versions=["3.8"],
+            python_versions=["3.9"],
        ),
        branches="main",
    ),
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@ -445,7 +445,6 @@ def retries_decorator(
                    print(
                        f'Attempt {idx} of {num_retries} to call {f.__name__} failed with "{e}"'
                    )
-                    pass
            return cast(T, rc)

        return wrapper
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@ -1,5 +1,29 @@
 # flake8: noqa: G004

+"""
+This runner determinator is used to determine which set of runners to run a
+GitHub job on. It uses the first comment of a GitHub issue (by default
+https://github.com/pytorch/test-infra/issues/5132) as a user list to determine
+which users will get their jobs to run on experimental runners. This user list
+is also a comma separated list of additional features or experiments which the
+user could be opted in to.
+
+The user list has the following rules:
+
+- Users are GitHub usernames with the @ prefix
+- If the first line is a "*" then all users will use the new runners
+- If the first line is a "!" then all users will use the old runners
+- Each user is also a comma-separated list of features/experiments to enable
+- A "#" prefix indicates the user is opted out of the new runners but is opting
+  into features/experiments.
+
+Example user list:
+
+    @User1
+    @User2,amz2023
+    #@UserOptOutOfNewRunner,amz2023
+"""
+
 import logging
 import os
 from argparse import ArgumentParser
@ -14,7 +38,11 @@ WORKFLOW_LABEL_META = ""  # use meta runners
 WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
 WORKFLOW_LABEL_LF_CANARY = "lf.c."  # use canary runners from the linux foundation

+RUNNER_AMI_LEGACY = ""
+RUNNER_AMI_AMZ2023 = "amz2023"
+
 GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
+GH_OUTPUT_KEY_AMI = "runner-ami"
 GH_OUTPUT_KEY_LABEL_TYPE = "label-type"


@ -150,7 +178,8 @@ def get_workflow_type(issue: Issue, workflow_requestors: Iterable[str]) -> str:
            return WORKFLOW_LABEL_LF
        else:
            all_opted_in_users = {
-                usr_raw.strip("\n\t@ ") for usr_raw in first_comment.split()
+                usr_raw.strip("\n\t@ ").split(",")[0]
+                for usr_raw in first_comment.split()
            }
            opted_in_requestors = {
                usr for usr in workflow_requestors if usr in all_opted_in_users
@ -173,12 +202,46 @@ def get_workflow_type(issue: Issue, workflow_requestors: Iterable[str]) -> str:
        return WORKFLOW_LABEL_META


+def get_optin_feature(
+    issue: Issue, workflow_requestors: Iterable[str], feature: str, fallback: str
+) -> str:
+    try:
+        first_comment = issue.get_comments()[0].body.strip("\n\t ")
+        userlist = {u.lstrip("#").strip("\n\t@ ") for u in first_comment.split()}
+        all_opted_in_users = set()
+        for user in userlist:
+            for i in user.split(","):
+                if i == feature:
+                    all_opted_in_users.add(user.split(",")[0])
+        opted_in_requestors = {
+            usr for usr in workflow_requestors if usr in all_opted_in_users
+        }
+
+        if opted_in_requestors:
+            log.info(
+                f"Feature {feature} is enabled for {', '.join(opted_in_requestors)}. Using feature {feature}."
+            )
+            return feature
+        else:
+            log.info(
+                f"Feature {feature} is disabled for {', '.join(workflow_requestors)}. Using fallback \"{fallback}\"."
+            )
+            return fallback
+
+    except Exception as e:
+        log.error(
+            f'Failed to determine if user has opted-in to feature {feature}. Using fallback "{fallback}". Exception: {e}'
+        )
+        return fallback
+
+
 def main() -> None:
    args = parse_args()

    if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
        log.info(f"Exception branch: '{args.github_branch}', using meta runners")
        label_type = WORKFLOW_LABEL_META
+        runner_ami = RUNNER_AMI_LEGACY
    else:
        try:
            gh = get_gh_client(args.github_token)
@ -198,17 +261,28 @@ def main() -> None:
                    username,
                ),
            )
+            runner_ami = get_optin_feature(
+                issue=issue,
+                workflow_requestors=(
+                    args.github_issue_owner,
+                    username,
+                ),
+                feature=RUNNER_AMI_AMZ2023,
+                fallback=RUNNER_AMI_LEGACY,
+            )
        except Exception as e:
            log.error(
                f"Failed to get issue. Falling back to meta runners. Exception: {e}"
            )
            label_type = WORKFLOW_LABEL_META
+            runner_ami = RUNNER_AMI_LEGACY

    # For Canary builds use canary runners
    if args.github_repo == "pytorch/pytorch-canary" and label_type == WORKFLOW_LABEL_LF:
        label_type = WORKFLOW_LABEL_LF_CANARY

    set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
+    set_github_output(GH_OUTPUT_KEY_AMI, runner_ami)


 if __name__ == "__main__":
--- a/.github/scripts/test_check_labels.py
+++ b/.github/scripts/test_check_labels.py
@ -18,6 +18,7 @@ def mock_parse_args() -> object:
    class Object:
        def __init__(self) -> None:
            self.pr_num = 76123
+            self.exit_non_zero = False

    return Object()

--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -1116,15 +1116,20 @@ class GitHubPR:
        msg = self.get_title() + f" (#{self.pr_num})\n\n"
        msg += msg_body

-        # Mention PR co-authors
-        for author_login, author_name in self.get_authors().items():
-            if author_login != self.get_pr_creator_login():
-                msg += f"\nCo-authored-by: {author_name}"
-
        msg += f"\nPull Request resolved: {self.get_pr_url()}\n"
        msg += f"Approved by: {approved_by_urls}\n"
        if ghstack_deps:
            msg += f"ghstack dependencies: {', '.join([f'#{pr.pr_num}' for pr in ghstack_deps])}\n"
+
+        # Mention PR co-authors, which should be at the end of the message
+        # And separated from the body by two newlines
+        first_coauthor = True
+        for author_login, author_name in self.get_authors().items():
+            if author_login != self.get_pr_creator_login():
+                if first_coauthor:
+                    msg, first_coauthor = (msg + "\n", False)
+                msg += f"\nCo-authored-by: {author_name}"
+
        return msg

    def add_numbered_label(self, label_base: str, dry_run: bool) -> None:
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -52,19 +52,32 @@ env:
 !{{ common.concurrency(build_environment) }}

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
 {%- for config in build_configs %}
  !{{ config["build_name"] }}-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:!{{ upload.binary_env_as_input(config) }}
      {%- if "aarch64" in build_environment %}
-      runs_on: linux.arm64.m7g.4xlarge
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      {%- elif "s390x" in build_environment %}
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
-      runs_on: linux.24xlarge
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.24xlarge.ephemeral
+      {%- else %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      {%- endif %}
      build_name: !{{ config["build_name"] }}
      build_environment: !{{ build_environment }}
@ -80,7 +93,9 @@ jobs:
  {%- if config["gpu_arch_type"] != "cuda-aarch64" %}
  !{{ config["build_name"] }}-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: !{{ config["build_name"] }}-build
+    needs:
+      - !{{ config["build_name"] }}-build
+      - get-label-type
    {%- if config["gpu_arch_type"] not in ["rocm", "xpu"] %}
    uses: ./.github/workflows/_binary-test-linux.yml
    with:!{{ upload.binary_env_as_input(config) }}
@ -95,8 +110,10 @@ jobs:
      {%- elif config["gpu_arch_type"] == "rocm" %}
      runs_on: linux.rocm.gpu
      {%- elif config["gpu_arch_type"] == "cuda" %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
      {%- else %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
      {%- endif %}
    secrets:
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -64,9 +64,6 @@ jobs:
    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
    {%- endif %}
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
    steps:
      !{{ set_runner_specific_vars() }}
      - name: Install conda and dependencies
@ -84,7 +81,7 @@ jobs:
      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch) }}
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
+        uses: nick-fields/retry@v3.0.0
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        with:
          timeout_minutes: 5
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -53,10 +53,24 @@ env:
 !{{ common.concurrency(build_environment) }}

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
 {%- for config in build_configs %}
  !{{ config["build_name"] }}-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    {%- if branches == "nightly" %}
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    {%- else %}
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
+    {%- endif %}
    timeout-minutes: !{{ common.timeout_minutes }}
    !{{ upload.binary_env(config, True) }}
    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
@ -85,15 +99,17 @@ jobs:
      !{{ common.wait_and_kill_ssh_windows('pytorch') }}
  !{{ config["build_name"] }}-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: !{{ config["build_name"] }}-build
+    needs:
+      - !{{ config["build_name"] }}-build
+      - get-label-type
 {%- if config["gpu_arch_type"] == "cuda" %}
 {%- if branches == "nightly" %}
-    runs-on: windows.8xlarge.nvidia.gpu
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
 {%- else %}
-    runs-on: windows.8xlarge.nvidia.gpu.nonephemeral
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu.nonephemeral"
 {%- endif %}
 {%- else %}
-    runs-on: windows.4xlarge.nonephemeral
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
 {%- endif %}
    timeout-minutes: !{{ common.timeout_minutes }}
    !{{ upload.binary_env(config, True) }}
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -11,11 +11,16 @@ on:
        required: true
        type: string
        description: The build environment
+      runner_prefix:
+        required: false
+        default: ""
+        type: string
+        description: prefix for runner label
      runs_on:
        required: false
-        default: linux.12xlarge
+        default: linux.12xlarge.ephemeral
        type: string
-        description: Hardware to run this "build"job on, linux.12xlarge or linux.arm64.2xlarge.
+        description: Hardware to run this "build" job on, linux.12xlarge or linux.arm64.2xlarge.
      timeout-minutes:
        required: false
        default: 210
@ -89,7 +94,7 @@ on:

 jobs:
  build:
-    runs-on: ${{ inputs.runs_on }}
+    runs-on: ${{ inputs.runner_prefix}}${{ inputs.runs_on }}
    timeout-minutes: ${{ inputs.timeout-minutes }}
    env:
      PYTORCH_ROOT: ${{ inputs.PYTORCH_ROOT }}
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -59,6 +59,11 @@ on:
        required: false
        type: string
        description: Desired python version
+      runner_prefix:
+        required: false
+        default: ""
+        type: string
+        description: prefix for runner label
      runs_on:
        required: true
        type: string
@ -77,7 +82,7 @@ on:

 jobs:
  test:
-    runs-on: ${{ inputs.runs_on }}
+    runs-on: ${{ inputs.runner_prefix}}${{ inputs.runs_on }}
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ inputs.PYTORCH_ROOT }}
--- a/.github/workflows/_buck-build-test.yml
+++ b/.github/workflows/_buck-build-test.yml
@ -8,6 +8,11 @@ on:
        type: string
        description: |
          A JSON description of what configs to run later on.
+      runner_prefix:
+        required: false
+        type: string
+        description: |
+          Prefix for runner label

 defaults:
  run:
@ -16,7 +21,7 @@ defaults:
 jobs:
  filter:
    if: github.repository_owner == 'pytorch'
-    runs-on: [self-hosted, linux.large]
+    runs-on: [self-hosted, "${{ inputs.runner_prefix }}linux.large"]
    outputs:
      test-matrix: ${{ steps.filter.outputs.test-matrix }}
      is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
@ -59,7 +64,7 @@ jobs:
          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}

      - name: Install Buck
-        uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
+        uses: nick-fields/retry@v3.0.0
        with:
          timeout_minutes: 10
          max_attempts: 5
@ -69,7 +74,7 @@ jobs:
            sudo apt install ./buck.2021.01.12.01_all.deb

      - name: Download third party libraries and generate wrappers
-        uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
+        uses: nick-fields/retry@v3.0.0
        with:
          timeout_minutes: 10
          max_attempts: 5
--- a/.github/workflows/_ios-build-test.yml
+++ b/.github/workflows/_ios-build-test.yml
@ -92,7 +92,7 @@ jobs:
          fi

      - name: Install brew dependencies
-        uses: nick-fields/retry@v2.8.2
+        uses: nick-fields/retry@v3.0.0
        with:
          timeout_minutes: 5
          max_attempts: 3
@ -109,7 +109,7 @@ jobs:
          pip-requirements-file: .github/requirements/pip-requirements-iOS.txt

      - name: Setup Fastlane
-        uses: nick-fields/retry@v2.8.2
+        uses: nick-fields/retry@v3.0.0
        with:
          timeout_minutes: 5
          max_attempts: 3
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -34,6 +34,11 @@ on:
        default: "5.2"
        description: |
          List of CUDA architectures CI build should target.
+      runner_prefix:
+        required: false
+        default: ""
+        type: string
+        description: Prefix for runner label
      runner:
        required: false
        type: string
@ -77,6 +82,10 @@ on:
        required: false
        description: |
          HF Auth token to avoid rate limits when downloading models or datasets from hub
+      SCRIBE_GRAPHQL_ACCESS_TOKEN:
+        required: false
+        description: |
+          FB app token to write to scribe endpoint


    outputs:
@ -89,9 +98,10 @@ on:

 jobs:
  build:
+    environment: ${{ github.ref == 'refs/heads/main' && 'scribe-protected' || startsWith(github.ref, 'refs/heads/release/') && 'scribe-protected' || contains(github.event.pull_request.labels.*.name, 'ci-scribe') && 'scribe-pr' || '' }}
    # Don't run on forked repos
    if: github.repository_owner == 'pytorch'
-    runs-on: ${{ inputs.runner }}
+    runs-on: ${{ inputs.runner_prefix}}${{ inputs.runner }}
    timeout-minutes: 240
    outputs:
      docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
@ -189,6 +199,7 @@ jobs:
          DEBUG: ${{ inputs.build-with-debug && '1' || '0' }}
          OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
          USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
        run: |
          # detached container should get cleaned up by teardown_ec2_linux
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -52,6 +52,10 @@ on:
        required: false
        description: |
          HF Auth token to avoid rate limits when downloading models or datasets from hub
+      SCRIBE_GRAPHQL_ACCESS_TOKEN:
+        required: false
+        description: |
+          FB app token to write to scribe endpoint

 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
@ -63,6 +67,7 @@ jobs:
    strategy:
      matrix: ${{ fromJSON(inputs.test-matrix) }}
      fail-fast: false
+    environment: ${{ github.ref == 'refs/heads/main' && 'scribe-protected' || startsWith(github.ref, 'refs/heads/release/') && 'scribe-protected' || contains(github.event.pull_request.labels.*.name, 'ci-scribe') && 'scribe-pr' || '' }}
    runs-on: ${{ matrix.runner }}
    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
    steps:
@ -212,6 +217,7 @@ jobs:
          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
          DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}

        run: |
          set -x
@ -266,6 +272,7 @@ jobs:
            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
            -e SKIP_SCCACHE_INITIALIZATION=1 \
            -e HUGGING_FACE_HUB_TOKEN \
+            -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
            -e DASHBOARD_TAG \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --security-opt seccomp=unconfined \
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@ -104,7 +104,7 @@ jobs:
          pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt

      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
+        uses: nick-fields/retry@v3.0.0
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        with:
          timeout_minutes: 5
@ -139,7 +139,7 @@ jobs:
            else
              # The runner has access to the S3 bucket via IAM profile without the need
              # for any credential
-              echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+              echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"0
              echo "SCCACHE_S3_KEY_PREFIX=${GITHUB_WORKFLOW}" >> "${GITHUB_ENV}"
            fi

--- a/.github/workflows/_run_android_tests.yml
+++ b/.github/workflows/_run_android_tests.yml
@ -8,6 +8,11 @@ on:
        type: string
        description: |
          A JSON description of what configs to run later on.
+      runner_prefix:
+        required: false
+        type: string
+        description: |
+          Prefix for runner label

 defaults:
  run:
@ -16,7 +21,7 @@ defaults:
 jobs:
  filter:
    if: github.repository_owner == 'pytorch'
-    runs-on: [self-hosted, linux.large]
+    runs-on: [self-hosted, "${{ inputs.runner_prefix }}linux.large"]
    outputs:
      test-matrix: ${{ steps.filter.outputs.test-matrix }}
      is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
@ -63,7 +68,7 @@ jobs:
          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}.txt

      - name: Install NDK
-        uses: nick-fields/retry@v2.8.2
+        uses: nick-fields/retry@v3.0.0
        with:
          timeout_minutes: 5
          max_attempts: 3
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@ -59,6 +59,30 @@ jobs:
          cat <<EOF > runner_determinator.py
          # flake8: noqa: G004

+          """
+          This runner determinator is used to determine which set of runners to run a
+          GitHub job on. It uses the first comment of a GitHub issue (by default
+          https://github.com/pytorch/test-infra/issues/5132) as a user list to determine
+          which users will get their jobs to run on experimental runners. This user list
+          is also a comma separated list of additional features or experiments which the
+          user could be opted in to.
+
+          The user list has the following rules:
+
+          - Users are GitHub usernames with the @ prefix
+          - If the first line is a "*" then all users will use the new runners
+          - If the first line is a "!" then all users will use the old runners
+          - Each user is also a comma-separated list of features/experiments to enable
+          - A "#" prefix indicates the user is opted out of the new runners but is opting
+            into features/experiments.
+
+          Example user list:
+
+              @User1
+              @User2,amz2023
+              #@UserOptOutOfNewRunner,amz2023
+          """
+
          import logging
          import os
          from argparse import ArgumentParser
@ -73,7 +97,11 @@ jobs:
          WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
          WORKFLOW_LABEL_LF_CANARY = "lf.c."  # use canary runners from the linux foundation

+          RUNNER_AMI_LEGACY = ""
+          RUNNER_AMI_AMZ2023 = "amz2023"
+
          GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
+          GH_OUTPUT_KEY_AMI = "runner-ami"
          GH_OUTPUT_KEY_LABEL_TYPE = "label-type"


@ -209,7 +237,8 @@ jobs:
                      return WORKFLOW_LABEL_LF
                  else:
                      all_opted_in_users = {
-                          usr_raw.strip("\n\t@ ") for usr_raw in first_comment.split()
+                          usr_raw.strip("\n\t@ ").split(",")[0]
+                          for usr_raw in first_comment.split()
                      }
                      opted_in_requestors = {
                          usr for usr in workflow_requestors if usr in all_opted_in_users
@ -232,12 +261,46 @@ jobs:
                  return WORKFLOW_LABEL_META


+          def get_optin_feature(
+              issue: Issue, workflow_requestors: Iterable[str], feature: str, fallback: str
+          ) -> str:
+              try:
+                  first_comment = issue.get_comments()[0].body.strip("\n\t ")
+                  userlist = {u.lstrip("#").strip("\n\t@ ") for u in first_comment.split()}
+                  all_opted_in_users = set()
+                  for user in userlist:
+                      for i in user.split(","):
+                          if i == feature:
+                              all_opted_in_users.add(user.split(",")[0])
+                  opted_in_requestors = {
+                      usr for usr in workflow_requestors if usr in all_opted_in_users
+                  }
+
+                  if opted_in_requestors:
+                      log.info(
+                          f"Feature {feature} is enabled for {', '.join(opted_in_requestors)}. Using feature {feature}."
+                      )
+                      return feature
+                  else:
+                      log.info(
+                          f"Feature {feature} is disabled for {', '.join(workflow_requestors)}. Using fallback \"{fallback}\"."
+                      )
+                      return fallback
+
+              except Exception as e:
+                  log.error(
+                      f'Failed to determine if user has opted-in to feature {feature}. Using fallback "{fallback}". Exception: {e}'
+                  )
+                  return fallback
+
+
          def main() -> None:
              args = parse_args()

              if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
                  log.info(f"Exception branch: '{args.github_branch}', using meta runners")
                  label_type = WORKFLOW_LABEL_META
+                  runner_ami = RUNNER_AMI_LEGACY
              else:
                  try:
                      gh = get_gh_client(args.github_token)
@ -257,17 +320,29 @@ jobs:
                              username,
                          ),
                      )
+                      runner_ami = get_optin_feature(
+                          issue=issue,
+                          workflow_requestors=(
+                              args.github_issue_owner,
+                              username,
+                          ),
+                          feature=RUNNER_AMI_AMZ2023,
+                          fallback=RUNNER_AMI_LEGACY,
+                      )
                  except Exception as e:
                      log.error(
                          f"Failed to get issue. Falling back to meta runners. Exception: {e}"
                      )
                      label_type = WORKFLOW_LABEL_META
+                      runner_ami = RUNNER_AMI_LEGACY

              # For Canary builds use canary runners
              if args.github_repo == "pytorch/pytorch-canary" and label_type == WORKFLOW_LABEL_LF:
                  label_type = WORKFLOW_LABEL_LF_CANARY

              set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
+              set_github_output(GH_OUTPUT_KEY_AMI, runner_ami)
+

          if __name__ == "__main__":
              main()
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -87,7 +87,7 @@ jobs:

      # TODO: Move to a requirements.txt file for windows
      - name: Install pip dependencies
-        uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
+        uses: nick-fields/retry@v3.0.0
        with:
          shell: bash
          timeout_minutes: 5
--- a/.github/workflows/auto_request_review.yml
+++ b/.github/workflows/auto_request_review.yml
@ -2,7 +2,7 @@ name: Auto Request Review

 on:
  pull_request:
-    types: [opened, ready_for_review, reopened]
+    types: [opened, ready_for_review]
 jobs:
  auto-request-review:
    # Don't run on forked repos
--- a/.github/workflows/build-conda-images.yml
+++ b/.github/workflows/build-conda-images.yml
@ -11,20 +11,18 @@ on:
      # Release candidate tags look like: v1.11.0-rc1
      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
    paths:
-      - conda/Dockerfile
-      - 'common/*'
+      - '.ci/docker/conda/*'
+      - '.ci/docker/common/*'
      - .github/workflows/build-conda-images.yml
  pull_request:
    paths:
-      - conda/Dockerfile
-      - 'common/*'
+      - '.ci/docker/conda/*'
+      - '.ci/docker/common/*'
      - .github/workflows/build-conda-images.yml

 env:
  DOCKER_REGISTRY: "docker.io"
  DOCKER_BUILDKIT: 1
-  DOCKER_ID: ${{ secrets.DOCKER_ID }}
-  DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}

 concurrency:
@ -33,7 +31,8 @@ concurrency:

 jobs:
  build-docker:
-    runs-on: linux.9xlarge.ephemeral
+    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
+    runs-on: am2.linux.9xlarge.ephemeral
    strategy:
      matrix:
        cuda_version: ["11.8", "12.1", "12.4", "cpu"]
@ -54,6 +53,9 @@ jobs:
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
+        env:
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -22,8 +22,6 @@ on:
 env:
  DOCKER_REGISTRY: "docker.io"
  DOCKER_BUILDKIT: 1
-  DOCKER_ID: ${{ secrets.DOCKER_ID }}
-  DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}

 concurrency:
@ -32,6 +30,7 @@ concurrency:

 jobs:
  build-docker-cuda:
+    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
@ -54,6 +53,9 @@ jobs:
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
+        env:
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
@ -63,10 +65,11 @@ jobs:
        run: |
          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cuda${{matrix.cuda_version}}
  build-docker-rocm:
+    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
-        rocm_version: ["6.0", "6.1"]
+        rocm_version: ["6.1", "6.2"]
    env:
      GPU_ARCH_TYPE: rocm
      GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
@ -85,6 +88,9 @@ jobs:
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
+        env:
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
@ -94,6 +100,7 @@ jobs:
        run: |
          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:rocm${{matrix.rocm_version}}
  build-docker-cpu:
+    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    runs-on: linux.9xlarge.ephemeral
    steps:
      - name: Checkout PyTorch
@ -110,6 +117,9 @@ jobs:
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
+        env:
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -12,11 +12,13 @@ on:
      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
    paths:
      - '.ci/docker/manywheel/*'
+      - '.ci/docker/manywheel/build_scripts/*'
      - '.ci/docker/common/*'
      - .github/workflows/build-manywheel-images.yml
  pull_request:
    paths:
      - '.ci/docker/manywheel/*'
+      - '.ci/docker/manywheel/build_scripts/*'
      - '.ci/docker/common/*'
      - .github/workflows/build-manywheel-images.yml

@ -24,8 +26,6 @@ on:
 env:
  DOCKER_REGISTRY: "docker.io"
  DOCKER_BUILDKIT: 1
-  DOCKER_ID: ${{ secrets.DOCKER_ID }}
-  DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}

 concurrency:
@ -34,7 +34,8 @@ concurrency:

 jobs:
  build-docker-cuda:
-    runs-on: linux.9xlarge.ephemeral
+    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
+    runs-on: am2.linux.9xlarge.ephemeral
    strategy:
      matrix:
        cuda_version: ["12.4", "12.1", "11.8"]
@ -58,6 +59,9 @@ jobs:
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
+        env:
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
@ -68,6 +72,7 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux-builder:cuda${{matrix.cuda_version}}
  # NOTE: manylinux_2_28 are still experimental, see https://github.com/pytorch/pytorch/issues/123649
  build-docker-cuda-manylinux_2_28:
+    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
@ -92,6 +97,9 @@ jobs:
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
+        env:
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
@ -101,7 +109,8 @@ jobs:
        run: |
          .ci/docker/manywheel/build.sh manylinux2_28-builder:cuda${{matrix.cuda_version}}
  build-docker-cuda-aarch64:
-    runs-on: linux.arm64.2xlarge
+    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
+    runs-on: linux.arm64.2xlarge.ephemeral
    strategy:
      matrix:
        cuda_version: ["12.4"]
@ -121,6 +130,9 @@ jobs:
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
+        env:
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
@ -130,10 +142,11 @@ jobs:
        run: |
          .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cuda${{matrix.cuda_version}}
  build-docker-rocm:
-    runs-on: linux.9xlarge.ephemeral
+    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
+    runs-on: am2.linux.9xlarge.ephemeral
    strategy:
      matrix:
-        rocm_version: ["6.0", "6.1"]
+        rocm_version: ["6.1", "6.2"]
    env:
      GPU_ARCH_TYPE: rocm
      GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
@ -152,6 +165,9 @@ jobs:
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
+        env:
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
@ -161,7 +177,8 @@ jobs:
        run: |
          .ci/docker/manywheel/build.sh manylinux-builder:rocm${{matrix.rocm_version}}
  build-docker-cpu:
-    runs-on: linux.9xlarge.ephemeral
+    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
+    runs-on: am2.linux.9xlarge.ephemeral
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
@ -177,6 +194,9 @@ jobs:
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
+        env:
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
@ -186,6 +206,7 @@ jobs:
        run: |
          .ci/docker/manywheel/build.sh manylinux-builder:cpu
  build-docker-cpu-manylinux_2_28:
+    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    runs-on: linux.9xlarge.ephemeral
    env:
      GPU_ARCH_TYPE: cpu-manylinux_2_28
@ -204,6 +225,9 @@ jobs:
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
+        env:
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
@ -213,7 +237,8 @@ jobs:
        run: |
          .ci/docker/manywheel/build.sh manylinux2_28-builder:cpu
  build-docker-cpu-aarch64:
-    runs-on: linux.arm64.2xlarge
+    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
+    runs-on: linux.arm64.2xlarge.ephemeral
    env:
      GPU_ARCH_TYPE: cpu-aarch64
    steps:
@ -231,6 +256,9 @@ jobs:
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
+        env:
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
@ -240,7 +268,8 @@ jobs:
        run: |
          .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cpu-aarch64
  build-docker-cpu-aarch64-2_28:
-    runs-on: linux.arm64.2xlarge
+    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
+    runs-on: linux.arm64.2xlarge.ephemeral
    env:
      GPU_ARCH_TYPE: cpu-aarch64-2_28
    steps:
@ -258,15 +287,22 @@ jobs:
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
+        env:
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
+        env:
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          .ci/docker/manywheel/build.sh manylinux2_28_aarch64-builder:cpu-aarch64
  build-docker-cpu-cxx11-abi:
+    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    runs-on: linux.9xlarge.ephemeral
    env:
      GPU_ARCH_TYPE: cpu-cxx11-abi
@ -285,6 +321,9 @@ jobs:
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
+        env:
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
@ -294,6 +333,7 @@ jobs:
        run: |
          .ci/docker/manywheel/build.sh manylinuxcxx11-abi-builder:cpu-cxx11-abi
  build-docker-xpu:
+    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    runs-on: linux.9xlarge.ephemeral
    env:
      GPU_ARCH_TYPE: xpu
@ -312,6 +352,9 @@ jobs:
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
+        env:
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -39,12 +39,12 @@ jobs:
        device: ["cuda", "rocm", "xpu"]
        include:
          - device: "rocm"
-            rocm_version: "6.1"
+            rocm_version: "6.2"
          - device: "cuda"
            rocm_version: ""
    timeout-minutes: 40
    env:
-      DOCKER_IMAGE: ${{ matrix.device == 'rocm' && format('pytorch/manylinux-rocm:{0}', matrix.rocm_version) || 'pytorch/manylinux-builder:cpu' }}
+      DOCKER_IMAGE: ${{ matrix.device == 'rocm' && format('pytorch/manylinux-builder:rocm{0}', matrix.rocm_version) || 'pytorch/manylinux-builder:cpu' }}
      PY_VERS: ${{ matrix.py_vers }}
      BUILD_DEVICE: ${{ matrix.device }}
    steps:
--- a/.github/workflows/check-labels.yml
+++ b/.github/workflows/check-labels.yml
@ -19,6 +19,10 @@ on:
    branches: [gh/**/base]

  workflow_dispatch:
+    inputs:
+      pr_number:
+        description: 'PR number to check labels for'
+        required: true

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -54,7 +58,7 @@ jobs:
      - name: Check labels
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          PR_NUM: ${{ github.event.number }}
+          PR_NUM: ${{ github.event.number || github.event.inputs.pr_number }}
        run: |
          set -ex
-          python3 .github/scripts/check_labels.py "${PR_NUM}"
+          python3 .github/scripts/check_labels.py --exit-non-zero "${PR_NUM}"
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -100,7 +100,7 @@ jobs:
        with:
          docker-image: ${{ steps.build-docker-image.outputs.docker-image }}

-      - uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
+      - uses: nick-fields/retry@v3.0.0
        name: Push to https://https://ghcr.io/
        id: push-to-ghcr-io
        if: ${{ github.event_name == 'push' }}
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@ -110,7 +110,7 @@ jobs:
          if [[ ${{ github.event.ref }} =~ ^refs/tags/v[0-9]+\.[0-9]+\.[0-9]+-rc[0-9]+$ ]]; then
            {
              echo "DOCKER_IMAGE=pytorch-test";
-              echo "INSTALL_CHANNEL=pytorch-test";
+              echo "INSTALL_CHANNEL=whl/test";
              echo "TRITON_VERSION=$(cut -f 1 .ci/docker/triton_version.txt)";
            } >> "${GITHUB_ENV}"
          fi
@ -119,7 +119,7 @@ jobs:
        run: |
          {
            echo "DOCKER_IMAGE=pytorch-nightly";
-            echo "INSTALL_CHANNEL=pytorch-nightly";
+            echo "INSTALL_CHANNEL=whl/nightly";
            echo "TRITON_VERSION=$(cut -f 1 .ci/docker/triton_version.txt)+$(cut -c -10 .ci/docker/ci_commit_pins/triton.txt)";
          } >> "${GITHUB_ENV}"
      - name: Run docker build / push
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -37,9 +37,18 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
  manywheel-py3_9-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -50,7 +59,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      DESIRED_PYTHON: "3.9"
-      runs_on: linux.arm64.m7g.4xlarge
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -59,7 +68,9 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-aarch64-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-cpu-aarch64-build
+    needs:
+      - manywheel-py3_9-cpu-aarch64-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -103,6 +114,7 @@ jobs:
  manywheel-py3_9-cuda-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -114,7 +126,7 @@ jobs:
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
      DESIRED_PYTHON: "3.9"
-      runs_on: linux.arm64.m7g.4xlarge
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cuda-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -148,6 +160,7 @@ jobs:
  manywheel-py3_10-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -158,7 +171,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      DESIRED_PYTHON: "3.10"
-      runs_on: linux.arm64.m7g.4xlarge
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -167,7 +180,9 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-aarch64-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-cpu-aarch64-build
+    needs:
+      - manywheel-py3_10-cpu-aarch64-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -211,6 +226,7 @@ jobs:
  manywheel-py3_10-cuda-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -222,7 +238,7 @@ jobs:
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
      DESIRED_PYTHON: "3.10"
-      runs_on: linux.arm64.m7g.4xlarge
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -256,6 +272,7 @@ jobs:
  manywheel-py3_11-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -266,7 +283,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      DESIRED_PYTHON: "3.11"
-      runs_on: linux.arm64.m7g.4xlarge
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -275,7 +292,9 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-aarch64-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_11-cpu-aarch64-build
+    needs:
+      - manywheel-py3_11-cpu-aarch64-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -319,6 +338,7 @@ jobs:
  manywheel-py3_11-cuda-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -330,7 +350,7 @@ jobs:
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
      DESIRED_PYTHON: "3.11"
-      runs_on: linux.arm64.m7g.4xlarge
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -364,6 +384,7 @@ jobs:
  manywheel-py3_12-cpu-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -374,7 +395,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      DESIRED_PYTHON: "3.12"
-      runs_on: linux.arm64.m7g.4xlarge
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
@ -383,7 +404,9 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-aarch64-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_12-cpu-aarch64-build
+    needs:
+      - manywheel-py3_12-cpu-aarch64-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -427,6 +450,7 @@ jobs:
  manywheel-py3_12-cuda-aarch64-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -438,7 +462,7 @@ jobs:
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.4-main
      DESIRED_DEVTOOLSET: cxx11-abi
      DESIRED_PYTHON: "3.12"
-      runs_on: linux.arm64.m7g.4xlarge
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64
      build_environment: linux-aarch64-binary-manywheel
--- a/.github/workflows/generated-linux-binary-conda-nightly.yml
+++ b/.github/workflows/generated-linux-binary-conda-nightly.yml
@ -37,9 +37,18 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
  conda-py3_9-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -50,13 +59,16 @@ jobs:
      GPU_ARCH_TYPE: cpu
      DOCKER_IMAGE: pytorch/conda-builder:cpu-main
      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: conda-py3_9-cpu
      build_environment: linux-binary-conda
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  conda-py3_9-cpu-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cpu-build
+    needs:
+      - conda-py3_9-cpu-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -70,6 +82,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      build_name: conda-py3_9-cpu
      build_environment: linux-binary-conda
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -99,6 +112,7 @@ jobs:
  conda-py3_9-cuda11_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -110,14 +124,17 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-main
      DESIRED_PYTHON: "3.9"
-      runs_on: linux.24xlarge
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.24xlarge.ephemeral
      build_name: conda-py3_9-cuda11_8
      build_environment: linux-binary-conda
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  conda-py3_9-cuda11_8-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_8-build
+    needs:
+      - conda-py3_9-cuda11_8-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -132,6 +149,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      build_name: conda-py3_9-cuda11_8
      build_environment: linux-binary-conda
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -162,6 +180,7 @@ jobs:
  conda-py3_9-cuda12_1-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -173,14 +192,17 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-main
      DESIRED_PYTHON: "3.9"
-      runs_on: linux.24xlarge
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.24xlarge.ephemeral
      build_name: conda-py3_9-cuda12_1
      build_environment: linux-binary-conda
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  conda-py3_9-cuda12_1-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda12_1-build
+    needs:
+      - conda-py3_9-cuda12_1-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -195,6 +217,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      build_name: conda-py3_9-cuda12_1
      build_environment: linux-binary-conda
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -225,6 +248,7 @@ jobs:
  conda-py3_9-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -236,14 +260,17 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
      DESIRED_PYTHON: "3.9"
-      runs_on: linux.24xlarge
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.24xlarge.ephemeral
      build_name: conda-py3_9-cuda12_4
      build_environment: linux-binary-conda
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  conda-py3_9-cuda12_4-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda12_4-build
+    needs:
+      - conda-py3_9-cuda12_4-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -258,6 +285,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      build_name: conda-py3_9-cuda12_4
      build_environment: linux-binary-conda
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -288,6 +316,7 @@ jobs:
  conda-py3_10-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -298,13 +327,16 @@ jobs:
      GPU_ARCH_TYPE: cpu
      DOCKER_IMAGE: pytorch/conda-builder:cpu-main
      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: conda-py3_10-cpu
      build_environment: linux-binary-conda
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  conda-py3_10-cpu-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cpu-build
+    needs:
+      - conda-py3_10-cpu-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -318,6 +350,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      build_name: conda-py3_10-cpu
      build_environment: linux-binary-conda
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -347,6 +380,7 @@ jobs:
  conda-py3_10-cuda11_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -358,14 +392,17 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-main
      DESIRED_PYTHON: "3.10"
-      runs_on: linux.24xlarge
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.24xlarge.ephemeral
      build_name: conda-py3_10-cuda11_8
      build_environment: linux-binary-conda
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  conda-py3_10-cuda11_8-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_8-build
+    needs:
+      - conda-py3_10-cuda11_8-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -380,6 +417,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      build_name: conda-py3_10-cuda11_8
      build_environment: linux-binary-conda
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -410,6 +448,7 @@ jobs:
  conda-py3_10-cuda12_1-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -421,14 +460,17 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-main
      DESIRED_PYTHON: "3.10"
-      runs_on: linux.24xlarge
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.24xlarge.ephemeral
      build_name: conda-py3_10-cuda12_1
      build_environment: linux-binary-conda
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  conda-py3_10-cuda12_1-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda12_1-build
+    needs:
+      - conda-py3_10-cuda12_1-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -443,6 +485,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      build_name: conda-py3_10-cuda12_1
      build_environment: linux-binary-conda
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -473,6 +516,7 @@ jobs:
  conda-py3_10-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -484,14 +528,17 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
      DESIRED_PYTHON: "3.10"
-      runs_on: linux.24xlarge
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.24xlarge.ephemeral
      build_name: conda-py3_10-cuda12_4
      build_environment: linux-binary-conda
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  conda-py3_10-cuda12_4-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda12_4-build
+    needs:
+      - conda-py3_10-cuda12_4-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -506,6 +553,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      build_name: conda-py3_10-cuda12_4
      build_environment: linux-binary-conda
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -536,6 +584,7 @@ jobs:
  conda-py3_11-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -546,13 +595,16 @@ jobs:
      GPU_ARCH_TYPE: cpu
      DOCKER_IMAGE: pytorch/conda-builder:cpu-main
      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: conda-py3_11-cpu
      build_environment: linux-binary-conda
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  conda-py3_11-cpu-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_11-cpu-build
+    needs:
+      - conda-py3_11-cpu-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -566,6 +618,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      build_name: conda-py3_11-cpu
      build_environment: linux-binary-conda
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -595,6 +648,7 @@ jobs:
  conda-py3_11-cuda11_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -606,14 +660,17 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-main
      DESIRED_PYTHON: "3.11"
-      runs_on: linux.24xlarge
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.24xlarge.ephemeral
      build_name: conda-py3_11-cuda11_8
      build_environment: linux-binary-conda
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  conda-py3_11-cuda11_8-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_11-cuda11_8-build
+    needs:
+      - conda-py3_11-cuda11_8-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -628,6 +685,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      build_name: conda-py3_11-cuda11_8
      build_environment: linux-binary-conda
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -658,6 +716,7 @@ jobs:
  conda-py3_11-cuda12_1-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -669,14 +728,17 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-main
      DESIRED_PYTHON: "3.11"
-      runs_on: linux.24xlarge
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.24xlarge.ephemeral
      build_name: conda-py3_11-cuda12_1
      build_environment: linux-binary-conda
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  conda-py3_11-cuda12_1-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_11-cuda12_1-build
+    needs:
+      - conda-py3_11-cuda12_1-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -691,6 +753,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      build_name: conda-py3_11-cuda12_1
      build_environment: linux-binary-conda
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -721,6 +784,7 @@ jobs:
  conda-py3_11-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -732,14 +796,17 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
      DESIRED_PYTHON: "3.11"
-      runs_on: linux.24xlarge
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.24xlarge.ephemeral
      build_name: conda-py3_11-cuda12_4
      build_environment: linux-binary-conda
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  conda-py3_11-cuda12_4-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_11-cuda12_4-build
+    needs:
+      - conda-py3_11-cuda12_4-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -754,6 +821,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      build_name: conda-py3_11-cuda12_4
      build_environment: linux-binary-conda
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -784,6 +852,7 @@ jobs:
  conda-py3_12-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -794,13 +863,16 @@ jobs:
      GPU_ARCH_TYPE: cpu
      DOCKER_IMAGE: pytorch/conda-builder:cpu-main
      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: conda-py3_12-cpu
      build_environment: linux-binary-conda
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  conda-py3_12-cpu-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_12-cpu-build
+    needs:
+      - conda-py3_12-cpu-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -814,6 +886,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      build_name: conda-py3_12-cpu
      build_environment: linux-binary-conda
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -843,6 +916,7 @@ jobs:
  conda-py3_12-cuda11_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -854,14 +928,17 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/conda-builder:cuda11.8-main
      DESIRED_PYTHON: "3.12"
-      runs_on: linux.24xlarge
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.24xlarge.ephemeral
      build_name: conda-py3_12-cuda11_8
      build_environment: linux-binary-conda
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  conda-py3_12-cuda11_8-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_12-cuda11_8-build
+    needs:
+      - conda-py3_12-cuda11_8-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -876,6 +953,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      build_name: conda-py3_12-cuda11_8
      build_environment: linux-binary-conda
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -906,6 +984,7 @@ jobs:
  conda-py3_12-cuda12_1-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -917,14 +996,17 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/conda-builder:cuda12.1-main
      DESIRED_PYTHON: "3.12"
-      runs_on: linux.24xlarge
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.24xlarge.ephemeral
      build_name: conda-py3_12-cuda12_1
      build_environment: linux-binary-conda
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  conda-py3_12-cuda12_1-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_12-cuda12_1-build
+    needs:
+      - conda-py3_12-cuda12_1-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -939,6 +1021,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      build_name: conda-py3_12-cuda12_1
      build_environment: linux-binary-conda
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -969,6 +1052,7 @@ jobs:
  conda-py3_12-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -980,14 +1064,17 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main
      DESIRED_PYTHON: "3.12"
-      runs_on: linux.24xlarge
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.24xlarge.ephemeral
      build_name: conda-py3_12-cuda12_4
      build_environment: linux-binary-conda
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  conda-py3_12-cuda12_4-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_12-cuda12_4-build
+    needs:
+      - conda-py3_12-cuda12_4-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -1002,6 +1089,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      build_name: conda-py3_12-cuda12_4
      build_environment: linux-binary-conda
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-main.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-main.yml
@ -32,9 +32,18 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
  libtorch-cpu-shared-with-deps-cxx11-abi-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -46,13 +55,16 @@ jobs:
      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-main
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: libtorch-cpu-shared-with-deps-cxx11-abi
      build_environment: linux-binary-libtorch-cxx11-abi
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-cpu-shared-with-deps-cxx11-abi-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-cxx11-abi-build
+    needs:
+      - libtorch-cpu-shared-with-deps-cxx11-abi-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -67,6 +79,7 @@ jobs:
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-shared-with-deps-cxx11-abi
      build_environment: linux-binary-libtorch-cxx11-abi
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
@ -37,9 +37,18 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
  libtorch-cpu-shared-with-deps-cxx11-abi-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -51,13 +60,16 @@ jobs:
      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-main
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: libtorch-cpu-shared-with-deps-cxx11-abi
      build_environment: linux-binary-libtorch-cxx11-abi
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-cpu-shared-with-deps-cxx11-abi-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-cxx11-abi-build
+    needs:
+      - libtorch-cpu-shared-with-deps-cxx11-abi-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -72,6 +84,7 @@ jobs:
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-shared-with-deps-cxx11-abi
      build_environment: linux-binary-libtorch-cxx11-abi
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -102,6 +115,7 @@ jobs:
  libtorch-cuda11_8-shared-with-deps-cxx11-abi-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -114,13 +128,16 @@ jobs:
      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-main
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: libtorch-cuda11_8-shared-with-deps-cxx11-abi
      build_environment: linux-binary-libtorch-cxx11-abi
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-cuda11_8-shared-with-deps-cxx11-abi-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_8-shared-with-deps-cxx11-abi-build
+    needs:
+      - libtorch-cuda11_8-shared-with-deps-cxx11-abi-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -136,6 +153,7 @@ jobs:
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda11_8-shared-with-deps-cxx11-abi
      build_environment: linux-binary-libtorch-cxx11-abi
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -167,6 +185,7 @@ jobs:
  libtorch-cuda12_1-shared-with-deps-cxx11-abi-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -179,13 +198,16 @@ jobs:
      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.1-main
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: libtorch-cuda12_1-shared-with-deps-cxx11-abi
      build_environment: linux-binary-libtorch-cxx11-abi
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-cuda12_1-shared-with-deps-cxx11-abi-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda12_1-shared-with-deps-cxx11-abi-build
+    needs:
+      - libtorch-cuda12_1-shared-with-deps-cxx11-abi-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -201,6 +223,7 @@ jobs:
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda12_1-shared-with-deps-cxx11-abi
      build_environment: linux-binary-libtorch-cxx11-abi
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -232,6 +255,7 @@ jobs:
  libtorch-cuda12_4-shared-with-deps-cxx11-abi-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -244,13 +268,16 @@ jobs:
      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.4-main
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: libtorch-cuda12_4-shared-with-deps-cxx11-abi
      build_environment: linux-binary-libtorch-cxx11-abi
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-cuda12_4-shared-with-deps-cxx11-abi-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda12_4-shared-with-deps-cxx11-abi-build
+    needs:
+      - libtorch-cuda12_4-shared-with-deps-cxx11-abi-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -266,6 +293,7 @@ jobs:
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda12_4-shared-with-deps-cxx11-abi
      build_environment: linux-binary-libtorch-cxx11-abi
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -294,115 +322,10 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  libtorch-rocm6_0-shared-with-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.0
-      GPU_ARCH_VERSION: 6.0
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.0-main
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm6_0-shared-with-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-rocm6_0-shared-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm6_0-shared-with-deps-cxx11-abi-build
-    runs-on: linux.rocm.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.0
-      GPU_ARCH_VERSION: 6.0
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.0-main
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-rocm6_0-shared-with-deps-cxx11-abi
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm6.0-main
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
-  libtorch-rocm6_0-shared-with-deps-cxx11-abi-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: libtorch-rocm6_0-shared-with-deps-cxx11-abi-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.0
-      GPU_ARCH_VERSION: 6.0
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.0-main
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-rocm6_0-shared-with-deps-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  libtorch-rocm6_1-shared-with-deps-cxx11-abi-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -415,13 +338,16 @@ jobs:
      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.1-main
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: libtorch-rocm6_1-shared-with-deps-cxx11-abi
      build_environment: linux-binary-libtorch-cxx11-abi
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-rocm6_1-shared-with-deps-cxx11-abi-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm6_1-shared-with-deps-cxx11-abi-build
+    needs:
+      - libtorch-rocm6_1-shared-with-deps-cxx11-abi-build
+      - get-label-type
    runs-on: linux.rocm.gpu
    timeout-minutes: 240
    env:
@ -505,3 +431,113 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  libtorch-rocm6_2-shared-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.2
+      GPU_ARCH_VERSION: 6.2
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.2-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-rocm6_2-shared-with-deps-cxx11-abi
+      build_environment: linux-binary-libtorch-cxx11-abi
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-rocm6_2-shared-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-rocm6_2-shared-with-deps-cxx11-abi-build
+      - get-label-type
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.2
+      GPU_ARCH_VERSION: 6.2
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.2-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm6_2-shared-with-deps-cxx11-abi
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: pytorch/libtorch-cxx11-builder:rocm6.2-main
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  libtorch-rocm6_2-shared-with-deps-cxx11-abi-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-rocm6_2-shared-with-deps-cxx11-abi-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.2
+      GPU_ARCH_VERSION: 6.2
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.2-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+      build_name: libtorch-rocm6_2-shared-with-deps-cxx11-abi
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-main.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-main.yml
@ -32,9 +32,18 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
  libtorch-cpu-shared-with-deps-pre-cxx11-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -46,13 +55,16 @@ jobs:
      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: libtorch-cpu-shared-with-deps-pre-cxx11
      build_environment: linux-binary-libtorch-pre-cxx11
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-cpu-shared-with-deps-pre-cxx11-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-pre-cxx11-build
+    needs:
+      - libtorch-cpu-shared-with-deps-pre-cxx11-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -67,6 +79,7 @@ jobs:
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cpu-shared-with-deps-pre-cxx11
      build_environment: linux-binary-libtorch-pre-cxx11
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
@ -37,9 +37,18 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
  libtorch-cpu-shared-with-deps-pre-cxx11-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -51,13 +60,16 @@ jobs:
      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: libtorch-cpu-shared-with-deps-pre-cxx11
      build_environment: linux-binary-libtorch-pre-cxx11
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-cpu-shared-with-deps-pre-cxx11-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-pre-cxx11-build
+    needs:
+      - libtorch-cpu-shared-with-deps-pre-cxx11-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -72,6 +84,7 @@ jobs:
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cpu-shared-with-deps-pre-cxx11
      build_environment: linux-binary-libtorch-pre-cxx11
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -102,6 +115,7 @@ jobs:
  libtorch-cuda11_8-shared-with-deps-pre-cxx11-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -114,13 +128,16 @@ jobs:
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: libtorch-cuda11_8-shared-with-deps-pre-cxx11
      build_environment: linux-binary-libtorch-pre-cxx11
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-cuda11_8-shared-with-deps-pre-cxx11-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_8-shared-with-deps-pre-cxx11-build
+    needs:
+      - libtorch-cuda11_8-shared-with-deps-pre-cxx11-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -136,6 +153,7 @@ jobs:
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda11_8-shared-with-deps-pre-cxx11
      build_environment: linux-binary-libtorch-pre-cxx11
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -167,6 +185,7 @@ jobs:
  libtorch-cuda12_1-shared-with-deps-pre-cxx11-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -179,13 +198,16 @@ jobs:
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: libtorch-cuda12_1-shared-with-deps-pre-cxx11
      build_environment: linux-binary-libtorch-pre-cxx11
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-cuda12_1-shared-with-deps-pre-cxx11-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda12_1-shared-with-deps-pre-cxx11-build
+    needs:
+      - libtorch-cuda12_1-shared-with-deps-pre-cxx11-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -201,6 +223,7 @@ jobs:
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda12_1-shared-with-deps-pre-cxx11
      build_environment: linux-binary-libtorch-pre-cxx11
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -232,6 +255,7 @@ jobs:
  libtorch-cuda12_4-shared-with-deps-pre-cxx11-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -244,13 +268,16 @@ jobs:
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: libtorch-cuda12_4-shared-with-deps-pre-cxx11
      build_environment: linux-binary-libtorch-pre-cxx11
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-cuda12_4-shared-with-deps-pre-cxx11-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda12_4-shared-with-deps-pre-cxx11-build
+    needs:
+      - libtorch-cuda12_4-shared-with-deps-pre-cxx11-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -266,6 +293,7 @@ jobs:
      DESIRED_DEVTOOLSET: pre-cxx11
      build_name: libtorch-cuda12_4-shared-with-deps-pre-cxx11
      build_environment: linux-binary-libtorch-pre-cxx11
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -294,115 +322,10 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  libtorch-rocm6_0-shared-with-deps-pre-cxx11-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.0
-      GPU_ARCH_VERSION: 6.0
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm6_0-shared-with-deps-pre-cxx11
-      build_environment: linux-binary-libtorch-pre-cxx11
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-rocm6_0-shared-with-deps-pre-cxx11-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm6_0-shared-with-deps-pre-cxx11-build
-    runs-on: linux.rocm.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.0
-      GPU_ARCH_VERSION: 6.0
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v3
-        name: Download Build Artifacts
-        with:
-          name: libtorch-rocm6_0-shared-with-deps-pre-cxx11
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          quiet-checkout: true
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Checkout pytorch/builder
-        uses: malfet/checkout@silent-checkout
-        with:
-          ref: main
-          submodules: recursive
-          repository: pytorch/builder
-          path: builder
-          quiet-checkout: true
-      - name: Clean pytorch/builder checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: builder
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: pytorch/manylinux-builder:rocm6.0-main
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
-  libtorch-rocm6_0-shared-with-deps-pre-cxx11-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: libtorch-rocm6_0-shared-with-deps-pre-cxx11-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.0
-      GPU_ARCH_VERSION: 6.0
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: pre-cxx11
-      build_name: libtorch-rocm6_0-shared-with-deps-pre-cxx11
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  libtorch-rocm6_1-shared-with-deps-pre-cxx11-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -415,13 +338,16 @@ jobs:
      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: pre-cxx11
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: libtorch-rocm6_1-shared-with-deps-pre-cxx11
      build_environment: linux-binary-libtorch-pre-cxx11
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-rocm6_1-shared-with-deps-pre-cxx11-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-rocm6_1-shared-with-deps-pre-cxx11-build
+    needs:
+      - libtorch-rocm6_1-shared-with-deps-pre-cxx11-build
+      - get-label-type
    runs-on: linux.rocm.gpu
    timeout-minutes: 240
    env:
@ -505,3 +431,113 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  libtorch-rocm6_2-shared-with-deps-pre-cxx11-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.2
+      GPU_ARCH_VERSION: 6.2
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.2-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-rocm6_2-shared-with-deps-pre-cxx11
+      build_environment: linux-binary-libtorch-pre-cxx11
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-rocm6_2-shared-with-deps-pre-cxx11-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-rocm6_2-shared-with-deps-pre-cxx11-build
+      - get-label-type
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.2
+      GPU_ARCH_VERSION: 6.2
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.2-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v3
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm6_2-shared-with-deps-pre-cxx11
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: pytorch/manylinux-builder:rocm6.2-main
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  libtorch-rocm6_2-shared-with-deps-pre-cxx11-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-rocm6_2-shared-with-deps-pre-cxx11-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.2
+      GPU_ARCH_VERSION: 6.2
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.2-main
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: pre-cxx11
+      build_name: libtorch-rocm6_2-shared-with-deps-pre-cxx11
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -32,9 +32,18 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  manywheel-py3_8-cuda11_8-build:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+  manywheel-py3_9-cuda11_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -45,15 +54,18 @@ jobs:
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda11_8
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda11_8
      build_environment: linux-binary-manywheel
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda11_8-test:  # Testing
+  manywheel-py3_9-cuda11_8-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda11_8-build
+    needs:
+      - manywheel-py3_9-cuda11_8-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -65,16 +77,18 @@ jobs:
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda11_8
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda11_8
      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

-  manywheel-py3_8-cuda11_8-split-build:
+  manywheel-py3_9-cuda11_8-split-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -86,15 +100,18 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda11_8-split
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda11_8-split
      build_environment: linux-binary-manywheel
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda11_8-split-test:  # Testing
+  manywheel-py3_9-cuda11_8-split-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda11_8-split-build
+    needs:
+      - manywheel-py3_9-cuda11_8-split-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -107,16 +124,18 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda11_8-split
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda11_8-split
      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

-  manywheel-py3_8-cuda12_1-build:
+  manywheel-py3_9-cuda12_1-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -127,15 +146,18 @@ jobs:
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_1
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda12_1
      build_environment: linux-binary-manywheel
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda12_1-test:  # Testing
+  manywheel-py3_9-cuda12_1-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda12_1-build
+    needs:
+      - manywheel-py3_9-cuda12_1-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -147,16 +169,18 @@ jobs:
      GPU_ARCH_VERSION: 12.1
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_1
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_1
      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

-  manywheel-py3_8-cuda12_1-split-build:
+  manywheel-py3_9-cuda12_1-split-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -168,15 +192,18 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_1-split
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda12_1-split
      build_environment: linux-binary-manywheel
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda12_1-split-test:  # Testing
+  manywheel-py3_9-cuda12_1-split-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda12_1-split-build
+    needs:
+      - manywheel-py3_9-cuda12_1-split-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -189,16 +216,18 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_1-split
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_1-split
      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

-  manywheel-py3_8-cuda12_4-build:
+  manywheel-py3_9-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -209,15 +238,18 @@ jobs:
      GPU_ARCH_VERSION: 12.4
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_4
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda12_4-test:  # Testing
+  manywheel-py3_9-cuda12_4-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda12_4-build
+    needs:
+      - manywheel-py3_9-cuda12_4-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -229,16 +261,18 @@ jobs:
      GPU_ARCH_VERSION: 12.4
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_4
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_4
      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

-  manywheel-py3_8-cuda12_4-split-build:
+  manywheel-py3_9-cuda12_4-split-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -250,15 +284,18 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_4-split
+      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_9-cuda12_4-split
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda12_4-split-test:  # Testing
+  manywheel-py3_9-cuda12_4-split-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda12_4-split-build
+    needs:
+      - manywheel-py3_9-cuda12_4-split-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -271,9 +308,10 @@ jobs:
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_4-split
+      DESIRED_PYTHON: "3.9"
+      build_name: manywheel-py3_9-cuda12_4-split
      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -37,9 +37,18 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
  manywheel-py3_9-cpu-s390x-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -59,7 +68,9 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-s390x-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_9-cpu-s390x-build
+    needs:
+      - manywheel-py3_9-cpu-s390x-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -103,6 +114,7 @@ jobs:
  manywheel-py3_10-cpu-s390x-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -122,7 +134,9 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-s390x-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_10-cpu-s390x-build
+    needs:
+      - manywheel-py3_10-cpu-s390x-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -166,6 +180,7 @@ jobs:
  manywheel-py3_11-cpu-s390x-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -185,7 +200,9 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-s390x-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_11-cpu-s390x-build
+    needs:
+      - manywheel-py3_11-cpu-s390x-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -229,6 +246,7 @@ jobs:
  manywheel-py3_12-cpu-s390x-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      BUILDER_ROOT: /builder
@ -248,7 +266,9 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-s390x-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_12-cpu-s390x-build
+    needs:
+      - manywheel-py3_12-cpu-s390x-build
+      - get-label-type
    uses: ./.github/workflows/_binary-test-linux.yml
    with:
      PYTORCH_ROOT: /pytorch
@ -288,3 +308,69 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13-cpu-s390x-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DESIRED_PYTHON: "3.13"
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+      build_name: manywheel-py3_13-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-cpu-s390x-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13-cpu-s390x-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cpu-s390x
+      build_environment: linux-s390x-binary-manywheel
+      runs_on: linux.s390x
+      ALPINE_IMAGE: "docker.io/s390x/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-cpu-s390x-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13-cpu-s390x-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cpu-s390x
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml
@ -46,9 +46,6 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -100,7 +97,7 @@ jobs:
          git clean -fxd
        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
+        uses: nick-fields/retry@v3.0.0
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        with:
          timeout_minutes: 5
@ -164,9 +161,6 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -218,7 +212,7 @@ jobs:
          git clean -fxd
        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
+        uses: nick-fields/retry@v3.0.0
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        with:
          timeout_minutes: 5
@ -282,9 +276,6 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -336,7 +327,7 @@ jobs:
          git clean -fxd
        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
+        uses: nick-fields/retry@v3.0.0
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        with:
          timeout_minutes: 5
@ -400,9 +391,6 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -454,7 +442,7 @@ jobs:
          git clean -fxd
        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
+        uses: nick-fields/retry@v3.0.0
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        with:
          timeout_minutes: 5
--- a/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml
@ -50,9 +50,6 @@ jobs:
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.8"
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -104,7 +101,7 @@ jobs:
          git clean -fxd
        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
+        uses: nick-fields/retry@v3.0.0
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        with:
          timeout_minutes: 5
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -47,9 +47,6 @@ jobs:
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -101,7 +98,7 @@ jobs:
          git clean -fxd
        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
+        uses: nick-fields/retry@v3.0.0
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        with:
          timeout_minutes: 5
@ -166,9 +163,6 @@ jobs:
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -220,7 +214,7 @@ jobs:
          git clean -fxd
        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
+        uses: nick-fields/retry@v3.0.0
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        with:
          timeout_minutes: 5
@ -285,9 +279,6 @@ jobs:
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -339,7 +330,7 @@ jobs:
          git clean -fxd
        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
+        uses: nick-fields/retry@v3.0.0
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        with:
          timeout_minutes: 5
@ -404,9 +395,6 @@ jobs:
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-      # For sccache access (only on non-forked PRs)
-      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
-      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -458,7 +446,7 @@ jobs:
          git clean -fxd
        working-directory: builder
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v2.8.2
+        uses: nick-fields/retry@v3.0.0
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        with:
          timeout_minutes: 5
--- a/.github/workflows/generated-windows-binary-conda-nightly.yml
+++ b/.github/workflows/generated-windows-binary-conda-nightly.yml
@ -32,9 +32,18 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
  conda-py3_9-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -145,8 +154,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  conda-py3_9-cpu-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cpu-build
-    runs-on: windows.4xlarge.nonephemeral
+    needs:
+      - conda-py3_9-cpu-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -276,7 +287,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_9-cuda11_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -388,8 +400,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  conda-py3_9-cuda11_8-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda11_8-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - conda-py3_9-cuda11_8-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -521,7 +535,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_9-cuda12_1-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -633,8 +648,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  conda-py3_9-cuda12_1-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda12_1-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - conda-py3_9-cuda12_1-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -766,7 +783,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_9-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -878,8 +896,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  conda-py3_9-cuda12_4-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_9-cuda12_4-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - conda-py3_9-cuda12_4-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1011,7 +1031,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_10-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1122,8 +1143,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  conda-py3_10-cpu-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cpu-build
-    runs-on: windows.4xlarge.nonephemeral
+    needs:
+      - conda-py3_10-cpu-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1253,7 +1276,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_10-cuda11_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1365,8 +1389,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  conda-py3_10-cuda11_8-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda11_8-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - conda-py3_10-cuda11_8-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1498,7 +1524,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_10-cuda12_1-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1610,8 +1637,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  conda-py3_10-cuda12_1-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda12_1-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - conda-py3_10-cuda12_1-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1743,7 +1772,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_10-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1855,8 +1885,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  conda-py3_10-cuda12_4-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_10-cuda12_4-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - conda-py3_10-cuda12_4-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1988,7 +2020,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_11-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2099,8 +2132,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  conda-py3_11-cpu-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_11-cpu-build
-    runs-on: windows.4xlarge.nonephemeral
+    needs:
+      - conda-py3_11-cpu-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2230,7 +2265,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_11-cuda11_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2342,8 +2378,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  conda-py3_11-cuda11_8-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_11-cuda11_8-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - conda-py3_11-cuda11_8-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2475,7 +2513,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_11-cuda12_1-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2587,8 +2626,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  conda-py3_11-cuda12_1-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_11-cuda12_1-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - conda-py3_11-cuda12_1-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2720,7 +2761,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_11-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2832,8 +2874,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  conda-py3_11-cuda12_4-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_11-cuda12_4-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - conda-py3_11-cuda12_4-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2965,7 +3009,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_12-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3076,8 +3121,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  conda-py3_12-cpu-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_12-cpu-build
-    runs-on: windows.4xlarge.nonephemeral
+    needs:
+      - conda-py3_12-cpu-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3207,7 +3254,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_12-cuda11_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3319,8 +3367,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  conda-py3_12-cuda11_8-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_12-cuda11_8-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - conda-py3_12-cuda11_8-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3452,7 +3502,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_12-cuda12_1-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3564,8 +3615,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  conda-py3_12-cuda12_1-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_12-cuda12_1-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - conda-py3_12-cuda12_1-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3697,7 +3750,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  conda-py3_12-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3809,8 +3863,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  conda-py3_12-cuda12_4-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: conda-py3_12-cuda12_4-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - conda-py3_12-cuda12_4-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
--- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
@ -25,9 +25,18 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
  libtorch-cpu-shared-with-deps-debug-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -142,8 +151,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cpu-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-debug-build
-    runs-on: windows.4xlarge.nonephemeral
+    needs:
+      - libtorch-cpu-shared-with-deps-debug-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -32,9 +32,18 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
  libtorch-cpu-shared-with-deps-debug-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -149,8 +158,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cpu-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-debug-build
-    runs-on: windows.4xlarge.nonephemeral
+    needs:
+      - libtorch-cpu-shared-with-deps-debug-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -288,7 +299,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-cuda11_8-shared-with-deps-debug-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -404,8 +416,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cuda11_8-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_8-shared-with-deps-debug-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - libtorch-cuda11_8-shared-with-deps-debug-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -545,7 +559,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-cuda12_1-shared-with-deps-debug-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -661,8 +676,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cuda12_1-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda12_1-shared-with-deps-debug-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - libtorch-cuda12_1-shared-with-deps-debug-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -802,7 +819,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-cuda12_4-shared-with-deps-debug-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -918,8 +936,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cuda12_4-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda12_4-shared-with-deps-debug-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - libtorch-cuda12_4-shared-with-deps-debug-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
--- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
@ -25,9 +25,18 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
  libtorch-cpu-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -142,8 +151,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cpu-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-release-build
-    runs-on: windows.4xlarge.nonephemeral
+    needs:
+      - libtorch-cpu-shared-with-deps-release-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -32,9 +32,18 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
  libtorch-cpu-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -149,8 +158,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cpu-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cpu-shared-with-deps-release-build
-    runs-on: windows.4xlarge.nonephemeral
+    needs:
+      - libtorch-cpu-shared-with-deps-release-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -288,7 +299,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-cuda11_8-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -404,8 +416,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cuda11_8-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda11_8-shared-with-deps-release-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - libtorch-cuda11_8-shared-with-deps-release-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -545,7 +559,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-cuda12_1-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -661,8 +676,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cuda12_1-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda12_1-shared-with-deps-release-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - libtorch-cuda12_1-shared-with-deps-release-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -802,7 +819,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-cuda12_4-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -918,8 +936,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cuda12_4-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: libtorch-cuda12_4-shared-with-deps-release-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - libtorch-cuda12_4-shared-with-deps-release-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@ -32,9 +32,18 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
  wheel-py3_9-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -146,8 +155,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  wheel-py3_9-cpu-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cpu-build
-    runs-on: windows.4xlarge.nonephemeral
+    needs:
+      - wheel-py3_9-cpu-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -277,7 +288,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_9-cuda11_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -390,8 +402,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  wheel-py3_9-cuda11_8-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda11_8-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - wheel-py3_9-cuda11_8-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -523,7 +537,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_9-cuda12_1-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -636,8 +651,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  wheel-py3_9-cuda12_1-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda12_1-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - wheel-py3_9-cuda12_1-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -769,7 +786,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_9-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -882,8 +900,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  wheel-py3_9-cuda12_4-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_9-cuda12_4-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - wheel-py3_9-cuda12_4-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1015,7 +1035,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_10-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1127,8 +1148,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  wheel-py3_10-cpu-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cpu-build
-    runs-on: windows.4xlarge.nonephemeral
+    needs:
+      - wheel-py3_10-cpu-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1258,7 +1281,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_10-cuda11_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1371,8 +1395,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  wheel-py3_10-cuda11_8-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda11_8-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - wheel-py3_10-cuda11_8-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1504,7 +1530,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_10-cuda12_1-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1617,8 +1644,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  wheel-py3_10-cuda12_1-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda12_1-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - wheel-py3_10-cuda12_1-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1750,7 +1779,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_10-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1863,8 +1893,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  wheel-py3_10-cuda12_4-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_10-cuda12_4-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - wheel-py3_10-cuda12_4-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -1996,7 +2028,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_11-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2108,8 +2141,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  wheel-py3_11-cpu-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_11-cpu-build
-    runs-on: windows.4xlarge.nonephemeral
+    needs:
+      - wheel-py3_11-cpu-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2239,7 +2274,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_11-cuda11_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2352,8 +2388,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  wheel-py3_11-cuda11_8-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_11-cuda11_8-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - wheel-py3_11-cuda11_8-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2485,7 +2523,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_11-cuda12_1-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2598,8 +2637,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  wheel-py3_11-cuda12_1-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_11-cuda12_1-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - wheel-py3_11-cuda12_1-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2731,7 +2772,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_11-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2844,8 +2886,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  wheel-py3_11-cuda12_4-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_11-cuda12_4-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - wheel-py3_11-cuda12_4-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -2977,7 +3021,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_12-cpu-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3089,8 +3134,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  wheel-py3_12-cpu-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_12-cpu-build
-    runs-on: windows.4xlarge.nonephemeral
+    needs:
+      - wheel-py3_12-cpu-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3220,7 +3267,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_12-cuda11_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3333,8 +3381,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  wheel-py3_12-cuda11_8-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_12-cuda11_8-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - wheel-py3_12-cuda11_8-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3466,7 +3516,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_12-cuda12_1-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3579,8 +3630,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  wheel-py3_12-cuda12_1-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_12-cuda12_1-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - wheel-py3_12-cuda12_1-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3712,7 +3765,8 @@ jobs:
    uses: ./.github/workflows/_binary-upload.yml
  wheel-py3_12-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: windows.4xlarge.nonephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -3825,8 +3879,10 @@ jobs:
          .github\scripts\kill_active_ssh_sessions.ps1
  wheel-py3_12-cuda12_4-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: wheel-py3_12-cuda12_4-build
-    runs-on: windows.8xlarge.nvidia.gpu
+    needs:
+      - wheel-py3_12-cuda12_4-build
+      - get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.8xlarge.nvidia.gpu"
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@ -5,7 +5,9 @@ on:
    # - cron: 0 7 * * 1-6
    # - cron: 0 7 * * 0
    # Does not perform max_autotune on CPU, so skip the weekly run setup
-    - cron: 0 7 * * *
+    # Run 6 times everyday to see if perf instablity can be reproduced
+    # Will change this back
+    - cron: 0 */4 * * *
  # NB: GitHub has an upper limit of 10 inputs here
  workflow_dispatch:
    inputs:
@ -57,18 +59,42 @@ jobs:
      docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
-          { config: "inductor_huggingface_perf_cpu_aarch64", shard: 1, num_shards: 3, runner: "linux.arm64.m7g.metal" },
-          { config: "inductor_huggingface_perf_cpu_aarch64", shard: 2, num_shards: 3, runner: "linux.arm64.m7g.metal" },
-          { config: "inductor_huggingface_perf_cpu_aarch64", shard: 3, num_shards: 3, runner: "linux.arm64.m7g.metal" },
-          { config: "inductor_timm_perf_cpu_aarch64", shard: 1, num_shards: 5, runner: "linux.arm64.m7g.metal" },
-          { config: "inductor_timm_perf_cpu_aarch64", shard: 2, num_shards: 5, runner: "linux.arm64.m7g.metal" },
-          { config: "inductor_timm_perf_cpu_aarch64", shard: 3, num_shards: 5, runner: "linux.arm64.m7g.metal" },
-          { config: "inductor_timm_perf_cpu_aarch64", shard: 4, num_shards: 5, runner: "linux.arm64.m7g.metal" },
-          { config: "inductor_timm_perf_cpu_aarch64", shard: 5, num_shards: 5, runner: "linux.arm64.m7g.metal" },
-          { config: "inductor_torchbench_perf_cpu_aarch64", shard: 1, num_shards: 4, runner: "linux.arm64.m7g.metal" },
-          { config: "inductor_torchbench_perf_cpu_aarch64", shard: 2, num_shards: 4, runner: "linux.arm64.m7g.metal" },
-          { config: "inductor_torchbench_perf_cpu_aarch64", shard: 3, num_shards: 4, runner: "linux.arm64.m7g.metal" },
-          { config: "inductor_torchbench_perf_cpu_aarch64", shard: 4, num_shards: 4, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_huggingface_perf_cpu_aarch64", shard: 1, num_shards: 9, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_huggingface_perf_cpu_aarch64", shard: 2, num_shards: 9, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_huggingface_perf_cpu_aarch64", shard: 3, num_shards: 9, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_huggingface_perf_cpu_aarch64", shard: 4, num_shards: 9, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_huggingface_perf_cpu_aarch64", shard: 5, num_shards: 9, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_huggingface_perf_cpu_aarch64", shard: 6, num_shards: 9, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_huggingface_perf_cpu_aarch64", shard: 7, num_shards: 9, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_huggingface_perf_cpu_aarch64", shard: 8, num_shards: 9, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_huggingface_perf_cpu_aarch64", shard: 9, num_shards: 9, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_timm_perf_cpu_aarch64", shard:  1, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_timm_perf_cpu_aarch64", shard:  2, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_timm_perf_cpu_aarch64", shard:  3, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_timm_perf_cpu_aarch64", shard:  4, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_timm_perf_cpu_aarch64", shard:  5, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_timm_perf_cpu_aarch64", shard:  6, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_timm_perf_cpu_aarch64", shard:  7, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_timm_perf_cpu_aarch64", shard:  8, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_timm_perf_cpu_aarch64", shard:  9, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_timm_perf_cpu_aarch64", shard: 10, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_timm_perf_cpu_aarch64", shard: 11, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_timm_perf_cpu_aarch64", shard: 12, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_timm_perf_cpu_aarch64", shard: 13, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_timm_perf_cpu_aarch64", shard: 14, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_timm_perf_cpu_aarch64", shard: 15, num_shards: 15, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard:  1, num_shards: 12, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard:  2, num_shards: 12, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard:  3, num_shards: 12, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard:  4, num_shards: 12, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard:  5, num_shards: 12, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard:  6, num_shards: 12, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard:  7, num_shards: 12, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard:  8, num_shards: 12, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard:  9, num_shards: 12, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard: 10, num_shards: 12, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard: 11, num_shards: 12, runner: "linux.arm64.m7g.metal" },
+          { config: "inductor_torchbench_perf_cpu_aarch64", shard: 12, num_shards: 12, runner: "linux.arm64.m7g.metal" },
        ]}
      selected-test-configs: ${{ inputs.benchmark_configs }}
    secrets:
@ -79,10 +105,12 @@ jobs:
    name: linux-jammy-aarch64-py3.10-inductor
    uses: ./.github/workflows/_linux-test.yml
    needs: linux-jammy-aarch64-py3_10-inductor-build
-    if: github.event.schedule == '0 7 * * *'
+    if: github.event.schedule == '0 */4 * * *'
    with:
      build-environment: linux-jammy-aarch64-py3.10
-      dashboard-tag: training-false-inference-true-default-true-dynamic-true-aotinductor-true
+      # Turn off dynamic-shapes and aotinductor tests for now, to have faster iteration for debugging perf instability.
+      # Will change this back
+      dashboard-tag: training-false-inference-true-default-true-dynamic-false-aotinductor-false
      docker-image: ${{ needs.linux-jammy-aarch64-py3_10-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-aarch64-py3_10-inductor-build.outputs.test-matrix }}
      use-gha: anything-non-empty-to-use-gha
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -18,34 +18,45 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
    name: cuda12.1-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
-          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_cpp_wrapper_abi_compatible", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu" },
+          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_cpp_wrapper_abi_compatible", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
    secrets:
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
@ -64,14 +75,16 @@ jobs:
  linux-focal-cuda12_1-py3_12-gcc9-inductor-build:
    name: cuda12.1-py3.12-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
      build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3.12-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_1-py3_12-gcc9-inductor-test:
@ -86,12 +99,14 @@ jobs:
  linux-jammy-cpu-py3_12-inductor-halide-build:
    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
      build-environment: linux-jammy-py3.12-gcc11
      docker-image-name: pytorch-linux-jammy-py3.12-halide
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor-halide", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+          { config: "inductor-halide", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
        ]}

  linux-jammy-cpu-py3_12-inductor-halide-test:
@ -107,15 +122,17 @@ jobs:
    # Should be synced with the one in inductor-periodic.yml but this only runs inductor_timm
    name: cuda12.4-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
      sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
    secrets:
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
@ -135,44 +152,54 @@ jobs:
  linux-jammy-cpu-py3_8-gcc11-inductor-build:
    name: linux-jammy-cpu-py3.8-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
      build-environment: linux-jammy-py3.8-gcc11-build
      docker-image-name: pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
-          { config: "cpu_inductor_timm", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_inductor_timm", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_inductor_huggingface_freezing", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
-          { config: "cpu_inductor_timm_freezing", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_inductor_timm_freezing", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_inductor_torchbench_freezing", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_inductor_torchbench_freezing", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_inductor_huggingface_amp_freezing", shard: 1, num_shards: 1, runner: "linux.16xlarge.spr" },
-          { config: "cpu_inductor_timm_amp_freezing", shard: 1, num_shards: 2, runner: "linux.16xlarge.spr" },
-          { config: "cpu_inductor_timm_amp_freezing", shard: 2, num_shards: 2, runner: "linux.16xlarge.spr" },
-          { config: "cpu_inductor_torchbench_amp_freezing", shard: 1, num_shards: 2, runner: "linux.16xlarge.spr" },
-          { config: "cpu_inductor_torchbench_amp_freezing", shard: 2, num_shards: 2, runner: "linux.16xlarge.spr" },
-          { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
-          { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_aot_inductor_huggingface_freezing", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
-          { config: "cpu_aot_inductor_timm_freezing", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_aot_inductor_timm_freezing", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_aot_inductor_torchbench_freezing", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_aot_inductor_torchbench_freezing", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
-          { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
-          { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
-          { config: "cpu_inductor_huggingface_freezing_avx2", shard: 1, num_shards: 1, runner: "linux.10xlarge.avx2" },
-          { config: "cpu_inductor_torchbench_freezing_avx2", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
-          { config: "cpu_inductor_torchbench_freezing_avx2", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
-          { config: "cpu_inductor_timm_freezing_avx2", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
-          { config: "cpu_inductor_timm_freezing_avx2", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
+          { config: "inductor_avx512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "inductor_avx512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "cpu_inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "cpu_inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "cpu_inductor_freezing_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "cpu_inductor_freezing_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "cpu_inductor_freezing_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "cpu_inductor_freezing_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "cpu_inductor_freezing_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "cpu_inductor_amp_freezing_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.16xlarge.spr" },
+          { config: "cpu_inductor_amp_freezing_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.16xlarge.spr" },
+          { config: "cpu_inductor_amp_freezing_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.16xlarge.spr" },
+          { config: "cpu_inductor_amp_freezing_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.16xlarge.spr" },
+          { config: "cpu_inductor_amp_freezing_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.16xlarge.spr" },
+          { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "cpu_aot_inductor_freezing_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "cpu_aot_inductor_freezing_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "cpu_aot_inductor_freezing_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "cpu_aot_inductor_freezing_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "cpu_aot_inductor_freezing_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "cpu_aot_inductor_amp_freezing_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "cpu_aot_inductor_amp_freezing_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "dynamic_cpu_aot_inductor_freezing_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "dynamic_cpu_aot_inductor_freezing_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "dynamic_cpu_aot_inductor_amp_freezing_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "dynamic_cpu_aot_inductor_amp_freezing_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.24xl.spr-metal" },
+          { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
+          { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
+          { config: "cpu_inductor_freezing_avx2_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
+          { config: "cpu_inductor_freezing_avx2_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
+          { config: "cpu_inductor_freezing_avx2_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
+          { config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
+          { config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
        ]}
    secrets:
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
--- a/Show More
+++ b/Show More