Skeletonts in and compiling

Add MPS dispatch to native_functions.yaml
[cuDNN] Check shapes during graph capture in cuDNN CTCLoss (#130071 )
2025-10-22 06:11:27 +08:00 · 2024-10-08 16:28:13 -07:00 · 2024-10-08 14:04:01 -07:00 · 2024-10-03 20:10:28 +00:00 · 2024-10-03 18:18:37 +00:00 · 2024-10-03 18:18:37 +00:00
1479 changed files with 41384 additions and 32385 deletions
--- a/.buckconfig.oss
+++ b/.buckconfig.oss
@ -21,6 +21,3 @@
  cxx = /usr/bin/clang++
  cxxpp = /usr/bin/clang++
  ld = /usr/bin/clang++
-
-[project]
-  default_flavors_mode=all
--- a/.ci/docker/aotriton_version.txt
+++ b/.ci/docker/aotriton_version.txt
@ -1,5 +1,5 @@
-0.6b
+0.7b
 manylinux_2_17
 rocm6.2
-7f07e8a1cb1f99627eb6d77f5c0e9295c775f3c7
-e4ab195d2bd19e939c675a13280c29714c6ef9f2cf420690da150fa0cac043b1
+9be04068c3c0857a4cfd17d7e39e71d0423ebac2
+3e9e1959d23b93d78a08fcc5f868125dc3854dece32fd9458be9ef4467982291
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -286,18 +286,7 @@ case "$image" in
    TRITON=yes
    ;;
  pytorch-linux-focal-rocm-n-1-py3)
-    ANACONDA_PYTHON_VERSION=3.8
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    ROCM_VERSION=6.0
-    NINJA_VERSION=1.9.0
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-focal-rocm-n-py3)
-    ANACONDA_PYTHON_VERSION=3.8
+    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
@ -307,6 +296,17 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
+  pytorch-linux-focal-rocm-n-py3)
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    ROCM_VERSION=6.2
+    NINJA_VERSION=1.9.0
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
  pytorch-linux-jammy-xpu-2024.0-py3)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
@ -379,6 +379,14 @@ case "$image" in
    GCC_VERSION=11
    CONDA_CMAKE=yes
    HALIDE=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-py3.12-triton-cpu)
+    CUDA_VERSION=12.4
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=11
+    CONDA_CMAKE=yes
+    TRITON_CPU=yes
    ;;
  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
@ -509,6 +517,7 @@ docker build \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
       --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
       --build-arg "TRITON=${TRITON}" \
+       --build-arg "TRITON_CPU=${TRITON_CPU}" \
       --build-arg "ONNX=${ONNX}" \
       --build-arg "DOCS=${DOCS}" \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
--- a/.ci/docker/ci_commit_pins/triton-cpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-cpu.txt
@ -0,0 +1 @@
+6a333f1b05671f6fada4ba7bbfae4a02a9d96f4f
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-cc981feba10a3f4c2e46f3fe368e8fcf5f5643df
+91b14bf5593cf58a8541f3e6b9125600a867d4ef
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-757b6a61e7df814ba806f498f8bb3160f84b120c
+cf34004b8a67d290a962da166f5aa2fc66751326
--- a/.ci/docker/common/install_aotriton.sh
+++ b/.ci/docker/common/install_aotriton.sh
@ -4,12 +4,12 @@ set -ex

 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

-TARBALL='aotriton.tar.bz2'
+TARBALL='aotriton.tar.gz'
 # This read command alwasy returns with exit code 1
 read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
 ARCH=$(uname -m)
 AOTRITON_INSTALL_PREFIX="$1"
-AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.bz2"
+AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.gz"

 cd "${AOTRITON_INSTALL_PREFIX}"
 # Must use -L to follow redirects
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -7,7 +7,7 @@ PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/hea
 GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py

 # Python versions to be installed in /opt/$VERSION_NO
-CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0"}
+CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}

 function check_var {
    if [ -z "$1" ]; then
@ -22,6 +22,13 @@ function do_cpython_build {
    check_var $py_ver
    check_var $py_folder
    tar -xzf Python-$py_ver.tgz
+
+    local additional_flags=""
+    if [ "$py_ver" == "3.13.0t" ]; then
+        additional_flags=" --disable-gil"
+        mv cpython-3.13/ cpython-3.13t/
+    fi
+
    pushd $py_folder

    local prefix="/opt/_internal/cpython-${py_ver}"
@ -37,8 +44,10 @@ function do_cpython_build {
        local openssl_flags="--with-openssl=${WITH_OPENSSL} --with-openssl-rpath=auto"
    fi

+
+
    # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
-    CFLAGS="-Wformat" ./configure --prefix=${prefix} ${openssl_flags} ${shared_flags} > /dev/null
+    CFLAGS="-Wformat" ./configure --prefix=${prefix} ${openssl_flags} ${shared_flags} ${additional_flags} > /dev/null

    make -j40 > /dev/null
    make install > /dev/null
@ -69,7 +78,14 @@ function build_cpython {
    check_var $py_ver
    check_var $PYTHON_DOWNLOAD_URL
    local py_ver_folder=$py_ver
-    if [ "$py_ver" = "3.13.0" ]; then
+
+    if [ "$py_ver" = "3.13.0t" ]; then
+        PY_VER_SHORT="3.13"
+        PYT_VER_SHORT="3.13t"
+        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
+        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
+        do_cpython_build $py_ver cpython-$PYT_VER_SHORT
+    elif [ "$py_ver" = "3.13.0" ]; then
        PY_VER_SHORT="3.13"
        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -105,7 +105,7 @@ function install_121 {
 }

 function install_124 {
-  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
  # install CUDA 12.4.1 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -5,19 +5,19 @@ set -ex

 NCCL_VERSION=v2.21.5-1

-function install_cusparselt_052 {
+function install_cusparselt_062 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
    mkdir tmp_cusparselt && pushd tmp_cusparselt
-    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
-    tar xf libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
-    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/include/* /usr/local/cuda/include/
-    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.6.2.3-archive.tar.xz
+    tar xf libcusparse_lt-linux-sbsa-0.6.2.3-archive.tar.xz
+    cp -a libcusparse_lt-linux-sbsa-0.6.2.3-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-sbsa-0.6.2.3-archive/lib/* /usr/local/cuda/lib64/
    popd
    rm -rf tmp_cusparselt
 }

 function install_124 {
-  echo "Installing CUDA 12.4.1 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+  echo "Installing CUDA 12.4.1 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
  # install CUDA 12.4.1 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux_sbsa.run
@ -44,7 +44,7 @@ function install_124 {
  cd ..
  rm -rf nccl

-  install_cusparselt_052
+  install_cusparselt_062

  ldconfig
 }
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,7 +5,7 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt

-if [[ ${CUDA_VERSION:0:4} =~ ^12\.[2-4]$ ]]; then
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[2-6]$ ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
--- a/.ci/docker/common/install_miopen.sh
+++ b/.ci/docker/common/install_miopen.sh
@ -10,6 +10,21 @@ if [[ -z $ROCM_VERSION ]]; then
    exit 1;
 fi

+IS_UBUNTU=0
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  ubuntu)
+    IS_UBUNTU=1
+    ;;
+  centos)
+    IS_UBUNTU=0
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
+
 # To make version comparison easier, create an integer representation.
 save_IFS="$IFS"
 IFS=. ROCM_VERSION_ARRAY=(${ROCM_VERSION})
@ -57,9 +72,11 @@ MIOPEN_CMAKE_COMMON_FLAGS="
 -DMIOPEN_BUILD_DRIVER=OFF
 "
 # Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version
-if [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60300 ]]; then
-    echo "ROCm 6.2 MIOpen does not need any patches, do not build from source"
+if [[ $ROCM_INT -ge 60300 ]]; then
+    echo "ROCm 6.3+ MIOpen does not need any patches, do not build from source"
    exit 0
+elif [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60300 ]]; then
+    MIOPEN_BRANCH="release/rocm-rel-6.2-staging"
 elif [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then
    echo "ROCm 6.1 MIOpen does not need any patches, do not build from source"
    exit 0
@ -93,12 +110,21 @@ else
    exit 1
 fi

-yum remove -y miopen-hip
+
+if [[ ${IS_UBUNTU} == 1 ]]; then
+  apt-get remove -y miopen-hip
+else
+  yum remove -y miopen-hip
+fi

 git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH}
 pushd MIOpen
 # remove .git to save disk space since CI runner was running out
 rm -rf .git
+# Don't build CK to save docker build time
+if [[ $ROCM_INT -ge 60200 ]]; then
+    sed -i '/composable_kernel/d' requirements.txt
+fi
 # Don't build MLIR to save docker build time
 # since we are disabling MLIR backend for MIOpen anyway
 if [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
@ -111,10 +137,15 @@ cmake -P install_deps.cmake --minimum

 # clean up since CI runner was running out of disk space
 rm -rf /tmp/*
-yum clean all
-rm -rf /var/cache/yum
-rm -rf /var/lib/yum/yumdb
-rm -rf /var/lib/yum/history
+if [[ ${IS_UBUNTU} == 1 ]]; then
+  apt-get autoclean && apt-get clean
+  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+else
+  yum clean all
+  rm -rf /var/cache/yum
+  rm -rf /var/lib/yum/yumdb
+  rm -rf /var/lib/yum/history
+fi

 ## Build MIOpen
 mkdir -p build
@ -131,7 +162,11 @@ make -j $(nproc) package
 # clean up since CI runner was running out of disk space
 rm -rf /usr/local/cget

-yum install -y miopen-*.rpm
+if [[ ${IS_UBUNTU} == 1 ]]; then
+  sudo dpkg -i miopen-hip*.deb
+else
+  yum install -y miopen-*.rpm
+fi

 popd
 rm -rf MIOpen
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -15,8 +15,11 @@ conda_reinstall() {
 if [ -n "${XPU_VERSION}" ]; then
  TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
  TRITON_TEXT_FILE="triton-xpu"
+elif [ -n "${TRITON_CPU}" ]; then
+  TRITON_REPO="https://github.com/triton-lang/triton-cpu"
+  TRITON_TEXT_FILE="triton-cpu"
 else
-  TRITON_REPO="https://github.com/openai/triton"
+  TRITON_REPO="https://github.com/triton-lang/triton"
  TRITON_TEXT_FILE="triton"
 fi

@ -44,9 +47,10 @@ chown -R jenkins /var/lib/jenkins/triton
 chgrp -R jenkins /var/lib/jenkins/triton
 pushd /var/lib/jenkins/

-as_jenkins git clone ${TRITON_REPO} triton
+as_jenkins git clone --recursive ${TRITON_REPO} triton
 cd triton
 as_jenkins git checkout ${TRITON_PINNED_COMMIT}
+as_jenkins git submodule update --init --recursive
 cd python

 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
--- a/.ci/docker/conda/build.sh
+++ b/.ci/docker/conda/build.sh
@ -37,6 +37,12 @@ esac

 (
  set -x
+  # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+  # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+  sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+  sudo systemctl daemon-reload
+  sudo systemctl restart docker
+
  docker build \
    --target final \
    --progress plain \
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -10,6 +10,7 @@ ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8

 ARG DEVTOOLSET_VERSION=9
+
 # Note: This is required patch since CentOS have reached EOL
 # otherwise any yum install setp will fail
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -124,7 +124,14 @@ if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
 fi
 (
    set -x
-    DOCKER_BUILDKIT=1 docker build \
+
+    # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+    # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+    sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+    sudo systemctl daemon-reload
+    sudo systemctl restart docker
+
+    DOCKER_BUILDKIT=1 docker build  \
        ${DOCKER_GPU_BUILD_ARG} \
        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
        --target "${TARGET}" \
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -90,7 +90,7 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.10.0
+mypy==1.11.2
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
 #Pinned versions: 1.10.0
@ -337,3 +337,8 @@ onnxscript==0.1.0.dev20240817
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
+
+parameterized==0.8.1
+#Description: Parameterizes unittests, both the tests themselves and the entire testing class
+#Pinned versions:
+#test that import:
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.0.0
+3.1.0
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -68,6 +68,8 @@ RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
+ADD ./common/install_miopen.sh install_miopen.sh
+RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
 ENV ROCM_PATH /opt/rocm
 ENV PATH /opt/rocm/bin:$PATH
 ENV PATH /opt/rocm/hcc/bin:$PATH
@ -121,5 +123,8 @@ RUN bash ./install_cache.sh && rm install_cache.sh
 ARG BUILD_ENVIRONMENT
 ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}

+# Install LLVM dev version (Defined in the pytorch/builder github repository)
+COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
+
 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -147,6 +147,13 @@ COPY ci_commit_pins/triton.txt triton.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton.txt

+ARG TRITON_CPU
+COPY ./common/install_triton.sh install_triton.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt
+RUN if [ -n "${TRITON_CPU}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton-cpu.txt
+
 ARG EXECUTORCH
 # Build and install executorch
 COPY ./common/install_executorch.sh install_executorch.sh
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -49,13 +49,8 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
 fi

 # Enable LLVM dependency for TensorExpr testing
-if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
-  export USE_LLVM=/opt/rocm/llvm
-  export LLVM_DIR=/opt/rocm/llvm/lib/cmake/llvm
-else
-  export USE_LLVM=/opt/llvm
-  export LLVM_DIR=/opt/llvm/lib/cmake/llvm
-fi
+export USE_LLVM=/opt/llvm
+export LLVM_DIR=/opt/llvm/lib/cmake/llvm

 if [[ "$BUILD_ENVIRONMENT" == *executorch* ]]; then
  # To build test_edge_op_registration
@ -237,7 +232,7 @@ fi

 # Do not change workspace permissions for ROCm CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
-if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* ]]; then
  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
  cleanup_workspace() {
@ -283,6 +278,7 @@ else
    # set only when building other architectures
    # or building non-XLA tests.
    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
+          "$BUILD_ENVIRONMENT" != *s390x*   &&
          "$BUILD_ENVIRONMENT" != *xla* ]]; then
      if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
        # Install numpy-2.0.2 for builds which are backward compatible with 1.X
@ -345,11 +341,11 @@ else
    CUSTOM_OP_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/custom-op-build"
    CUSTOM_OP_TEST="$PWD/test/custom_operator"
    python --version
-    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+    SITE_PACKAGES="$(python -c 'import site; print(";".join([x for x in site.getsitepackages()] + [x + "/torch" for x in site.getsitepackages()]))')"

    mkdir -p "$CUSTOM_OP_BUILD"
    pushd "$CUSTOM_OP_BUILD"
-    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -359,10 +355,10 @@ else
    JIT_HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
    JIT_HOOK_TEST="$PWD/test/jit_hooks"
    python --version
-    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+    SITE_PACKAGES="$(python -c 'import site; print(";".join([x for x in site.getsitepackages()] + [x + "/torch" for x in site.getsitepackages()]))')"
    mkdir -p "$JIT_HOOK_BUILD"
    pushd "$JIT_HOOK_BUILD"
-    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -374,7 +370,7 @@ else
    python --version
    mkdir -p "$CUSTOM_BACKEND_BUILD"
    pushd "$CUSTOM_BACKEND_BUILD"
-    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -407,6 +403,6 @@ fi

 # snadampal: skipping it till sccache support added for aarch64
 # https://github.com/pytorch/pytorch/issues/121559
-if [[ "$BUILD_ENVIRONMENT" != *aarch64* ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *aarch64* &&  "$BUILD_ENVIRONMENT" != *s390x* ]]; then
  print_sccache_stats
 fi
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -191,9 +191,22 @@ function install_torchrec_and_fbgemm() {
  pip_uninstall torchrec-nightly
  pip_uninstall fbgemm-gpu-nightly
  pip_install setuptools-git-versioning scikit-build pyre-extensions
+
+  # TODO (huydhn): I still have no clue on why sccache doesn't work with only fbgemm_gpu here, but it
+  # seems to be an sccache-related issue
+  if [[ "$IS_A100_RUNNER" == "1" ]]; then
+    unset CMAKE_CUDA_COMPILER_LAUNCHER
+    sudo mv /opt/cache/bin /opt/cache/bin-backup
+  fi
+
  # See https://github.com/pytorch/pytorch/issues/106971
  CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 --user "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu"
  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
+
+  if [[ "$IS_A100_RUNNER" == "1" ]]; then
+    export CMAKE_CUDA_COMPILER_LAUNCHER=/opt/cache/bin/sccache
+    sudo mv /opt/cache/bin-backup /opt/cache/bin
+  fi
 }

 function clone_pytorch_xla() {
--- a/.ci/pytorch/create_test_cert.py
+++ b/.ci/pytorch/create_test_cert.py
@ -1,4 +1,4 @@
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from tempfile import mkdtemp

 from cryptography import x509
@ -42,10 +42,10 @@ def create_cert(path, C, ST, L, O, key):
        .issuer_name(issuer)
        .public_key(key.public_key())
        .serial_number(x509.random_serial_number())
-        .not_valid_before(datetime.utcnow())
+        .not_valid_before(datetime.now(timezone.utc))
        .not_valid_after(
            # Our certificate will be valid for 10 days
-            datetime.utcnow()
+            datetime.now(timezone.utc)
            + timedelta(days=10)
        )
        .add_extension(
@ -88,10 +88,10 @@ def sign_certificate_request(path, csr_cert, ca_cert, private_ca_key):
        .issuer_name(ca_cert.subject)
        .public_key(csr_cert.public_key())
        .serial_number(x509.random_serial_number())
-        .not_valid_before(datetime.utcnow())
+        .not_valid_before(datetime.now(timezone.utc))
        .not_valid_after(
            # Our certificate will be valid for 10 days
-            datetime.utcnow()
+            datetime.now(timezone.utc)
            + timedelta(days=10)
            # Sign our certificate with our private key
        )
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -9,15 +9,13 @@ if [[ -n "$CONDA_ENV" ]]; then
  export PATH="$CONDA_ENV/bin":$PATH
 fi

-# Test that OpenMP is enabled for non-arm64 build
-if [[ ${BUILD_ENVIRONMENT} != *arm64* ]]; then
-  pushd test
-  if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available()))") == "1" ]]; then
-    echo "Build should have OpenMP enabled, but torch.backends.openmp.is_available() is False"
-    exit 1
-  fi
-  popd
+# Test that OpenMP is enabled
+pushd test
+if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available()))") == "1" ]]; then
+  echo "Build should have OpenMP enabled, but torch.backends.openmp.is_available() is False"
+  exit 1
 fi
+popd

 setup_test_python() {
  # The CircleCI worker hostname doesn't resolve to an address.
@ -27,8 +25,9 @@ setup_test_python() {
  echo "Ninja version: $(ninja --version)"
  echo "Python version: $(which python) ($(python --version))"

-  # Increase default limit on open file handles from 256 to 1024
-  ulimit -n 1024
+  # Set the limit on open file handles to 16384
+  # might help with intermittent compiler test failures
+  ulimit -n 16384
 }

 test_python_all() {
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -375,9 +375,8 @@ test_inductor_cpp_wrapper_abi_compatible() {
  mkdir -p "$TEST_REPORTS_DIR"

  echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
-  # cpu stack allocation causes segfault and needs more investigation
  PYTORCH_TESTING_DEVICE_ONLY_FOR="" python test/run_test.py --include inductor/test_cpu_cpp_wrapper
-  python test/run_test.py --include inductor/test_cuda_cpp_wrapper
+  python test/run_test.py --include inductor/test_cuda_cpp_wrapper inductor/test_cpu_repro inductor/test_extension_backend

  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
    --training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
@ -401,10 +400,10 @@ pr_time_benchmarks() {

  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
-  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_after.txt" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
+  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
  echo "benchmark results on current PR: "
-  cat  "$TEST_REPORTS_DIR/pr_time_benchmarks_after.txt"
-
+  cat  "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"
+  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks python benchmarks/dynamo/pr_time_benchmarks/check_results.py "benchmarks/dynamo/pr_time_benchmarks/expected_results.csv" "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"
 }

 if [[ "${TEST_CONFIG}" == *pr_time_benchmarks* ]]; then
@ -607,6 +606,11 @@ test_inductor_halide() {
  assert_git_not_dirty
 }

+test_inductor_triton_cpu() {
+  python test/run_test.py --include inductor/test_triton_cpu_backend.py --verbose
+  assert_git_not_dirty
+}
+
 test_dynamo_benchmark() {
  # Usage: test_dynamo_benchmark huggingface 0
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -661,15 +665,6 @@ test_inductor_torchbench_smoketest_perf() {
  # The threshold value needs to be actively maintained to make this check useful
  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4

-  TORCHINDUCTOR_ABI_COMPATIBLE=1 python benchmarks/dynamo/torchbench.py --device cuda --performance --bfloat16 --inference \
-    --export-aot-inductor --only nanogpt --output "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv"
-  # The threshold value needs to be actively maintained to make this check useful
-  # The perf number of nanogpt seems not very stable, e.g.
-  # https://github.com/pytorch/pytorch/actions/runs/7158691360/job/19491437314,
-  # and thus we lower its threshold to reduce flakiness. If this continues to be a problem,
-  # we switch to use some other model.
-  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.9
-
  # Check memory compression ratio for a few models
  for test in hf_Albert timm_vision_transformer; do
    python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
@ -713,6 +708,10 @@ test_inductor_set_cpu_affinity(){
    export KMP_BLOCKTIME=1
  fi
  cores=$(test_inductor_get_core_number)
+  # Set number of cores to 16 on Aarch64 for performance runs.
+  if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then
+    cores=16
+  fi
  export OMP_NUM_THREADS=$cores
  end_core=$((cores-1))
  export TASKSET="taskset -c 0-$end_core"
@ -1383,14 +1382,16 @@ test_executorch() {
  assert_git_not_dirty
 }

-test_linux_aarch64(){
+test_linux_aarch64() {
  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
-       test_transformers test_multiprocessing test_numpy_interop --verbose
+        test_transformers test_multiprocessing test_numpy_interop \
+        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

  # Dynamo tests
  python test/run_test.py --include dynamo/test_compile dynamo/test_backends dynamo/test_comptime dynamo/test_config \
       dynamo/test_functions dynamo/test_fx_passes_pre_grad dynamo/test_interop dynamo/test_model_output dynamo/test_modules \
-       dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles --verbose
+       dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles \
+       --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

  # Inductor tests
  python test/run_test.py --include inductor/test_torchinductor inductor/test_benchmark_fusion inductor/test_codecache \
@ -1400,7 +1401,8 @@ test_linux_aarch64(){
       inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
       inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
       inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \
-       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes --verbose
+       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes inductor/test_memory \
+       --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
 }

 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
@ -1433,6 +1435,8 @@ elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
  test_inductor_halide
+elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
+  test_inductor_triton_cpu
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
@ -1456,7 +1460,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  # https://github.com/opencv/opencv-python/issues/885
  pip_install opencv-python==4.8.0.74
  if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
-    checkout_install_torchbench hf_Bert hf_Albert nanogpt timm_vision_transformer
+    checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
--- a/.ci/pytorch/win-build.sh
+++ b/.ci/pytorch/win-build.sh
@ -26,7 +26,7 @@ fi
 export SCRIPT_HELPERS_DIR=$SCRIPT_PARENT_DIR/win-test-helpers

 set +ex
-grep -E -R 'PyLong_(From|As)(Unsigned|)Long\(' --exclude=python_numbers.h --exclude=eval_frame.c torch/
+grep -E -R 'PyLong_(From|As)(Unsigned|)Long\(' --exclude=python_numbers.h  --exclude=pythoncapi_compat.h --exclude=eval_frame.c torch/
 PYLONG_API_CHECK=$?
 if [[ $PYLONG_API_CHECK == 0 ]]; then
  echo "Usage of PyLong_{From,As}{Unsigned}Long API may lead to overflow errors on Windows"
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -43,6 +43,9 @@ python -m pip install z3-solver==4.12.2.0
 # Install tlparse for test\dynamo\test_structured_trace.py UTs.
 python -m pip install tlparse==0.3.25

+# Install parameterized
+python -m pip install parameterized==0.8.1
+
 run_tests() {
    # Run nvidia-smi if available
    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -27,12 +27,11 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
  source activate testenv >/dev/null
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
  python_path="/opt/python/cp\$python_nodot-cp\${python_nodot}"
-  # Prior to Python 3.8 paths were suffixed with an 'm'
-  if [[ -d  "\${python_path}/bin" ]]; then
-    export PATH="\${python_path}/bin:\$PATH"
-  elif [[ -d "\${python_path}m/bin" ]]; then
-    export PATH="\${python_path}m/bin:\$PATH"
+  if [[ "\$python_nodot" = *t ]]; then
+    python_digits="\$(echo $DESIRED_PYTHON | tr -cd [:digit:])"
+    python_path="/opt/python/cp\$python_digits-cp\${python_digits}t"
  fi
+  export PATH="\${python_path}/bin:\$PATH"
 fi

 EXTRA_CONDA_FLAGS=""
--- a/.clang-format
+++ b/.clang-format
@ -44,7 +44,9 @@ ContinuationIndentWidth: 4
 Cpp11BracedListStyle: true
 DerivePointerAlignment: false
 DisableFormat:   false
-ForEachMacros:   [ FOR_EACH_RANGE, FOR_EACH, ]
+ForEachMacros:
+  - FOR_EACH_RANGE
+  - FOR_EACH
 IncludeCategories:
  - Regex:           '^<.*\.h(pp)?>'
    Priority:        1
@ -58,6 +60,24 @@ IndentWrappedFunctionNames: false
 KeepEmptyLinesAtTheStartOfBlocks: false
 MacroBlockBegin: ''
 MacroBlockEnd:   ''
+Macros:
+  - >-
+    PyObject_HEAD_INIT(type)={
+        /* this is not exactly match with PyObject_HEAD_INIT in Python source code
+         * but it is enough for clang-format */
+        { 0xFFFFFFFF },
+        (type)
+    },
+  - >-
+    PyVarObject_HEAD_INIT(type, size)={
+        {
+            /* manually expand PyObject_HEAD_INIT(type) above
+             * because clang-format do not support recursive expansion */
+            { 0xFFFFFFFF },
+            (type)
+        },
+        (size)
+    },
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
 PenaltyBreakBeforeFirstCallParameter: 1
@ -79,7 +99,11 @@ SpacesInContainerLiterals: true
 SpacesInCStyleCastParentheses: false
 SpacesInParentheses: false
 SpacesInSquareBrackets: false
-Standard:        Cpp11
+Standard:        c++17
+StatementMacros:
+  - PyObject_HEAD
+  - PyObject_VAR_HEAD
+  - PyException_HEAD
 TabWidth:        8
 UseTab:          Never
 ---
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@ -1,38 +0,0 @@
-If you have a question or would like help and support, please ask at our
-[forums](https://discuss.pytorch.org/).
-
-If you are submitting a feature request, please preface the title with [feature request].
-If you are submitting a bug report, please fill in the following details.
-
-## Issue description
-
-Provide a short description.
-
-## Code example
-
-Please try to provide a minimal example to repro the bug.
-Error messages and stack traces are also helpful.
-
-## System Info
-Please copy and paste the output from our
-[environment collection script](https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py)
-(or fill out the checklist below manually).
-
-You can get the script and run it with:
-```
-wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py
-# For security purposes, please check the contents of collect_env.py before running it.
-python collect_env.py
-```
-
- PyTorch or Caffe2:
- How you installed PyTorch (conda, pip, source):
- Build command you used (if compiling from source):
- OS:
- PyTorch version:
- Python version:
- CUDA/cuDNN version:
- GPU models and configuration:
- GCC version (if compiling from source):
- CMake version:
- Versions of any other relevant libraries:
--- a/.github/ISSUE_TEMPLATE/ci-sev.md
+++ b/.github/ISSUE_TEMPLATE/ci-sev.md
@ -5,7 +5,8 @@ about: Tracking incidents for PyTorch's CI infra.

 > NOTE: Remember to label this issue with "`ci: sev`"

-**MERGE BLOCKING** <!-- remove this line if you don't want this SEV to block merges -->
+ <!-- uncomment the below line if you don't want this SEV to block merges -->
+ <!--  **MERGE BLOCKING** -->

 ## Current Status
 *Status could be: preemptive, ongoing, mitigated, closed. Also tell people if they need to take action to fix it (i.e. rebase)*.
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -32,30 +32,6 @@ self-hosted-runner:
    - lf.linux.8xlarge.nvidia.gpu
    - lf.linux.16xlarge.nvidia.gpu
    - lf.linux.g5.4xlarge.nvidia.gpu
-    # Organization-wide AWS Linux Runners with new Amazon 2023 AMI
-    - amz2023.linux.large
-    - amz2023.linux.2xlarge
-    - amz2023.linux.4xlarge
-    - amz2023.linux.12xlarge
-    - amz2023.linux.24xlarge
-    - amz2023.linux.arm64.2xlarge
-    - amz2023.linux.arm64.m7g.4xlarge
-    - amz2023.linux.arm64.m7g.4xlarge.ephemeral
-    - amz2023.linux.4xlarge.nvidia.gpu
-    - amz2023.linux.8xlarge.nvidia.gpu
-    - amz2023.linux.16xlarge.nvidia.gpu
-    - amz2023.linux.g5.4xlarge.nvidia.gpu
-    # Pytorch/pytorch AWS Linux Runners with the new Amazon 2023 AMI on Linux Foundation account
-    - amz2023.lf.linux.large
-    - amz2023.lf.linux.2xlarge
-    - amz2023.lf.linux.4xlarge
-    - amz2023.lf.linux.12xlarge
-    - amz2023.lf.linux.24xlarge
-    - amz2023.lf.linux.arm64.2xlarge
-    - amz2023.lf.linux.4xlarge.nvidia.gpu
-    - amz2023.lf.linux.8xlarge.nvidia.gpu
-    - amz2023.lf.linux.16xlarge.nvidia.gpu
-    - amz2023.lf.linux.g5.4xlarge.nvidia.gpu
    # Repo-specific IBM hosted S390x runner
    - linux.s390x
    # Organization wide AWS Windows runners
--- a/.github/actions/checkout-pytorch/action.yml
+++ b/.github/actions/checkout-pytorch/action.yml
@ -18,8 +18,14 @@ inputs:
 runs:
  using: composite
  steps:
+    - name: Check if in a container runner
+      shell: bash
+      id: check_container_runner
+      run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
+
    - name: Clean workspace
      shell: bash
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
      env:
        NO_SUDO: ${{ inputs.no-sudo }}
      run: |
--- a/.github/actions/linux-test/action.yml
+++ b/.github/actions/linux-test/action.yml
@ -85,15 +85,25 @@ runs:
      with:
        docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

-    - name: Check if in a ARC runner
+    - name: Check if in a container runner
      shell: bash
-      id: check_arc_runner
-      run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
+      id: check_container_runner
+      run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

    - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
      id: install-nvidia-driver
      uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-      if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+      if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
+
+    - name: Setup GPU_FLAG for docker run
+      id: setup-gpu-flag
+      run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+      if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
+
+    - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
+      id: setup-sscache-port-flag
+      run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}

    - name: Lock NVIDIA A100 40GB Frequency
      shell: bash
@ -101,7 +111,7 @@ runs:
        sudo nvidia-smi -pm 1
        sudo nvidia-smi -ac 1215,1410
        nvidia-smi
-      if: contains(matrix.runner, 'a100')
+      if: ${{ contains(matrix.runner, 'a100') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

    - name: Start monitoring script
      id: monitor-script
@ -172,6 +182,7 @@ runs:
        NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
        TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+        SCCACHE_REGION: us-east-1
        SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
        SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
        DOCKER_IMAGE: ${{ inputs.docker-image }}
@ -181,6 +192,9 @@ runs:
        PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
        DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
        HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
+        SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+        IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}
+
      shell: bash
      run: |
        set -x
@ -199,6 +213,7 @@ runs:
        # shellcheck disable=SC2086,SC2090
        container_name=$(docker run \
          ${GPU_FLAG:-} \
+          ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
          -e BUILD_ENVIRONMENT \
          -e PR_NUMBER \
          -e GITHUB_ACTIONS \
@ -227,6 +242,7 @@ runs:
          -e PR_LABELS \
          -e MAX_JOBS="$(nproc --ignore=2)" \
          -e SCCACHE_BUCKET \
+          -e SCCACHE_REGION \
          -e SCCACHE_S3_KEY_PREFIX \
          -e XLA_CUDA \
          -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
@ -234,7 +250,9 @@ runs:
          -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
          -e SKIP_SCCACHE_INITIALIZATION=1 \
          -e HUGGING_FACE_HUB_TOKEN \
+          -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
          -e DASHBOARD_TAG \
+          -e IS_A100_RUNNER \
          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
          --security-opt seccomp=unconfined \
          --cap-add=SYS_PTRACE \
@ -305,7 +323,7 @@ runs:

    - name: Teardown Linux
      uses: pytorch/test-infra/.github/actions/teardown-linux@main
-      if: always()
+      if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'

    # NB: We are currently having an intermittent GPU-related issue on G5 runners with
    # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@ -28,14 +28,14 @@ runs:
        echo "instance-type: $(get_ec2_metadata instance-type)"
        echo "system info $(uname -a)"

-    - name: Check if in a ARC runner
+    - name: Check if in a container runner
      shell: bash
-      id: check_arc_runner
-      run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)"  >> $GITHUB_OUTPUT
+      id: check_container_runner
+      run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

    - name: Start docker if docker deamon is not running
      shell: bash
-      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
      run: |
        if systemctl is-active --quiet docker; then
            echo "Docker daemon is running...";
@ -73,7 +73,7 @@ runs:
        env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"

    - name: Kill any existing containers, clean up images
-      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
      shell: bash
      run: |
        # ignore expansion of "docker ps -q" since it could be empty
@ -116,7 +116,7 @@ runs:
    - name: Check that the docker daemon is running
      shell: bash
      continue-on-error: true
-      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'true' }}
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
      run: |
        set +x

--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-97ed7b36b7a741253d4e41e4da3c901d83294503
+ba696ea3dfec4cbe693bf06a84c75dc196077f5b
--- a/.github/lf-canary-scale-config.yml
+++ b/.github/lf-canary-scale-config.yml
@ -7,10 +7,14 @@
 #   runners. Runners listed here will be available as self hosted
 #   runners, configuration is directly pulled from the main branch.
 #
-# NOTE (Apr, 5, 2021): Linux runners are currently all an amazonlinux2
 #
-# NOTE (Jan 5, 2021): Linux runners are all non-ephemeral to reduce the amount of CreateInstaces calls
-#                     to avoid RequestLimitExceeded issues
+# NOTES:
+#  - Linux runners are by default non-ephemeral to reduce the amount of CreateInstaces calls
+#    to avoid RequestLimitExceeded issues
+#  - When updating this file, run the following command to validate the YAML and to generate
+#    corresponding versions of scale-config for the pytorch/pytorch repo and merge the
+#    pytorch/pytorch changes before merging these changes.
+#    `python .github/scripts/validate_scale_config.py --test-infra-repo-root [path_to_test-infra_root] --pytorch-repo-root [path_to_pytorch_root]``
 #
 # TODO: Add some documentation on how the auto-scaling works
 #
@ -31,58 +35,36 @@ runner_types:
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.10xlarge.avx2:
    disk_size: 200
    instance_type: m4.10xlarge
    is_ephemeral: false
    max_available: 450
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.24xl.spr-metal:
    disk_size: 200
    instance_type: c7i.metal-24xl
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.16xlarge.spr:
    disk_size: 200
    instance_type: c7i.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.9xlarge.ephemeral:
    disk_size: 200
    instance_type: c5.9xlarge
    is_ephemeral: true
    max_available: 50
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
      am2:
        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.12xlarge.ephemeral:
@ -91,240 +73,140 @@ runner_types:
    is_ephemeral: true
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.16xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.24xlarge:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: false
    max_available: 500
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.24xlarge.ephemeral:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.2xlarge:
    disk_size: 150
    instance_type: c5.2xlarge
    is_ephemeral: false
    max_available: 3120
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.4xlarge:
    disk_size: 150
    instance_type: c5.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.8xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.8xlarge
    is_ephemeral: false
    max_available: 400
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g4dn.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.12xlarge
    is_ephemeral: false
    max_available: 250
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g4dn.metal.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.metal
    is_ephemeral: false
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g5.48xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.48xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g5.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.12xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g5.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.4xlarge
    is_ephemeral: false
    max_available: 2400
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g6.4xlarge.experimental.nvidia.gpu:
    disk_size: 150
    instance_type: g6.4xlarge
    is_ephemeral: false
    max_available: 50
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.large:
    max_available: 1200
    disk_size: 15
    instance_type: c5.large
    is_ephemeral: false
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.arm64.2xlarge:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.linux.arm64.m7g.4xlarge:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.linux.arm64.2xlarge.ephemeral:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.linux.arm64.m7g.4xlarge.ephemeral:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.linux.arm64.m7g.metal:
    disk_size: 256
    instance_type: m7g.metal
    is_ephemeral: false
    max_available: 100
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.windows.g4dn.xlarge:
    disk_size: 256
    instance_type: g4dn.xlarge
--- a/.github/lf-scale-config.yml
+++ b/.github/lf-scale-config.yml
@ -7,10 +7,14 @@
 #   runners. Runners listed here will be available as self hosted
 #   runners, configuration is directly pulled from the main branch.
 #
-# NOTE (Apr, 5, 2021): Linux runners are currently all an amazonlinux2
 #
-# NOTE (Jan 5, 2021): Linux runners are all non-ephemeral to reduce the amount of CreateInstaces calls
-#                     to avoid RequestLimitExceeded issues
+# NOTES:
+#  - Linux runners are by default non-ephemeral to reduce the amount of CreateInstaces calls
+#    to avoid RequestLimitExceeded issues
+#  - When updating this file, run the following command to validate the YAML and to generate
+#    corresponding versions of scale-config for the pytorch/pytorch repo and merge the
+#    pytorch/pytorch changes before merging these changes.
+#    `python .github/scripts/validate_scale_config.py --test-infra-repo-root [path_to_test-infra_root] --pytorch-repo-root [path_to_pytorch_root]``
 #
 # TODO: Add some documentation on how the auto-scaling works
 #
@ -31,58 +35,36 @@ runner_types:
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.10xlarge.avx2:
    disk_size: 200
    instance_type: m4.10xlarge
    is_ephemeral: false
    max_available: 450
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.24xl.spr-metal:
    disk_size: 200
    instance_type: c7i.metal-24xl
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.16xlarge.spr:
    disk_size: 200
    instance_type: c7i.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.9xlarge.ephemeral:
    disk_size: 200
    instance_type: c5.9xlarge
    is_ephemeral: true
    max_available: 50
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
      am2:
        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.12xlarge.ephemeral:
@ -91,240 +73,140 @@ runner_types:
    is_ephemeral: true
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.16xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.24xlarge:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: false
    max_available: 500
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.24xlarge.ephemeral:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.2xlarge:
    disk_size: 150
    instance_type: c5.2xlarge
    is_ephemeral: false
    max_available: 3120
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.4xlarge:
    disk_size: 150
    instance_type: c5.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.8xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.8xlarge
    is_ephemeral: false
    max_available: 400
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g4dn.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.12xlarge
    is_ephemeral: false
    max_available: 250
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g4dn.metal.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.metal
    is_ephemeral: false
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g5.48xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.48xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g5.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.12xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g5.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.4xlarge
    is_ephemeral: false
    max_available: 2400
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g6.4xlarge.experimental.nvidia.gpu:
    disk_size: 150
    instance_type: g6.4xlarge
    is_ephemeral: false
    max_available: 50
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.large:
    max_available: 1200
    disk_size: 15
    instance_type: c5.large
    is_ephemeral: false
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.arm64.2xlarge:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.linux.arm64.m7g.4xlarge:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.linux.arm64.2xlarge.ephemeral:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.linux.arm64.m7g.4xlarge.ephemeral:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.linux.arm64.m7g.metal:
    disk_size: 256
    instance_type: m7g.metal
    is_ephemeral: false
    max_available: 100
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.windows.g4dn.xlarge:
    disk_size: 256
    instance_type: g4dn.xlarge
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -86,6 +86,18 @@
  - pull
  - inductor

+- name: OSS CI / pytorchbot / slow tests
+  patterns:
+  - test/slow_tests.json
+  approved_by:
+  - pytorchbot
+  ignore_flaky_failures: false
+  mandatory_checks_name:
+  - EasyCLA
+  - Lint
+  - pull
+  - slow
+
 - name: OSS CI /pytorchbot / Executorch
  patterns:
  - .ci/docker/ci_commit_pins/executorch.txt
@ -532,6 +544,7 @@
  - anijain2305
  - bdhirsh
  - zou3519
+  - isuruf
  mandatory_checks_name:
  - EasyCLA
  - Lint
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -31,3 +31,4 @@ optree==0.12.1
 # NB: test_hparams_* from test_tensorboard is failing with protobuf 5.26.0 in
 # which the stringify metadata is wrong when escaping double quote
 protobuf==3.20.2
+parameterized==0.8.1
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -333,7 +333,7 @@ def generate_wheels_matrix(
        package_type = "manywheel"

    if python_versions is None:
-        python_versions = FULL_PYTHON_VERSIONS + ["3.13"]
+        python_versions = FULL_PYTHON_VERSIONS + ["3.13", "3.13t"]

    if arches is None:
        # Define default compute archivectures
@ -369,7 +369,13 @@ def generate_wheels_matrix(
            # TODO: Enable python 3.13 on rocm, aarch64, windows
            if (
                gpu_arch_type == "rocm" or (os != "linux" and os != "linux-s390x")
-            ) and python_version == "3.13":
+            ) and (python_version == "3.13" or python_version == "3.13t"):
+                continue
+
+            # TODO: Enable python 3.13t on xpu and cpu-s390x
+            if (
+                gpu_arch_type == "xpu" or gpu_arch_type == "cpu-s390x"
+            ) and python_version == "3.13t":
                continue

            if use_split_build and (
@ -412,8 +418,8 @@ def generate_wheels_matrix(
                        ),
                    }
                )
-                # Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
-                if python_version == "3.10" and arch_version == "12.1":
+                # Special build building to use on Colab. Python 3.11 for 12.1 CUDA
+                if python_version == "3.11" and arch_version == "12.1":
                    ret.append(
                        {
                            "python_version": python_version,
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -70,17 +70,15 @@ class BinaryBuildWorkflow:
            )
        else:
            self.build_environment = f"{self.os}-binary-{self.package_type}"
+        if self.use_split_build:
+            # added to distinguish concurrency groups
+            self.build_environment += "-split"

    def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
        output_file_path = (
            GITHUB_DIR
            / f"workflows/generated-{self.build_environment}-{self.branches}.yml"
        )
-        if self.use_split_build:
-            output_file_path = (
-                GITHUB_DIR
-                / f"workflows/generated-{self.build_environment}-{self.branches}-split.yml"
-            )
        with open(output_file_path, "w") as output_file:
            GENERATED = "generated"  # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file
            output_file.writelines([f"# @{GENERATED} DO NOT EDIT MANUALLY\n"])
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -168,6 +168,14 @@ def gh_post_commit_comment(
    )


+def gh_close_pr(org: str, repo: str, pr_num: int, dry_run: bool = False) -> None:
+    url = f"{GITHUB_API_URL}/repos/{org}/{repo}/pulls/{pr_num}"
+    if dry_run:
+        print(f"Dry run closing PR {pr_num}")
+    else:
+        gh_fetch_url(url, method="PATCH", data={"state": "closed"})
+
+
 def gh_delete_comment(org: str, repo: str, comment_id: int) -> None:
    url = f"{GITHUB_API_URL}/repos/{org}/{repo}/issues/comments/{comment_id}"
    gh_fetch_url(url, method="DELETE")
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -17,6 +17,11 @@ if [[ -d "${CACHE_DIRECTORY}" ]]; then
    cp -r "${CACHE_DIRECTORY}" . || true
 fi

+# if lintrunner is not installed, install it
+if ! command -v lintrunner &> /dev/null; then
+    python3 -m pip install lintrunner==0.12.5
+fi
+
 # This has already been cached in the docker image
 lintrunner init 2> /dev/null

@ -33,7 +38,7 @@ python3 torch/utils/data/datapipes/gen_pyi.py

 RC=0
 # Run lintrunner on all files
-if ! lintrunner --force-color --all-files --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
+if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
    echo ""
    echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner -m origin/main\`. (If you don't get the same results, run \'lintrunner init\' to update your local linter)\e[0m"
    echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@ -1,51 +1,103 @@
 # flake8: noqa: G004

+# Note: Copies of this script in runner_determinator.py and _runner-determinator.yml
+#       must be kept in sync. You can do it easily by running the following command:
+#           python .github/scripts/update_runner_determinator.py
+
 """
 This runner determinator is used to determine which set of runners to run a
 GitHub job on. It uses the first comment of a GitHub issue (by default
-https://github.com/pytorch/test-infra/issues/5132) as a user list to determine
-which users will get their jobs to run on experimental runners. This user list
-is also a comma separated list of additional features or experiments which the
-user could be opted in to.
+https://github.com/pytorch/test-infra/issues/5132) to define the configuration
+of which runners should be used to run which job.
+
+The configuration has two parts, the settings and a list of opted-in users,
+separated by a line containing "---".  If the line is not present, the
+settings are considered to be empty with only the second part, the user
+list, defined.
+
+The first part is a YAML block that defines the rollout settings. This can be
+used to define any settings that are needed to determine which runners to use.
+It's fields are defined by the RolloutSettings class below.
+
+The second part is a list of users who are explicitly opted in to the LF fleet.
+The user list is also a comma separated list of additional features or
+experiments which the user could be opted in to.

 The user list has the following rules:

- Users are GitHub usernames with the @ prefix
- If the first line is a "*" then all users will use the new runners
- If the first line is a "!" then all users will use the old runners
+- Users are GitHub usernames, which must start with the @ prefix
 - Each user is also a comma-separated list of features/experiments to enable
- A "#" prefix indicates the user is opted out of the new runners but is opting
-  into features/experiments.
+- A "#" prefix opts the user out of all experiments

-Example user list:
+Example config:
+    # A list of experiments that can be opted into.
+    # This defines the behavior they'll induce when opted into.
+    # Expected syntax is:
+    #   [experiment_name]: # Name of the experiment. Also used for the label prefix.
+    #      rollout_perc: [int] # % of workflows to run with this experiment when users are not opted in.

-    @User1
-    @User2,amz2023
-    #@UserOptOutOfNewRunner,amz2023
+    experiments:
+      lf:
+        rollout_percent: 25
+
+    ---
+
+    # Opt-ins:
+    # Users can opt into the LF fleet by adding their GitHub username to this list
+    # and specifying experiments to enable in a comma-separated list.
+    # Experiments should be from the above list.
+
+    @User1,lf,split_build
+    @User2,lf
+    @User3,split_build
 """

 import logging
 import os
+import random
 from argparse import ArgumentParser
 from logging import LogRecord
-from typing import Any, Iterable
+from typing import Any, Dict, Iterable, List, NamedTuple, Tuple

+import yaml
 from github import Auth, Github
 from github.Issue import Issue


-WORKFLOW_LABEL_META = ""  # use meta runners
+DEFAULT_LABEL_PREFIX = ""  # use meta runners
 WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
 WORKFLOW_LABEL_LF_CANARY = "lf.c."  # use canary runners from the linux foundation

-RUNNER_AMI_LEGACY = ""
-RUNNER_AMI_AMZ2023 = "amz2023"
-
 GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
 GH_OUTPUT_KEY_AMI = "runner-ami"
 GH_OUTPUT_KEY_LABEL_TYPE = "label-type"


+SETTING_EXPERIMENTS = "experiments"
+
+LF_FLEET_EXPERIMENT = "lf"
+CANARY_FLEET_SUFFIX = ".c"
+
+
+class Experiment(NamedTuple):
+    rollout_perc: float = (
+        0  # Percentage of workflows to experiment on when user is not opted-in.
+    )
+    all_branches: bool = (
+        False  # If True, the experiment is also enabled on the exception branches
+    )
+
+    # Add more fields as needed
+
+
+class Settings(NamedTuple):
+    """
+    Settings for the experiments that can be opted into.
+    """
+
+    experiments: Dict[str, Experiment] = {}
+
+
 class ColorFormatter(logging.Formatter):
    """Color codes the log messages based on the log level"""

@ -167,90 +219,194 @@ def get_potential_pr_author(

 def is_exception_branch(branch: str) -> bool:
    """
-    Branches that get opted out of all experiments and should always use Meta runners
+    Branches that get opted out of experiments by default, until they're explicitly enabled.
    """
    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}


-def get_fleet(rollout_state: str, workflow_requestors: Iterable[str]) -> str:
-    """
-    Determines if the job should run on the LF fleet or the Meta fleet
-
-    Returns:
-        The appropriate label prefix for the runner, corresponding to the fleet to use.
-        This gets prefixed to the very start of the runner label.
-    """
-
+def load_yaml(yaml_text: str) -> Any:
    try:
-        if rollout_state[0] == "!":
-            log.info("LF Workflows are disabled for everyone. Using meta runners.")
-            return WORKFLOW_LABEL_META
-        elif rollout_state[0] == "*":
-            log.info("LF Workflows are enabled for everyone. Using LF runners.")
-            return WORKFLOW_LABEL_LF
-        else:
-            all_opted_in_users = {
-                usr_raw.strip("\n\t@ ").split(",")[0]
-                for usr_raw in rollout_state.split()
-            }
-            opted_in_requestors = {
-                usr for usr in workflow_requestors if usr in all_opted_in_users
-            }
-            if opted_in_requestors:
-                log.info(
-                    f"LF Workflows are enabled for {', '.join(opted_in_requestors)}. Using LF runners."
-                )
-                return WORKFLOW_LABEL_LF
-            else:
-                log.info(
-                    f"LF Workflows are disabled for {', '.join(workflow_requestors)}. Using meta runners."
-                )
-                return WORKFLOW_LABEL_META
-
-    except Exception as e:
-        log.error(
-            f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
-        )
-        return WORKFLOW_LABEL_META
+        data = yaml.safe_load(yaml_text)
+        return data
+    except yaml.YAMLError as exc:
+        log.exception("Error loading YAML")
+        raise


-def get_optin_feature(
-    rollout_state: str, workflow_requestors: Iterable[str], feature: str, fallback: str
+def extract_settings_user_opt_in_from_text(rollout_state: str) -> Tuple[str, str]:
+    """
+    Extracts the text with settings, if any, and the opted in users from the rollout state.
+
+    If the issue body contains "---" then the text above that is the settings
+    and the text below is the list of opted in users.
+
+    If it doesn't contain "---" then the settings are empty and the rest is the users.
+    """
+    rollout_state_parts = rollout_state.split("---")
+    if len(rollout_state_parts) >= 2:
+        return rollout_state_parts[0], rollout_state_parts[1]
+    else:
+        return "", rollout_state
+
+
+class UserOptins(Dict[str, List[str]]):
+    """
+    Dictionary of users with a list of features they have opted into
+    """
+
+
+def parse_user_opt_in_from_text(user_optin_text: str) -> UserOptins:
+    """
+    Parse the user opt-in text into a key value pair of username and the list of features they have opted into
+
+    Users are GitHub usernames with the @ prefix. Each user is also a comma-separated list of features/experiments to enable.
+        - Example line: "@User1,lf,split_build"
+        - A "#" prefix indicates the user is opted out of all experiments
+
+
+    """
+    optins = UserOptins()
+    for user in user_optin_text.split("\n"):
+        user = user.strip("\r\n\t -")
+        if not user or not user.startswith("@"):
+            # Not a valid user. Skip
+            continue
+
+        if user:
+            usr_name = user.split(",")[0].strip("@")
+            optins[usr_name] = [exp.strip(" ") for exp in user.split(",")[1:]]
+
+    return optins
+
+
+def parse_settings_from_text(settings_text: str) -> Settings:
+    """
+    Parse the experiments from the issue body into a list of ExperimentSettings
+    """
+    try:
+        if settings_text:
+            # Escape the backtick as well so that we can have the settings in a code block on the GH issue
+            # for easy reading
+            # Note: Using ascii for the backtick so that the cat step in _runner-determinator.yml doesn't choke on
+            #       the backtick character in shell commands.
+            backtick = chr(96)  # backtick character
+            settings_text = settings_text.strip(f"\r\n\t{backtick} ")
+            settings = load_yaml(settings_text)
+
+            # For now we just load experiments. We can expand this if/when we add more settings
+            experiments = {}
+
+            for exp_name, exp_settings in settings.get(SETTING_EXPERIMENTS).items():
+                valid_settings = {}
+                for setting in exp_settings:
+                    if setting not in Experiment._fields:
+                        log.warning(
+                            f"Unexpected setting in experiment: {setting} = {exp_settings[setting]}"
+                        )
+                    else:
+                        valid_settings[setting] = exp_settings[setting]
+
+                experiments[exp_name] = Experiment(**valid_settings)
+            return Settings(experiments)
+
+    except Exception:
+        log.exception("Failed to parse settings")
+
+    return Settings()
+
+
+def parse_settings(rollout_state: str) -> Settings:
+    """
+    Parse settings, if any, from the rollout state.
+
+    If the issue body contains "---" then the text above that is the settings
+    and the text below is the list of opted in users.
+
+    If it doesn't contain "---" then the settings are empty and the default values are used.
+    """
+    settings_text, _ = extract_settings_user_opt_in_from_text(rollout_state)
+    return parse_settings_from_text(settings_text)
+
+
+def parse_users(rollout_state: str) -> UserOptins:
+    """
+    Parse users from the rollout state.
+
+    """
+    _, users_text = extract_settings_user_opt_in_from_text(rollout_state)
+    return parse_user_opt_in_from_text(users_text)
+
+
+def is_user_opted_in(user: str, user_optins: UserOptins, experiment_name: str) -> bool:
+    """
+    Check if a user is opted into an experiment
+    """
+    return experiment_name in user_optins.get(user, [])
+
+
+def get_runner_prefix(
+    rollout_state: str,
+    workflow_requestors: Iterable[str],
+    branch: str,
+    is_canary: bool = False,
 ) -> str:
-    """
-    Used to dynamically opt in jobs to specific runner-type variants.
+    settings = parse_settings(rollout_state)
+    user_optins = parse_users(rollout_state)

-    Returns:
-        The runner-type's variant name if the user has opted in to the feature, otherwise returns an empty string.
-        This variant name is prefixed to the runner-type in the label.
-    """
-    try:
-        userlist = {u.lstrip("#").strip("\n\t@ ") for u in rollout_state.split()}
-        all_opted_in_users = set()
-        for user in userlist:
-            for i in user.split(","):
-                if i == feature:
-                    all_opted_in_users.add(user.split(",")[0])
-        opted_in_requestors = {
-            usr for usr in workflow_requestors if usr in all_opted_in_users
-        }
+    fleet_prefix = ""
+    prefixes = []
+    for experiment_name, experiment_settings in settings.experiments.items():
+        enabled = False

-        if opted_in_requestors:
+        if not experiment_settings.all_branches and is_exception_branch(branch):
            log.info(
-                f"Feature {feature} is enabled for {', '.join(opted_in_requestors)}. Using feature {feature}."
+                f"Branch {branch} is an exception branch. Not enabling experiment {experiment_name}."
            )
-            return feature
-        else:
-            log.info(
-                f"Feature {feature} is disabled for {', '.join(workflow_requestors)}. Using fallback \"{fallback}\"."
-            )
-            return fallback
+            continue

-    except Exception as e:
+        # Is any workflow_requestor opted in to this experiment?
+        opted_in_users = [
+            requestor
+            for requestor in workflow_requestors
+            if is_user_opted_in(requestor, user_optins, experiment_name)
+        ]
+
+        if opted_in_users:
+            log.info(
+                f"{', '.join(opted_in_users)} have opted into experiment {experiment_name}."
+            )
+            enabled = True
+        elif experiment_settings.rollout_perc:
+            # If no user is opted in, then we randomly enable the experiment based on the rollout percentage
+            if random.uniform(0, 100) <= experiment_settings.rollout_perc:
+                log.info(
+                    f"Based on rollout percentage of {experiment_settings.rollout_perc}%, enabling experiment {experiment_name}."
+                )
+                enabled = True
+
+        if enabled:
+            label = experiment_name
+            if experiment_name == LF_FLEET_EXPERIMENT:
+                # We give some special treatment to the "lf" experiment since determines the fleet we use
+                #  - If it's enabled, then we always list it's prefix first
+                #  - If we're in the canary branch, then we append ".c" to the lf prefix
+                if is_canary:
+                    label += CANARY_FLEET_SUFFIX
+                fleet_prefix = label
+            else:
+                prefixes.append(label)
+
+    if len(prefixes) > 1:
        log.error(
-            f'Failed to determine if user has opted-in to feature {feature}. Using fallback "{fallback}". Exception: {e}'
+            f"Only a fleet and one other experiment can be enabled for a job at any time. Enabling {prefixes[0]} and ignoring the rest, which are {', '.join(prefixes[1:])}"
        )
-        return fallback
+        prefixes = prefixes[:1]
+
+    # Fleet always comes first
+    if fleet_prefix:
+        prefixes.insert(0, fleet_prefix)
+
+    return ".".join(prefixes) + "." if prefixes else ""


 def get_rollout_state_from_issue(github_token: str, repo: str, issue_num: int) -> str:
@ -267,53 +423,36 @@ def get_rollout_state_from_issue(github_token: str, repo: str, issue_num: int) -
 def main() -> None:
    args = parse_args()

-    if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
-        log.info(f"Exception branch: '{args.github_branch}', using meta runners")
-        label_type = WORKFLOW_LABEL_META
-        runner_ami = RUNNER_AMI_LEGACY
-    else:
-        try:
-            rollout_state = get_rollout_state_from_issue(
-                args.github_token, args.github_issue_repo, args.github_issue
-            )
+    runner_label_prefix = DEFAULT_LABEL_PREFIX

-            username = get_potential_pr_author(
-                args.github_token,
-                args.github_repo,
-                args.github_actor,
-                args.github_ref_type,
-                args.github_branch,
-            )
+    try:
+        rollout_state = get_rollout_state_from_issue(
+            args.github_token, args.github_issue_repo, args.github_issue
+        )

-            label_type = get_fleet(
-                rollout_state,
-                (
-                    args.github_issue_owner,
-                    username,
-                ),
-            )
-            runner_ami = get_optin_feature(
-                rollout_state=rollout_state,
-                workflow_requestors=(
-                    args.github_issue_owner,
-                    username,
-                ),
-                feature=RUNNER_AMI_AMZ2023,
-                fallback=RUNNER_AMI_LEGACY,
-            )
-        except Exception as e:
-            log.error(
-                f"Failed to get issue. Falling back to meta runners. Exception: {e}"
-            )
-            label_type = WORKFLOW_LABEL_META
-            runner_ami = RUNNER_AMI_LEGACY
+        username = get_potential_pr_author(
+            args.github_token,
+            args.github_repo,
+            args.github_actor,
+            args.github_ref_type,
+            args.github_branch,
+        )

-    # For Canary builds use canary runners
-    if args.github_repo == "pytorch/pytorch-canary" and label_type == WORKFLOW_LABEL_LF:
-        label_type = WORKFLOW_LABEL_LF_CANARY
+        is_canary = args.github_repo == "pytorch/pytorch-canary"

-    set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
-    set_github_output(GH_OUTPUT_KEY_AMI, runner_ami)
+        runner_label_prefix = get_runner_prefix(
+            rollout_state,
+            (args.github_issue_owner, username),
+            args.github_branch,
+            is_canary,
+        )
+
+    except Exception as e:
+        log.error(
+            f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
+        )
+
+    set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)


 if __name__ == "__main__":
--- a/.github/scripts/sync_distributed_folder_prototype.sh
+++ b/.github/scripts/sync_distributed_folder_prototype.sh
@ -1,35 +0,0 @@
-#!/bin/bash
-
-set -eoux pipefail
-
-SYNC_BRANCH=pytorch-stable-prototype
-
-git config user.email "fake@example.com"
-git config user.name  "PyTorch Stable Bot"
-
-git fetch origin main
-git fetch origin "$SYNC_BRANCH"
-git checkout "$SYNC_BRANCH"
-
-# Using a hardcoded SHA here is a massive speedup as we can skip the entire history of the pytorch GitHub repo.
-# This specific SHA was chosen as it was before the "branch point" of the stable branch
-for SHA in $(git log ba3b05fdf37ddbc3c301294d6a560a816335e717..origin/main --pretty="%h" -- torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed)
-do
-    # `git merge-base --is-ancestor` exits with code 0 if the given SHA is an ancestor, and non-0 otherwise
-    if git merge-base --is-ancestor $SHA HEAD || [[ $(git log --grep="(cherry picked from commit $SHA") ]]
-    then
-        echo "Skipping $SHA"
-        continue
-    fi
-    echo "Copying $SHA"
-    git cherry-pick -x "$SHA" -X theirs
-    git reset --soft HEAD~1
-    git add torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed
-    git checkout .
-    git commit --reuse-message=HEAD@{1}
-    git clean -f
-done
-
-if [[ "${WITH_PUSH}" == true ]]; then
-  git push
-fi
--- a/.github/scripts/tag_docker_images_for_release.py
+++ b/.github/scripts/tag_docker_images_for_release.py
@ -51,6 +51,8 @@ def main() -> None:

    for platform_image in platform_images:  # type: ignore[attr-defined]
        for arch in platform_image.keys():  # type: ignore[attr-defined]
+            if arch == "cpu-s390x":
+                continue
            tag_image(
                platform_image[arch],  # type: ignore[index]
                default_tag,
--- a/.github/scripts/test_runner_determinator.py
+++ b/.github/scripts/test_runner_determinator.py
@ -0,0 +1,310 @@
+from unittest import main, TestCase
+from unittest.mock import Mock, patch
+
+import runner_determinator as rd
+
+
+USER_BRANCH = "somebranch"
+EXCEPTION_BRANCH = "main"
+
+
+class TestRunnerDeterminatorIssueParser(TestCase):
+    def test_parse_settings(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 25
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        settings = rd.parse_settings(settings_text)
+
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=25),
+            settings.experiments["lf"],
+            "lf settings not parsed correctly",
+        )
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=0),
+            settings.experiments["otherExp"],
+            "otherExp settings not parsed correctly",
+        )
+
+    def test_parse_settings_in_code_block(self) -> None:
+        settings_text = """
+
+        ```
+        experiments:
+            lf:
+                rollout_perc: 25
+            otherExp:
+                rollout_perc: 0
+
+        ```
+
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        settings = rd.parse_settings(settings_text)
+
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=25),
+            settings.experiments["lf"],
+            "lf settings not parsed correctly",
+        )
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=0),
+            settings.experiments["otherExp"],
+            "otherExp settings not parsed correctly",
+        )
+
+    def test_parse_all_branches_setting(self) -> None:
+        settings_text = """
+        ```
+        experiments:
+            lf:
+                rollout_perc: 25
+                all_branches: true
+            otherExp:
+                all_branches: True
+                rollout_perc: 0
+        ```
+
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        settings = rd.parse_settings(settings_text)
+
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=25, all_branches=True),
+            settings.experiments["lf"],
+            "lf settings not parsed correctly",
+        )
+        self.assertTrue(settings.experiments["otherExp"].all_branches)
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=0, all_branches=True),
+            settings.experiments["otherExp"],
+            "otherExp settings not parsed correctly",
+        )
+
+    def test_parse_users(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        users = rd.parse_users(settings_text)
+        self.assertDictEqual(
+            {"User1": ["lf"], "User2": ["lf", "otherExp"]},
+            users,
+            "Users not parsed correctly",
+        )
+
+    def test_parse_users_without_settings(self) -> None:
+        settings_text = """
+
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        users = rd.parse_users(settings_text)
+        self.assertDictEqual(
+            {"User1": ["lf"], "User2": ["lf", "otherExp"]},
+            users,
+            "Users not parsed correctly",
+        )
+
+
+class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
+    def test_opted_in_user(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], USER_BRANCH)
+        self.assertEqual("lf.", prefix, "Runner prefix not correct for User1")
+
+    def test_opted_in_user_two_experiments(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+        prefix = rd.get_runner_prefix(settings_text, ["User2"], USER_BRANCH)
+        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for User2")
+
+    @patch("random.uniform", return_value=50)
+    def test_opted_out_user(self, mock_uniform: Mock) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 25
+            otherExp:
+                rollout_perc: 25
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+        prefix = rd.get_runner_prefix(settings_text, ["User3"], USER_BRANCH)
+        self.assertEqual("", prefix, "Runner prefix not correct for user")
+
+    @patch("random.uniform", return_value=10)
+    def test_opted_out_user_was_pulled_in_by_rollout(self, mock_uniform: Mock) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 25
+            otherExp:
+                rollout_perc: 25
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        # User3 is opted out, but is pulled into both experiments by the 10% rollout
+        prefix = rd.get_runner_prefix(settings_text, ["User3"], USER_BRANCH)
+        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
+
+    def test_lf_prefix_always_comes_first(self) -> None:
+        settings_text = """
+        experiments:
+            otherExp:
+                rollout_perc: 0
+            lf:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,otherExp,lf
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User2"], USER_BRANCH)
+        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
+
+    def test_ignores_commented_users(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        #@User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], USER_BRANCH)
+        self.assertEqual("", prefix, "Runner prefix not correct for user")
+
+    def test_ignores_extra_experiments(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+            foo:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf,otherExp,foo
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], USER_BRANCH)
+        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
+
+    def test_disables_experiment_on_exception_branches_when_not_explicitly_opted_in(
+        self,
+    ) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 100
+        ---
+
+        Users:
+        @User,lf,otherExp
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], EXCEPTION_BRANCH)
+        self.assertEqual("", prefix, "Runner prefix not correct for user")
+
+    def test_allows_experiment_on_exception_branches_when_explicitly_opted_in(
+        self,
+    ) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 100
+                all_branches: true
+        ---
+
+        Users:
+        @User,lf,otherExp
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], EXCEPTION_BRANCH)
+        self.assertEqual("lf.", prefix, "Runner prefix not correct for user")
+
+
+if __name__ == "__main__":
+    main()
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -36,6 +36,7 @@ from warnings import warn

 import yaml
 from github_utils import (
+    gh_close_pr,
    gh_fetch_json_list,
    gh_fetch_merge_base,
    gh_fetch_url,
@ -1174,11 +1175,11 @@ class GitHubPR:
            for pr in additional_merged_prs:
                pr.add_numbered_label(MERGE_COMPLETE_LABEL, dry_run)

-        if comment_id and self.pr_num:
-            # When the merge process reaches this part, we can assume that the commit
-            # has been successfully pushed to trunk
-            merge_commit_sha = repo.rev_parse(name=REMOTE_MAIN_BRANCH)
+        # When the merge process reaches this part, we can assume that the commit
+        # has been successfully pushed to trunk
+        merge_commit_sha = repo.rev_parse(name=self.default_branch())

+        if comment_id and self.pr_num:
            # Finally, upload the record to Rockset. The list of pending and failed
            # checks are at the time of the merge
            save_merge_record(
@ -1203,6 +1204,17 @@ class GitHubPR:
        else:
            print("Missing comment ID or PR number, couldn't upload to Rockset")

+        # Usually Github will see that the commit has "resolves <pr_num>" in the
+        # commit message and close the PR, but sometimes it doesn't, leading to
+        # confusion.  When it doesn't, we close it manually.
+        time.sleep(60)  # Give Github some time to close the PR
+        manually_close_merged_pr(
+            pr=self,
+            additional_merged_prs=additional_merged_prs,
+            merge_commit_sha=merge_commit_sha,
+            dry_run=dry_run,
+        )
+
    def merge_changes(
        self,
        repo: GitRepo,
@ -1503,6 +1515,34 @@ def checks_to_markdown_bullets(
    ]


+def manually_close_merged_pr(
+    pr: GitHubPR,
+    additional_merged_prs: List[GitHubPR],
+    merge_commit_sha: str,
+    dry_run: bool,
+) -> None:
+    def _comment_and_close(pr: GitHubPR, comment: str) -> None:
+        pr = GitHubPR(pr.org, pr.project, pr.pr_num)  # Refresh the PR
+        if not pr.is_closed():
+            gh_post_pr_comment(pr.org, pr.project, pr.pr_num, comment, dry_run)
+            gh_close_pr(pr.org, pr.project, pr.pr_num, dry_run)
+
+    message = (
+        f"This PR (#{pr.pr_num}) was merged in {merge_commit_sha} but it is still open, likely due to a Github bug, "
+        "so mergebot is closing it manually.  If you think this is a mistake, please feel free to reopen and contact Dev Infra."
+    )
+    _comment_and_close(pr, message)
+    for additional_pr in additional_merged_prs:
+        message = (
+            f"This PR (#{additional_pr.pr_num}) was merged as part of PR #{pr.pr_num} in the stack under {merge_commit_sha} "
+            "but it is still open, likely due to a Github bug, so mergebot is closing it manually. "
+            "If you think this is a mistake, please feel free to reopen and contact Dev Infra."
+        )
+        _comment_and_close(additional_pr, message)
+
+    print(f"PR {pr.pr_num} and all additional PRs in the stack have been closed.")
+
+
@retries_decorator()
 def save_merge_record(
    comment_id: int,
--- a/.github/scripts/update_runner_determinator.py
+++ b/.github/scripts/update_runner_determinator.py
@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+import re
+
+
+# Read the contents of runner_determinator.py
+with open(".github/scripts/runner_determinator.py") as script_file:
+    script_content = script_file.read()
+
+# Indent the script content by 10 spaces to match destination indentation
+indented_script_content = "\n".join(
+    [" " * 10 + line if line else line for line in script_content.splitlines()]
+)
+
+# Read the contents of _runner-determinator.yml
+with open(".github/workflows/_runner-determinator.yml") as yml_file:
+    yml_content = yml_file.read()
+
+# Replace the content between the markers
+new_yml_content = re.sub(
+    r"(cat <<EOF > runner_determinator.py\n)(.*?)(\n\s+EOF)",
+    lambda match: match.group(1) + indented_script_content + match.group(3),
+    yml_content,
+    flags=re.DOTALL,
+)
+
+# Save the modified content back to _runner-determinator.yml
+with open(".github/workflows/_runner-determinator.yml", "w") as yml_file:
+    yml_file.write(new_yml_content)
+
+print("Updated _runner-determinator.yml with the contents of runner_determinator.py")
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@ -91,14 +91,14 @@ jobs:
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

-      - name: Check if in a ARC runner
+      - name: Check if in a container runner
        shell: bash
-        id: check_arc_runner
-        run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
+        id: check_container_runner
+        run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        if: ${{ inputs.cuda-version != 'cpu' && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+        if: ${{ inputs.cuda-version != 'cpu' && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

      - name: Output disk space left
        run: |
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -109,6 +109,7 @@ jobs:
    steps:
      - name: Setup SSH (Click me for login details)
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}

@ -118,13 +119,16 @@ jobs:
      # checkout. In other cases you should prefer a local checkout.
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          no-sudo: ${{ inputs.build-environment == 'linux-s390x-binary-manywheel' }}

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'

      - name: configure aws credentials
        uses: aws-actions/configure-aws-credentials@v3
-        if: ${{ inputs.aws-role-to-assume != '' }}
+        if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        with:
          role-to-assume: ${{ inputs.aws-role-to-assume }}
          role-session-name: gha-linux-build
@ -133,11 +137,13 @@ jobs:
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image-name: ${{ inputs.docker-image-name }}

      - name: Use following to pull public copy of the image
        id: print-ghcr-mirror
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        env:
          ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
        shell: bash
@ -147,6 +153,7 @@ jobs:

      - name: Pull docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -174,6 +181,7 @@ jobs:
      - name: Download pytest cache
        uses: ./.github/actions/pytest-cache-download
        continue-on-error: true
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          cache_dir: .pytest_cache
          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
@ -195,6 +203,7 @@ jobs:
          PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
          TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          DOCKER_IMAGE_S390X: ${{ inputs.docker-image-name }}
          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
          DEBUG: ${{ inputs.build-with-debug && '1' || '0' }}
          OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
@ -202,7 +211,21 @@ jobs:
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
          USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
        run: |
+          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
+            JENKINS_USER=
+            USED_IMAGE="${DOCKER_IMAGE_S390X}"
+
+            # since some steps are skipped on s390x, if they are necessary, run them here
+            env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+            env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+          else
+            JENKINS_USER="--user jenkins"
+            USED_IMAGE="${DOCKER_IMAGE}"
+          fi
+
          # detached container should get cleaned up by teardown_ec2_linux
+          # Used for JENKINS_USER, which can be empty
+          # shellcheck disable=SC2086
          container_name=$(docker run \
            -e BUILD_ENVIRONMENT \
            -e MAX_JOBS="$(nproc --ignore=2)" \
@ -225,10 +248,10 @@ jobs:
            --cap-add=SYS_PTRACE \
            --tty \
            --detach \
-            --user jenkins \
+            ${JENKINS_USER} \
            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
+            "${USED_IMAGE}"
          )
          docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'

@ -239,7 +262,7 @@ jobs:

      - name: Store PyTorch Build Artifacts on S3
        uses: seemethere/upload-artifact-s3@v5
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}
          retention-days: 14
@ -249,7 +272,7 @@ jobs:

      - name: Store PyTorch Build Artifacts on S3 for split build
        uses: seemethere/upload-artifact-s3@v5
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}-experimental-split-build
          retention-days: 14
@ -257,8 +280,26 @@ jobs:
          path: artifacts.zip
          s3-bucket: ${{ inputs.s3-bucket }}

+      - name: Store PyTorch Build Artifacts for s390x
+        uses: actions/upload-artifact@v3
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
+        with:
+          name: ${{ inputs.build-environment }}
+          retention-days: 14
+          if-no-files-found: error
+          path: artifacts.zip
+
+      - name: Store PyTorch Build Artifacts for s390x for split build
+        uses: actions/upload-artifact@v3
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
+        with:
+          name: ${{ inputs.build-environment }}-experimental-split-build
+          retention-days: 14
+          if-no-files-found: error
+          path: artifacts.zip
+
      - name: Upload sccache stats
-        if: steps.build.outcome != 'skipped'
+        if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
        uses: seemethere/upload-artifact-s3@v5
        with:
          s3-prefix: |
@ -270,4 +311,13 @@ jobs:

      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
-        if: always()
+        if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'
+
+      - name: Cleanup docker
+        if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel'
+        shell: bash
+        run: |
+          # on s390x stop the container for clean worker stop
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -114,22 +114,32 @@ jobs:
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

-      - name: Check if in a ARC runner
+      - name: Check if in a container runner
        shell: bash
-        id: check_arc_runner
-        run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
+        id: check_container_runner
+        run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        id: install-nvidia-driver
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
+
+      - name: Setup GPU_FLAG for docker run
+        id: setup-gpu-flag
+        run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
+
+      - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
+        id: setup-sscache-port-flag
+        run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
+        if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}

      - name: Lock NVIDIA A100 40GB Frequency
        run: |
          sudo nvidia-smi -pm 1
          sudo nvidia-smi -ac 1215,1410
          nvidia-smi
-        if: contains(matrix.runner, 'a100')
+        if: ${{ contains(matrix.runner, 'a100') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

      - name: Start monitoring script
        id: monitor-script
@ -208,6 +218,7 @@ jobs:
          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
          TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+          SCCACHE_REGION: us-east-1
          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
          SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
          DOCKER_IMAGE: ${{ inputs.docker-image }}
@ -218,6 +229,7 @@ jobs:
          DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}

        run: |
          set -x
@ -236,6 +248,7 @@ jobs:
          # shellcheck disable=SC2086,SC2090
          container_name=$(docker run \
            ${GPU_FLAG:-} \
+            ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
            -e BUILD_ENVIRONMENT \
            -e PR_NUMBER \
            -e GITHUB_ACTIONS \
@ -265,6 +278,7 @@ jobs:
            -e PR_LABELS \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e SCCACHE_BUCKET \
+            -e SCCACHE_REGION \
            -e SCCACHE_S3_KEY_PREFIX \
            -e XLA_CUDA \
            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
@ -274,6 +288,7 @@ jobs:
            -e HUGGING_FACE_HUB_TOKEN \
            -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
            -e DASHBOARD_TAG \
+            -e IS_A100_RUNNER \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --security-opt seccomp=unconfined \
            --cap-add=SYS_PTRACE \
@ -343,7 +358,7 @@ jobs:

      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
-        if: always()
+        if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'

      # NB: We are currently having an intermittent GPU-related issue on G5 runners with
      # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@ -88,6 +88,13 @@ jobs:
          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
          pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt

+      - name: Get workflow job id
+        id: get-job-id
+        uses: ./.github/actions/get-workflow-job-id
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
      - name: Install PyTorch and run MPS tests
        id: test
        env:
@ -103,6 +110,14 @@ jobs:
          NO_TEST_TIMEOUT: ${{ needs.filter.outputs.ci-no-test-timeout }}
          NO_TD: ${{ needs.filter.outputs.ci-no-td }}
          PIP_REQUIREMENTS_FILE: .github/requirements/pip-requirements-${{ runner.os }}.txt
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_WORKFLOW: ${{ github.workflow }}
+          GITHUB_JOB: ${{ github.job }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+          GITHUB_RUN_NUMBER: ${{ github.run_number }}
+          GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
+          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
          REENABLED_ISSUES: ${{ needs.filter.outputs.reenabled-issues }}
        run: |
          # shellcheck disable=SC1090
@ -144,13 +159,6 @@ jobs:
        run: |
          cat test/**/*_toprint.log || true

-      - name: Get workflow job id
-        id: get-job-id
-        uses: ./.github/actions/get-workflow-job-id
-        if: always()
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-
      - name: Upload test artifacts
        uses: ./.github/actions/upload-test-artifacts
        if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@ -59,52 +59,104 @@ jobs:
          cat <<EOF > runner_determinator.py
          # flake8: noqa: G004

+          # Note: Copies of this script in runner_determinator.py and _runner-determinator.yml
+          #       must be kept in sync. You can do it easily by running the following command:
+          #           python .github/scripts/update_runner_determinator.py
+
          """
          This runner determinator is used to determine which set of runners to run a
          GitHub job on. It uses the first comment of a GitHub issue (by default
-          https://github.com/pytorch/test-infra/issues/5132) as a user list to determine
-          which users will get their jobs to run on experimental runners. This user list
-          is also a comma separated list of additional features or experiments which the
-          user could be opted in to.
+          https://github.com/pytorch/test-infra/issues/5132) to define the configuration
+          of which runners should be used to run which job.
+
+          The configuration has two parts, the settings and a list of opted-in users,
+          separated by a line containing "---".  If the line is not present, the
+          settings are considered to be empty with only the second part, the user
+          list, defined.
+
+          The first part is a YAML block that defines the rollout settings. This can be
+          used to define any settings that are needed to determine which runners to use.
+          It's fields are defined by the RolloutSettings class below.
+
+          The second part is a list of users who are explicitly opted in to the LF fleet.
+          The user list is also a comma separated list of additional features or
+          experiments which the user could be opted in to.

          The user list has the following rules:

-          - Users are GitHub usernames with the @ prefix
-          - If the first line is a "*" then all users will use the new runners
-          - If the first line is a "!" then all users will use the old runners
+          - Users are GitHub usernames, which must start with the @ prefix
          - Each user is also a comma-separated list of features/experiments to enable
-          - A "#" prefix indicates the user is opted out of the new runners but is opting
-            into features/experiments.
+          - A "#" prefix opts the user out of all experiments

-          Example user list:
+          Example config:
+              # A list of experiments that can be opted into.
+              # This defines the behavior they'll induce when opted into.
+              # Expected syntax is:
+              #   [experiment_name]: # Name of the experiment. Also used for the label prefix.
+              #      rollout_perc: [int] # % of workflows to run with this experiment when users are not opted in.

-              @User1
-              @User2,amz2023
-              #@UserOptOutOfNewRunner,amz2023
+              experiments:
+                lf:
+                  rollout_percent: 25
+
+              ---
+
+              # Opt-ins:
+              # Users can opt into the LF fleet by adding their GitHub username to this list
+              # and specifying experiments to enable in a comma-separated list.
+              # Experiments should be from the above list.
+
+              @User1,lf,split_build
+              @User2,lf
+              @User3,split_build
          """

          import logging
          import os
+          import random
          from argparse import ArgumentParser
          from logging import LogRecord
-          from typing import Any, Iterable
+          from typing import Any, Dict, Iterable, List, NamedTuple, Tuple

+          import yaml
          from github import Auth, Github
          from github.Issue import Issue


-          WORKFLOW_LABEL_META = ""  # use meta runners
+          DEFAULT_LABEL_PREFIX = ""  # use meta runners
          WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
          WORKFLOW_LABEL_LF_CANARY = "lf.c."  # use canary runners from the linux foundation

-          RUNNER_AMI_LEGACY = ""
-          RUNNER_AMI_AMZ2023 = "amz2023"
-
          GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
          GH_OUTPUT_KEY_AMI = "runner-ami"
          GH_OUTPUT_KEY_LABEL_TYPE = "label-type"


+          SETTING_EXPERIMENTS = "experiments"
+
+          LF_FLEET_EXPERIMENT = "lf"
+          CANARY_FLEET_SUFFIX = ".c"
+
+
+          class Experiment(NamedTuple):
+              rollout_perc: float = (
+                  0  # Percentage of workflows to experiment on when user is not opted-in.
+              )
+              all_branches: bool = (
+                  False  # If True, the experiment is also enabled on the exception branches
+              )
+
+              # Add more fields as needed
+
+
+          class Settings(NamedTuple):
+              """
+              Settings for the experiments that can be opted into.
+              """
+
+              experiments: Dict[str, Experiment] = {}
+
+
          class ColorFormatter(logging.Formatter):
              """Color codes the log messages based on the log level"""

@ -226,90 +278,194 @@ jobs:

          def is_exception_branch(branch: str) -> bool:
              """
-              Branches that get opted out of all experiments and should always use Meta runners
+              Branches that get opted out of experiments by default, until they're explicitly enabled.
              """
              return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}


-          def get_fleet(rollout_state: str, workflow_requestors: Iterable[str]) -> str:
-              """
-              Determines if the job should run on the LF fleet or the Meta fleet
-
-              Returns:
-                  The appropriate label prefix for the runner, corresponding to the fleet to use.
-                  This gets prefixed to the very start of the runner label.
-              """
-
+          def load_yaml(yaml_text: str) -> Any:
              try:
-                  if rollout_state[0] == "!":
-                      log.info("LF Workflows are disabled for everyone. Using meta runners.")
-                      return WORKFLOW_LABEL_META
-                  elif rollout_state[0] == "*":
-                      log.info("LF Workflows are enabled for everyone. Using LF runners.")
-                      return WORKFLOW_LABEL_LF
-                  else:
-                      all_opted_in_users = {
-                          usr_raw.strip("\n\t@ ").split(",")[0]
-                          for usr_raw in rollout_state.split()
-                      }
-                      opted_in_requestors = {
-                          usr for usr in workflow_requestors if usr in all_opted_in_users
-                      }
-                      if opted_in_requestors:
-                          log.info(
-                              f"LF Workflows are enabled for {', '.join(opted_in_requestors)}. Using LF runners."
-                          )
-                          return WORKFLOW_LABEL_LF
-                      else:
-                          log.info(
-                              f"LF Workflows are disabled for {', '.join(workflow_requestors)}. Using meta runners."
-                          )
-                          return WORKFLOW_LABEL_META
-
-              except Exception as e:
-                  log.error(
-                      f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
-                  )
-                  return WORKFLOW_LABEL_META
+                  data = yaml.safe_load(yaml_text)
+                  return data
+              except yaml.YAMLError as exc:
+                  log.exception("Error loading YAML")
+                  raise


-          def get_optin_feature(
-              rollout_state: str, workflow_requestors: Iterable[str], feature: str, fallback: str
+          def extract_settings_user_opt_in_from_text(rollout_state: str) -> Tuple[str, str]:
+              """
+              Extracts the text with settings, if any, and the opted in users from the rollout state.
+
+              If the issue body contains "---" then the text above that is the settings
+              and the text below is the list of opted in users.
+
+              If it doesn't contain "---" then the settings are empty and the rest is the users.
+              """
+              rollout_state_parts = rollout_state.split("---")
+              if len(rollout_state_parts) >= 2:
+                  return rollout_state_parts[0], rollout_state_parts[1]
+              else:
+                  return "", rollout_state
+
+
+          class UserOptins(Dict[str, List[str]]):
+              """
+              Dictionary of users with a list of features they have opted into
+              """
+
+
+          def parse_user_opt_in_from_text(user_optin_text: str) -> UserOptins:
+              """
+              Parse the user opt-in text into a key value pair of username and the list of features they have opted into
+
+              Users are GitHub usernames with the @ prefix. Each user is also a comma-separated list of features/experiments to enable.
+                  - Example line: "@User1,lf,split_build"
+                  - A "#" prefix indicates the user is opted out of all experiments
+
+
+              """
+              optins = UserOptins()
+              for user in user_optin_text.split("\n"):
+                  user = user.strip("\r\n\t -")
+                  if not user or not user.startswith("@"):
+                      # Not a valid user. Skip
+                      continue
+
+                  if user:
+                      usr_name = user.split(",")[0].strip("@")
+                      optins[usr_name] = [exp.strip(" ") for exp in user.split(",")[1:]]
+
+              return optins
+
+
+          def parse_settings_from_text(settings_text: str) -> Settings:
+              """
+              Parse the experiments from the issue body into a list of ExperimentSettings
+              """
+              try:
+                  if settings_text:
+                      # Escape the backtick as well so that we can have the settings in a code block on the GH issue
+                      # for easy reading
+                      # Note: Using ascii for the backtick so that the cat step in _runner-determinator.yml doesn't choke on
+                      #       the backtick character in shell commands.
+                      backtick = chr(96)  # backtick character
+                      settings_text = settings_text.strip(f"\r\n\t{backtick} ")
+                      settings = load_yaml(settings_text)
+
+                      # For now we just load experiments. We can expand this if/when we add more settings
+                      experiments = {}
+
+                      for exp_name, exp_settings in settings.get(SETTING_EXPERIMENTS).items():
+                          valid_settings = {}
+                          for setting in exp_settings:
+                              if setting not in Experiment._fields:
+                                  log.warning(
+                                      f"Unexpected setting in experiment: {setting} = {exp_settings[setting]}"
+                                  )
+                              else:
+                                  valid_settings[setting] = exp_settings[setting]
+
+                          experiments[exp_name] = Experiment(**valid_settings)
+                      return Settings(experiments)
+
+              except Exception:
+                  log.exception("Failed to parse settings")
+
+              return Settings()
+
+
+          def parse_settings(rollout_state: str) -> Settings:
+              """
+              Parse settings, if any, from the rollout state.
+
+              If the issue body contains "---" then the text above that is the settings
+              and the text below is the list of opted in users.
+
+              If it doesn't contain "---" then the settings are empty and the default values are used.
+              """
+              settings_text, _ = extract_settings_user_opt_in_from_text(rollout_state)
+              return parse_settings_from_text(settings_text)
+
+
+          def parse_users(rollout_state: str) -> UserOptins:
+              """
+              Parse users from the rollout state.
+
+              """
+              _, users_text = extract_settings_user_opt_in_from_text(rollout_state)
+              return parse_user_opt_in_from_text(users_text)
+
+
+          def is_user_opted_in(user: str, user_optins: UserOptins, experiment_name: str) -> bool:
+              """
+              Check if a user is opted into an experiment
+              """
+              return experiment_name in user_optins.get(user, [])
+
+
+          def get_runner_prefix(
+              rollout_state: str,
+              workflow_requestors: Iterable[str],
+              branch: str,
+              is_canary: bool = False,
          ) -> str:
-              """
-              Used to dynamically opt in jobs to specific runner-type variants.
+              settings = parse_settings(rollout_state)
+              user_optins = parse_users(rollout_state)

-              Returns:
-                  The runner-type's variant name if the user has opted in to the feature, otherwise returns an empty string.
-                  This variant name is prefixed to the runner-type in the label.
-              """
-              try:
-                  userlist = {u.lstrip("#").strip("\n\t@ ") for u in rollout_state.split()}
-                  all_opted_in_users = set()
-                  for user in userlist:
-                      for i in user.split(","):
-                          if i == feature:
-                              all_opted_in_users.add(user.split(",")[0])
-                  opted_in_requestors = {
-                      usr for usr in workflow_requestors if usr in all_opted_in_users
-                  }
+              fleet_prefix = ""
+              prefixes = []
+              for experiment_name, experiment_settings in settings.experiments.items():
+                  enabled = False

-                  if opted_in_requestors:
+                  if not experiment_settings.all_branches and is_exception_branch(branch):
                      log.info(
-                          f"Feature {feature} is enabled for {', '.join(opted_in_requestors)}. Using feature {feature}."
+                          f"Branch {branch} is an exception branch. Not enabling experiment {experiment_name}."
                      )
-                      return feature
-                  else:
-                      log.info(
-                          f"Feature {feature} is disabled for {', '.join(workflow_requestors)}. Using fallback \"{fallback}\"."
-                      )
-                      return fallback
+                      continue

-              except Exception as e:
+                  # Is any workflow_requestor opted in to this experiment?
+                  opted_in_users = [
+                      requestor
+                      for requestor in workflow_requestors
+                      if is_user_opted_in(requestor, user_optins, experiment_name)
+                  ]
+
+                  if opted_in_users:
+                      log.info(
+                          f"{', '.join(opted_in_users)} have opted into experiment {experiment_name}."
+                      )
+                      enabled = True
+                  elif experiment_settings.rollout_perc:
+                      # If no user is opted in, then we randomly enable the experiment based on the rollout percentage
+                      if random.uniform(0, 100) <= experiment_settings.rollout_perc:
+                          log.info(
+                              f"Based on rollout percentage of {experiment_settings.rollout_perc}%, enabling experiment {experiment_name}."
+                          )
+                          enabled = True
+
+                  if enabled:
+                      label = experiment_name
+                      if experiment_name == LF_FLEET_EXPERIMENT:
+                          # We give some special treatment to the "lf" experiment since determines the fleet we use
+                          #  - If it's enabled, then we always list it's prefix first
+                          #  - If we're in the canary branch, then we append ".c" to the lf prefix
+                          if is_canary:
+                              label += CANARY_FLEET_SUFFIX
+                          fleet_prefix = label
+                      else:
+                          prefixes.append(label)
+
+              if len(prefixes) > 1:
                  log.error(
-                      f'Failed to determine if user has opted-in to feature {feature}. Using fallback "{fallback}". Exception: {e}'
+                      f"Only a fleet and one other experiment can be enabled for a job at any time. Enabling {prefixes[0]} and ignoring the rest, which are {', '.join(prefixes[1:])}"
                  )
-                  return fallback
+                  prefixes = prefixes[:1]
+
+              # Fleet always comes first
+              if fleet_prefix:
+                  prefixes.insert(0, fleet_prefix)
+
+              return ".".join(prefixes) + "." if prefixes else ""


          def get_rollout_state_from_issue(github_token: str, repo: str, issue_num: int) -> str:
@ -326,53 +482,36 @@ jobs:
          def main() -> None:
              args = parse_args()

-              if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
-                  log.info(f"Exception branch: '{args.github_branch}', using meta runners")
-                  label_type = WORKFLOW_LABEL_META
-                  runner_ami = RUNNER_AMI_LEGACY
-              else:
-                  try:
-                      rollout_state = get_rollout_state_from_issue(
-                          args.github_token, args.github_issue_repo, args.github_issue
-                      )
+              runner_label_prefix = DEFAULT_LABEL_PREFIX

-                      username = get_potential_pr_author(
-                          args.github_token,
-                          args.github_repo,
-                          args.github_actor,
-                          args.github_ref_type,
-                          args.github_branch,
-                      )
+              try:
+                  rollout_state = get_rollout_state_from_issue(
+                      args.github_token, args.github_issue_repo, args.github_issue
+                  )

-                      label_type = get_fleet(
-                          rollout_state,
-                          (
-                              args.github_issue_owner,
-                              username,
-                          ),
-                      )
-                      runner_ami = get_optin_feature(
-                          rollout_state=rollout_state,
-                          workflow_requestors=(
-                              args.github_issue_owner,
-                              username,
-                          ),
-                          feature=RUNNER_AMI_AMZ2023,
-                          fallback=RUNNER_AMI_LEGACY,
-                      )
-                  except Exception as e:
-                      log.error(
-                          f"Failed to get issue. Falling back to meta runners. Exception: {e}"
-                      )
-                      label_type = WORKFLOW_LABEL_META
-                      runner_ami = RUNNER_AMI_LEGACY
+                  username = get_potential_pr_author(
+                      args.github_token,
+                      args.github_repo,
+                      args.github_actor,
+                      args.github_ref_type,
+                      args.github_branch,
+                  )

-              # For Canary builds use canary runners
-              if args.github_repo == "pytorch/pytorch-canary" and label_type == WORKFLOW_LABEL_LF:
-                  label_type = WORKFLOW_LABEL_LF_CANARY
+                  is_canary = args.github_repo == "pytorch/pytorch-canary"

-              set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
-              set_github_output(GH_OUTPUT_KEY_AMI, runner_ami)
+                  runner_label_prefix = get_runner_prefix(
+                      rollout_state,
+                      (args.github_issue_owner, username),
+                      args.github_branch,
+                      is_canary,
+                  )
+
+              except Exception as e:
+                  log.error(
+                      f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
+                  )
+
+              set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)


          if __name__ == "__main__":
--- a/.github/workflows/build-conda-images.yml
+++ b/.github/workflows/build-conda-images.yml
@ -32,7 +32,7 @@ concurrency:
 jobs:
  build-docker:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: am2.linux.9xlarge.ephemeral
+    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
        cuda_version: ["11.8", "12.1", "12.4", "cpu"]
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -29,9 +29,19 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  build-docker-cuda:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.4", "12.1", "11.8"]
@ -66,7 +76,8 @@ jobs:
          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cuda${{matrix.cuda_version}}
  build-docker-rocm:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        rocm_version: ["6.1", "6.2"]
@ -101,7 +112,8 @@ jobs:
          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:rocm${{matrix.rocm_version}}
  build-docker-cpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -33,9 +33,19 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  build-docker-cuda:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: am2.linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.4", "12.1", "11.8"]
@ -73,7 +83,8 @@ jobs:
  # NOTE: manylinux_2_28 are still experimental, see https://github.com/pytorch/pytorch/issues/123649
  build-docker-cuda-manylinux_2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.4", "12.1", "11.8"]
@ -110,7 +121,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux2_28-builder:cuda${{matrix.cuda_version}}
  build-docker-cuda-aarch64:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.arm64.2xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.4"]
@ -143,7 +155,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cuda${{matrix.cuda_version}}
  build-docker-rocm:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: am2.linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        rocm_version: ["6.1", "6.2"]
@ -178,7 +191,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux-builder:rocm${{matrix.rocm_version}}
  build-docker-cpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: am2.linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
@ -207,7 +221,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux-builder:cpu
  build-docker-cpu-manylinux_2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-manylinux_2_28
    steps:
@ -238,7 +253,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux2_28-builder:cpu
  build-docker-cpu-aarch64:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.arm64.2xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-aarch64
    steps:
@ -269,7 +285,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cpu-aarch64
  build-docker-cpu-aarch64-2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.arm64.2xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-aarch64-2_28
    steps:
@ -303,7 +320,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux2_28_aarch64-builder:cpu-aarch64
  build-docker-cpu-cxx11-abi:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-cxx11-abi
    steps:
@ -334,7 +352,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinuxcxx11-abi-builder:cpu-cxx11-abi
  build-docker-xpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: xpu
    steps:
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -27,13 +27,23 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  build-wheel:
    name: "Build Triton Wheel"
-    runs-on: [self-hosted, linux.4xlarge]
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        py_vers: [ "3.9", "3.10", "3.11", "3.12" ]
        device: ["cuda", "rocm", "xpu"]
        include:
          - device: "rocm"
@ -81,9 +91,6 @@ jobs:

          # Determine python executable for given version
          case $PY_VERS in
-          3.8)
-            PYTHON_EXECUTABLE=/opt/python/cp38-cp38/bin/python
-            ;;
          3.9)
            PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python
            ;;
@ -199,11 +206,12 @@ jobs:

  build-conda:
    name: "Build Triton Conda"
-    runs-on: [self-hosted, linux.2xlarge]
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        py_vers: [ "3.9", "3.10", "3.11", "3.12" ]
    timeout-minutes: 40
    env:
      DOCKER_IMAGE: pytorch/conda-builder:cpu
--- a/.github/workflows/check-labels.yml
+++ b/.github/workflows/check-labels.yml
@ -30,6 +30,9 @@ concurrency:

 jobs:
  check-labels:
+    permissions:
+      contents: read
+      pull-requests: write
    name: Check labels
    if: github.repository_owner == 'pytorch'
    runs-on: linux.20_04.4x
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -16,6 +16,15 @@ on:
    paths: [.github/workflows/create_release.yml]

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  release:
    if: ${{ github.repository == 'pytorch/pytorch' }}
    name: Create Release
@ -63,7 +72,7 @@ jobs:
          files: ${{env.PT_RELEASE_FILE}}
      - name: Upload source distribution to GHA artifacts for release tags
        if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4.4.0
        with:
          name: ${{ env.PT_RELEASE_FILE }}
          path: ${{ env.PT_RELEASE_FILE }}
@ -73,12 +82,14 @@ jobs:

  upload_source_code_to_s3:
    if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
-    runs-on: linux.2xlarge
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
    environment: sourcecode-upload
    name: Upload source code to S3 for release tags
    permissions:
      id-token: write
-    needs: release
+    needs:
+      - get-label-type
+      - release
    steps:
      - uses: actions/download-artifact@v4.1.7
        with:
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -30,8 +30,18 @@ env:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  docker-build:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
+    needs: get-label-type
    timeout-minutes: 240
    strategy:
      fail-fast: false
@ -68,7 +78,7 @@ jobs:
          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
            runner: linux.arm64.m7g.4xlarge
            timeout-minutes: 600
-    runs-on: [self-hosted, "${{ matrix.runner }}"]
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}"
    env:
      DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ matrix.docker-image-name }}
    steps:
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@ -34,9 +34,19 @@ env:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  generate-matrix:
    if: github.repository_owner == 'pytorch'
-    runs-on: [self-hosted, linux.large]
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.large"
    outputs:
      matrix: ${{ steps.generate-matrix.outputs.matrix }}
    steps:
@ -54,10 +64,12 @@ jobs:

  build:
    if: ${{ github.repository == 'pytorch/pytorch' }}
-    runs-on: [self-hosted, linux.2xlarge]
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
    environment: ${{ (github.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    timeout-minutes: 240
-    needs: generate-matrix
+    needs:
+      - generate-matrix
+      - get-label-type
    strategy:
      matrix: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
      fail-fast: false
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -1010,76 +1010,6 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_10-cuda12_1-full-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_10-cuda12_1-full
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_1-full-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_10-cuda12_1-full-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1-full
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_1-full-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_10-cuda12_1-full-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1-full
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_10-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -1766,6 +1696,76 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_11-cuda12_1-full-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_11-cuda12_1-full
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_1-full-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_11-cuda12_1-full-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_1-full
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_1-full-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda12_1-full-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_1-full
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_11-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -3324,3 +3324,353 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cpu
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cpu-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cpu-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cpu-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cpu-cxx11-abi
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cpu-cxx11-abi-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu-cxx11-abi
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-cxx11-abi-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cpu-cxx11-abi-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu-cxx11-abi
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda11_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda11_8
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda11_8-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda11_8-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda11_8
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda11_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda11_8-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda11_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda12_1-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda12_1
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_1-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda12_1-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_1
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_1-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda12_1-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_1
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda12_4
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda12_4-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_4
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-split-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-split-main.yml
@ -2,7 +2,7 @@

 # Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-manywheel
+name: linux-binary-manywheel-split


 on:
@ -19,7 +19,7 @@ env:
  ANACONDA_USER: pytorch
  AWS_DEFAULT_REGION: us-east-1
  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-manywheel
+  BUILD_ENVIRONMENT: linux-binary-manywheel-split
  BUILDER_ROOT: /builder
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
@ -28,7 +28,7 @@ env:
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 0
 concurrency:
-  group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  group: linux-binary-manywheel-split-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true

 jobs:
@ -58,7 +58,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -81,7 +81,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -105,7 +105,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -128,7 +128,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -152,7 +152,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -175,7 +175,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
--- a/.github/workflows/generated-linux-binary-manywheel-split-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-split-nightly.yml
@ -2,7 +2,7 @@

 # Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-manywheel
+name: linux-binary-manywheel-split


 on:
@ -24,7 +24,7 @@ env:
  ANACONDA_USER: pytorch
  AWS_DEFAULT_REGION: us-east-1
  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-manywheel
+  BUILD_ENVIRONMENT: linux-binary-manywheel-split
  BUILDER_ROOT: /builder
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
@ -33,7 +33,7 @@ env:
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 0
 concurrency:
-  group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  group: linux-binary-manywheel-split-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true

 jobs:
@ -63,7 +63,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -86,7 +86,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -134,7 +134,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -157,7 +157,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -205,7 +205,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -228,7 +228,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -275,7 +275,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-test:  # Testing
@ -296,7 +296,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
@ -343,7 +343,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -366,7 +366,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -414,7 +414,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -437,7 +437,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -467,76 +467,6 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_10-cuda12_1-full-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_10-cuda12_1-full
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_1-full-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_10-cuda12_1-full-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1-full
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_1-full-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_10-cuda12_1-full-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1-full
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_10-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -555,7 +485,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -578,7 +508,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -625,7 +555,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-test:  # Testing
@ -646,7 +576,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
@ -693,7 +623,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -716,7 +646,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -764,7 +694,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -787,7 +717,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -817,6 +747,76 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_11-cuda12_1-full-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_11-cuda12_1-full
+      build_environment: linux-binary-manywheel-split
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_1-full-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_11-cuda12_1-full-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_1-full
+      build_environment: linux-binary-manywheel-split
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_1-full-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda12_1-full-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_1-full
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_11-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -835,7 +835,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -858,7 +858,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -905,7 +905,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-test:  # Testing
@ -926,7 +926,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
@ -973,7 +973,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -996,7 +996,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -1044,7 +1044,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1067,7 +1067,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -1115,7 +1115,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1138,7 +1138,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -1185,7 +1185,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-test:  # Testing
@ -1206,7 +1206,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
@ -1253,7 +1253,7 @@ jobs:
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1276,7 +1276,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -1324,7 +1324,7 @@ jobs:
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1347,7 +1347,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -1395,7 +1395,7 @@ jobs:
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1418,7 +1418,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -1465,7 +1465,7 @@ jobs:
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cpu-test:  # Testing
@ -1486,7 +1486,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
@ -1514,3 +1514,283 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda11_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda11_8
+      build_environment: linux-binary-manywheel-split
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda11_8-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda11_8-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda11_8
+      build_environment: linux-binary-manywheel-split
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda11_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda11_8-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda11_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda12_1-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda12_1
+      build_environment: linux-binary-manywheel-split
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_1-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda12_1-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_1
+      build_environment: linux-binary-manywheel-split
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_1-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda12_1-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_1
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda12_4
+      build_environment: linux-binary-manywheel-split
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda12_4-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_4
+      build_environment: linux-binary-manywheel-split
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cpu
+      build_environment: linux-binary-manywheel-split
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cpu-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu
+      build_environment: linux-binary-manywheel-split
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cpu-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/inductor-cu124.yml
+++ b/.github/workflows/inductor-cu124.yml
@ -18,11 +18,22 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
    # Should be synced with the one in inductor.yml, but this doesn't run inductor_timm
    name: cuda12.4-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@ -16,10 +16,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@ -13,10 +13,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-perf-test-nightly-a10g.yml
+++ b/.github/workflows/inductor-perf-test-nightly-a10g.yml
@ -68,10 +68,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@ -5,9 +5,7 @@ on:
    # - cron: 0 7 * * 1-6
    # - cron: 0 7 * * 0
    # Does not perform max_autotune on CPU, so skip the weekly run setup
-    # Run 6 times everyday to see if perf instablity can be reproduced
-    # Will change this back
-    - cron: 0 */4 * * *
+    - cron: 0 7 * * *
  # NB: GitHub has an upper limit of 10 inputs here
  workflow_dispatch:
    inputs:
@ -50,10 +48,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-jammy-aarch64-py3_10-inductor-build:
    name: linux-jammy-aarch64-py3.10-inductor
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runner: linux.arm64.m7g.4xlarge
      build-environment: linux-jammy-aarch64-py3.10
      docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
@ -105,7 +114,7 @@ jobs:
    name: linux-jammy-aarch64-py3.10-inductor
    uses: ./.github/workflows/_linux-test.yml
    needs: linux-jammy-aarch64-py3_10-inductor-build
-    if: github.event.schedule == '0 */4 * * *'
+    if: github.event.schedule == '0 7 * * *'
    with:
      build-environment: linux-jammy-aarch64-py3.10
      # Turn off dynamic-shapes and aotinductor tests for now, to have faster iteration for debugging perf instability.
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@ -48,10 +48,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-jammy-cpu-py3_9-gcc11-inductor-build:
    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -66,10 +66,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -18,10 +18,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build:
    name: cuda12.1-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
@ -60,7 +71,9 @@ jobs:
  linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-rocm.yml
+++ b/.github/workflows/inductor-rocm.yml
@ -22,11 +22,22 @@ concurrency:
 permissions: read-all

 jobs:
-  linux-focal-rocm6_1-py3_8-inductor-build:
-    name: rocm6.1-py3.8-inductor
-    uses: ./.github/workflows/_linux-build.yml
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-focal-rocm6_2-py3_10-inductor-build:
+    name: rocm6.2-py3.10-inductor
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
@ -34,14 +45,14 @@ jobs:
          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2" },
        ]}

-  linux-focal-rocm6_1-py3_8-inductor-test:
+  linux-focal-rocm6_2-py3_10-inductor-test:
    permissions:
      id-token: write
      contents: read
-    name: rocm6.1-py3.8-inductor
+    name: rocm6.2-py3.10-inductor
    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm6_1-py3_8-inductor-build
+    needs: linux-focal-rocm6_2-py3_10-inductor-build
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-inductor-build.outputs.docker-image }}
-      test-matrix:  ${{ needs.linux-focal-rocm6_1-py3_8-inductor-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-inductor-build.outputs.docker-image }}
+      test-matrix:  ${{ needs.linux-focal-rocm6_2-py3_10-inductor-build.outputs.test-matrix }}
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -58,8 +58,7 @@ jobs:
          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_cpp_wrapper_abi_compatible", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-focal-cuda12_1-py3_10-gcc9-inductor-test:
    name: cuda12.1-py3.10-gcc9-sm86
@ -69,8 +68,7 @@ jobs:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-focal-cuda12_1-py3_12-gcc9-inductor-build:
    name: cuda12.1-py3.12-gcc9-sm86
@ -86,6 +84,7 @@ jobs:
          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
+    secrets: inherit

  linux-focal-cuda12_1-py3_12-gcc9-inductor-test:
    name: cuda12.1-py3.12-gcc9-sm86
@ -95,6 +94,7 @@ jobs:
      build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
      docker-image: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit

  linux-jammy-cpu-py3_12-inductor-halide-build:
    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
@ -108,6 +108,7 @@ jobs:
        { include: [
          { config: "inductor-halide", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
        ]}
+    secrets: inherit

  linux-jammy-cpu-py3_12-inductor-halide-test:
    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
@ -117,6 +118,29 @@ jobs:
      build-environment: linux-jammy-py3.12-gcc11
      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_12-inductor-triton-cpu-build:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image-name: pytorch-linux-jammy-py3.12-triton-cpu
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      test-matrix: |
+        { include: [
+          { config: "inductor-triton-cpu", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+        ]}
+
+  linux-jammy-cpu-py3_12-inductor-triton-cpu-test:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_12-inductor-triton-cpu-build
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.test-matrix }}

  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
    # Should be synced with the one in inductor-periodic.yml but this only runs inductor_timm
@ -134,8 +158,7 @@ jobs:
          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-focal-cuda12_4-py3_10-gcc9-inductor-test:
    name: cuda12.4-py3.10-gcc9-sm86
@ -146,8 +169,7 @@ jobs:
      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-inductor-build:
    name: linux-jammy-cpu-py3.9-gcc11-inductor
@ -201,8 +223,7 @@ jobs:
          { config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
          { config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
        ]}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-inductor-test:
    name: linux-jammy-cpu-py3.9-gcc11-inductor
@ -212,5 +233,4 @@ jobs:
      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit
--- a/.github/workflows/lint-autoformat.yml
+++ b/.github/workflows/lint-autoformat.yml
@ -0,0 +1,45 @@
+name: Apply lint suggestions
+
+on:
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+jobs:
+  lintrunner-autoformat:
+    permissions:
+      contents: read
+      pull-requests: write
+    runs-on: lf.linux.2xlarge
+    if: ${{ github.repository_owner == 'pytorch' }}
+    steps:
+      - name: Checkout pytorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: true
+          fetch-depth: 0
+      - name: Setup miniconda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: "3.10"
+      - name: Run lintrunner (nonretryable)
+        continue-on-error: true
+        # we can't run all files here because only changes around where the diff are shown in the PR UI
+        run: |
+          export ADDITIONAL_LINTRUNNER_ARGS="format"
+          bash .github/scripts/lintrunner.sh
+      - name: Check for changes
+        id: git-check
+        continue-on-error: true
+        run: |
+          git diff --exit-code || echo "changes=true" >> "$GITHUB_OUTPUT"
+      - name: Suggest changes
+        if: steps.git-check.outputs.changes == 'true'
+        continue-on-error: true
+        uses: parkerbxyz/suggest-changes@v1
+        with:
+          comment: "Please commit the suggested changes from pytorch's linter."
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -36,7 +36,7 @@ jobs:
      submodules: true
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      script: |
-        export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT"
+        export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT --all-files"
        export CLANG=1
        .github/scripts/lintrunner.sh

@ -53,7 +53,7 @@ jobs:
      submodules: true
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      script: |
-        export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT"
+        export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT --all-files"
        .github/scripts/lintrunner.sh

  quick-checks:
@ -215,10 +215,10 @@ jobs:
        with:
          submodules: false
          fetch-depth: 1
-      - name: Setup Python 3.8
+      - name: Setup Python 3.9
        uses: actions/setup-python@v4
        with:
-          python-version: '3.8'
+          python-version: '3.9'
          architecture: x64
          cache: pip
      - name: Install dependencies
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -57,8 +57,10 @@ jobs:
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}
  linux-focal-cuda12_1-py3_10-gcc9-test:
@ -87,8 +89,10 @@ jobs:
          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}

@ -214,7 +218,9 @@ jobs:
  # TODO: Figure out how to migrate this job to M1 runner
  ios-build-test:
    name: ios-build-test
-    if: github.event_name != 'schedule' || github.event.schedule == '45 0,8,16 * * 1-5' || github.event.schedule == '45 4 * * 0,6' || github.event.schedule == '29 8 * * *'
+    # Has been broken for a while, see https://github.com/pytorch/pytorch/issues/136284
+    # if: github.event_name != 'schedule' || github.event.schedule == '45 0,8,16 * * 1-5' || github.event.schedule == '45 4 * * 0,6' || github.event.schedule == '29 8 * * *'
+    if: false
    uses: ./.github/workflows/_ios-build-test.yml
    with:
      trigger-event: ${{ github.event_name }}
@ -293,13 +299,13 @@ jobs:
      docker-image: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.test-matrix }}

-  linux-focal-rocm6_1-py3_8-build:
-    name: linux-focal-rocm6.1-py3.8
+  linux-focal-rocm6_2-py3_10-build:
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm6.1-py3.8
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
@ -308,19 +314,19 @@ jobs:
          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu" },
        ]}

-  linux-focal-rocm6_1-py3_8-test:
+  linux-focal-rocm6_2-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.1-py3.8
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_1-py3_8-build
+      - linux-focal-rocm6_2-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}

  linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build:
    name: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
@ -333,8 +339,10 @@ jobs:
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}

--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -383,7 +383,7 @@ jobs:
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-py3.9-clang9-xla
-      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.1-lite
+      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
      test-matrix: |
        { include: [
          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
@ -503,15 +503,15 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm6_1-py3_8-build:
+  linux-focal-rocm6_2-py3_10-build:
    # don't run build twice on main
    if: github.event_name == 'pull_request'
-    name: linux-focal-rocm6.1-py3.8
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm6.1-py3.8
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -3,18 +3,12 @@ name: rocm
 on:
  push:
    branches:
-#     - main
+      - main
      - release/*
    tags:
      - ciflow/rocm/*
  workflow_dispatch:
  schedule:
-    # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
-    # Also run less frequently on weekends.
-    - cron: 45 0,8,16 * * 1-5
-    - cron: 45 4 * * 0,6
-    - cron: 45 4,12,20 * * 1-5
-    - cron: 45 12 * * 0,6
    - cron: 29 8 * * *  # about 1:29am PDT

 concurrency:
@ -31,11 +25,11 @@ jobs:
      id-token: write
      contents: read

-  linux-focal-rocm6_1-py3_8-build:
-    name: linux-focal-rocm6.1-py3.8
+  linux-focal-rocm6_2-py3_10-build:
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
@ -48,16 +42,16 @@ jobs:
          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.2" },
        ]}

-  linux-focal-rocm6_1-py3_8-test:
+  linux-focal-rocm6_2-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.1-py3.8
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_1-py3_8-build
+      - linux-focal-rocm6_2-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -56,12 +56,14 @@ jobs:
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 6, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 7, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 8, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-test:
@ -87,8 +89,9 @@ jobs:
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
-          { config: "slow", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "slow", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_1-py3_10-gcc9-sm86-test:
@ -127,13 +130,13 @@ jobs:
      docker-image: ${{ needs.linux-focal-py3_9-clang10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-py3_9-clang10-build.outputs.test-matrix }}

-  linux-focal-rocm6_1-py3_8-build:
-    name: linux-focal-rocm6.1-py3.8
+  linux-focal-rocm6_2-py3_10-build:
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm6.1-py3.8
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
@ -141,19 +144,19 @@ jobs:
          { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
        ]}

-  linux-focal-rocm6_1-py3_8-test:
+  linux-focal-rocm6_2-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.1-py3.8
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_1-py3_8-build
+      - linux-focal-rocm6_2-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}

  linux-jammy-py3_10-clang15-asan-build:
    name: linux-jammy-py3.10-clang15-asan
--- a/.github/workflows/sync_distributed_folder_prototype.yml
+++ b/.github/workflows/sync_distributed_folder_prototype.yml
@ -1,30 +0,0 @@
-name: Sync Distributed Folder
-
-on:
-  #push:
-  #  branches:
-  #    - 'main'
-  #  paths:
-  #    - 'torch/distributed/**'
-  workflow_dispatch:
-  pull_request:
-    paths:
-      - '.github/scripts/sync_distributed_folder_prototype.sh'
-      - '.github/workflows/sync_distributed_folder_prototype.yml'
-
-env:
-  WITH_PUSH: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
-
-permissions:
-  contents: write
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  sync:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - run: .github/scripts/sync_distributed_folder_prototype.sh
--- a/.github/workflows/target-determination-indexer.yml
+++ b/.github/workflows/target-determination-indexer.yml
@ -10,8 +10,18 @@ permissions:
  contents: read

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  index:
-    runs-on: linux.g5.4xlarge.nvidia.gpu # 1 GPU A10G 24GB each
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" # 1 GPU A10G 24GB each
    environment: target-determinator-env
    steps:
      - name: Clone PyTorch
--- a/.github/workflows/torchbench.yml
+++ b/.github/workflows/torchbench.yml
@ -11,10 +11,21 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-torchbench-build-gcp:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -223,13 +223,13 @@ jobs:
      cuda-version: "12.1"
      runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"

-  linux-focal-rocm6_1-py3_8-build:
-    name: linux-focal-rocm6.1-py3.8
+  linux-focal-rocm6_2-py3_10-build:
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm6.1-py3.8
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
@ -240,19 +240,19 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm6_1-py3_8-test:
+  linux-focal-rocm6_2-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.1-py3.8
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_1-py3_8-build
+      - linux-focal-rocm6_2-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}
      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"

  linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build:
@ -266,8 +266,10 @@ jobs:
      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
@ -314,3 +316,11 @@ jobs:
      build-environment: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build
      docker-image: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
+
+  linux-manylinux-2_28-py3-cpu-s390x-build:
+    name: linux-manylinux-2_28-py3-cpu-s390x
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-s390x-binary-manywheel
+      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      runner: linux.s390x
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@ -11,15 +11,39 @@ concurrency:

 jobs:
  do_update_viablestrict:
+    permissions:
+      id-token: write
    if: ${{ github.repository_owner == 'pytorch' }}
    runs-on: ubuntu-20.04
    environment: ${{ (github.event_name == 'schedule') && 'mergebot' || '' }}
    steps:
      - name: Update viable/strict
        uses: pytorch/test-infra/.github/actions/update-viablestrict@main
+        id: update_viablestrict
        with:
          repository: pytorch/pytorch
          stable-branch: viable/strict
          requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]'
          secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
          rockset-api-key: ${{ secrets.ROCKSET_API_KEY }}
+
+      - name: Authenticate to AWS with OIDC
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/upload_to_ossci_raw_job_status
+          aws-region: us-east-1
+
+      - name: Print sha
+        env:
+          LATEST_SHA: ${{ steps.update_viablestrict.outputs.latest_viable_sha }}
+          PUSH_RESULT: ${{ steps.update_viablestrict.outputs.push_result }}
+          TIME: ${{ steps.update_viablestrict.outputs.time }}
+        run: |
+          echo "${PUSH_RESULT}"
+          if [ "$PUSH_RESULT" = "Everything up-to-date" ]; then
+            echo "No update pushed"
+          else
+            echo "{\"sha\": \"${LATEST_SHA}\", \"repository\":\"pytorch/pytorch\", \"timestamp\": ${TIME}}" > "/tmp/${LATEST_SHA}.json"
+            pip install awscli==1.29.40
+            aws s3 cp "/tmp/${LATEST_SHA}.json" "s3://ossci-raw-job-status/stable_pushes/pytorch/pytorch/${LATEST_SHA}.json"
+          fi
--- a/.github/workflows/upload-alerts.yml
+++ b/.github/workflows/upload-alerts.yml
@ -1,55 +0,0 @@
-# upload alerts every 10 minutes
-
-name: Upload Alerts to AWS/Rockset
-
-on:
-  schedule:
-    - cron: '*/10 * * * *'
-  pull_request:
-    paths:
-      - 'tools/alerts/create_alerts.py'
-      - '.github/workflows/upload-alerts.yml'
-
-jobs:
-  upload-alerts:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: ubuntu-22.04
-    environment: upload-stats
-    steps:
-      - name: Checkout repo
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.11'
-          cache: pip
-
-      - name: Install Python Packages
-        run: |
-          pip3 install rockset==1.0.3 boto3==1.19.12 requests==2.32.2
-
-      - name: Create alerts
-        run: |
-          output=$(PYTHONPATH=$PYTHONPATH:$(pwd) python3 "tools/alerts/create_alerts.py")
-          echo "uploading following alerts"
-          echo "$output"
-          echo "script-output=$output" >> "$GITHUB_OUTPUT"
-        id: alert_creation_step
-
-      - name: Upload alerts
-        env:
-          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-        uses: pytorch/test-infra/.github/actions/upload-alerts@main
-        with:
-          alerts: '${{ steps.alert_creation_step.outputs.script-output }}'
-          organization: "pytorch"
-          repo: "pytorch"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -96,7 +96,7 @@ jobs:
          python3 -m tools.stats.check_disabled_tests --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}"

      - name: Upload gpt-fast benchmark results to Rockset
-        if: steps.upload-s3.outcome && steps.upload-s3.outcome == 'success' && github.event.workflow_run.name == 'inductor-micro-benchmark'
+        if: steps.upload-s3.outcome && steps.upload-s3.outcome == 'success' && contains(github.event.workflow_run.name, 'inductor-micro-benchmark')
        env:
          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -139,7 +139,7 @@ init_command = [
    'numpy==1.24.3 ; python_version == "3.8"',
    'numpy==1.26.0 ; python_version >= "3.9"',
    'expecttest==0.2.1',
-    'mypy==1.10.0',
+    'mypy==1.11.2',
    'sympy==1.12.1 ; python_version == "3.8"',
    'sympy==1.13.0 ; python_version >= "3.9"',
    'types-requests==2.27.25',
@ -195,6 +195,7 @@ include_patterns = [
    # and excluding most sub-directories for now.
    'aten/src/ATen/*.h',
    'aten/src/ATen/*.cpp',
+    'aten/src/ATen/cuda/*.cpp',
    'aten/src/ATen/cpu/*.h',
    'aten/src/ATen/cpu/*.cpp',
    'aten/src/ATen/core/*.h',
@ -224,7 +225,6 @@ exclude_patterns = [
    # CUDA files are also excluded.
    '**/fb/**',
    '**/*pb.h',
-    'aten/**/cuda/*pp',
    'c10/xpu/**/*.h',
    'c10/xpu/**/*.cpp',
    'c10/cuda/CUDAAlgorithm.h',
@ -246,7 +246,6 @@ exclude_patterns = [
    'torch/csrc/inductor/aoti_torch/c/shim.h',
    'torch/csrc/jit/**/*',
    'torch/csrc/jit/serialization/mobile_bytecode_generated.h',
-    'torch/csrc/lazy/**/*',
 ]
 init_command = [
    'python3',
@ -1255,7 +1254,6 @@ exclude_patterns = [
    'torch/fx/experimental/refinement_types.py',
    'torch/fx/experimental/rewriter.py',
    'torch/fx/experimental/schema_type_annotation.py',
-    'torch/fx/experimental/symbolic_shapes.py',
    'torch/fx/experimental/unification/__init__.py',
    'torch/fx/experimental/unification/core.py',
    'torch/fx/experimental/unification/dispatch.py',
@ -1271,7 +1269,6 @@ exclude_patterns = [
    'torch/fx/experimental/unification/utils.py',
    'torch/fx/experimental/unification/variable.py',
    'torch/fx/experimental/unify_refinements.py',
-    'torch/fx/experimental/validator.py',
    'torch/fx/graph.py',
    'torch/fx/graph_module.py',
    'torch/fx/interpreter.py',
@ -1585,6 +1582,27 @@ command = [
 ]
 is_formatter = true

+
+[[linter]]
+code = 'META_NO_CREATE_UNBACKED'
+include_patterns = [
+  "torch/_meta_registrations.py"
+]
+command = [
+    'python3',
+    'tools/linter/adapters/grep_linter.py',
+    '--pattern=create_unbacked',
+    '--linter-name=META_NO_CREATE_UNBACKED',
+    '--error-name=no create_unbacked in meta registrations',
+    """--error-description=\
+        Data-dependent operators should have their meta \
+        registration in torch/_subclasses/fake_impls.py, \
+        not torch/_meta_registrations.py
+    """,
+    '--',
+    '@{{PATHSFILE}}'
+]
+
 [[linter]]
 code = 'ATEN_CPU_GPU_AGNOSTIC'
 include_patterns = [
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -305,7 +305,6 @@ if(NOT DEFINED USE_VULKAN)
  cmake_dependent_option(USE_VULKAN "Use Vulkan GPU backend" ON "ANDROID" OFF)
 endif()

-option(USE_SLEEF_FOR_ARM_VEC256 "Use sleef for arm" OFF)
 option(USE_SOURCE_DEBUG_ON_MOBILE "Enable" ON)
 option(USE_LITE_INTERPRETER_PROFILER "Enable" ON)
 cmake_dependent_option(
@ -369,7 +368,7 @@ cmake_dependent_option(
    USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
 cmake_dependent_option(
    USE_TENSORPIPE "Use TensorPipe. Only available if USE_DISTRIBUTED is on." ON
-    "USE_DISTRIBUTED" OFF)
+    "USE_DISTRIBUTED AND NOT WIN32" OFF)
 option(ONNX_ML "Enable traditional ONNX ML API." ON)
 option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF)
 option(BUILD_LIBTORCH_CPU_WITH_DEBUG
@ -912,11 +911,6 @@ if(USE_PYTORCH_QNNPACK)
  string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
 endif()

-if(USE_SLEEF_FOR_ARM_VEC256)
-  string(APPEND CMAKE_CXX_FLAGS " -DAT_BUILD_ARM_VEC256_WITH_SLEEF")
-  add_definitions(-DAT_BUILD_ARM_VEC256_WITH_SLEEF)
-endif()
-
 # Enable sleef on macOS with Apple silicon by default
 if((${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64"))
  message(STATUS "Running on macOS with Apple silicon")
@ -924,6 +918,14 @@ if((${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") AND ("${CMAKE_SYSTEM_PROCESSOR}" STR
  add_definitions(-DAT_BUILD_ARM_VEC256_WITH_SLEEF)
 endif()

+# Enable sleef on Arm(R) architecture by default (except Android)
+if((NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android")
+  AND("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64"))
+  string(APPEND CMAKE_CXX_FLAGS " -DAT_BUILD_ARM_VEC256_WITH_SLEEF")
+  add_definitions(-DAT_BUILD_ARM_VEC256_WITH_SLEEF)
+endif()
+
+
 if(USE_XNNPACK)
  string(APPEND CMAKE_CXX_FLAGS " -DUSE_XNNPACK")
 endif()
@ -1081,8 +1083,16 @@ if(NOT MSVC)
  append_cxx_flag_if_supported("-Wno-unused-but-set-variable" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-fstandalone-debug" CMAKE_CXX_FLAGS_DEBUG)
-  string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-omit-frame-pointer -O0")
-  string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fno-omit-frame-pointer -O0")
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" AND CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+    if(CMAKE_BUILD_TYPE MATCHES Debug)
+      message(Warning "Applying -Og optimization for aarch64 GCC debug build to workaround ICE")
+    endif()
+    string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-omit-frame-pointer -Og")
+    string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fno-omit-frame-pointer -Og")
+  else()
+    string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-omit-frame-pointer -O0")
+    string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fno-omit-frame-pointer -O0")
+  endif()
  append_cxx_flag_if_supported("-fno-math-errno" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-fno-trapping-math" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Werror=format" CMAKE_CXX_FLAGS)
--- a/4
+++ b/4
@ -98,6 +98,10 @@ test/test_type_promotion.py @mruberry
 test/functorch/test_ops.py @zou3519 @chillee @kshitij12345
 test/functorch/test_vmap.py @zou3519 @chillee @kshitij12345

+# HOPs
+torch/_higher_order_ops/*.py @zou3519
+torch/_dynamo/variables/higher_order_ops.py @zou3519
+
 # torch MPS
 test/test_mps.py @kulinseth @malfet
 aten/src/ATen/mps/ @kulinseth @malfet
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -50,7 +50,6 @@ aspects of contributing to PyTorch.
 - [Windows development tips](#windows-development-tips)
  - [Known MSVC (and MSVC with NVCC) bugs](#known-msvc-and-msvc-with-nvcc-bugs)
  - [Building on legacy code and CUDA](#building-on-legacy-code-and-cuda)
- [Running clang-tidy](#running-clang-tidy)
 - [Pre-commit tidy/linting hook](#pre-commit-tidylinting-hook)
 - [Building PyTorch with ASAN](#building-pytorch-with-asan)
  - [Getting `ccache` to work](#getting-ccache-to-work)
@ -1132,38 +1131,6 @@ CUDA, MSVC, and PyTorch versions are interdependent; please install matching ver

 Note: There's a [compilation issue](https://github.com/oneapi-src/oneDNN/issues/812) in several Visual Studio 2019 versions since 16.7.1, so please make sure your Visual Studio 2019 version is not in 16.7.1 ~ 16.7.5

-## Running clang-tidy
-
-[Clang-Tidy](https://clang.llvm.org/extra/clang-tidy/index.html) is a C++
-linter and static analysis tool based on the clang compiler. We run clang-tidy
-in our CI to make sure that new C++ code is safe, sane and efficient. See the
-[`clang-tidy` job in our GitHub Workflow's
-lint.yml file](https://github.com/pytorch/pytorch/blob/main/.github/workflows/lint.yml)
-for the simple commands we use for this.
-
-To run clang-tidy locally, follow these steps:
-
-1. Install clang-tidy.
-We provide custom built binaries which have additional checks enabled. You can install it by running:
-```bash
-python3 -m tools.linter.clang_tidy.generate_build_files
-```
-We currently only support Linux and MacOS (x86).
-
-2. Install clang-tidy driver script dependencies
-```bash
-pip3 install -r tools/linter/clang_tidy/requirements.txt
-```
-
-3. Run clang-tidy
-```bash
-# Run clang-tidy on the entire codebase
-make clang-tidy
-# Run clang-tidy only on your changes
-make clang-tidy CHANGED_ONLY=--changed-only
-```
-This internally invokes our driver script and closely mimics how clang-tidy is run on CI.
-
 ## Pre-commit tidy/linting hook

 We use clang-tidy to perform additional
--- a/README.md
+++ b/README.md
@ -27,8 +27,8 @@ Our trunk health (Continuous Integration signals) can be found at [hud.pytorch.o
      - [NVIDIA CUDA Support](#nvidia-cuda-support)
      - [AMD ROCm Support](#amd-rocm-support)
      - [Intel GPU Support](#intel-gpu-support)
-    - [Install Dependencies](#install-dependencies)
    - [Get the PyTorch Source](#get-the-pytorch-source)
+    - [Install Dependencies](#install-dependencies)
    - [Install PyTorch](#install-pytorch)
      - [Adjust Build Options (Optional)](#adjust-build-options-optional)
  - [Docker Image](#docker-image)
@ -161,9 +161,34 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)
 #### Prerequisites
 If you are installing from source, you will need:
 - Python 3.8 or later (for Linux, Python 3.8.1+ is needed)
- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required)
+- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux)
+- Visual Studio or Visual Studio Build Tool on Windows

-We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.
+\* PyTorch CI uses Visual C++ BuildTools, which come with Visual Studio Enterprise,
+Professional, or Community Editions. You can also install the build tools from
+https://visualstudio.microsoft.com/visual-cpp-build-tools/. The build tools *do not*
+come with Visual Studio Code by default.
+
+\* We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.
+
+An example of environment setup is shown below:
+
+* Linux:
+
+```bash
+$ source <CONDA_INSTALL_DIR>/bin/activate
+$ conda create -y -n <CONDA_NAME>
+$ conda activate <CONDA_NAME>
+```
+
+* Windows:
+
+```bash
+$ source <CONDA_INSTALL_DIR>\Scripts\activate.bat
+$ conda create -y -n <CONDA_NAME>
+$ conda activate <CONDA_NAME>
+$ call "C:\Program Files\Microsoft Visual Studio\<VERSION>\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
+```

 ##### NVIDIA CUDA Support
 If you want to compile with CUDA support, [select a supported version of CUDA from our support matrix](https://pytorch.org/get-started/locally/), then install the following:
@ -194,12 +219,23 @@ If you want to compile with Intel GPU support, follow these
 If you want to disable Intel GPU support, export the environment variable `USE_XPU=0`.
 Other potentially useful environment variables may be found in `setup.py`.

+#### Get the PyTorch Source
+```bash
+git clone --recursive https://github.com/pytorch/pytorch
+cd pytorch
+# if you are updating an existing checkout
+git submodule sync
+git submodule update --init --recursive
+```
+
 #### Install Dependencies

 **Common**

 ```bash
 conda install cmake ninja
+# Run this command on native Windows
+conda install rust
 # Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section below
 pip install -r requirements.txt
 ```
@ -235,15 +271,6 @@ pip install mkl-static mkl-include
 conda install -c conda-forge libuv=1.39
 ```

-#### Get the PyTorch Source
-```bash
-git clone --recursive https://github.com/pytorch/pytorch
-cd pytorch
-# if you are updating an existing checkout
-git submodule sync
-git submodule update --init --recursive
-```
-
 #### Install PyTorch
 **On Linux**

@ -284,13 +311,6 @@ python3 setup.py develop

 **On Windows**

-Choose Correct Visual Studio Version.
-
-PyTorch CI uses Visual C++ BuildTools, which come with Visual Studio Enterprise,
-Professional, or Community Editions. You can also install the build tools from
-https://visualstudio.microsoft.com/visual-cpp-build-tools/. The build tools *do not*
-come with Visual Studio Code by default.
-
 If you want to build legacy python code, please refer to [Building on legacy code and CUDA](https://github.com/pytorch/pytorch/blob/main/CONTRIBUTING.md#building-on-legacy-code-and-cuda)

 **CPU-only builds**
@ -298,7 +318,6 @@ If you want to build legacy python code, please refer to [Building on legacy cod
 In this mode PyTorch computations will run on your CPU, not your GPU

 ```cmd
-conda activate
 python setup.py develop
 ```

--- a/RELEASE.md
+++ b/RELEASE.md
@ -48,16 +48,16 @@

 Following is the Release Compatibility Matrix for PyTorch releases:

-| PyTorch version | Python | Stable CUDA | Experimental CUDA | Stable ROCm |
-| --- | --- | --- | --- | --- |
-| 2.5 | >=3.9, <=3.12, (3.13 experimental) | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70  | None | ROCm 6.2 |
-| 2.4 | >=3.8, <=3.12 | CUDA 11.8, CUDA 12.1, CUDNN 9.1.0.70  | CUDA 12.4, CUDNN 9.1.0.70 | ROCm 6.1 |
-| 2.3 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 |
-| 2.2 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 |
-| 2.1 | >=3.8, <=3.11 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.6 |
-| 2.0 | >=3.8, <=3.11 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 | ROCm 5.4 |
-| 1.13 | >=3.7, <=3.10 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
-| 1.12 | >=3.7, <=3.10 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |
+| PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm |
+| --- | --- | --- | --- | --- | --- |
+| 2.5 | >=3.9, <=3.12, (3.13 experimental) | C++17 | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70  | None | ROCm 6.2 |
+| 2.4 | >=3.8, <=3.12 | C++17 | CUDA 11.8, CUDA 12.1, CUDNN 9.1.0.70  | CUDA 12.4, CUDNN 9.1.0.70 | ROCm 6.1 |
+| 2.3 | >=3.8, <=3.11, (3.12 experimental) | C++17 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 |
+| 2.2 | >=3.8, <=3.11, (3.12 experimental) | C++17 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 |
+| 2.1 | >=3.8, <=3.11 | C++17 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.6 |
+| 2.0 | >=3.8, <=3.11 | C++14 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 | ROCm 5.4 |
+| 1.13 | >=3.7, <=3.10 | C++14 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
+| 1.12 | >=3.7, <=3.10 | C++14 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |

 ## Release Cadence

@ -234,7 +234,7 @@ Typically, within a release cycle fixes are necessary for regressions, test fixe
 For fixes that are to go into a release after the release branch has been cut we typically employ the use of a cherry pick tracker.

 An example of this would look like:
-* https://github.com/pytorch/pytorch/issues/51886
+* https://github.com/pytorch/pytorch/issues/128436

 Please also make sure to add milestone target to the PR/issue, especially if it needs to be considered for inclusion into the dot release.

@ -243,7 +243,9 @@ Please also make sure to add milestone target to the PR/issue, especially if it
 #### How to do Cherry Picking

 You can now use `pytorchbot` to cherry pick a PyTorch PR that has been committed
-to the main branch using `@pytorchbot cherry-pick` command as follows.
+to the main branch using `@pytorchbot cherry-pick` command as follows (make sure
+that the cherry-pick tracker issue for the target release labelled as "release tracker" -
+this will allow the bot to find it and post comments).

 ```
 usage: @pytorchbot cherry-pick --onto ONTO [--fixes FIXES] -c
@ -380,7 +382,7 @@ Patch release process takes around 4-5 weeks to complete.
 ### Issue Tracker for Patch releases

 For patch releases issue tracker needs to be created. For patch release, we require all cherry-pick changes to have links to either a high-priority GitHub issue or a CI failure from previous RC. An example of this would look like:
-* https://github.com/pytorch/pytorch/issues/51886
+* https://github.com/pytorch/pytorch/issues/128436

 Only following issues are accepted:
 1. Fixes to regressions against previous major version (e.g. regressions introduced in 1.13.0 from 1.12.0 are pickable for 1.13.1)
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -54,7 +54,7 @@ if(NOT BUILD_LITE_INTERPRETER)
 endif()
 EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS})

-file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/vec256/zarch/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
+file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/vec256/zarch/*.h" "cpu/vec/sve/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
 file(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp" "functorch/*.cpp")
 file(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh" "cuda/tunable/*.cuh" "cuda/tunable/*.h")
 file(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp" "cuda/tunable/*.cpp")
@ -467,6 +467,9 @@ if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
 endif()

 if(USE_CUDA AND NOT USE_ROCM)
+  add_definitions(-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1)
+  add_definitions(-DCUTLASS_ENABLE_SM90_EXTENDED_MMA_SHAPES=1)
+  add_definitions(-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
  list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
  list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
  if($ENV{ATEN_STATIC_CUDA})
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -145,6 +145,14 @@ void Context::setSDPUseMath(bool e) {
  enabled_mathSDP = e;
 }

+bool Context::allowFP16BF16ReductionMathSDP() const {
+  return allow_fp16_bf16_reduction_mathSDP;
+}
+
+void Context::setAllowFP16BF16ReductionMathSDP(bool e) {
+  allow_fp16_bf16_reduction_mathSDP = e;
+}
+
 bool Context::userEnabledCuDNNSDP() const {
  return enabled_cudnnSDP;
 }
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -106,8 +106,7 @@ class TORCH_API Context {
            opt_device_type.value())) { // passed device not an accelerator
      return false;
    }
-    return getAcceleratorHooksInterface(opt_device_type.value())
-        .isPinnedPtr(data);
+    return getAcceleratorHooksInterface(opt_device_type).isPinnedPtr(data);
  }
  Allocator* getPinnedMemoryAllocator(
      std::optional<c10::DeviceType> device_type = std::nullopt) {
@ -234,6 +233,9 @@ class TORCH_API Context {
  void setSDPUseCuDNN(bool);
  bool userEnabledCuDNNSDP() const;

+  void setAllowFP16BF16ReductionMathSDP(bool);
+  bool allowFP16BF16ReductionMathSDP() const;
+
  void setSDPUseOverrideable(bool);
  bool userEnabledOverrideableSDP() const;

@ -390,6 +392,7 @@ class TORCH_API Context {
  bool enabled_mathSDP = true;
  bool enabled_cudnnSDP = true;
  bool enabled_overrideable = true;
+  bool allow_fp16_bf16_reduction_mathSDP = false;
 #ifdef USE_ROCM
  bool benchmark_cudnn = true;
 #else
@ -509,7 +512,7 @@ inline size_t getNumGPUs() {
        "to be CUDA (e.g., when you say CUDA, on a HIP build of ATen, this actually "
        "means HIP.  Rebuild PyTorch with one or the other disabled.");
  } else if (hasCUDA()) {
-    return detail::getCUDAHooks().getNumGPUs();
+    return detail::getCUDAHooks().deviceCount();
  } else if (hasHIP()) {
    return detail::getHIPHooks().getNumGPUs();
  } else {
@ -546,7 +549,7 @@ inline void manual_seed(uint64_t seed) {
  }
  // NB: Sometimes we build with CUDA, but we don't have any GPUs
  // available. In that case, we must not seed CUDA; it will fail!
-  const auto cuda_num_gpus = detail::getCUDAHooks().getNumGPUs();
+  const auto cuda_num_gpus = detail::getCUDAHooks().deviceCount();
  if (hasCUDA() && cuda_num_gpus > 0) {
    for (const auto i : c10::irange(cuda_num_gpus)) {
      auto cuda_gen = globalContext().defaultGenerator(
@ -559,7 +562,7 @@ inline void manual_seed(uint64_t seed) {
    }
  }

-  const auto xpu_num_gpus = detail::getXPUHooks().getNumGPUs();
+  const auto xpu_num_gpus = detail::getXPUHooks().deviceCount();
  if (hasXPU() && xpu_num_gpus) {
    for (const auto i : c10::irange(xpu_num_gpus)) {
      auto xpu_gen = globalContext().defaultGenerator(
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@ -299,6 +299,15 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                 \
  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)

+#define AT_DISPATCH_CASE_FLOATING_TYPES_AND5(                             \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, ...) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)                            \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)
+
 #define AT_DISPATCH_FLOATING_TYPES_AND4(                                 \
    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(                                                    \
@ -307,6 +316,26 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
      AT_DISPATCH_CASE_FLOATING_TYPES_AND4(                              \
          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__))

+#define AT_DISPATCH_FLOATING_TYPES_AND5(    \
+    SCALARTYPE1,                            \
+    SCALARTYPE2,                            \
+    SCALARTYPE3,                            \
+    SCALARTYPE4,                            \
+    SCALARTYPE5,                            \
+    TYPE,                                   \
+    NAME,                                   \
+    ...)                                    \
+  AT_DISPATCH_SWITCH(                       \
+      TYPE,                                 \
+      NAME,                                 \
+      AT_DISPATCH_CASE_FLOATING_TYPES_AND5( \
+          SCALARTYPE1,                      \
+          SCALARTYPE2,                      \
+          SCALARTYPE3,                      \
+          SCALARTYPE4,                      \
+          SCALARTYPE5,                      \
+          __VA_ARGS__))
+
 #define AT_DISPATCH_CASE_COMPLEX_TYPES(...)                    \
  AT_DISPATCH_CASE(at::ScalarType::ComplexDouble, __VA_ARGS__) \
  AT_DISPATCH_CASE(at::ScalarType::ComplexFloat, __VA_ARGS__)
--- a/aten/src/ATen/ParallelOpenMP.cpp
+++ b/aten/src/ATen/ParallelOpenMP.cpp
@ -61,9 +61,8 @@ void set_num_threads(int nthreads) {
 #endif
 #ifdef USE_PTHREADPOOL
  // because PyTorch uses caffe2::pthreadpool() in QNNPACK
-  caffe2::PThreadPool* const pool = caffe2::pthreadpool();
+  caffe2::PThreadPool* const pool = caffe2::pthreadpool(nthreads);
  TORCH_INTERNAL_ASSERT(pool, "Invalid thread pool!");
-  pool->set_thread_count(nthreads);
 #endif
 #if AT_MKLDNN_ENABLED()
  at::native::mkldnn::clear_computation_cache();
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .0.0
 .1.0