[dynamo] Remove some files from dynamo_expected_failures

Some tests in `test/dynamo` are marked as "expected failure when testing with `PYTORCH_TEST_WITH_DYNAMO=1`, i.e., we added files of those test names in the `dynamo_expected_failures` folder. However, a lot of those dynamo tests seem to be passing with `PYTORCH_TEST_WITH_DYNAMO=1`, so this patch removes them from `dynamo_expected_failures`.
Recomend pip install -r requirements in the unit testing guidelines. (#137797 )
2025-11-03 07:24:58 +08:00 · 2024-10-25 13:26:41 -07:00 · 2024-10-25 18:47:44 +00:00 · 2024-10-25 18:13:57 +00:00 · 2024-10-25 18:12:34 +00:00 · 2024-10-25 18:09:50 +00:00
2131 changed files with 70750 additions and 30225 deletions
--- a/.buckconfig.oss
+++ b/.buckconfig.oss
@ -21,6 +21,3 @@
  cxx = /usr/bin/clang++
  cxxpp = /usr/bin/clang++
  ld = /usr/bin/clang++
-
-[project]
-  default_flavors_mode=all
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -355,6 +355,12 @@ case "$image" in
    CONDA_CMAKE=yes
    VISION=yes
    ;;
+  pytorch-linux-jammy-py3-clang18-asan)
+    ANACONDA_PYTHON_VERSION=3.10
+    CLANG_VERSION=18
+    CONDA_CMAKE=yes
+    VISION=yes
+    ;;
  pytorch-linux-jammy-py3.9-gcc11)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
@ -381,6 +387,13 @@ case "$image" in
    HALIDE=yes
    TRITON=yes
    ;;
+  pytorch-linux-jammy-py3.12-triton-cpu)
+    CUDA_VERSION=12.4
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=11
+    CONDA_CMAKE=yes
+    TRITON_CPU=yes
+    ;;
  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
    # We will need to update mypy version eventually, but that's for another day. The task
@ -510,6 +523,7 @@ docker build \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
       --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
       --build-arg "TRITON=${TRITON}" \
+       --build-arg "TRITON_CPU=${TRITON_CPU}" \
       --build-arg "ONNX=${ONNX}" \
       --build-arg "DOCS=${DOCS}" \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-cd1c833b079adb324871dcbbe75b43d42ffc0ade
+ca4783992ed7602a39528ba304d61f00396b2a5a
--- a/.ci/docker/ci_commit_pins/triton-cpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-cpu.txt
@ -0,0 +1 @@
+c7711371cace304afe265c1ffa906415ab82fc66
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-5fe38ffd73c2ac6ed6323b554205186696631c6f
+cf34004b8a67d290a962da166f5aa2fc66751326
--- a/.ci/docker/common/install_clang.sh
+++ b/.ci/docker/common/install_clang.sh
@ -13,11 +13,18 @@ if [ -n "$CLANG_VERSION" ]; then
  elif [[ $UBUNTU_VERSION == 22.04 ]]; then
    # work around ubuntu apt-get conflicts
    sudo apt-get -y -f install
+    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add  -
+    if [[ $CLANG_VERSION == 18 ]]; then
+      apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main"
+    fi
  fi

  sudo apt-get update
-  apt-get install -y --no-install-recommends clang-"$CLANG_VERSION"
-  apt-get install -y --no-install-recommends llvm-"$CLANG_VERSION"
+  if [[ $CLANG_VERSION -ge 18 ]]; then
+    apt-get install -y libomp-${CLANG_VERSION}-dev libclang-rt-${CLANG_VERSION}-dev clang-"$CLANG_VERSION" llvm-"$CLANG_VERSION"
+  else
+    apt-get install -y --no-install-recommends clang-"$CLANG_VERSION" llvm-"$CLANG_VERSION"
+  fi

  # Install dev version of LLVM.
  if [ -n "$LLVMDEV" ]; then
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -65,23 +65,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then

  # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
  if [[ $(uname -m) == "aarch64" ]]; then
-    CONDA_COMMON_DEPS="astunparse pyyaml setuptools openblas==0.3.25=*openmp* ninja==1.11.1 scons==4.5.2"
-
-    if [ "$ANACONDA_PYTHON_VERSION" = "3.8" ]; then
-      NUMPY_VERSION=1.24.4
-    else
-      NUMPY_VERSION=1.26.2
-    fi
+    conda_install "openblas==0.3.25=*openmp*"
  else
-    CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"
-
-    if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.13" ]; then
-      NUMPY_VERSION=1.26.0
-    else
-      NUMPY_VERSION=1.21.2
-    fi
+    conda_install "mkl=2021.4.0 mkl-include=2021.4.0"
  fi
-  conda_install ${CONDA_COMMON_DEPS}

  # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
  # and libpython-static for torch deploy
@ -103,8 +90,6 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then

  # Install some other packages, including those needed for Python test reporting
  pip_install -r /opt/conda/requirements-ci.txt
-  pip_install numpy=="$NUMPY_VERSION"
-  pip_install -U scikit-learn

  if [ -n "$DOCS" ]; then
    apt-get update
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -105,7 +105,7 @@ function install_121 {
 }

 function install_124 {
-  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
  # install CUDA 12.4.1 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
@ -137,6 +137,39 @@ function install_124 {
  ldconfig
 }

+function install_126 {
+  echo "Installing CUDA 12.6.2 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
+  rm -rf /usr/local/cuda-12.6 /usr/local/cuda
+  # install CUDA 12.6.2 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux.run
+  chmod +x cuda_12.6.2_560.35.03_linux.run
+  ./cuda_12.6.2_560.35.03_linux.run --toolkit --silent
+  rm -f cuda_12.6.2_560.35.03_linux.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn
+
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl
+
+  install_cusparselt_062
+
+  ldconfig
+}
+
 function prune_118 {
    echo "Pruning CUDA 11.8 and cuDNN"
    #####################################################################################
@ -227,12 +260,46 @@ function prune_124 {
  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a

  #####################################################################################
-  # CUDA 12.1 prune visual tools
+  # CUDA 12.4 prune visual tools
  #####################################################################################
  export CUDA_BASE="/usr/local/cuda-12.4/"
  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
 }

+function prune_126 {
+  echo "Pruning CUDA 12.6"
+  #####################################################################################
+  # CUDA 12.6 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.6 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.6/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
+}
+
 # idiomatic parameter and option handling in sh
 while test $# -gt 0
 do
@ -243,6 +310,8 @@ do
        ;;
    12.4) install_124; prune_124
        ;;
+    12.6) install_126; prune_126
+        ;;
    *) echo "bad argument $1"; exit 1
        ;;
    esac
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -5,19 +5,19 @@ set -ex

 NCCL_VERSION=v2.21.5-1

-function install_cusparselt_052 {
+function install_cusparselt_062 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
    mkdir tmp_cusparselt && pushd tmp_cusparselt
-    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
-    tar xf libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
-    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/include/* /usr/local/cuda/include/
-    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.6.2.3-archive.tar.xz
+    tar xf libcusparse_lt-linux-sbsa-0.6.2.3-archive.tar.xz
+    cp -a libcusparse_lt-linux-sbsa-0.6.2.3-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-sbsa-0.6.2.3-archive/lib/* /usr/local/cuda/lib64/
    popd
    rm -rf tmp_cusparselt
 }

 function install_124 {
-  echo "Installing CUDA 12.4.1 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+  echo "Installing CUDA 12.4.1 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
  # install CUDA 12.4.1 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux_sbsa.run
@ -44,7 +44,7 @@ function install_124 {
  cd ..
  rm -rf nccl

-  install_cusparselt_052
+  install_cusparselt_062

  ldconfig
 }
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -32,7 +32,7 @@ pip_install coloredlogs packaging

 pip_install onnxruntime==1.18.1
 pip_install onnx==1.16.2
-pip_install onnxscript==0.1.0.dev20240831 --no-deps
+pip_install onnxscript==0.1.0.dev20241009 --no-deps
 # required by onnxscript
 pip_install ml_dtypes

--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -15,8 +15,11 @@ conda_reinstall() {
 if [ -n "${XPU_VERSION}" ]; then
  TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
  TRITON_TEXT_FILE="triton-xpu"
+elif [ -n "${TRITON_CPU}" ]; then
+  TRITON_REPO="https://github.com/triton-lang/triton-cpu"
+  TRITON_TEXT_FILE="triton-cpu"
 else
-  TRITON_REPO="https://github.com/openai/triton"
+  TRITON_REPO="https://github.com/triton-lang/triton"
  TRITON_TEXT_FILE="triton"
 fi

@ -44,9 +47,10 @@ chown -R jenkins /var/lib/jenkins/triton
 chgrp -R jenkins /var/lib/jenkins/triton
 pushd /var/lib/jenkins/

-as_jenkins git clone ${TRITON_REPO} triton
+as_jenkins git clone --recursive ${TRITON_REPO} triton
 cd triton
 as_jenkins git checkout ${TRITON_PINNED_COMMIT}
+as_jenkins git submodule update --init --recursive
 cd python

 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -41,13 +41,16 @@ function install_ubuntu() {
        libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
        libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
+    if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
+        apt-get install -y intel-ocloc
+    fi
    # Development Packages
    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
    # Install Intel Support Packages
    if [ -n "$XPU_VERSION" ]; then
-        apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION} intel-pti-dev
+        apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION} intel-pti-dev-0.9
    else
-        apt-get install -y intel-for-pytorch-gpu-dev intel-pti-dev
+        apt-get install -y intel-for-pytorch-gpu-dev-0.5 intel-pti-dev-0.9
    fi

    # Cleanup
@ -97,7 +100,7 @@ EOF
        intel-igc-opencl-devel level-zero-devel intel-gsc-devel libmetee-devel \
        level-zero-devel
    # Install Intel Support Packages
-    yum install -y intel-for-pytorch-gpu-dev intel-pti-dev
+    yum install -y intel-for-pytorch-gpu-dev-0.5 intel-pti-dev-0.9

    # Cleanup
    dnf clean all
@ -131,7 +134,7 @@ function install_sles() {
    zypper install -y libigdfcl-devel intel-igc-cm libigfxcmrt-devel level-zero-devel

    # Install Intel Support Packages
-    zypper install -y intel-for-pytorch-gpu-dev intel-pti-dev
+    zypper install -y intel-for-pytorch-gpu-dev-0.5 intel-pti-dev-0.9

 }

--- a/.ci/docker/conda/Dockerfile
+++ b/.ci/docker/conda/Dockerfile
@ -70,6 +70,10 @@ FROM cuda as cuda12.4
 RUN bash ./install_cuda.sh 12.4
 ENV DESIRED_CUDA=12.4

+FROM cuda as cuda12.6
+RUN bash ./install_cuda.sh 12.6
+ENV DESIRED_CUDA=12.6
+
 # Install MNIST test data
 FROM base as mnist
 ADD ./common/install_mnist.sh install_mnist.sh
@ -79,6 +83,7 @@ FROM base as all_cuda
 COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
 COPY --from=cuda12.1  /usr/local/cuda-12.1 /usr/local/cuda-12.1
 COPY --from=cuda12.4  /usr/local/cuda-12.4 /usr/local/cuda-12.4
+COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6

 # Final step
 FROM ${BASE_TARGET} as final
--- a/.ci/docker/manywheel/build_scripts/ssl-check.py
+++ b/.ci/docker/manywheel/build_scripts/ssl-check.py
@ -1,10 +1,12 @@
 # cf. https://github.com/pypa/manylinux/issues/53

+import sys
+from urllib.request import urlopen
+
+
 GOOD_SSL = "https://google.com"
 BAD_SSL = "https://self-signed.badssl.com"

-import sys
-

 print("Testing SSL certificate checking for Python:", sys.version)

@ -12,14 +14,8 @@ if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):
    print("This version never checks SSL certs; skipping tests")
    sys.exit(0)

-if sys.version_info[0] >= 3:
-    from urllib.request import urlopen

-    EXC = OSError
-else:
-    from urllib import urlopen
-
-    EXC = IOError
+EXC = OSError

 print(f"Connecting to {GOOD_SSL} should work")
 urlopen(GOOD_SSL)
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -5,7 +5,7 @@
 #Pinned versions: 1.6
 #test that import:

-boto3==1.19.12
+boto3==1.35.42
 #Description: AWS SDK for python
 #Pinned versions: 1.19.12, 1.16.34
 #test that import:
@ -118,7 +118,7 @@ numba==0.55.2 ; python_version == "3.10"

 #numpy
 #Description: Provides N-dimensional arrays and linear algebra
-#Pinned versions: 1.20
+#Pinned versions: 1.26.2
 #test that import: test_view_ops.py, test_unary_ufuncs.py, test_type_promotion.py,
 #test_type_info.py, test_torch.py, test_tensorexpr_pybind.py, test_tensorexpr.py,
 #test_tensorboard.py, test_tensor_creation_ops.py, test_static_runtime.py,
@ -128,6 +128,10 @@ numba==0.55.2 ; python_version == "3.10"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
+numpy==1.21.2; python_version == "3.9"
+numpy==1.22.4; python_version == "3.10"
+numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
+numpy==2.1.2; python_version >= "3.13"

 #onnxruntime
 #Description: scoring engine for Open Neural Network Exchange (ONNX) models
@ -139,9 +143,9 @@ opt-einsum==3.3
 #Pinned versions: 3.3
 #test that import: test_linalg.py

-optree==0.12.1
+optree==0.13.0
 #Description: A library for tree manipulation
-#Pinned versions: 0.12.1
+#Pinned versions: 0.13.0
 #test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
 #test_pytree.py, test_ops.py, test_control_flow.py, test_modules.py,
 #common_utils.py, test_eager_transforms.py, test_python_dispatch.py,
@ -322,13 +326,12 @@ lxml==5.0.0

 PyGithub==2.3.0

-sympy==1.12.1 ; python_version == "3.8"
 sympy==1.13.1 ; python_version >= "3.9"
 #Description: Required by coremltools, also pinned in .github/requirements/pip-requirements-macOS.txt
 #Pinned versions:
 #test that import:

-onnx==1.16.1
+onnx==1.17.0
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
@ -342,3 +345,26 @@ parameterized==0.8.1
 #Description: Parameterizes unittests, both the tests themselves and the entire testing class
 #Pinned versions:
 #test that import:
+
+#Description: required for testing torch/distributed/_tools/sac_estimator.py
+#Pinned versions: 1.24.0
+#test that import: test_sac_estimator.py
+
+pwlf==2.2.1 ; python_version >= "3.8"
+#Description: required for testing torch/distributed/_tools/sac_estimator.py
+#Pinned versions: 2.2.1
+#test that import: test_sac_estimator.py
+
+
+# To build PyTorch itself
+astunparse
+PyYAML
+setuptools
+
+ninja==1.11.1 ; platform_machine == "aarch64"
+scons==4.5.2 ; platform_machine == "aarch64"
+
+pulp==2.9.0 ; python_version >= "3.8"
+#Description: required for testing ilp formulaiton under torch/distributed/_tools
+#Pinned versions: 2.9.0
+#test that import: test_sac_ilp.py
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -147,6 +147,13 @@ COPY ci_commit_pins/triton.txt triton.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton.txt

+ARG TRITON_CPU
+COPY ./common/install_triton.sh install_triton.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt
+RUN if [ -n "${TRITON_CPU}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton-cpu.txt
+
 ARG EXECUTORCH
 # Build and install executorch
 COPY ./common/install_executorch.sh install_executorch.sh
--- a/.ci/libtorch/build.sh
+++ b/.ci/libtorch/build.sh
@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+# This is mostly just a shim to manywheel/build.sh
+# TODO: Make this a dedicated script to build just libtorch
+
+set -ex
+
+SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
--- a/.ci/manywheel/LICENSE
+++ b/.ci/manywheel/LICENSE
@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2016 manylinux
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/.ci/manywheel/build.sh
+++ b/.ci/manywheel/build.sh
@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+set -ex
+
+SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+case "${GPU_ARCH_TYPE:-BLANK}" in
+    BLANK)
+        # Legacy behavior for CircleCI
+        bash "${SCRIPTPATH}/build_cuda.sh"
+        ;;
+    cuda)
+        bash "${SCRIPTPATH}/build_cuda.sh"
+        ;;
+    rocm)
+        bash "${SCRIPTPATH}/build_rocm.sh"
+        ;;
+    cpu | cpu-cxx11-abi | cpu-s390x | xpu)
+        bash "${SCRIPTPATH}/build_cpu.sh"
+        ;;
+    *)
+        echo "Un-recognized GPU_ARCH_TYPE '${GPU_ARCH_TYPE}', exiting..."
+        exit 1
+        ;;
+esac
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -0,0 +1,505 @@
+#!/usr/bin/env bash
+# meant to be called only from the neighboring build.sh and build_cpu.sh scripts
+
+set -ex
+SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
+
+
+# Require only one python installation
+if [[ -z "$DESIRED_PYTHON" ]]; then
+    echo "Need to set DESIRED_PYTHON env variable"
+    exit 1
+fi
+if [[ -n "$BUILD_PYTHONLESS" && -z "$LIBTORCH_VARIANT" ]]; then
+    echo "BUILD_PYTHONLESS is set, so need LIBTORCH_VARIANT to also be set"
+    echo "LIBTORCH_VARIANT should be one of shared-with-deps shared-without-deps static-with-deps static-without-deps"
+    exit 1
+fi
+
+# Function to retry functions that sometimes timeout or have flaky failures
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+# TODO move this into the Docker images
+OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
+if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
+    retry yum install -q -y zip openssl
+elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+    retry yum install -q -y zip openssl
+elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
+    retry dnf install -q -y zip openssl
+elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
+    # TODO: Remove this once nvidia package repos are back online
+    # Comment out nvidia repositories to prevent them from getting apt-get updated, see https://github.com/pytorch/pytorch/issues/74968
+    # shellcheck disable=SC2046
+    sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list")
+
+    retry apt-get update
+    retry apt-get -y install zip openssl
+fi
+
+# We use the package name to test the package by passing this to 'pip install'
+# This is the env variable that setup.py uses to name the package. Note that
+# pip 'normalizes' the name first by changing all - to _
+if [[ -z "$TORCH_PACKAGE_NAME" ]]; then
+    TORCH_PACKAGE_NAME='torch'
+fi
+
+if [[ -z "$TORCH_NO_PYTHON_PACKAGE_NAME" ]]; then
+    TORCH_NO_PYTHON_PACKAGE_NAME='torch_no_python'
+fi
+
+TORCH_PACKAGE_NAME="$(echo $TORCH_PACKAGE_NAME | tr '-' '_')"
+TORCH_NO_PYTHON_PACKAGE_NAME="$(echo $TORCH_NO_PYTHON_PACKAGE_NAME | tr '-' '_')"
+echo "Expecting the built wheels to all be called '$TORCH_PACKAGE_NAME' or '$TORCH_NO_PYTHON_PACKAGE_NAME'"
+
+# Version: setup.py uses $PYTORCH_BUILD_VERSION.post$PYTORCH_BUILD_NUMBER if
+# PYTORCH_BUILD_NUMBER > 1
+build_version="$PYTORCH_BUILD_VERSION"
+build_number="$PYTORCH_BUILD_NUMBER"
+if [[ -n "$OVERRIDE_PACKAGE_VERSION" ]]; then
+    # This will be the *exact* version, since build_number<1
+    build_version="$OVERRIDE_PACKAGE_VERSION"
+    build_number=0
+fi
+if [[ -z "$build_version" ]]; then
+    build_version=1.0.0
+fi
+if [[ -z "$build_number" ]]; then
+    build_number=1
+fi
+export PYTORCH_BUILD_VERSION=$build_version
+export PYTORCH_BUILD_NUMBER=$build_number
+
+export CMAKE_LIBRARY_PATH="/opt/intel/lib:/lib:$CMAKE_LIBRARY_PATH"
+export CMAKE_INCLUDE_PATH="/opt/intel/include:$CMAKE_INCLUDE_PATH"
+
+if [[ -e /opt/openssl ]]; then
+    export OPENSSL_ROOT_DIR=/opt/openssl
+    export CMAKE_INCLUDE_PATH="/opt/openssl/include":$CMAKE_INCLUDE_PATH
+fi
+
+# If given a python version like 3.6m or 2.7mu, convert this to the format we
+# expect. The binary CI jobs pass in python versions like this; they also only
+# ever pass one python version, so we assume that DESIRED_PYTHON is not a list
+# in this case
+if [[ -n "$DESIRED_PYTHON" && $DESIRED_PYTHON =~ ([0-9].[0-9]+)t ]]; then
+    python_digits="$(echo $DESIRED_PYTHON | tr -cd [:digit:])"
+    py_majmin="${DESIRED_PYTHON}"
+    DESIRED_PYTHON="cp${python_digits}-cp${python_digits}t"
+elif [[ -n "$DESIRED_PYTHON" && "$DESIRED_PYTHON" != cp* ]]; then
+    python_nodot="$(echo $DESIRED_PYTHON | tr -d m.u)"
+    DESIRED_PYTHON="cp${python_nodot}-cp${python_nodot}"
+    if [[ ${python_nodot} -ge 310 ]]; then
+        py_majmin="${DESIRED_PYTHON:2:1}.${DESIRED_PYTHON:3:2}"
+    else
+        py_majmin="${DESIRED_PYTHON:2:1}.${DESIRED_PYTHON:3:1}"
+    fi
+fi
+
+pydir="/opt/python/$DESIRED_PYTHON"
+export PATH="$pydir/bin:$PATH"
+echo "Will build for Python version: ${DESIRED_PYTHON} with ${python_installation}"
+
+mkdir -p /tmp/$WHEELHOUSE_DIR
+
+export PATCHELF_BIN=/usr/local/bin/patchelf
+patchelf_version=$($PATCHELF_BIN --version)
+echo "patchelf version: " $patchelf_version
+if [[ "$patchelf_version" == "patchelf 0.9" ]]; then
+    echo "Your patchelf version is too old. Please use version >= 0.10."
+    exit 1
+fi
+
+########################################################
+# Compile wheels as well as libtorch
+#######################################################
+if [[ -z "$PYTORCH_ROOT" ]]; then
+    echo "Need to set PYTORCH_ROOT env variable"
+    exit 1
+fi
+pushd "$PYTORCH_ROOT"
+python setup.py clean
+retry pip install -qr requirements.txt
+case ${DESIRED_PYTHON} in
+  cp31*)
+    retry pip install -q --pre numpy==2.1.0
+    ;;
+  # Should catch 3.9+
+  *)
+    retry pip install -q --pre numpy==2.0.2
+    ;;
+esac
+
+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    export _GLIBCXX_USE_CXX11_ABI=1
+else
+    export _GLIBCXX_USE_CXX11_ABI=0
+fi
+
+if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
+    echo "Calling build_amd.py at $(date)"
+    python tools/amd_build/build_amd.py
+fi
+
+# This value comes from binary_linux_build.sh (and should only be set to true
+# for master / release branches)
+BUILD_DEBUG_INFO=${BUILD_DEBUG_INFO:=0}
+
+if [[ $BUILD_DEBUG_INFO == "1" ]]; then
+    echo "Building wheel and debug info"
+else
+    echo "BUILD_DEBUG_INFO was not set, skipping debug info"
+fi
+
+if [[ "$DISABLE_RCCL" = 1 ]]; then
+    echo "Disabling NCCL/RCCL in pyTorch"
+    USE_RCCL=0
+    USE_NCCL=0
+    USE_KINETO=0
+else
+    USE_RCCL=1
+    USE_NCCL=1
+    USE_KINETO=1
+fi
+
+echo "Calling setup.py bdist at $(date)"
+
+if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+    echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
+    time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
+    BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 \
+    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
+    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
+    python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
+    echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
+    echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
+    time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
+    BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 \
+    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
+    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
+    python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR --cmake
+    echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
+else
+    time CMAKE_ARGS=${CMAKE_ARGS[@]} \
+        EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
+        BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
+        USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
+        python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
+fi
+echo "Finished setup.py bdist at $(date)"
+
+# Build libtorch packages
+if [[ -n "$BUILD_PYTHONLESS" ]]; then
+    # Now build pythonless libtorch
+    # Note - just use whichever python we happen to be on
+    python setup.py clean
+
+    if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
+        STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
+    fi
+
+    mkdir -p build
+    pushd build
+    echo "Calling tools/build_libtorch.py at $(date)"
+    time CMAKE_ARGS=${CMAKE_ARGS[@]} \
+         EXTRA_CAFFE2_CMAKE_FLAGS="${EXTRA_CAFFE2_CMAKE_FLAGS[@]} $STATIC_CMAKE_FLAG" \
+         python ../tools/build_libtorch.py
+    echo "Finished tools/build_libtorch.py at $(date)"
+    popd
+
+    mkdir -p libtorch/{lib,bin,include,share}
+    cp -r build/build/lib libtorch/
+
+    # for now, the headers for the libtorch package will just be copied in
+    # from one of the wheels (this is from when this script built multiple
+    # wheels at once)
+    ANY_WHEEL=$(ls /tmp/$WHEELHOUSE_DIR/torch*.whl | head -n1)
+    unzip -d any_wheel $ANY_WHEEL
+    if [[ -d any_wheel/torch/include ]]; then
+        cp -r any_wheel/torch/include libtorch/
+    else
+        cp -r any_wheel/torch/lib/include libtorch/
+    fi
+    cp -r any_wheel/torch/share/cmake libtorch/share/
+    rm -rf any_wheel
+
+    echo $PYTORCH_BUILD_VERSION > libtorch/build-version
+    echo "$(pushd $PYTORCH_ROOT && git rev-parse HEAD)" > libtorch/build-hash
+
+    mkdir -p /tmp/$LIBTORCH_HOUSE_DIR
+
+    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+        LIBTORCH_ABI="cxx11-abi-"
+    else
+        LIBTORCH_ABI=
+    fi
+
+    zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch
+    cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \
+       /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-latest.zip
+fi
+
+popd
+
+#######################################################################
+# ADD DEPENDENCIES INTO THE WHEEL
+#
+# auditwheel repair doesn't work correctly and is buggy
+# so manually do the work of copying dependency libs and patchelfing
+# and fixing RECORDS entries correctly
+######################################################################
+
+fname_with_sha256() {
+    HASH=$(sha256sum $1 | cut -c1-8)
+    DIRNAME=$(dirname $1)
+    BASENAME=$(basename $1)
+    # Do not rename nvrtc-builtins.so as they are dynamically loaded
+    # by libnvrtc.so
+    # Similarly don't mangle libcudnn and libcublas library names
+    if [[ $BASENAME == "libnvrtc-builtins.s"* || $BASENAME == "libcudnn"* || $BASENAME == "libcublas"*  ]]; then
+        echo $1
+    else
+        INITNAME=$(echo $BASENAME | cut -f1 -d".")
+        ENDNAME=$(echo $BASENAME | cut -f 2- -d".")
+        echo "$DIRNAME/$INITNAME-$HASH.$ENDNAME"
+    fi
+}
+
+fname_without_so_number() {
+    LINKNAME=$(echo $1 | sed -e 's/\.so.*/.so/g')
+    echo "$LINKNAME"
+}
+
+make_wheel_record() {
+    FPATH=$1
+    if echo $FPATH | grep RECORD >/dev/null 2>&1; then
+        # if the RECORD file, then
+        echo "$FPATH,,"
+    else
+        HASH=$(openssl dgst -sha256 -binary $FPATH | openssl base64 | sed -e 's/+/-/g' | sed -e 's/\//_/g' | sed -e 's/=//g')
+        FSIZE=$(ls -nl $FPATH | awk '{print $5}')
+        echo "$FPATH,sha256=$HASH,$FSIZE"
+    fi
+}
+
+replace_needed_sofiles() {
+    find $1 -name '*.so*' | while read sofile; do
+        origname=$2
+        patchedname=$3
+        if [[ "$origname" != "$patchedname" ]] || [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
+            set +e
+            origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
+            ERRCODE=$?
+            set -e
+            if [ "$ERRCODE" -eq "0" ]; then
+                echo "patching $sofile entry $origname to $patchedname"
+                $PATCHELF_BIN --replace-needed $origname $patchedname $sofile
+            fi
+        fi
+    done
+}
+
+echo 'Built this wheel:'
+ls /tmp/$WHEELHOUSE_DIR
+mkdir -p "/$WHEELHOUSE_DIR"
+mv /tmp/$WHEELHOUSE_DIR/torch*linux*.whl /$WHEELHOUSE_DIR/
+
+if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+    mv /tmp/$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/ || true
+fi
+
+if [[ -n "$BUILD_PYTHONLESS" ]]; then
+    mkdir -p /$LIBTORCH_HOUSE_DIR
+    mv /tmp/$LIBTORCH_HOUSE_DIR/*.zip /$LIBTORCH_HOUSE_DIR
+    rm -rf /tmp/$LIBTORCH_HOUSE_DIR
+fi
+rm -rf /tmp/$WHEELHOUSE_DIR
+rm -rf /tmp_dir
+mkdir /tmp_dir
+pushd /tmp_dir
+
+for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.whl /$LIBTORCH_HOUSE_DIR/libtorch*.zip; do
+
+    # if the glob didn't match anything
+    if [[ ! -e $pkg ]]; then
+        continue
+    fi
+
+    rm -rf tmp
+    mkdir -p tmp
+    cd tmp
+    cp $pkg .
+
+    unzip -q $(basename $pkg)
+    rm -f $(basename $pkg)
+
+    if [[ -d torch ]]; then
+        PREFIX=torch
+    else
+        PREFIX=libtorch
+    fi
+
+    if [[ $pkg != *"without-deps"* ]]; then
+        # copy over needed dependent .so files over and tag them with their hash
+        patched=()
+        for filepath in "${DEPS_LIST[@]}"; do
+            filename=$(basename $filepath)
+            destpath=$PREFIX/lib/$filename
+            if [[ "$filepath" != "$destpath" ]]; then
+                cp $filepath $destpath
+            fi
+
+            # ROCm workaround for roctracer dlopens
+            if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
+                patchedpath=$(fname_without_so_number $destpath)
+            # Keep the so number for XPU dependencies
+            elif [[ "$DESIRED_CUDA" == *"xpu"* ]]; then
+                patchedpath=$destpath
+            else
+                patchedpath=$(fname_with_sha256 $destpath)
+            fi
+            patchedname=$(basename $patchedpath)
+            if [[ "$destpath" != "$patchedpath" ]]; then
+                mv $destpath $patchedpath
+            fi
+            patched+=("$patchedname")
+            echo "Copied $filepath to $patchedpath"
+        done
+
+        echo "patching to fix the so names to the hashed names"
+        for ((i=0;i<${#DEPS_LIST[@]};++i)); do
+            replace_needed_sofiles $PREFIX ${DEPS_SONAME[i]} ${patched[i]}
+            # do the same for caffe2, if it exists
+            if [[ -d caffe2 ]]; then
+                replace_needed_sofiles caffe2 ${DEPS_SONAME[i]} ${patched[i]}
+            fi
+        done
+
+        # copy over needed auxiliary files
+        for ((i=0;i<${#DEPS_AUX_SRCLIST[@]};++i)); do
+            srcpath=${DEPS_AUX_SRCLIST[i]}
+            dstpath=$PREFIX/${DEPS_AUX_DSTLIST[i]}
+            mkdir -p $(dirname $dstpath)
+            cp $srcpath $dstpath
+        done
+    fi
+
+    # set RPATH of _C.so and similar to $ORIGIN, $ORIGIN/lib
+    find $PREFIX -maxdepth 1 -type f -name "*.so*" | while read sofile; do
+        echo "Setting rpath of $sofile to ${C_SO_RPATH:-'$ORIGIN:$ORIGIN/lib'}"
+        $PATCHELF_BIN --set-rpath ${C_SO_RPATH:-'$ORIGIN:$ORIGIN/lib'} ${FORCE_RPATH:-} $sofile
+        $PATCHELF_BIN --print-rpath $sofile
+    done
+
+    # set RPATH of lib/ files to $ORIGIN
+    find $PREFIX/lib -maxdepth 1 -type f -name "*.so*" | while read sofile; do
+        echo "Setting rpath of $sofile to ${LIB_SO_RPATH:-'$ORIGIN'}"
+        $PATCHELF_BIN --set-rpath ${LIB_SO_RPATH:-'$ORIGIN'} ${FORCE_RPATH:-} $sofile
+        $PATCHELF_BIN --print-rpath $sofile
+    done
+
+    # regenerate the RECORD file with new hashes
+    record_file=$(echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/RECORD/g')
+    if [[ -e $record_file ]]; then
+        echo "Generating new record file $record_file"
+        : > "$record_file"
+        # generate records for folders in wheel
+        find * -type f | while read fname; do
+            make_wheel_record "$fname" >>"$record_file"
+        done
+    fi
+
+    if [[ $BUILD_DEBUG_INFO == "1" ]]; then
+        pushd "$PREFIX/lib"
+
+        # Duplicate library into debug lib
+        cp libtorch_cpu.so libtorch_cpu.so.dbg
+
+        # Keep debug symbols on debug lib
+        strip --only-keep-debug libtorch_cpu.so.dbg
+
+        # Remove debug info from release lib
+        strip --strip-debug libtorch_cpu.so
+
+        objcopy libtorch_cpu.so --add-gnu-debuglink=libtorch_cpu.so.dbg
+
+        # Zip up debug info
+        mkdir -p /tmp/debug
+        mv libtorch_cpu.so.dbg /tmp/debug/libtorch_cpu.so.dbg
+        CRC32=$(objcopy --dump-section .gnu_debuglink=>(tail -c4 | od -t x4 -An | xargs echo) libtorch_cpu.so)
+
+        pushd /tmp
+        PKG_NAME=$(basename "$pkg" | sed 's/\.whl$//g')
+        zip /tmp/debug-whl-libtorch-"$PKG_NAME"-"$CRC32".zip /tmp/debug/libtorch_cpu.so.dbg
+        cp /tmp/debug-whl-libtorch-"$PKG_NAME"-"$CRC32".zip "$PYTORCH_FINAL_PACKAGE_DIR"
+        popd
+
+        popd
+    fi
+
+    # zip up the wheel back
+    zip -rq $(basename $pkg) $PREIX*
+
+    # replace original wheel
+    rm -f $pkg
+    mv $(basename $pkg) $pkg
+    cd ..
+    rm -rf tmp
+done
+
+# Copy wheels to host machine for persistence before testing
+if [[ -n "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
+    mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
+    if [[ -n "$BUILD_PYTHONLESS" ]]; then
+        cp /$LIBTORCH_HOUSE_DIR/libtorch*.zip "$PYTORCH_FINAL_PACKAGE_DIR"
+    else
+        cp /$WHEELHOUSE_DIR/torch*.whl "$PYTORCH_FINAL_PACKAGE_DIR"
+    fi
+fi
+
+# remove stuff before testing
+rm -rf /opt/rh
+if ls /usr/local/cuda* >/dev/null 2>&1; then
+    rm -rf /usr/local/cuda*
+fi
+
+
+# Test that all the wheels work
+if [[ -z "$BUILD_PYTHONLESS" ]]; then
+  export OMP_NUM_THREADS=4 # on NUMA machines this takes too long
+  pushd $PYTORCH_ROOT/test
+
+  # Install the wheel for this Python version
+  if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+    pip uninstall -y "$TORCH_NO_PYTHON_PACKAGE_NAME" || true
+  fi
+
+  pip uninstall -y "$TORCH_PACKAGE_NAME"
+
+  if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+    pip install "$TORCH_NO_PYTHON_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
+  fi
+
+  pip install "$TORCH_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
+
+  # Print info on the libraries installed in this wheel
+  # Rather than adjust find command to skip non-library files with an embedded *.so* in their name,
+  # since this is only for reporting purposes, we add the || true to the ldd command.
+  installed_libraries=($(find "$pydir/lib/python${py_majmin}/site-packages/torch/" -name '*.so*'))
+  echo "The wheel installed all of the libraries: ${installed_libraries[@]}"
+  for installed_lib in "${installed_libraries[@]}"; do
+      ldd "$installed_lib" || true
+  done
+
+  # Run the tests
+  echo "$(date) :: Running tests"
+  pushd "$PYTORCH_ROOT"
+
+  #TODO: run_tests.sh and check_binary.sh should be moved to pytorch/pytorch project
+  LD_LIBRARY_PATH=/usr/local/nvidia/lib64 \
+          "/builder/run_tests.sh" manywheel "${py_majmin}" "$DESIRED_CUDA"
+  popd
+  echo "$(date) :: Finished tests"
+fi
--- a/.ci/manywheel/build_cpu.sh
+++ b/.ci/manywheel/build_cpu.sh
@ -0,0 +1,99 @@
+#!/usr/bin/env bash
+
+set -ex
+
+GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
+
+export TH_BINARY_BUILD=1
+export USE_CUDA=0
+
+# Keep an array of cmake variables to add to
+if [[ -z "$CMAKE_ARGS" ]]; then
+    # These are passed to tools/build_pytorch_libs.sh::build()
+    CMAKE_ARGS=()
+fi
+if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
+    # These are passed to tools/build_pytorch_libs.sh::build_caffe2()
+    EXTRA_CAFFE2_CMAKE_FLAGS=()
+fi
+
+DIR_SUFFIX=cpu
+if [[ "$GPU_ARCH_TYPE" == "xpu" ]]; then
+    DIR_SUFFIX=xpu
+    # Refer https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu/2-5.html
+    source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
+    source /opt/intel/oneapi/pti/latest/env/vars.sh
+    export USE_STATIC_MKL=1
+fi
+
+WHEELHOUSE_DIR="wheelhouse$DIR_SUFFIX"
+LIBTORCH_HOUSE_DIR="libtorch_house$DIR_SUFFIX"
+if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
+    if [[ -z "$BUILD_PYTHONLESS" ]]; then
+        PYTORCH_FINAL_PACKAGE_DIR="/remote/wheelhouse$DIR_SUFFIX"
+    else
+        PYTORCH_FINAL_PACKAGE_DIR="/remote/libtorch_house$DIR_SUFFIX"
+    fi
+fi
+mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
+
+OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
+if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
+    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
+elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
+    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
+elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
+elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
+    if [[ "$(uname -m)" == "s390x" ]]; then
+        LIBGOMP_PATH="/usr/lib/s390x-linux-gnu/libgomp.so.1"
+    else
+        LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
+    fi
+fi
+
+DEPS_LIST=(
+    "$LIBGOMP_PATH"
+)
+
+DEPS_SONAME=(
+    "libgomp.so.1"
+)
+
+if [[ "$GPU_ARCH_TYPE" == "xpu" ]]; then
+    echo "Bundling with xpu support package libs."
+    DEPS_LIST+=(
+        "/opt/intel/oneapi/compiler/latest/lib/libsycl-preview.so.7"
+        "/opt/intel/oneapi/compiler/latest/lib/libOpenCL.so.1"
+        "/opt/intel/oneapi/compiler/latest/lib/libxptifw.so"
+        "/opt/intel/oneapi/compiler/latest/lib/libsvml.so"
+        "/opt/intel/oneapi/compiler/latest/lib/libirng.so"
+        "/opt/intel/oneapi/compiler/latest/lib/libimf.so"
+        "/opt/intel/oneapi/compiler/latest/lib/libintlc.so.5"
+        "/opt/intel/oneapi/compiler/latest/lib/libpi_level_zero.so"
+        "/opt/intel/oneapi/pti/latest/lib/libpti_view.so.0.9"
+        "/opt/intel/oneapi/pti/latest/lib/libpti.so.0.9"
+    )
+    DEPS_SONAME+=(
+        "libsycl-preview.so.7"
+        "libOpenCL.so.1"
+        "libxptifw.so"
+        "libsvml.so"
+        "libirng.so"
+        "libimf.so"
+        "libintlc.so.5"
+        "libpi_level_zero.so"
+        "libpti_view.so.0.9"
+        "libpti.so.0.9"
+    )
+fi
+
+rm -rf /usr/local/cuda*
+
+SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
+if [[ -z "$BUILD_PYTHONLESS" ]]; then
+    BUILD_SCRIPT=build_common.sh
+else
+    BUILD_SCRIPT=build_libtorch.sh
+fi
+source ${SOURCE_DIR}/${BUILD_SCRIPT}
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -0,0 +1,290 @@
+#!/usr/bin/env bash
+
+set -ex
+
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P ))"
+
+export TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
+export NCCL_ROOT_DIR=/usr/local/cuda
+export TH_BINARY_BUILD=1
+export USE_STATIC_CUDNN=1
+export USE_STATIC_NCCL=1
+export ATEN_STATIC_CUDA=1
+export USE_CUDA_STATIC_LINK=1
+export INSTALL_TEST=0 # dont install test binaries into site-packages
+export USE_CUPTI_SO=0
+export USE_CUSPARSELT=${USE_CUSPARSELT:-1} # Enable if not disabled by libtorch build
+
+# Keep an array of cmake variables to add to
+if [[ -z "$CMAKE_ARGS" ]]; then
+    # These are passed to tools/build_pytorch_libs.sh::build()
+    CMAKE_ARGS=()
+fi
+if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
+    # These are passed to tools/build_pytorch_libs.sh::build_caffe2()
+    EXTRA_CAFFE2_CMAKE_FLAGS=()
+fi
+
+# Determine CUDA version and architectures to build for
+#
+# NOTE: We should first check `DESIRED_CUDA` when determining `CUDA_VERSION`,
+# because in some cases a single Docker image can have multiple CUDA versions
+# on it, and `nvcc --version` might not show the CUDA version we want.
+if [[ -n "$DESIRED_CUDA" ]]; then
+    # If the DESIRED_CUDA already matches the format that we expect
+    if [[ ${DESIRED_CUDA} =~ ^[0-9]+\.[0-9]+$ ]]; then
+        CUDA_VERSION=${DESIRED_CUDA}
+    else
+        # cu90, cu92, cu100, cu101
+        if [[ ${#DESIRED_CUDA} -eq 4 ]]; then
+            CUDA_VERSION="${DESIRED_CUDA:2:1}.${DESIRED_CUDA:3:1}"
+        elif [[ ${#DESIRED_CUDA} -eq 5 ]]; then
+            CUDA_VERSION="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4:1}"
+        fi
+    fi
+    echo "Using CUDA $CUDA_VERSION as determined by DESIRED_CUDA"
+
+    # There really has to be a better way to do this - eli
+    # Possibly limiting builds to specific cuda versions be delimiting images would be a choice
+    if [[ "$OS_NAME" == *"Ubuntu"* ]]; then
+        echo "Switching to CUDA version ${DESIRED_CUDA}"
+        /builder/conda/switch_cuda_version.sh "${DESIRED_CUDA}"
+    fi
+else
+    CUDA_VERSION=$(nvcc --version|grep release|cut -f5 -d" "|cut -f1 -d",")
+    echo "CUDA $CUDA_VERSION Detected"
+fi
+
+cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
+
+TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
+case ${CUDA_VERSION} in
+    12.4)
+        if [[ "$GPU_ARCH_TYPE" = "cuda-aarch64" ]]; then
+            TORCH_CUDA_ARCH_LIST="9.0"
+        else
+            TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0+PTX"
+        fi
+        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
+        ;;
+    12.1)
+        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
+        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
+        ;;
+    11.8)
+        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};3.7;9.0"
+        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
+        ;;
+    11.[67])
+        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};3.7"
+        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
+        ;;
+    *)
+        echo "unknown cuda version $CUDA_VERSION"
+        exit 1
+        ;;
+esac
+
+export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
+echo "${TORCH_CUDA_ARCH_LIST}"
+
+# Package directories
+WHEELHOUSE_DIR="wheelhouse$cuda_version_nodot"
+LIBTORCH_HOUSE_DIR="libtorch_house$cuda_version_nodot"
+if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
+    if [[ -z "$BUILD_PYTHONLESS" ]]; then
+        PYTORCH_FINAL_PACKAGE_DIR="/remote/wheelhouse$cuda_version_nodot"
+    else
+        PYTORCH_FINAL_PACKAGE_DIR="/remote/libtorch_house$cuda_version_nodot"
+    fi
+fi
+mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
+
+OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
+if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
+    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
+elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
+elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
+    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
+elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
+    LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
+fi
+
+DEPS_LIST=(
+    "$LIBGOMP_PATH"
+)
+DEPS_SONAME=(
+    "libgomp.so.1"
+)
+
+if [[ $USE_CUSPARSELT == "1" ]]; then
+        DEPS_SONAME+=(
+            "libcusparseLt.so.0"
+        )
+        DEPS_LIST+=(
+            "/usr/local/cuda/lib64/libcusparseLt.so.0"
+        )
+fi
+
+if [[ $CUDA_VERSION == "12.1" || $CUDA_VERSION == "12.4" ]]; then
+    export USE_STATIC_CUDNN=0
+    # Try parallelizing nvcc as well
+    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
+
+    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
+        echo "Bundling with cudnn and cublas."
+        DEPS_LIST+=(
+            "/usr/local/cuda/lib64/libcudnn_adv.so.9"
+            "/usr/local/cuda/lib64/libcudnn_cnn.so.9"
+            "/usr/local/cuda/lib64/libcudnn_graph.so.9"
+            "/usr/local/cuda/lib64/libcudnn_ops.so.9"
+            "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9"
+            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9"
+            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9"
+            "/usr/local/cuda/lib64/libcudnn.so.9"
+            "/usr/local/cuda/lib64/libcublas.so.12"
+            "/usr/local/cuda/lib64/libcublasLt.so.12"
+            "/usr/local/cuda/lib64/libcudart.so.12"
+            "/usr/local/cuda/lib64/libnvToolsExt.so.1"
+            "/usr/local/cuda/lib64/libnvrtc.so.12"
+            "/usr/local/cuda/lib64/libnvrtc-builtins.so"
+        )
+        DEPS_SONAME+=(
+            "libcudnn_adv.so.9"
+            "libcudnn_cnn.so.9"
+            "libcudnn_graph.so.9"
+            "libcudnn_ops.so.9"
+            "libcudnn_engines_runtime_compiled.so.9"
+            "libcudnn_engines_precompiled.so.9"
+            "libcudnn_heuristic.so.9"
+            "libcudnn.so.9"
+            "libcublas.so.12"
+            "libcublasLt.so.12"
+            "libcudart.so.12"
+            "libnvToolsExt.so.1"
+            "libnvrtc.so.12"
+            "libnvrtc-builtins.so"
+        )
+    else
+        echo "Using nvidia libs from pypi."
+        CUDA_RPATHS=(
+            '$ORIGIN/../../nvidia/cublas/lib'
+            '$ORIGIN/../../nvidia/cuda_cupti/lib'
+            '$ORIGIN/../../nvidia/cuda_nvrtc/lib'
+            '$ORIGIN/../../nvidia/cuda_runtime/lib'
+            '$ORIGIN/../../nvidia/cudnn/lib'
+            '$ORIGIN/../../nvidia/cufft/lib'
+            '$ORIGIN/../../nvidia/curand/lib'
+            '$ORIGIN/../../nvidia/cusolver/lib'
+            '$ORIGIN/../../nvidia/cusparse/lib'
+            '$ORIGIN/../../nvidia/nccl/lib'
+            '$ORIGIN/../../nvidia/nvtx/lib'
+        )
+        CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
+        export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
+        export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
+        export FORCE_RPATH="--force-rpath"
+        export USE_STATIC_NCCL=0
+        export USE_SYSTEM_NCCL=1
+        export ATEN_STATIC_CUDA=0
+        export USE_CUDA_STATIC_LINK=0
+        export USE_CUPTI_SO=1
+        export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
+        export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
+    fi
+elif [[ $CUDA_VERSION == "11.8" ]]; then
+    export USE_STATIC_CUDNN=0
+    # Try parallelizing nvcc as well
+    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
+    # Bundle ptxas into the wheel, see https://github.com/pytorch/pytorch/pull/119750
+    export BUILD_BUNDLE_PTXAS=1
+
+    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
+        echo "Bundling with cudnn and cublas."
+        DEPS_LIST+=(
+            "/usr/local/cuda/lib64/libcudnn_adv.so.9"
+            "/usr/local/cuda/lib64/libcudnn_cnn.so.9"
+            "/usr/local/cuda/lib64/libcudnn_graph.so.9"
+            "/usr/local/cuda/lib64/libcudnn_ops.so.9"
+            "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9"
+            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9"
+            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9"
+            "/usr/local/cuda/lib64/libcudnn.so.9"
+            "/usr/local/cuda/lib64/libcublas.so.11"
+            "/usr/local/cuda/lib64/libcublasLt.so.11"
+            "/usr/local/cuda/lib64/libcudart.so.11.0"
+            "/usr/local/cuda/lib64/libnvToolsExt.so.1"
+            "/usr/local/cuda/lib64/libnvrtc.so.11.2"    # this is not a mistake, it links to more specific cuda version
+            "/usr/local/cuda/lib64/libnvrtc-builtins.so.11.8"
+        )
+        DEPS_SONAME+=(
+            "libcudnn_adv.so.9"
+            "libcudnn_cnn.so.9"
+            "libcudnn_graph.so.9"
+            "libcudnn_ops.so.9"
+            "libcudnn_engines_runtime_compiled.so.9"
+            "libcudnn_engines_precompiled.so.9"
+            "libcudnn_heuristic.so.9"
+            "libcudnn.so.9"
+            "libcublas.so.11"
+            "libcublasLt.so.11"
+            "libcudart.so.11.0"
+            "libnvToolsExt.so.1"
+            "libnvrtc.so.11.2"
+            "libnvrtc-builtins.so.11.8"
+        )
+    else
+        echo "Using nvidia libs from pypi."
+        CUDA_RPATHS=(
+            '$ORIGIN/../../nvidia/cublas/lib'
+            '$ORIGIN/../../nvidia/cuda_cupti/lib'
+            '$ORIGIN/../../nvidia/cuda_nvrtc/lib'
+            '$ORIGIN/../../nvidia/cuda_runtime/lib'
+            '$ORIGIN/../../nvidia/cudnn/lib'
+            '$ORIGIN/../../nvidia/cufft/lib'
+            '$ORIGIN/../../nvidia/curand/lib'
+            '$ORIGIN/../../nvidia/cusolver/lib'
+            '$ORIGIN/../../nvidia/cusparse/lib'
+            '$ORIGIN/../../nvidia/nccl/lib'
+            '$ORIGIN/../../nvidia/nvtx/lib'
+        )
+        CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
+        export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
+        export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
+        export FORCE_RPATH="--force-rpath"
+        export USE_STATIC_NCCL=0
+        export USE_SYSTEM_NCCL=1
+        export ATEN_STATIC_CUDA=0
+        export USE_CUDA_STATIC_LINK=0
+        export USE_CUPTI_SO=1
+        export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
+        export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
+    fi
+else
+    echo "Unknown cuda version $CUDA_VERSION"
+    exit 1
+fi
+
+# builder/test.sh requires DESIRED_CUDA to know what tests to exclude
+export DESIRED_CUDA="$cuda_version_nodot"
+
+# Switch `/usr/local/cuda` to the desired CUDA version
+rm -rf /usr/local/cuda || true
+ln -s "/usr/local/cuda-${CUDA_VERSION}" /usr/local/cuda
+
+# Switch `/usr/local/magma` to the desired CUDA version
+rm -rf /usr/local/magma || true
+ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
+
+export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev) # 10.0.130
+export CUDA_VERSION_SHORT=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev | cut -f1,2 -d".") # 10.0
+export CUDNN_VERSION=$(ls /usr/local/cuda/lib64/libcudnn.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev)
+
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+if [[ -z "$BUILD_PYTHONLESS" ]]; then
+    BUILD_SCRIPT=build_common.sh
+else
+    BUILD_SCRIPT=build_libtorch.sh
+fi
+source $SCRIPTPATH/${BUILD_SCRIPT}
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@ -0,0 +1,353 @@
+#!/usr/bin/env bash
+# meant to be called only from the neighboring build.sh and build_cpu.sh scripts
+
+set -e pipefail
+SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
+
+# Require only one python installation
+if [[ -z "$DESIRED_PYTHON" ]]; then
+    echo "Need to set DESIRED_PYTHON env variable"
+    exit 1
+fi
+if [[ -n "$BUILD_PYTHONLESS" && -z "$LIBTORCH_VARIANT" ]]; then
+    echo "BUILD_PYTHONLESS is set, so need LIBTORCH_VARIANT to also be set"
+    echo "LIBTORCH_VARIANT should be one of shared-with-deps shared-without-deps static-with-deps static-without-deps"
+    exit 1
+fi
+
+# Function to retry functions that sometimes timeout or have flaky failures
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+# TODO move this into the Docker images
+OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
+if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
+    retry yum install -q -y zip openssl
+elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+    retry yum install -q -y zip openssl
+elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
+    retry dnf install -q -y zip openssl
+elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
+    # TODO: Remove this once nvidia package repos are back online
+    # Comment out nvidia repositories to prevent them from getting apt-get updated, see https://github.com/pytorch/pytorch/issues/74968
+    # shellcheck disable=SC2046
+    sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list")
+    retry apt-get update
+    retry apt-get -y install zip openssl
+fi
+
+# Version: setup.py uses $PYTORCH_BUILD_VERSION.post$PYTORCH_BUILD_NUMBER if
+# PYTORCH_BUILD_NUMBER > 1
+build_version="$PYTORCH_BUILD_VERSION"
+build_number="$PYTORCH_BUILD_NUMBER"
+if [[ -n "$OVERRIDE_PACKAGE_VERSION" ]]; then
+    # This will be the *exact* version, since build_number<1
+    build_version="$OVERRIDE_PACKAGE_VERSION"
+    build_number=0
+fi
+if [[ -z "$build_version" ]]; then
+    build_version=1.0.0
+fi
+if [[ -z "$build_number" ]]; then
+    build_number=1
+fi
+export PYTORCH_BUILD_VERSION=$build_version
+export PYTORCH_BUILD_NUMBER=$build_number
+
+export CMAKE_LIBRARY_PATH="/opt/intel/lib:/lib:$CMAKE_LIBRARY_PATH"
+export CMAKE_INCLUDE_PATH="/opt/intel/include:$CMAKE_INCLUDE_PATH"
+
+# set OPENSSL_ROOT_DIR=/opt/openssl if it exists
+if [[ -e /opt/openssl ]]; then
+    export OPENSSL_ROOT_DIR=/opt/openssl
+    export CMAKE_INCLUDE_PATH="/opt/openssl/include":$CMAKE_INCLUDE_PATH
+fi
+
+# If given a python version like 3.6m or 2.7mu, convert this to the format we
+# expect. The binary CI jobs pass in python versions like this; they also only
+# ever pass one python version, so we assume that DESIRED_PYTHON is not a list
+# in this case
+if [[ -n "$DESIRED_PYTHON" && "$DESIRED_PYTHON" != cp* ]]; then
+    python_nodot="$(echo $DESIRED_PYTHON | tr -d m.u)"
+    DESIRED_PYTHON="cp${python_nodot}-cp${python_nodot}"
+fi
+pydir="/opt/python/$DESIRED_PYTHON"
+export PATH="$pydir/bin:$PATH"
+
+export PATCHELF_BIN=/usr/local/bin/patchelf
+patchelf_version=`$PATCHELF_BIN --version`
+echo "patchelf version: " $patchelf_version
+if [[ "$patchelf_version" == "patchelf 0.9" ]]; then
+    echo "Your patchelf version is too old. Please use version >= 0.10."
+    exit 1
+fi
+
+########################################################
+# Compile wheels as well as libtorch
+#######################################################
+if [[ -z "$PYTORCH_ROOT" ]]; then
+    echo "Need to set PYTORCH_ROOT env variable"
+    exit 1
+fi
+pushd "$PYTORCH_ROOT"
+python setup.py clean
+retry pip install -qr requirements.txt
+retry pip install -q numpy==2.0.1
+
+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    export _GLIBCXX_USE_CXX11_ABI=1
+else
+    export _GLIBCXX_USE_CXX11_ABI=0
+fi
+
+if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
+    echo "Calling build_amd.py at $(date)"
+    python tools/amd_build/build_amd.py
+    # TODO remove this work-around once pytorch sources are updated
+    export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr
+fi
+
+echo "Calling setup.py install at $(date)"
+
+if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
+    STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
+fi
+
+(
+    set -x
+
+    mkdir -p build
+
+    time CMAKE_ARGS=${CMAKE_ARGS[@]} \
+        EXTRA_CAFFE2_CMAKE_FLAGS="${EXTRA_CAFFE2_CMAKE_FLAGS[@]} $STATIC_CMAKE_FLAG" \
+        # TODO: Remove this flag once https://github.com/pytorch/pytorch/issues/55952 is closed
+        CFLAGS='-Wno-deprecated-declarations' \
+        BUILD_LIBTORCH_CPU_WITH_DEBUG=1 \
+        python setup.py install
+
+    mkdir -p libtorch/{lib,bin,include,share}
+
+    # Make debug folder separate so it doesn't get zipped up with the rest of
+    # libtorch
+    mkdir debug
+
+    # Copy over all lib files
+    cp -rv build/lib/*                libtorch/lib/
+    cp -rv build/lib*/torch/lib/*     libtorch/lib/
+
+    # Copy over all include files
+    cp -rv build/include/*            libtorch/include/
+    cp -rv build/lib*/torch/include/* libtorch/include/
+
+    # Copy over all of the cmake files
+    cp -rv build/lib*/torch/share/*   libtorch/share/
+
+    # Split libtorch into debug / release version
+    cp libtorch/lib/libtorch_cpu.so libtorch/lib/libtorch_cpu.so.dbg
+
+    # Keep debug symbols on debug lib
+    strip --only-keep-debug libtorch/lib/libtorch_cpu.so.dbg
+
+    # Remove debug info from release lib
+    strip --strip-debug libtorch/lib/libtorch_cpu.so
+
+    # Add a debug link to the release lib to the debug lib (debuggers will then
+    # search for symbols in a file called libtorch_cpu.so.dbg in some
+    # predetermined locations) and embed a CRC32 of the debug library into the .so
+    cd libtorch/lib
+
+    objcopy libtorch_cpu.so --add-gnu-debuglink=libtorch_cpu.so.dbg
+    cd ../..
+
+    # Move the debug symbols to its own directory so it doesn't get processed /
+    # zipped with all the other libraries
+    mv libtorch/lib/libtorch_cpu.so.dbg debug/libtorch_cpu.so.dbg
+
+    echo "${PYTORCH_BUILD_VERSION}" > libtorch/build-version
+    echo "$(pushd $PYTORCH_ROOT && git rev-parse HEAD)" > libtorch/build-hash
+
+)
+
+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    LIBTORCH_ABI="cxx11-abi-"
+else
+    LIBTORCH_ABI=
+fi
+
+(
+    set -x
+
+    mkdir -p /tmp/$LIBTORCH_HOUSE_DIR
+
+    # objcopy installs a CRC32 into libtorch_cpu above so, so add that to the name here
+    CRC32=$(objcopy --dump-section .gnu_debuglink=>(tail -c4 | od -t x4 -An | xargs echo) libtorch/lib/libtorch_cpu.so)
+
+    # Zip debug symbols
+    zip /tmp/$LIBTORCH_HOUSE_DIR/debug-libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION-$CRC32.zip debug/libtorch_cpu.so.dbg
+
+    # Zip and copy libtorch
+    zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch
+    cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \
+       /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-latest.zip
+)
+
+
+popd
+
+#######################################################################
+# ADD DEPENDENCIES INTO THE WHEEL
+#
+# auditwheel repair doesn't work correctly and is buggy
+# so manually do the work of copying dependency libs and patchelfing
+# and fixing RECORDS entries correctly
+######################################################################
+
+fname_with_sha256() {
+    HASH=$(sha256sum $1 | cut -c1-8)
+    DIRNAME=$(dirname $1)
+    BASENAME=$(basename $1)
+    if [[ $BASENAME == "libnvrtc-builtins.so" || $BASENAME == "libcudnn"* ]]; then
+        echo $1
+    else
+        INITNAME=$(echo $BASENAME | cut -f1 -d".")
+        ENDNAME=$(echo $BASENAME | cut -f 2- -d".")
+        echo "$DIRNAME/$INITNAME-$HASH.$ENDNAME"
+    fi
+}
+
+fname_without_so_number() {
+    LINKNAME=$(echo $1 | sed -e 's/\.so.*/.so/g')
+    echo "$LINKNAME"
+}
+
+make_wheel_record() {
+    FPATH=$1
+    if echo $FPATH | grep RECORD >/dev/null 2>&1; then
+        # if the RECORD file, then
+        echo "$FPATH,,"
+    else
+        HASH=$(openssl dgst -sha256 -binary $FPATH | openssl base64 | sed -e 's/+/-/g' | sed -e 's/\//_/g' | sed -e 's/=//g')
+        FSIZE=$(ls -nl $FPATH | awk '{print $5}')
+        echo "$FPATH,sha256=$HASH,$FSIZE"
+    fi
+}
+
+echo 'Built this package:'
+(
+    set -x
+    mkdir -p /$LIBTORCH_HOUSE_DIR
+    mv /tmp/$LIBTORCH_HOUSE_DIR/*.zip /$LIBTORCH_HOUSE_DIR
+    rm -rf /tmp/$LIBTORCH_HOUSE_DIR
+)
+TMP_DIR=$(mktemp -d)
+trap "rm -rf ${TMP_DIR}" EXIT
+pushd "${TMP_DIR}"
+
+for pkg in /$LIBTORCH_HOUSE_DIR/libtorch*.zip; do
+
+    # if the glob didn't match anything
+    if [[ ! -e $pkg ]]; then
+        continue
+    fi
+
+    rm -rf tmp
+    mkdir -p tmp
+    cd tmp
+    cp $pkg .
+
+    unzip -q $(basename $pkg)
+    rm -f $(basename $pkg)
+
+    PREFIX=libtorch
+
+    if [[ $pkg != *"without-deps"* ]]; then
+        # copy over needed dependent .so files over and tag them with their hash
+        patched=()
+        for filepath in "${DEPS_LIST[@]}"; do
+            filename=$(basename $filepath)
+            destpath=$PREFIX/lib/$filename
+            if [[ "$filepath" != "$destpath" ]]; then
+                cp $filepath $destpath
+            fi
+
+            if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
+                patchedpath=$(fname_without_so_number $destpath)
+            else
+                patchedpath=$(fname_with_sha256 $destpath)
+            fi
+            patchedname=$(basename $patchedpath)
+            if [[ "$destpath" != "$patchedpath" ]]; then
+                mv $destpath $patchedpath
+            fi
+            patched+=("$patchedname")
+            echo "Copied $filepath to $patchedpath"
+        done
+
+        echo "patching to fix the so names to the hashed names"
+        for ((i=0;i<${#DEPS_LIST[@]};++i)); do
+            find $PREFIX -name '*.so*' | while read sofile; do
+                origname=${DEPS_SONAME[i]}
+                patchedname=${patched[i]}
+                if [[ "$origname" != "$patchedname" ]] || [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
+                    set +e
+                    origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
+                    ERRCODE=$?
+                    set -e
+                    if [ "$ERRCODE" -eq "0" ]; then
+                        echo "patching $sofile entry $origname to $patchedname"
+                        $PATCHELF_BIN --replace-needed $origname $patchedname $sofile
+                    fi
+                fi
+            done
+        done
+
+        # copy over needed auxiliary files
+        for ((i=0;i<${#DEPS_AUX_SRCLIST[@]};++i)); do
+            srcpath=${DEPS_AUX_SRCLIST[i]}
+            dstpath=$PREFIX/${DEPS_AUX_DSTLIST[i]}
+            mkdir -p $(dirname $dstpath)
+            cp $srcpath $dstpath
+        done
+    fi
+
+    # set RPATH of _C.so and similar to $ORIGIN, $ORIGIN/lib
+    find $PREFIX -maxdepth 1 -type f -name "*.so*" | while read sofile; do
+        echo "Setting rpath of $sofile to " '$ORIGIN:$ORIGIN/lib'
+        $PATCHELF_BIN --set-rpath '$ORIGIN:$ORIGIN/lib' $sofile
+        $PATCHELF_BIN --print-rpath $sofile
+    done
+
+    # set RPATH of lib/ files to $ORIGIN
+    find $PREFIX/lib -maxdepth 1 -type f -name "*.so*" | while read sofile; do
+        echo "Setting rpath of $sofile to " '$ORIGIN'
+        $PATCHELF_BIN --set-rpath '$ORIGIN' $sofile
+        $PATCHELF_BIN --print-rpath $sofile
+    done
+
+    # regenerate the RECORD file with new hashes
+    record_file=`echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/RECORD/g'`
+    if [[ -e $record_file ]]; then
+        echo "Generating new record file $record_file"
+        rm -f $record_file
+        # generate records for folders in wheel
+        find * -type f | while read fname; do
+            echo $(make_wheel_record $fname) >>$record_file
+        done
+    fi
+
+    # zip up the wheel back
+    zip -rq $(basename $pkg) $PREFIX*
+
+    # replace original wheel
+    rm -f $pkg
+    mv $(basename $pkg) $pkg
+    cd ..
+    rm -rf tmp
+done
+
+# Copy wheels to host machine for persistence before testing
+if [[ -n "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
+    cp /$LIBTORCH_HOUSE_DIR/libtorch*.zip "$PYTORCH_FINAL_PACKAGE_DIR"
+    cp /$LIBTORCH_HOUSE_DIR/debug-libtorch*.zip "$PYTORCH_FINAL_PACKAGE_DIR"
+fi
--- a/.ci/manywheel/build_rocm.sh
+++ b/.ci/manywheel/build_rocm.sh
@ -0,0 +1,263 @@
+#!/usr/bin/env bash
+
+set -ex
+
+export ROCM_HOME=/opt/rocm
+export MAGMA_HOME=$ROCM_HOME/magma
+# TODO: libtorch_cpu.so is broken when building with Debug info
+export BUILD_DEBUG_INFO=0
+
+# TODO Are these all used/needed?
+export TH_BINARY_BUILD=1
+export USE_STATIC_CUDNN=1
+export USE_STATIC_NCCL=1
+export ATEN_STATIC_CUDA=1
+export USE_CUDA_STATIC_LINK=1
+export INSTALL_TEST=0 # dont install test binaries into site-packages
+# Set RPATH instead of RUNPATH when using patchelf to avoid LD_LIBRARY_PATH override
+export FORCE_RPATH="--force-rpath"
+
+# Keep an array of cmake variables to add to
+if [[ -z "$CMAKE_ARGS" ]]; then
+    # These are passed to tools/build_pytorch_libs.sh::build()
+    CMAKE_ARGS=()
+fi
+if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
+    # These are passed to tools/build_pytorch_libs.sh::build_caffe2()
+    EXTRA_CAFFE2_CMAKE_FLAGS=()
+fi
+
+# Determine ROCm version and architectures to build for
+#
+# NOTE: We should first check `DESIRED_CUDA` when determining `ROCM_VERSION`
+if [[ -n "$DESIRED_CUDA" ]]; then
+    if ! echo "${DESIRED_CUDA}"| grep "^rocm" >/dev/null 2>/dev/null; then
+        export DESIRED_CUDA="rocm${DESIRED_CUDA}"
+    fi
+    # rocm3.7, rocm3.5.1
+    ROCM_VERSION="$DESIRED_CUDA"
+    echo "Using $ROCM_VERSION as determined by DESIRED_CUDA"
+else
+    echo "Must set DESIRED_CUDA"
+    exit 1
+fi
+
+# Package directories
+WHEELHOUSE_DIR="wheelhouse$ROCM_VERSION"
+LIBTORCH_HOUSE_DIR="libtorch_house$ROCM_VERSION"
+if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
+    if [[ -z "$BUILD_PYTHONLESS" ]]; then
+        PYTORCH_FINAL_PACKAGE_DIR="/remote/wheelhouse$ROCM_VERSION"
+    else
+        PYTORCH_FINAL_PACKAGE_DIR="/remote/libtorch_house$ROCM_VERSION"
+    fi
+fi
+mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
+
+# To make version comparison easier, create an integer representation.
+ROCM_VERSION_CLEAN=$(echo ${ROCM_VERSION} | sed s/rocm//)
+save_IFS="$IFS"
+IFS=. ROCM_VERSION_ARRAY=(${ROCM_VERSION_CLEAN})
+IFS="$save_IFS"
+if [[ ${#ROCM_VERSION_ARRAY[@]} == 2 ]]; then
+    ROCM_VERSION_MAJOR=${ROCM_VERSION_ARRAY[0]}
+    ROCM_VERSION_MINOR=${ROCM_VERSION_ARRAY[1]}
+    ROCM_VERSION_PATCH=0
+elif [[ ${#ROCM_VERSION_ARRAY[@]} == 3 ]]; then
+    ROCM_VERSION_MAJOR=${ROCM_VERSION_ARRAY[0]}
+    ROCM_VERSION_MINOR=${ROCM_VERSION_ARRAY[1]}
+    ROCM_VERSION_PATCH=${ROCM_VERSION_ARRAY[2]}
+else
+    echo "Unhandled ROCM_VERSION ${ROCM_VERSION}"
+    exit 1
+fi
+ROCM_INT=$(($ROCM_VERSION_MAJOR * 10000 + $ROCM_VERSION_MINOR * 100 + $ROCM_VERSION_PATCH))
+
+# Required ROCm libraries
+ROCM_SO_FILES=(
+    "libMIOpen.so"
+    "libamdhip64.so"
+    "libhipblas.so"
+    "libhipfft.so"
+    "libhiprand.so"
+    "libhipsolver.so"
+    "libhipsparse.so"
+    "libhsa-runtime64.so"
+    "libamd_comgr.so"
+    "libmagma.so"
+    "librccl.so"
+    "librocblas.so"
+    "librocfft.so"
+    "librocm_smi64.so"
+    "librocrand.so"
+    "librocsolver.so"
+    "librocsparse.so"
+    "libroctracer64.so"
+    "libroctx64.so"
+    "libhipblaslt.so"
+    "libhiprtc.so"
+)
+
+if [[ $ROCM_INT -ge 60100 ]]; then
+    ROCM_SO_FILES+=("librocprofiler-register.so")
+fi
+
+if [[ $ROCM_INT -ge 60200 ]]; then
+    ROCM_SO_FILES+=("librocm-core.so")
+fi
+
+OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
+if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
+    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
+    LIBNUMA_PATH="/usr/lib64/libnuma.so.1"
+    LIBELF_PATH="/usr/lib64/libelf.so.1"
+    LIBTINFO_PATH="/usr/lib64/libtinfo.so.5"
+    LIBDRM_PATH="/opt/amdgpu/lib64/libdrm.so.2"
+    LIBDRM_AMDGPU_PATH="/opt/amdgpu/lib64/libdrm_amdgpu.so.1"
+    if [[ $ROCM_INT -ge 60100 ]]; then
+        # Below libs are direct dependencies of libhipsolver
+        LIBSUITESPARSE_CONFIG_PATH="/lib64/libsuitesparseconfig.so.4"
+        LIBCHOLMOD_PATH="/lib64/libcholmod.so.2"
+        # Below libs are direct dependencies of libcholmod
+        LIBAMD_PATH="/lib64/libamd.so.2"
+        LIBCAMD_PATH="/lib64/libcamd.so.2"
+        LIBCCOLAMD_PATH="/lib64/libccolamd.so.2"
+        LIBCOLAMD_PATH="/lib64/libcolamd.so.2"
+        LIBSATLAS_PATH="/lib64/atlas/libsatlas.so.3"
+        # Below libs are direct dependencies of libsatlas
+        LIBGFORTRAN_PATH="/lib64/libgfortran.so.3"
+        LIBQUADMATH_PATH="/lib64/libquadmath.so.0"
+    fi
+    MAYBE_LIB64=lib64
+elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
+    LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
+    LIBNUMA_PATH="/usr/lib/x86_64-linux-gnu/libnuma.so.1"
+    LIBELF_PATH="/usr/lib/x86_64-linux-gnu/libelf.so.1"
+    if [[ $ROCM_INT -ge 50300 ]]; then
+        LIBTINFO_PATH="/lib/x86_64-linux-gnu/libtinfo.so.6"
+    else
+        LIBTINFO_PATH="/lib/x86_64-linux-gnu/libtinfo.so.5"
+    fi
+    LIBDRM_PATH="/usr/lib/x86_64-linux-gnu/libdrm.so.2"
+    LIBDRM_AMDGPU_PATH="/usr/lib/x86_64-linux-gnu/libdrm_amdgpu.so.1"
+    if [[ $ROCM_INT -ge 60100 ]]; then
+        # Below libs are direct dependencies of libhipsolver
+        LIBCHOLMOD_PATH="/lib/x86_64-linux-gnu/libcholmod.so.3"
+        # Below libs are direct dependencies of libcholmod
+        LIBSUITESPARSE_CONFIG_PATH="/lib/x86_64-linux-gnu/libsuitesparseconfig.so.5"
+        LIBAMD_PATH="/lib/x86_64-linux-gnu/libamd.so.2"
+        LIBCAMD_PATH="/lib/x86_64-linux-gnu/libcamd.so.2"
+        LIBCCOLAMD_PATH="/lib/x86_64-linux-gnu/libccolamd.so.2"
+        LIBCOLAMD_PATH="/lib/x86_64-linux-gnu/libcolamd.so.2"
+        LIBMETIS_PATH="/lib/x86_64-linux-gnu/libmetis.so.5"
+        LIBLAPACK_PATH="/lib/x86_64-linux-gnu/liblapack.so.3"
+        LIBBLAS_PATH="/lib/x86_64-linux-gnu/libblas.so.3"
+        # Below libs are direct dependencies of libblas
+        LIBGFORTRAN_PATH="/lib/x86_64-linux-gnu/libgfortran.so.5"
+        LIBQUADMATH_PATH="/lib/x86_64-linux-gnu/libquadmath.so.0"
+    fi
+    MAYBE_LIB64=lib
+fi
+OS_SO_PATHS=($LIBGOMP_PATH $LIBNUMA_PATH\
+             $LIBELF_PATH $LIBTINFO_PATH\
+             $LIBDRM_PATH $LIBDRM_AMDGPU_PATH\
+             $LIBSUITESPARSE_CONFIG_PATH\
+             $LIBCHOLMOD_PATH $LIBAMD_PATH\
+             $LIBCAMD_PATH $LIBCCOLAMD_PATH\
+             $LIBCOLAMD_PATH $LIBSATLAS_PATH\
+             $LIBGFORTRAN_PATH $LIBQUADMATH_PATH\
+             $LIBMETIS_PATH $LIBLAPACK_PATH\
+             $LIBBLAS_PATH)
+OS_SO_FILES=()
+for lib in "${OS_SO_PATHS[@]}"
+do
+    file_name="${lib##*/}" # Substring removal of path to get filename
+    OS_SO_FILES[${#OS_SO_FILES[@]}]=$file_name # Append lib to array
+done
+
+# PyTorch-version specific
+# AOTriton dependency only for PyTorch >= 2.4
+if (( $(echo "${PYTORCH_VERSION} 2.4" | awk '{print ($1 >= $2)}') )); then
+    ROCM_SO_FILES+=("libaotriton_v2.so")
+fi
+
+# rocBLAS library files
+ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library
+ROCBLAS_LIB_DST=lib/rocblas/library
+ARCH=$(echo $PYTORCH_ROCM_ARCH | sed 's/;/|/g') # Replace ; seperated arch list to bar for grep
+ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
+OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
+ROCBLAS_LIB_FILES=($ARCH_SPECIFIC_FILES $OTHER_FILES)
+
+# hipblaslt library files
+HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library
+HIPBLASLT_LIB_DST=lib/hipblaslt/library
+ARCH_SPECIFIC_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -E $ARCH)
+OTHER_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -v gfx)
+HIPBLASLT_LIB_FILES=($ARCH_SPECIFIC_FILES $OTHER_FILES)
+
+# ROCm library files
+ROCM_SO_PATHS=()
+for lib in "${ROCM_SO_FILES[@]}"
+do
+    file_path=($(find $ROCM_HOME/lib/ -name "$lib")) # First search in lib
+    if [[ -z $file_path ]]; then
+        if [ -d "$ROCM_HOME/lib64/" ]; then
+            file_path=($(find $ROCM_HOME/lib64/ -name "$lib")) # Then search in lib64
+        fi
+    fi
+    if [[ -z $file_path ]]; then
+        file_path=($(find $ROCM_HOME/ -name "$lib")) # Then search in ROCM_HOME
+    fi
+    if [[ -z $file_path ]]; then
+        echo "Error: Library file $lib is not found." >&2
+        exit 1
+    fi
+    ROCM_SO_PATHS[${#ROCM_SO_PATHS[@]}]="$file_path" # Append lib to array
+done
+
+DEPS_LIST=(
+    ${ROCM_SO_PATHS[*]}
+    ${OS_SO_PATHS[*]}
+)
+
+DEPS_SONAME=(
+    ${ROCM_SO_FILES[*]}
+    ${OS_SO_FILES[*]}
+)
+
+DEPS_AUX_SRCLIST=(
+    "${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_SRC/}"
+    "${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_SRC/}"
+    "/opt/amdgpu/share/libdrm/amdgpu.ids"
+)
+
+DEPS_AUX_DSTLIST=(
+    "${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_DST/}"
+    "${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_DST/}"
+    "share/libdrm/amdgpu.ids"
+)
+
+# MIOpen library files
+MIOPEN_SHARE_SRC=$ROCM_HOME/share/miopen/db
+MIOPEN_SHARE_DST=share/miopen/db
+MIOPEN_SHARE_FILES=($(ls $MIOPEN_SHARE_SRC | grep -E $ARCH))
+DEPS_AUX_SRCLIST+=(${MIOPEN_SHARE_FILES[@]/#/$MIOPEN_SHARE_SRC/})
+DEPS_AUX_DSTLIST+=(${MIOPEN_SHARE_FILES[@]/#/$MIOPEN_SHARE_DST/})
+
+# RCCL library files
+RCCL_SHARE_SRC=$ROCM_HOME/share/rccl/msccl-algorithms
+RCCL_SHARE_DST=share/rccl/msccl-algorithms
+RCCL_SHARE_FILES=($(ls $RCCL_SHARE_SRC))
+DEPS_AUX_SRCLIST+=(${RCCL_SHARE_FILES[@]/#/$RCCL_SHARE_SRC/})
+DEPS_AUX_DSTLIST+=(${RCCL_SHARE_FILES[@]/#/$RCCL_SHARE_DST/})
+
+echo "PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH}"
+
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+if [[ -z "$BUILD_PYTHONLESS" ]]; then
+    BUILD_SCRIPT=build_common.sh
+else
+    BUILD_SCRIPT=build_libtorch.sh
+fi
+source $SCRIPTPATH/${BUILD_SCRIPT}
--- a/.ci/manywheel/test_wheel.sh
+++ b/.ci/manywheel/test_wheel.sh
@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+set -e
+
+yum install -y wget git
+
+rm -rf /usr/local/cuda*
+
+# Install Anaconda
+if ! ls /py
+then
+    echo "Miniconda needs to be installed"
+    wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
+    bash ~/miniconda.sh -b -p /py
+else
+    echo "Miniconda is already installed"
+fi
+
+export PATH="/py/bin:$PATH"
+
+# Anaconda token
+if ls /remote/token
+then
+   source /remote/token
+fi
+
+conda install -y conda-build anaconda-client
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -178,7 +178,7 @@ fi
 # sccache will fail for CUDA builds if all cores are used for compiling
 # gcc 7 with sccache seems to have intermittent OOM issue if all cores are used
 if [ -z "$MAX_JOBS" ]; then
-  if { [[ "$BUILD_ENVIRONMENT" == *cuda* ]] || [[ "$BUILD_ENVIRONMENT" == *gcc7* ]]; } && which sccache > /dev/null; then
+  if { [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; } && which sccache > /dev/null; then
    export MAX_JOBS=$(($(nproc) - 1))
  fi
 fi
@ -203,10 +203,12 @@ if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
 fi

 if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
-  export LDSHARED="clang --shared"
-  export USE_CUDA=0
+  if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
+    export USE_CUDA=1
+  fi
  export USE_ASAN=1
-  export UBSAN_FLAGS="-fno-sanitize-recover=all;-fno-sanitize=float-divide-by-zero;-fno-sanitize=float-cast-overflow"
+  export REL_WITH_DEB_INFO=1
+  export UBSAN_FLAGS="-fno-sanitize-recover=all"
  unset USE_LLVM
 fi

@ -218,10 +220,6 @@ if [[ "${BUILD_ENVIRONMENT}" == *-pch* ]]; then
    export USE_PRECOMPILED_HEADERS=1
 fi

-if [[ "${BUILD_ENVIRONMENT}" == *linux-focal-py3.7-gcc7-build*  ]]; then
-  export USE_GLOO_WITH_OPENSSL=ON
-fi
-
 if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
  export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi
@ -278,7 +276,6 @@ else
    # set only when building other architectures
    # or building non-XLA tests.
    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
-          "$BUILD_ENVIRONMENT" != *s390x*   &&
          "$BUILD_ENVIRONMENT" != *xla* ]]; then
      if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
        # Install numpy-2.0.2 for builds which are backward compatible with 1.X
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -191,9 +191,22 @@ function install_torchrec_and_fbgemm() {
  pip_uninstall torchrec-nightly
  pip_uninstall fbgemm-gpu-nightly
  pip_install setuptools-git-versioning scikit-build pyre-extensions
+
+  # TODO (huydhn): I still have no clue on why sccache doesn't work with only fbgemm_gpu here, but it
+  # seems to be an sccache-related issue
+  if [[ "$IS_A100_RUNNER" == "1" ]]; then
+    unset CMAKE_CUDA_COMPILER_LAUNCHER
+    sudo mv /opt/cache/bin /opt/cache/bin-backup
+  fi
+
  # See https://github.com/pytorch/pytorch/issues/106971
  CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 --user "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu"
  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
+
+  if [[ "$IS_A100_RUNNER" == "1" ]]; then
+    export CMAKE_CUDA_COMPILER_LAUNCHER=/opt/cache/bin/sccache
+    sudo mv /opt/cache/bin-backup /opt/cache/bin
+  fi
 }

 function clone_pytorch_xla() {
--- a/.ci/pytorch/create_test_cert.py
+++ b/.ci/pytorch/create_test_cert.py
@ -45,8 +45,7 @@ def create_cert(path, C, ST, L, O, key):
        .not_valid_before(datetime.now(timezone.utc))
        .not_valid_after(
            # Our certificate will be valid for 10 days
-            datetime.now(timezone.utc)
-            + timedelta(days=10)
+            datetime.now(timezone.utc) + timedelta(days=10)
        )
        .add_extension(
            x509.BasicConstraints(ca=True, path_length=None),
@ -91,8 +90,7 @@ def sign_certificate_request(path, csr_cert, ca_cert, private_ca_key):
        .not_valid_before(datetime.now(timezone.utc))
        .not_valid_after(
            # Our certificate will be valid for 10 days
-            datetime.now(timezone.utc)
-            + timedelta(days=10)
+            datetime.now(timezone.utc) + timedelta(days=10)
            # Sign our certificate with our private key
        )
        .sign(private_ca_key, hashes.SHA256())
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -49,16 +49,16 @@ NUM_TEST_SHARDS="${NUM_TEST_SHARDS:=1}"
 export VALGRIND=ON
 # export TORCH_INDUCTOR_INSTALL_GXX=ON
 if [[ "$BUILD_ENVIRONMENT" == *clang9* ]]; then
-  # clang9 appears to miscompile code involving c10::optional<c10::SymInt>,
+  # clang9 appears to miscompile code involving std::optional<c10::SymInt>,
  # such that valgrind complains along these lines:
  #
  # Conditional jump or move depends on uninitialised value(s)
  #    at 0x40303A: ~optional_base (Optional.h:281)
  #    by 0x40303A: call (Dispatcher.h:448)
-  #    by 0x40303A: call(at::Tensor const&, c10::ArrayRef<c10::SymInt>, c10::ArrayRef<c10::SymInt>, c10::optional<c10::SymInt>) (basic.cpp:10)
+  #    by 0x40303A: call(at::Tensor const&, c10::ArrayRef<c10::SymInt>, c10::ArrayRef<c10::SymInt>, std::optional<c10::SymInt>) (basic.cpp:10)
  #    by 0x403700: main (basic.cpp:16)
  #  Uninitialised value was created by a stack allocation
-  #    at 0x402AAA: call(at::Tensor const&, c10::ArrayRef<c10::SymInt>, c10::ArrayRef<c10::SymInt>, c10::optional<c10::SymInt>) (basic.cpp:6)
+  #    at 0x402AAA: call(at::Tensor const&, c10::ArrayRef<c10::SymInt>, c10::ArrayRef<c10::SymInt>, std::optional<c10::SymInt>) (basic.cpp:6)
  #
  # The problem does not appear with gcc or newer versions of clang (we tested
  # clang14).  So we suppress valgrind testing for clang9 specifically.
@ -72,7 +72,7 @@ if [[ "$BUILD_ENVIRONMENT" == *clang9* ]]; then
  #
  # using namespace at;
  #
-  # Tensor call(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset) {
+  # Tensor call(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, std::optional<c10::SymInt> storage_offset) {
  #   auto op = c10::Dispatcher::singleton()
  #       .findSchemaOrThrow(at::_ops::as_strided::name, at::_ops::as_strided::overload_name)
  #       .typed<at::_ops::as_strided::schema>();
@ -81,7 +81,7 @@ if [[ "$BUILD_ENVIRONMENT" == *clang9* ]]; then
  #
  # int main(int argv) {
  #   Tensor b = empty({3, 4});
-  #   auto z = call(b, b.sym_sizes(), b.sym_strides(), c10::nullopt);
+  #   auto z = call(b, b.sym_sizes(), b.sym_strides(), std::nullopt);
  # }
  export VALGRIND=OFF
 fi
@ -196,6 +196,9 @@ install_tlparse
 # ASAN test is not working
 if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
    export ASAN_OPTIONS=detect_leaks=0:symbolize=1:detect_stack_use_after_return=true:strict_init_order=true:detect_odr_violation=1:detect_container_overflow=0:check_initialization_order=true:debug=true
+    if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
+        export ASAN_OPTIONS="${ASAN_OPTIONS}:protect_shadow_gap=0"
+    fi
    export UBSAN_OPTIONS=print_stacktrace=1:suppressions=$PWD/ubsan.supp
    export PYTORCH_TEST_WITH_ASAN=1
    export PYTORCH_TEST_WITH_UBSAN=1
@ -233,8 +236,8 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
    # it depends on a ton of dynamic libraries that most programs aren't gonna
    # have, and it applies to child processes.

-    # TODO: get rid of the hardcoded path
-    export LD_PRELOAD=/usr/lib/llvm-15/lib/clang/15.0.7/lib/linux/libclang_rt.asan-x86_64.so
+    LD_PRELOAD=$(clang --print-file-name=libclang_rt.asan-x86_64.so)
+    export LD_PRELOAD
    # Disable valgrind for asan
    export VALGRIND=OFF

@ -281,7 +284,7 @@ test_python_shard() {

  # modify LD_LIBRARY_PATH to ensure it has the conda env.
  # This set of tests has been shown to be buggy without it for the split-build
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running

  assert_git_not_dirty
 }
@ -307,7 +310,8 @@ test_dynamo_shard() {
    --exclude-distributed-tests \
    --exclude-torch-export-tests \
    --shard "$1" "$NUM_TEST_SHARDS" \
-    --verbose
+    --verbose \
+    --upload-artifacts-while-running
  assert_git_not_dirty
 }

@ -320,6 +324,7 @@ test_inductor_distributed() {
  python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
  python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
  python test/run_test.py -i distributed/tensor/parallel/test_micro_pipeline_tp.py --verbose
+  python test/run_test.py -i distributed/_composable/test_replicate_with_compiler.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
@ -331,11 +336,12 @@ test_inductor_distributed() {
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py -k test_clip_grad_norm_2d --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_compile.py --verbose
  python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose

  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
  # with if required # gpus aren't available
-  python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives --verbose
+  python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_compute_comm_reordering --verbose
  assert_git_not_dirty
 }

@ -369,21 +375,27 @@ test_inductor_aoti() {
  CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
 }

-test_inductor_cpp_wrapper_abi_compatible() {
-  export TORCHINDUCTOR_ABI_COMPATIBLE=1
+test_inductor_cpp_wrapper() {
+  export TORCHINDUCTOR_CPP_WRAPPER=1
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

-  echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
-  PYTORCH_TESTING_DEVICE_ONLY_FOR="" python test/run_test.py --include inductor/test_cpu_cpp_wrapper
-  python test/run_test.py --include inductor/test_cuda_cpp_wrapper inductor/test_cpu_repro
-
-  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
+  python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
    --training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
    --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv"
  python benchmarks/dynamo/check_accuracy.py \
    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
    --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv"
+
+  python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
+    --bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
+  python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
+    --bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
+  python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
+    --bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
+  python benchmarks/dynamo/check_accuracy.py \
+    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
+    --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"
 }

 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
@ -403,7 +415,7 @@ pr_time_benchmarks() {
  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
  echo "benchmark results on current PR: "
  cat  "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"
-
+  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks python benchmarks/dynamo/pr_time_benchmarks/check_results.py "benchmarks/dynamo/pr_time_benchmarks/expected_results.csv" "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "$TEST_REPORTS_DIR/new_expected_results.csv"
 }

 if [[ "${TEST_CONFIG}" == *pr_time_benchmarks* ]]; then
@ -511,7 +523,7 @@ test_perf_for_dashboard() {
              "${target_flag[@]}" --"$mode" --"$dtype" --export --disable-cudagraphs "$@" \
              --output "$TEST_REPORTS_DIR/${backend}_export_${suite}_${dtype}_${mode}_${device}_${target}.csv"
        fi
-        TORCHINDUCTOR_ABI_COMPATIBLE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
+        $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --export-aot-inductor --disable-cudagraphs "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_aot_inductor_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
@ -566,13 +578,6 @@ test_single_dynamo_benchmark() {
    test_perf_for_dashboard "$suite" \
      "${DYNAMO_BENCHMARK_FLAGS[@]}" "$@" "${partition_flags[@]}"
  else
-    if [[ "${TEST_CONFIG}" == *aot_inductor* && "${TEST_CONFIG}" != *cpu_aot_inductor* ]]; then
-      # Test AOTInductor with the ABI-compatible mode on CI
-      # This can be removed once the ABI-compatible mode becomes default.
-      # For CPU device, we perfer non ABI-compatible mode on CI when testing AOTInductor.
-      export TORCHINDUCTOR_ABI_COMPATIBLE=1
-    fi
-
    if [[ "${TEST_CONFIG}" == *_avx2* ]]; then
      TEST_CONFIG=${TEST_CONFIG//_avx2/}
    fi
@ -606,6 +611,11 @@ test_inductor_halide() {
  assert_git_not_dirty
 }

+test_inductor_triton_cpu() {
+  python test/run_test.py --include inductor/test_triton_cpu_backend.py --verbose
+  assert_git_not_dirty
+}
+
 test_dynamo_benchmark() {
  # Usage: test_dynamo_benchmark huggingface 0
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -643,32 +653,12 @@ test_inductor_torchbench_smoketest_perf() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

-  # Test some models in the cpp wrapper mode
-  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-    --bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-    --bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-    --bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-  python benchmarks/dynamo/check_accuracy.py \
-    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
-    --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"
-
  python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
    --output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
  # The threshold value needs to be actively maintained to make this check useful
  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4

-  TORCHINDUCTOR_ABI_COMPATIBLE=1 python benchmarks/dynamo/torchbench.py --device cuda --performance --bfloat16 --inference \
-    --export-aot-inductor --only nanogpt --output "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv"
-  # The threshold value needs to be actively maintained to make this check useful
-  # The perf number of nanogpt seems not very stable, e.g.
-  # https://github.com/pytorch/pytorch/actions/runs/7158691360/job/19491437314,
-  # and thus we lower its threshold to reduce flakiness. If this continues to be a problem,
-  # we switch to use some other model.
-  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.9
-
  # Check memory compression ratio for a few models
  for test in hf_Albert timm_vision_transformer; do
    python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
@ -712,6 +702,10 @@ test_inductor_set_cpu_affinity(){
    export KMP_BLOCKTIME=1
  fi
  cores=$(test_inductor_get_core_number)
+  # Set number of cores to 16 on Aarch64 for performance runs.
+  if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then
+    cores=16
+  fi
  export OMP_NUM_THREADS=$cores
  end_core=$((cores-1))
  export TASKSET="taskset -c 0-$end_core"
@ -748,19 +742,9 @@ test_inductor_torchbench_cpu_smoketest_perf(){
    fi
    cat "$output_name"
    # The threshold value needs to be actively maintained to make this check useful.
-    python benchmarks/dynamo/check_perf_csv.py -f "$output_name" -t "$speedup_target"
+    # Allow 1% variance for CPU perf to accommodate perf fluctuation
+    python benchmarks/dynamo/check_perf_csv.py -f "$output_name" -t "$speedup_target" -s 0.99
  done
-
-  # Add a few ABI-compatible accuracy tests for CPU. These can be removed once we turn on ABI-compatible as default.
-  TORCHINDUCTOR_ABI_COMPATIBLE=1 python benchmarks/dynamo/timm_models.py --device cpu --accuracy \
-    --bfloat16 --inference --export-aot-inductor --disable-cudagraphs --only adv_inception_v3 \
-    --output "$TEST_REPORTS_DIR/aot_inductor_smoke_test.csv"
-  TORCHINDUCTOR_ABI_COMPATIBLE=1 python benchmarks/dynamo/timm_models.py --device cpu --accuracy \
-    --bfloat16 --inference --export-aot-inductor --disable-cudagraphs --only beit_base_patch16_224 \
-    --output "$TEST_REPORTS_DIR/aot_inductor_smoke_test.csv"
-  python benchmarks/dynamo/check_accuracy.py \
-    --actual "$TEST_REPORTS_DIR/aot_inductor_smoke_test.csv" \
-    --expected "benchmarks/dynamo/ci_expected_accuracy/aot_inductor_timm_inference.csv"
 }

 test_torchbench_gcp_smoketest(){
@ -1371,7 +1355,7 @@ test_executorch() {
  echo "Run ExecuTorch regression tests for some models"
  # TODO(huydhn): Add more coverage here using ExecuTorch's gather models script
  # shellcheck disable=SC1091
-  source .ci/scripts/test.sh mv3 cmake xnnpack-quantization-delegation ''
+  source .ci/scripts/test_model.sh mv3 cmake xnnpack-quantization-delegation ''

  popd

@ -1435,6 +1419,8 @@ elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
  test_inductor_halide
+elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
+  test_inductor_triton_cpu
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
@ -1451,14 +1437,13 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  else
    install_torchaudio cuda
  fi
-  install_torchtext
  install_torchvision
  TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install git+https://github.com/pytorch/ao.git
  id=$((SHARD_NUMBER-1))
  # https://github.com/opencv/opencv-python/issues/885
  pip_install opencv-python==4.8.0.74
  if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
-    checkout_install_torchbench hf_Bert hf_Albert nanogpt timm_vision_transformer
+    checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
@ -1477,9 +1462,11 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
    fi
    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
  fi
-elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper_abi_compatible* ]]; then
+elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
+  install_torchaudio cuda
  install_torchvision
-  test_inductor_cpp_wrapper_abi_compatible
+  checkout_install_torchbench hf_T5 llama moco
+  PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
  test_inductor_shard "${SHARD_NUMBER}"
--- a/.ci/pytorch/win-build.sh
+++ b/.ci/pytorch/win-build.sh
@ -26,7 +26,7 @@ fi
 export SCRIPT_HELPERS_DIR=$SCRIPT_PARENT_DIR/win-test-helpers

 set +ex
-grep -E -R 'PyLong_(From|As)(Unsigned|)Long\(' --exclude=python_numbers.h --exclude=eval_frame.c torch/
+grep -E -R 'PyLong_(From|As)(Unsigned|)Long\(' --exclude=python_numbers.h  --exclude=pythoncapi_compat.h --exclude=eval_frame.c torch/
 PYLONG_API_CHECK=$?
 if [[ $PYLONG_API_CHECK == 0 ]]; then
  echo "Usage of PyLong_{From,As}{Unsigned}Long API may lead to overflow errors on Windows"
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -46,6 +46,9 @@ python -m pip install tlparse==0.3.25
 # Install parameterized
 python -m pip install parameterized==0.8.1

+# Install pulp for testing ilps under torch\distributed\_tools
+python -m pip install pulp==2.9.0
+
 run_tests() {
    # Run nvidia-smi if available
    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -27,12 +27,11 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
  source activate testenv >/dev/null
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
  python_path="/opt/python/cp\$python_nodot-cp\${python_nodot}"
-  # Prior to Python 3.8 paths were suffixed with an 'm'
-  if [[ -d  "\${python_path}/bin" ]]; then
-    export PATH="\${python_path}/bin:\$PATH"
-  elif [[ -d "\${python_path}m/bin" ]]; then
-    export PATH="\${python_path}m/bin:\$PATH"
+  if [[ "\$python_nodot" = *t ]]; then
+    python_digits="\$(echo $DESIRED_PYTHON | tr -cd [:digit:])"
+    python_path="/opt/python/cp\$python_digits-cp\${python_digits}t"
  fi
+  export PATH="\${python_path}/bin:\$PATH"
 fi

 EXTRA_CONDA_FLAGS=""
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -114,6 +114,12 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B
    fi
 fi

+USE_GLOO_WITH_OPENSSL="ON"
+if [[ "$GPU_ARCH_TYPE" =~ .*aarch64.* ]]; then
+  USE_GLOO_WITH_OPENSSL="OFF"
+  USE_GOLD_LINKER="OFF"
+fi
+
 cat >"$envfile" <<EOL
 # =================== The following code will be executed inside Docker container ===================
 export TZ=UTC
@ -153,7 +159,7 @@ export DOCKER_IMAGE="$DOCKER_IMAGE"


 export USE_GOLD_LINKER="${USE_GOLD_LINKER}"
-export USE_GLOO_WITH_OPENSSL="ON"
+export USE_GLOO_WITH_OPENSSL="${USE_GLOO_WITH_OPENSSL}"
 # =================== The above code will be executed inside Docker container ===================
 EOL

--- a/.clang-format
+++ b/.clang-format
@ -44,7 +44,9 @@ ContinuationIndentWidth: 4
 Cpp11BracedListStyle: true
 DerivePointerAlignment: false
 DisableFormat:   false
-ForEachMacros:   [ FOR_EACH_RANGE, FOR_EACH, ]
+ForEachMacros:
+  - FOR_EACH_RANGE
+  - FOR_EACH
 IncludeCategories:
  - Regex:           '^<.*\.h(pp)?>'
    Priority:        1
@ -58,6 +60,24 @@ IndentWrappedFunctionNames: false
 KeepEmptyLinesAtTheStartOfBlocks: false
 MacroBlockBegin: ''
 MacroBlockEnd:   ''
+Macros:
+  - >-
+    PyObject_HEAD_INIT(type)={
+        /* this is not exactly match with PyObject_HEAD_INIT in Python source code
+         * but it is enough for clang-format */
+        { 0xFFFFFFFF },
+        (type)
+    },
+  - >-
+    PyVarObject_HEAD_INIT(type, size)={
+        {
+            /* manually expand PyObject_HEAD_INIT(type) above
+             * because clang-format do not support recursive expansion */
+            { 0xFFFFFFFF },
+            (type)
+        },
+        (size)
+    },
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
 PenaltyBreakBeforeFirstCallParameter: 1
@ -79,7 +99,11 @@ SpacesInContainerLiterals: true
 SpacesInCStyleCastParentheses: false
 SpacesInParentheses: false
 SpacesInSquareBrackets: false
-Standard:        Cpp11
+Standard:        c++17
+StatementMacros:
+  - PyObject_HEAD
+  - PyObject_VAR_HEAD
+  - PyException_HEAD
 TabWidth:        8
 UseTab:          Never
 ---
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@ -1,38 +0,0 @@
-If you have a question or would like help and support, please ask at our
-[forums](https://discuss.pytorch.org/).
-
-If you are submitting a feature request, please preface the title with [feature request].
-If you are submitting a bug report, please fill in the following details.
-
-## Issue description
-
-Provide a short description.
-
-## Code example
-
-Please try to provide a minimal example to repro the bug.
-Error messages and stack traces are also helpful.
-
-## System Info
-Please copy and paste the output from our
-[environment collection script](https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py)
-(or fill out the checklist below manually).
-
-You can get the script and run it with:
-```
-wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py
-# For security purposes, please check the contents of collect_env.py before running it.
-python collect_env.py
-```
-
- PyTorch or Caffe2:
- How you installed PyTorch (conda, pip, source):
- Build command you used (if compiling from source):
- OS:
- PyTorch version:
- Python version:
- CUDA/cuDNN version:
- GPU models and configuration:
- GCC version (if compiling from source):
- CMake version:
- Versions of any other relevant libraries:
--- a/.github/ISSUE_TEMPLATE/ci-sev.md
+++ b/.github/ISSUE_TEMPLATE/ci-sev.md
@ -5,7 +5,8 @@ about: Tracking incidents for PyTorch's CI infra.

 > NOTE: Remember to label this issue with "`ci: sev`"

-**MERGE BLOCKING** <!-- remove this line if you don't want this SEV to block merges -->
+ <!-- uncomment the below line if you don't want this SEV to block merges -->
+ <!--  **MERGE BLOCKING** -->

 ## Current Status
 *Status could be: preemptive, ongoing, mitigated, closed. Also tell people if they need to take action to fix it (i.e. rebase)*.
--- a/.github/actions/checkout-pytorch/action.yml
+++ b/.github/actions/checkout-pytorch/action.yml
@ -18,8 +18,14 @@ inputs:
 runs:
  using: composite
  steps:
+    - name: Check if in a container runner
+      shell: bash
+      id: check_container_runner
+      run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
+
    - name: Clean workspace
      shell: bash
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
      env:
        NO_SUDO: ${{ inputs.no-sudo }}
      run: |
--- a/.github/actions/linux-test/action.yml
+++ b/.github/actions/linux-test/action.yml
@ -85,15 +85,25 @@ runs:
      with:
        docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

-    - name: Check if in a ARC runner
+    - name: Check if in a container runner
      shell: bash
-      id: check_arc_runner
-      run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
+      id: check_container_runner
+      run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

    - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
      id: install-nvidia-driver
      uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-      if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+      if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
+
+    - name: Setup GPU_FLAG for docker run
+      id: setup-gpu-flag
+      run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+      if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
+
+    - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
+      id: setup-sscache-port-flag
+      run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}

    - name: Lock NVIDIA A100 40GB Frequency
      shell: bash
@ -101,7 +111,7 @@ runs:
        sudo nvidia-smi -pm 1
        sudo nvidia-smi -ac 1215,1410
        nvidia-smi
-      if: contains(matrix.runner, 'a100')
+      if: ${{ contains(matrix.runner, 'a100') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

    - name: Start monitoring script
      id: monitor-script
@ -172,6 +182,7 @@ runs:
        NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
        TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+        SCCACHE_REGION: us-east-1
        SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
        SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
        DOCKER_IMAGE: ${{ inputs.docker-image }}
@ -181,6 +192,9 @@ runs:
        PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
        DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
        HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
+        SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+        IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}
+
      shell: bash
      run: |
        set -x
@ -199,6 +213,7 @@ runs:
        # shellcheck disable=SC2086,SC2090
        container_name=$(docker run \
          ${GPU_FLAG:-} \
+          ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
          -e BUILD_ENVIRONMENT \
          -e PR_NUMBER \
          -e GITHUB_ACTIONS \
@ -227,6 +242,7 @@ runs:
          -e PR_LABELS \
          -e MAX_JOBS="$(nproc --ignore=2)" \
          -e SCCACHE_BUCKET \
+          -e SCCACHE_REGION \
          -e SCCACHE_S3_KEY_PREFIX \
          -e XLA_CUDA \
          -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
@ -234,7 +250,9 @@ runs:
          -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
          -e SKIP_SCCACHE_INITIALIZATION=1 \
          -e HUGGING_FACE_HUB_TOKEN \
+          -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
          -e DASHBOARD_TAG \
+          -e IS_A100_RUNNER \
          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
          --security-opt seccomp=unconfined \
          --cap-add=SYS_PTRACE \
@ -305,7 +323,7 @@ runs:

    - name: Teardown Linux
      uses: pytorch/test-infra/.github/actions/teardown-linux@main
-      if: always()
+      if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'

    # NB: We are currently having an intermittent GPU-related issue on G5 runners with
    # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
--- a/.github/actions/pytest-cache-download/action.yml
+++ b/.github/actions/pytest-cache-download/action.yml
@ -26,7 +26,7 @@ runs:
        retry_wait_seconds: 30
        command: |
          set -eu
-          python3 -m pip install boto3==1.19.12
+          python3 -m pip install boto3==1.35.42

    - name: Download the cache
      shell: bash
--- a/.github/actions/pytest-cache-upload/action.yml
+++ b/.github/actions/pytest-cache-upload/action.yml
@ -33,7 +33,7 @@ runs:
        retry_wait_seconds: 30
        command: |
          set -eu
-          python3 -m pip install boto3==1.19.12
+          python3 -m pip install boto3==1.35.42

    - name: Upload the cache
      shell: bash
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@ -20,7 +20,7 @@ runs:
          elif [[ $runner_name_str == *"gcp"* ]]; then
            echo "Runner is from Google Cloud Platform, No info on ec2 metadata"
          else
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          fi
        }
        echo "ami-id: $(get_ec2_metadata ami-id)"
@ -28,14 +28,14 @@ runs:
        echo "instance-type: $(get_ec2_metadata instance-type)"
        echo "system info $(uname -a)"

-    - name: Check if in a ARC runner
+    - name: Check if in a container runner
      shell: bash
-      id: check_arc_runner
-      run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)"  >> $GITHUB_OUTPUT
+      id: check_container_runner
+      run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

    - name: Start docker if docker deamon is not running
      shell: bash
-      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
      run: |
        if systemctl is-active --quiet docker; then
            echo "Docker daemon is running...";
@ -73,7 +73,7 @@ runs:
        env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"

    - name: Kill any existing containers, clean up images
-      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
      shell: bash
      run: |
        # ignore expansion of "docker ps -q" since it could be empty
@ -116,7 +116,7 @@ runs:
    - name: Check that the docker daemon is running
      shell: bash
      continue-on-error: true
-      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'true' }}
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
      run: |
        set +x

--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@ -18,7 +18,7 @@ runs:
          # Pulled from instance metadata endpoint for EC2
          # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
          category=$1
-          curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
        }
        echo "ami-id: $(get_ec2_metadata ami-id)"
        echo "instance-id: $(get_ec2_metadata instance-id)"
--- a/.github/actions/upload-test-artifacts/action.yml
+++ b/.github/actions/upload-test-artifacts/action.yml
@ -28,7 +28,7 @@ runs:
      run: |
        # Remove any previous test jsons if they exist
        rm -f test-jsons-*.zip
-        zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json'
+        zip -r "test-jsons-${FILE_SUFFIX}.zip" test/test-reports -i '*.json'

    - name: Zip test reports for upload
      if: runner.os != 'Windows' && !inputs.use-gha
@ -38,7 +38,7 @@ runs:
      run: |
        # Remove any previous test reports if they exist
        rm -f test-reports-*.zip
-        zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' -i '*.csv'
+        zip -r "test-reports-${FILE_SUFFIX}.zip" test/test-reports -i '*.xml' -i '*.csv'

    - name: Zip usage log for upload
      if: runner.os != 'Windows' && !inputs.use-gha
@ -53,8 +53,8 @@ runs:
        if [ -f 'usage_log.txt' ]; then
            zip "logs-${FILE_SUFFIX}.zip" 'usage_log.txt'
        fi
-        if ls test/**/*.log 1> /dev/null 2>&1; then
-            zip -r "logs-${FILE_SUFFIX}.zip" test -i '*.log'
+        if find "test/test-reports" -name "*.log" 2>/dev/null | grep -q .; then
+            zip -r "logs-${FILE_SUFFIX}.zip" test/test-reports -i '*.log'
        fi

    - name: Zip debugging artifacts for upload
@ -77,7 +77,7 @@ runs:
        FILE_SUFFIX: ${{ inputs.file-suffix }}
      run: |
        # -ir => recursive include all files in pattern
-        7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json'
+        7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\test-reports\*.json'

    - name: Zip test reports for upload
      if: runner.os == 'Windows' && !inputs.use-gha
@ -86,7 +86,7 @@ runs:
        FILE_SUFFIX: ${{ inputs.file-suffix }}
      run: |
        # -ir => recursive include all files in pattern
-        7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' -ir'!test\*.csv'
+        7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\test-reports\*.xml' -ir'!test\test-reports\*.csv'

    - name: Zip usage log for upload
      if: runner.os == 'Windows' && !inputs.use-gha
@ -96,7 +96,7 @@ runs:
        FILE_SUFFIX: ${{ inputs.file-suffix }}
      run: |
        # -ir => recursive include all files in pattern
-        7z a "logs-$Env:FILE_SUFFIX.zip" 'usage_log.txt' -ir'!test\*.log'
+        7z a "logs-$Env:FILE_SUFFIX.zip" 'usage_log.txt' -ir'!test\test-reports\*.log'

    # S3 upload
    - name: Store Test Downloaded JSONs on S3
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-ba696ea3dfec4cbe693bf06a84c75dc196077f5b
+79047bf6bdec9e32c4cffd0f9835b347781fefbf
--- a/.github/ci_commit_pins/torchbench.txt
+++ b/.github/ci_commit_pins/torchbench.txt
@ -1 +1 @@
-23512dbebd44a11eb84afbf53c3c071dd105297e
+e522b45cd4535b9dfe067aa68d7315755df38f48
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -98,3 +98,9 @@
 "module: distributed_checkpoint":
 - torch/distributed/checkpoint/**
 - test/distributed/checkpoint/**
+
+"module: compiled autograd":
+- torch/csrc/dynamo/python_compiled_autograd.cpp
+- torch/csrc/dynamo/compiled_autograd.h
+- torch/_dynamo/compiled_autograd.py
+- torch/inductor/test_compiled_autograd.py
--- a/.github/lf-canary-scale-config.yml
+++ b/.github/lf-canary-scale-config.yml
@ -1,251 +0,0 @@
-
-# This file is generated by .github/scripts/validate_scale_config.py in test-infra
-# It defines runner types that will be provisioned by by LF Self-hosted runners
-
-# scale-config.yml:
-#   Powers what instance types are available for GHA auto-scaled
-#   runners. Runners listed here will be available as self hosted
-#   runners, configuration is directly pulled from the main branch.
-#
-#
-# NOTES:
-#  - Linux runners are by default non-ephemeral to reduce the amount of CreateInstaces calls
-#    to avoid RequestLimitExceeded issues
-#  - When updating this file, run the following command to validate the YAML and to generate
-#    corresponding versions of scale-config for the pytorch/pytorch repo and merge the
-#    pytorch/pytorch changes before merging these changes.
-#    `python .github/scripts/validate_scale_config.py --test-infra-repo-root [path_to_test-infra_root] --pytorch-repo-root [path_to_pytorch_root]``
-#
-# TODO: Add some documentation on how the auto-scaling works
-#
-# NOTE: Default values,
-#
-# runner_types:
-#   runner_label:
-#     instance_type: m4.large
-#     os: linux
-#     max_available: 20
-#     disk_size: 50
-#     is_ephemeral: true
-
-runner_types:
-  lf.c.linux.12xlarge:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.10xlarge.avx2:
-    disk_size: 200
-    instance_type: m4.10xlarge
-    is_ephemeral: false
-    max_available: 450
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.24xl.spr-metal:
-    disk_size: 200
-    instance_type: c7i.metal-24xl
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.16xlarge.spr:
-    disk_size: 200
-    instance_type: c7i.16xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.9xlarge.ephemeral:
-    disk_size: 200
-    instance_type: c5.9xlarge
-    is_ephemeral: true
-    max_available: 50
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-    variants:
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
-  lf.c.linux.12xlarge.ephemeral:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: true
-    max_available: 300
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.16xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.16xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.24xlarge:
-    disk_size: 150
-    instance_type: c5.24xlarge
-    is_ephemeral: false
-    max_available: 500
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.24xlarge.ephemeral:
-    disk_size: 150
-    instance_type: c5.24xlarge
-    is_ephemeral: true
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.2xlarge:
-    disk_size: 150
-    instance_type: c5.2xlarge
-    is_ephemeral: false
-    max_available: 3120
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.4xlarge:
-    disk_size: 150
-    instance_type: c5.4xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.4xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.8xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.8xlarge
-    is_ephemeral: false
-    max_available: 400
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.g4dn.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.12xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.g4dn.metal.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.metal
-    is_ephemeral: false
-    max_available: 300
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.g5.48xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.48xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.g5.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.12xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.g5.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 2400
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.g6.4xlarge.experimental.nvidia.gpu:
-    disk_size: 150
-    instance_type: g6.4xlarge
-    is_ephemeral: false
-    max_available: 50
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.large:
-    max_available: 1200
-    disk_size: 15
-    instance_type: c5.large
-    is_ephemeral: false
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.arm64.2xlarge:
-    disk_size: 256
-    instance_type: t4g.2xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
-  lf.c.linux.arm64.m7g.4xlarge:
-    disk_size: 256
-    instance_type: m7g.4xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
-  lf.c.linux.arm64.2xlarge.ephemeral:
-    disk_size: 256
-    instance_type: t4g.2xlarge
-    is_ephemeral: true
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
-  lf.c.linux.arm64.m7g.4xlarge.ephemeral:
-    disk_size: 256
-    instance_type: m7g.4xlarge
-    is_ephemeral: true
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
-  lf.c.linux.arm64.m7g.metal:
-    disk_size: 256
-    instance_type: m7g.metal
-    is_ephemeral: false
-    max_available: 100
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
-  lf.c.windows.g4dn.xlarge:
-    disk_size: 256
-    instance_type: g4dn.xlarge
-    is_ephemeral: true
-    max_available: 100
-    os: windows
-  lf.c.windows.g4dn.xlarge.nonephemeral:
-    disk_size: 256
-    instance_type: g4dn.xlarge
-    is_ephemeral: false
-    max_available: 100
-    os: windows
-  lf.c.windows.4xlarge:
-    disk_size: 256
-    instance_type: c5d.4xlarge
-    is_ephemeral: true
-    max_available: 420
-    os: windows
-  lf.c.windows.4xlarge.nonephemeral:
-    disk_size: 256
-    instance_type: c5d.4xlarge
-    is_ephemeral: false
-    max_available: 420
-    os: windows
-  lf.c.windows.8xlarge.nvidia.gpu:
-    disk_size: 256
-    instance_type: p3.2xlarge
-    is_ephemeral: true
-    max_available: 300
-    os: windows
-  lf.c.windows.8xlarge.nvidia.gpu.nonephemeral:
-    disk_size: 256
-    instance_type: p3.2xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: windows
-  lf.c.windows.g5.4xlarge.nvidia.gpu:
-    disk_size: 256
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: windows
--- a/.github/lf-scale-config.yml
+++ b/.github/lf-scale-config.yml
@ -1,251 +0,0 @@
-
-# This file is generated by .github/scripts/validate_scale_config.py in test-infra
-# It defines runner types that will be provisioned by by LF Self-hosted runners
-
-# scale-config.yml:
-#   Powers what instance types are available for GHA auto-scaled
-#   runners. Runners listed here will be available as self hosted
-#   runners, configuration is directly pulled from the main branch.
-#
-#
-# NOTES:
-#  - Linux runners are by default non-ephemeral to reduce the amount of CreateInstaces calls
-#    to avoid RequestLimitExceeded issues
-#  - When updating this file, run the following command to validate the YAML and to generate
-#    corresponding versions of scale-config for the pytorch/pytorch repo and merge the
-#    pytorch/pytorch changes before merging these changes.
-#    `python .github/scripts/validate_scale_config.py --test-infra-repo-root [path_to_test-infra_root] --pytorch-repo-root [path_to_pytorch_root]``
-#
-# TODO: Add some documentation on how the auto-scaling works
-#
-# NOTE: Default values,
-#
-# runner_types:
-#   runner_label:
-#     instance_type: m4.large
-#     os: linux
-#     max_available: 20
-#     disk_size: 50
-#     is_ephemeral: true
-
-runner_types:
-  lf.linux.12xlarge:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.10xlarge.avx2:
-    disk_size: 200
-    instance_type: m4.10xlarge
-    is_ephemeral: false
-    max_available: 450
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.24xl.spr-metal:
-    disk_size: 200
-    instance_type: c7i.metal-24xl
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.16xlarge.spr:
-    disk_size: 200
-    instance_type: c7i.16xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.9xlarge.ephemeral:
-    disk_size: 200
-    instance_type: c5.9xlarge
-    is_ephemeral: true
-    max_available: 50
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-    variants:
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
-  lf.linux.12xlarge.ephemeral:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: true
-    max_available: 300
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.16xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.16xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.24xlarge:
-    disk_size: 150
-    instance_type: c5.24xlarge
-    is_ephemeral: false
-    max_available: 500
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.24xlarge.ephemeral:
-    disk_size: 150
-    instance_type: c5.24xlarge
-    is_ephemeral: true
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.2xlarge:
-    disk_size: 150
-    instance_type: c5.2xlarge
-    is_ephemeral: false
-    max_available: 3120
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.4xlarge:
-    disk_size: 150
-    instance_type: c5.4xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.4xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.8xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.8xlarge
-    is_ephemeral: false
-    max_available: 400
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.g4dn.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.12xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.g4dn.metal.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.metal
-    is_ephemeral: false
-    max_available: 300
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.g5.48xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.48xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.g5.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.12xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.g5.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 2400
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.g6.4xlarge.experimental.nvidia.gpu:
-    disk_size: 150
-    instance_type: g6.4xlarge
-    is_ephemeral: false
-    max_available: 50
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.large:
-    max_available: 1200
-    disk_size: 15
-    instance_type: c5.large
-    is_ephemeral: false
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.arm64.2xlarge:
-    disk_size: 256
-    instance_type: t4g.2xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
-  lf.linux.arm64.m7g.4xlarge:
-    disk_size: 256
-    instance_type: m7g.4xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
-  lf.linux.arm64.2xlarge.ephemeral:
-    disk_size: 256
-    instance_type: t4g.2xlarge
-    is_ephemeral: true
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
-  lf.linux.arm64.m7g.4xlarge.ephemeral:
-    disk_size: 256
-    instance_type: m7g.4xlarge
-    is_ephemeral: true
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
-  lf.linux.arm64.m7g.metal:
-    disk_size: 256
-    instance_type: m7g.metal
-    is_ephemeral: false
-    max_available: 100
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
-  lf.windows.g4dn.xlarge:
-    disk_size: 256
-    instance_type: g4dn.xlarge
-    is_ephemeral: true
-    max_available: 100
-    os: windows
-  lf.windows.g4dn.xlarge.nonephemeral:
-    disk_size: 256
-    instance_type: g4dn.xlarge
-    is_ephemeral: false
-    max_available: 100
-    os: windows
-  lf.windows.4xlarge:
-    disk_size: 256
-    instance_type: c5d.4xlarge
-    is_ephemeral: true
-    max_available: 420
-    os: windows
-  lf.windows.4xlarge.nonephemeral:
-    disk_size: 256
-    instance_type: c5d.4xlarge
-    is_ephemeral: false
-    max_available: 420
-    os: windows
-  lf.windows.8xlarge.nvidia.gpu:
-    disk_size: 256
-    instance_type: p3.2xlarge
-    is_ephemeral: true
-    max_available: 300
-    os: windows
-  lf.windows.8xlarge.nvidia.gpu.nonephemeral:
-    disk_size: 256
-    instance_type: p3.2xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: windows
-  lf.windows.g5.4xlarge.nvidia.gpu:
-    disk_size: 256
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: windows
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -16,11 +16,13 @@ ciflow_push_tags:
 - ciflow/nightly
 - ciflow/periodic
 - ciflow/rocm
+- ciflow/s390
 - ciflow/slow
 - ciflow/trunk
 - ciflow/unstable
 - ciflow/xpu
 - ciflow/torchbench
+- ciflow/autoformat
 retryable_workflows:
 - pull
 - trunk
--- a/.github/requirements-gha-cache.txt
+++ b/.github/requirements-gha-cache.txt
@ -4,7 +4,7 @@
 #   docs/cpp/requirements.txt
 #   functorch/docs/requirements.txt
 #   .ci/docker/requirements-ci.txt
-boto3==1.19.12
+boto3==1.35.42
 jinja2==3.1.4
 lintrunner==0.10.7
 ninja==1.10.0.post1
--- a/.github/requirements/pip-requirements-iOS.txt
+++ b/.github/requirements/pip-requirements-iOS.txt
@ -1,4 +1,4 @@
 # iOS simulator requirements
 coremltools==5.0b5
 protobuf==3.20.2
-optree==0.12.1
+optree==0.13.0
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -1,4 +1,4 @@
-boto3==1.19.12
+boto3==1.35.42
 hypothesis==6.56.4
 expecttest==0.2.1
 fbscribelogger==0.1.6
@ -27,7 +27,7 @@ pytest-cpp==2.3.0
 rockset==1.0.3
 z3-solver==4.12.2.0
 tensorboard==2.13.0
-optree==0.12.1
+optree==0.13.0
 # NB: test_hparams_* from test_tensorboard is failing with protobuf 5.26.0 in
 # which the stringify metadata is wrong when escaping double quote
 protobuf==3.20.2
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -77,6 +77,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -333,7 +334,7 @@ def generate_wheels_matrix(
        package_type = "manywheel"

    if python_versions is None:
-        python_versions = FULL_PYTHON_VERSIONS + ["3.13"]
+        python_versions = FULL_PYTHON_VERSIONS + ["3.13", "3.13t"]

    if arches is None:
        # Define default compute archivectures
@ -368,8 +369,15 @@ def generate_wheels_matrix(

            # TODO: Enable python 3.13 on rocm, aarch64, windows
            if (
-                gpu_arch_type == "rocm" or (os != "linux" and os != "linux-s390x")
-            ) and python_version == "3.13":
+                gpu_arch_type == "rocm"
+                or os not in ["linux", "linux-s390x", "macos-arm64"]
+            ) and python_version in ["3.13", "3.13t"]:
+                continue
+
+            # TODO: Enable python 3.13t on xpu and cpu-s390x or MacOS
+            if (
+                gpu_arch_type in ["xpu", "cpu-s390x"] or os == "macos-arm64"
+            ) and python_version == "3.13t":
                continue

            if use_split_build and (
@ -403,7 +411,7 @@ def generate_wheels_matrix(
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
                        "pytorch_extra_install_requirements": (
-                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]  # fmt: skip
+                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]
                            if os != "linux-aarch64"
                            else ""
                        ),
@ -451,7 +459,7 @@ def generate_wheels_matrix(
                            ".", "_"
                        ),
                        "pytorch_extra_install_requirements": (
-                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.1"]  # fmt: skip
+                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.4"]
                            if os != "linux" and gpu_arch_type != "xpu"
                            else ""
                        ),
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -114,20 +114,21 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
            isolated_workflow=True,
        ),
    ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.LINUX,
-        package_type="manywheel",
-        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-            OperatingSystem.LINUX,
-            use_split_build=True,
-            arches=["11.8", "12.1", "12.4", "cpu"],
-        ),
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
-            isolated_workflow=True,
-        ),
-        use_split_build=True,
-    ),
+    # See https://github.com/pytorch/pytorch/issues/138750
+    #   BinaryBuildWorkflow(
+    #     os=OperatingSystem.LINUX,
+    #     package_type="manywheel",
+    #     build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+    #         OperatingSystem.LINUX,
+    #         use_split_build=True,
+    #         arches=["11.8", "12.1", "12.4", "cpu"],
+    #     ),
+    #     ciflow_config=CIFlowConfig(
+    #         labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
+    #         isolated_workflow=True,
+    #     ),
+    #     use_split_build=True,
+    # ),
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="conda",
@ -180,21 +181,22 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
        ),
        branches="main",
    ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.LINUX,
-        package_type="manywheel",
-        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-            OperatingSystem.LINUX,
-            arches=["11.8", "12.1", "12.4"],
-            python_versions=["3.9"],
-            use_split_build=True,
-        ),
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_PERIODIC},
-        ),
-        branches="main",
-        use_split_build=True,
-    ),
+    # See https://github.com/pytorch/pytorch/issues/138750
+    # BinaryBuildWorkflow(
+    #     os=OperatingSystem.LINUX,
+    #     package_type="manywheel",
+    #     build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+    #         OperatingSystem.LINUX,
+    #         arches=["11.8", "12.1", "12.4"],
+    #         python_versions=["3.9"],
+    #         use_split_build=True,
+    #     ),
+    #     ciflow_config=CIFlowConfig(
+    #         labels={LABEL_CIFLOW_PERIODIC},
+    #     ),
+    #     branches="main",
+    #     use_split_build=True,
+    # ),
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="libtorch",
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -41,7 +41,8 @@ RC=0
 if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
    echo ""
    echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner -m origin/main\`. (If you don't get the same results, run \'lintrunner init\' to update your local linter)\e[0m"
-    echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
+    echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions. To apply suggested patches automatically, use the -a flag. Before pushing another commit,\e[0m"
+    echo -e "\e[1m\e[36mplease verify locally and ensure everything passes.\e[0m"
    RC=1
 fi

--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@ -1,5 +1,9 @@
 # flake8: noqa: G004

+# Note: Copies of this script in runner_determinator.py and _runner-determinator.yml
+#       must be kept in sync. You can do it easily by running the following command:
+#           python .github/scripts/update_runner_determinator.py
+
 """
 This runner determinator is used to determine which set of runners to run a
 GitHub job on. It uses the first comment of a GitHub issue (by default
@ -35,7 +39,8 @@ Example config:
    experiments:
      lf:
        rollout_percent: 25
-
+        all_branches: false
+        default: true
    ---

    # Opt-ins:
@ -53,7 +58,7 @@ import os
 import random
 from argparse import ArgumentParser
 from logging import LogRecord
-from typing import Any, Dict, Iterable, List, NamedTuple, Tuple
+from typing import Any, Dict, FrozenSet, Iterable, List, NamedTuple, Tuple

 import yaml
 from github import Auth, Github
@ -79,6 +84,12 @@ class Experiment(NamedTuple):
    rollout_perc: float = (
        0  # Percentage of workflows to experiment on when user is not opted-in.
    )
+    all_branches: bool = (
+        False  # If True, the experiment is also enabled on the exception branches
+    )
+    default: bool = (
+        True  # If True, the experiment is enabled by default for all queries
+    )

    # Add more fields as needed

@ -133,6 +144,12 @@ def set_github_output(key: str, value: str) -> None:
        f.write(f"{key}={value}\n")


+def _str_comma_separated_to_set(value: str) -> FrozenSet[str]:
+    return frozenset(
+        filter(lambda itm: itm != "", map(str.strip, value.strip(" \n\t").split(",")))
+    )
+
+
 def parse_args() -> Any:
    parser = ArgumentParser("Get dynamic rollout settings")
    parser.add_argument("--github-token", type=str, required=True, help="GitHub token")
@ -167,6 +184,13 @@ def parse_args() -> Any:
        required=True,
        help="Current GitHub ref type, branch or tag",
    )
+    parser.add_argument(
+        "--eligible-experiments",
+        type=_str_comma_separated_to_set,
+        required=False,
+        default="",
+        help="comma separated list of experiments to check, if omitted all experiments marked with default=True are checked",
+    )

    return parser.parse_args()

@ -212,7 +236,7 @@ def get_potential_pr_author(

 def is_exception_branch(branch: str) -> bool:
    """
-    Branches that get opted out of all experiments and should always use Meta runners
+    Branches that get opted out of experiments by default, until they're explicitly enabled.
    """
    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}

@ -338,7 +362,11 @@ def is_user_opted_in(user: str, user_optins: UserOptins, experiment_name: str) -


 def get_runner_prefix(
-    rollout_state: str, workflow_requestors: Iterable[str], is_canary: bool = False
+    rollout_state: str,
+    workflow_requestors: Iterable[str],
+    branch: str,
+    eligible_experiments: FrozenSet[str] = frozenset(),
+    is_canary: bool = False,
 ) -> str:
    settings = parse_settings(rollout_state)
    user_optins = parse_users(rollout_state)
@ -346,7 +374,24 @@ def get_runner_prefix(
    fleet_prefix = ""
    prefixes = []
    for experiment_name, experiment_settings in settings.experiments.items():
-        enabled = False
+        if not experiment_settings.all_branches and is_exception_branch(branch):
+            log.info(
+                f"Branch {branch} is an exception branch. Not enabling experiment {experiment_name}."
+            )
+            continue
+
+        if eligible_experiments:
+            if experiment_name not in eligible_experiments:
+                exp_list = ", ".join(eligible_experiments)
+                log.info(
+                    f"Skipping experiment '{experiment_name}', as it is not in the eligible_experiments list: {exp_list}"
+                )
+                continue
+        elif not experiment_settings.default:
+            log.info(
+                f"Skipping experiment '{experiment_name}', as it is not a default experiment"
+            )
+            continue

        # Is any workflow_requestor opted in to this experiment?
        opted_in_users = [
@ -355,11 +400,13 @@ def get_runner_prefix(
            if is_user_opted_in(requestor, user_optins, experiment_name)
        ]

+        enabled = False
        if opted_in_users:
            log.info(
                f"{', '.join(opted_in_users)} have opted into experiment {experiment_name}."
            )
            enabled = True
+
        elif experiment_settings.rollout_perc:
            # If no user is opted in, then we randomly enable the experiment based on the rollout percentage
            if random.uniform(0, 100) <= experiment_settings.rollout_perc:
@ -407,35 +454,35 @@ def get_rollout_state_from_issue(github_token: str, repo: str, issue_num: int) -
 def main() -> None:
    args = parse_args()

-    if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
-        log.info(
-            f"Exception branch: '{args.github_branch}', using Meta runners and no experiments."
+    runner_label_prefix = DEFAULT_LABEL_PREFIX
+
+    try:
+        rollout_state = get_rollout_state_from_issue(
+            args.github_token, args.github_issue_repo, args.github_issue
        )
-        runner_label_prefix = DEFAULT_LABEL_PREFIX
-    else:
-        try:
-            rollout_state = get_rollout_state_from_issue(
-                args.github_token, args.github_issue_repo, args.github_issue
-            )

-            username = get_potential_pr_author(
-                args.github_token,
-                args.github_repo,
-                args.github_actor,
-                args.github_ref_type,
-                args.github_branch,
-            )
+        username = get_potential_pr_author(
+            args.github_token,
+            args.github_repo,
+            args.github_actor,
+            args.github_ref_type,
+            args.github_branch,
+        )

-            is_canary = args.github_repo == "pytorch/pytorch-canary"
+        is_canary = args.github_repo == "pytorch/pytorch-canary"

-            runner_label_prefix = get_runner_prefix(
-                rollout_state, (args.github_issue_owner, username), is_canary
-            )
+        runner_label_prefix = get_runner_prefix(
+            rollout_state,
+            (args.github_issue_owner, username),
+            args.github_branch,
+            args.eligible_experiments,
+            is_canary,
+        )

-        except Exception as e:
-            log.error(
-                f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
-            )
+    except Exception as e:
+        log.error(
+            f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
+        )

    set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)

--- a/.github/scripts/test_runner_determinator.py
+++ b/.github/scripts/test_runner_determinator.py
@ -4,6 +4,10 @@ from unittest.mock import Mock, patch
 import runner_determinator as rd


+USER_BRANCH = "somebranch"
+EXCEPTION_BRANCH = "main"
+
+
 class TestRunnerDeterminatorIssueParser(TestCase):
    def test_parse_settings(self) -> None:
        settings_text = """
@ -12,6 +16,7 @@ class TestRunnerDeterminatorIssueParser(TestCase):
                rollout_perc: 25
            otherExp:
                rollout_perc: 0
+                default: false
        ---

        Users:
@ -28,7 +33,7 @@ class TestRunnerDeterminatorIssueParser(TestCase):
            "lf settings not parsed correctly",
        )
        self.assertTupleEqual(
-            rd.Experiment(rollout_perc=0),
+            rd.Experiment(rollout_perc=0, default=False),
            settings.experiments["otherExp"],
            "otherExp settings not parsed correctly",
        )
@ -42,7 +47,7 @@ class TestRunnerDeterminatorIssueParser(TestCase):
                rollout_perc: 25
            otherExp:
                rollout_perc: 0
-
+                default: false
        ```

        ---
@ -61,7 +66,41 @@ class TestRunnerDeterminatorIssueParser(TestCase):
            "lf settings not parsed correctly",
        )
        self.assertTupleEqual(
-            rd.Experiment(rollout_perc=0),
+            rd.Experiment(rollout_perc=0, default=False),
+            settings.experiments["otherExp"],
+            "otherExp settings not parsed correctly",
+        )
+
+    def test_parse_all_branches_setting(self) -> None:
+        settings_text = """
+        ```
+        experiments:
+            lf:
+                rollout_perc: 25
+                all_branches: true
+            otherExp:
+                all_branches: True
+                rollout_perc: 0
+        ```
+
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        settings = rd.parse_settings(settings_text)
+
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=25, all_branches=True),
+            settings.experiments["lf"],
+            "lf settings not parsed correctly",
+        )
+        self.assertTrue(settings.experiments["otherExp"].all_branches)
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=0, all_branches=True),
            settings.experiments["otherExp"],
            "otherExp settings not parsed correctly",
        )
@ -119,7 +158,7 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
        @User2,lf,otherExp

        """
-        prefix = rd.get_runner_prefix(settings_text, ["User1"])
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], USER_BRANCH)
        self.assertEqual("lf.", prefix, "Runner prefix not correct for User1")

    def test_opted_in_user_two_experiments(self) -> None:
@ -136,9 +175,67 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
        @User2,lf,otherExp

        """
-        prefix = rd.get_runner_prefix(settings_text, ["User2"])
+        prefix = rd.get_runner_prefix(settings_text, ["User2"], USER_BRANCH)
        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for User2")

+    def test_opted_in_user_two_experiments_default(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+                default: false
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+        prefix = rd.get_runner_prefix(settings_text, ["User2"], USER_BRANCH)
+        self.assertEqual("lf.", prefix, "Runner prefix not correct for User2")
+
+    def test_opted_in_user_two_experiments_default_exp(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+                default: false
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+        prefix = rd.get_runner_prefix(
+            settings_text, ["User2"], USER_BRANCH, frozenset(["lf", "otherExp"])
+        )
+        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for User2")
+
+    def test_opted_in_user_two_experiments_default_exp_2(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+                default: false
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+        prefix = rd.get_runner_prefix(
+            settings_text, ["User2"], USER_BRANCH, frozenset(["otherExp"])
+        )
+        self.assertEqual("otherExp.", prefix, "Runner prefix not correct for User2")
+
    @patch("random.uniform", return_value=50)
    def test_opted_out_user(self, mock_uniform: Mock) -> None:
        settings_text = """
@ -154,7 +251,7 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
        @User2,lf,otherExp

        """
-        prefix = rd.get_runner_prefix(settings_text, ["User3"])
+        prefix = rd.get_runner_prefix(settings_text, ["User3"], USER_BRANCH)
        self.assertEqual("", prefix, "Runner prefix not correct for user")

    @patch("random.uniform", return_value=10)
@ -174,9 +271,80 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
        """

        # User3 is opted out, but is pulled into both experiments by the 10% rollout
-        prefix = rd.get_runner_prefix(settings_text, ["User3"])
+        prefix = rd.get_runner_prefix(settings_text, ["User3"], USER_BRANCH)
        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")

+    @patch("random.uniform", return_value=10)
+    def test_opted_out_user_was_pulled_in_by_rollout_excl_nondefault(
+        self, mock_uniform: Mock
+    ) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 25
+            otherExp:
+                rollout_perc: 25
+                default: false
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        # User3 is opted out, but is pulled into default experiments by the 10% rollout
+        prefix = rd.get_runner_prefix(settings_text, ["User3"], USER_BRANCH)
+        self.assertEqual("lf.", prefix, "Runner prefix not correct for user")
+
+    @patch("random.uniform", return_value=10)
+    def test_opted_out_user_was_pulled_in_by_rollout_filter_exp(
+        self, mock_uniform: Mock
+    ) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 25
+            otherExp:
+                rollout_perc: 25
+                default: false
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        # User3 is opted out, but is pulled into default experiments by the 10% rollout
+        prefix = rd.get_runner_prefix(
+            settings_text, ["User3"], USER_BRANCH, frozenset(["otherExp"])
+        )
+        self.assertEqual("otherExp.", prefix, "Runner prefix not correct for user")
+
+    @patch("random.uniform", return_value=25)
+    def test_opted_out_user_was_pulled_out_by_rollout_filter_exp(
+        self, mock_uniform: Mock
+    ) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 10
+            otherExp:
+                rollout_perc: 50
+                default: false
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        # User3 is opted out, but is pulled into default experiments by the 10% rollout
+        prefix = rd.get_runner_prefix(settings_text, ["User3"], USER_BRANCH)
+        self.assertEqual("", prefix, "Runner prefix not correct for user")
+
    def test_lf_prefix_always_comes_first(self) -> None:
        settings_text = """
        experiments:
@ -192,7 +360,7 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):

        """

-        prefix = rd.get_runner_prefix(settings_text, ["User2"])
+        prefix = rd.get_runner_prefix(settings_text, ["User2"], USER_BRANCH)
        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")

    def test_ignores_commented_users(self) -> None:
@ -210,7 +378,7 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):

        """

-        prefix = rd.get_runner_prefix(settings_text, ["User1"])
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], USER_BRANCH)
        self.assertEqual("", prefix, "Runner prefix not correct for user")

    def test_ignores_extra_experiments(self) -> None:
@ -229,9 +397,44 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):

        """

-        prefix = rd.get_runner_prefix(settings_text, ["User1"])
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], USER_BRANCH)
        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")

+    def test_disables_experiment_on_exception_branches_when_not_explicitly_opted_in(
+        self,
+    ) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 100
+        ---
+
+        Users:
+        @User,lf,otherExp
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], EXCEPTION_BRANCH)
+        self.assertEqual("", prefix, "Runner prefix not correct for user")
+
+    def test_allows_experiment_on_exception_branches_when_explicitly_opted_in(
+        self,
+    ) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 100
+                all_branches: true
+        ---
+
+        Users:
+        @User,lf,otherExp
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], EXCEPTION_BRANCH)
+        self.assertEqual("lf.", prefix, "Runner prefix not correct for user")
+

 if __name__ == "__main__":
    main()
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -12,7 +12,7 @@ import json
 import os
 import warnings
 from hashlib import sha256
-from typing import Any, Dict, List, Optional
+from typing import Any, List, Optional
 from unittest import main, mock, skip, TestCase
 from urllib.error import HTTPError

@ -24,7 +24,6 @@ from trymerge import (
    find_matching_merge_rule,
    get_classifications,
    get_drci_classifications,
-    get_rockset_results,
    gh_get_team_members,
    GitHubPR,
    JobCheckState,
@ -42,7 +41,6 @@ if "GIT_REMOTE_URL" not in os.environ:
    os.environ["GIT_REMOTE_URL"] = "https://github.com/pytorch/pytorch"

 GQL_MOCKS = "gql_mocks.json.gz"
-ROCKSET_MOCKS = "rockset_mocks.json.gz"
 DRCI_MOCKS = "drci_mocks.json.gz"


@ -77,16 +75,11 @@ def mock_query(
        if err.code == 401 or err.code == 403:
            err_msg = f"If you are seeing this message during workflow run, please make sure to update {file_name}"
            err_msg += f" locally, by deleting it and running {os.path.basename(__file__)} with"
-            err_msg += " GitHub Personal Access Token passed via GITHUB_TOKEN,"
-            err_msg += " the rockset api key passed via ROCKSET_API_KEY,"
+            err_msg += " GitHub Personal Access Token passed via GITHUB_TOKEN"
            err_msg += " and drci api key passed via DRCI_BOT_KEY environment variables"
-            if (
-                os.getenv("GITHUB_TOKEN") is None
-                or os.getenv("ROCKSET_API_KEY") is None
-                or os.getenv("DRCI_BOT_KEY") is None
-            ):
+            if os.getenv("GITHUB_TOKEN") is None or os.getenv("DRCI_BOT_KEY") is None:
                err_msg = (
-                    "Failed to update cached queries as GITHUB_TOKEN or ROCKSET_API_KEY or DRCI_BOT_KEY "
+                    "Failed to update cached queries as GITHUB_TOKEN or DRCI_BOT_KEY "
                    + "is not defined. "
                    + err_msg
                )
@ -110,16 +103,6 @@ def mocked_gh_graphql(query: str, **kwargs: Any) -> Any:
    return mock_query(gh_graphql_wrapper, GQL_MOCKS, key_function, query, kwargs)


-def mocked_rockset_results(head_sha: str, merge_base: str, num_retries: int = 3) -> Any:
-    return mock_query(
-        get_rockset_results,
-        ROCKSET_MOCKS,
-        lambda x, y: f"{x} {y}",
-        head_sha,
-        merge_base,
-    )
-
-
 def mocked_drci_classifications(pr_num: int, project: str, num_retries: int = 3) -> Any:
    return mock_query(
        get_drci_classifications,
@ -273,10 +256,6 @@ def xla_merge_rules(repo: Any, org: str, project: str) -> List[MergeRule]:
    ]


-def empty_rockset_results(head_sha: str, merge_base: str) -> List[Dict[str, Any]]:
-    return []
-
-
 class DummyGitRepo(GitRepo):
    def __init__(self) -> None:
        super().__init__(get_git_repo_dir(), get_git_remote_name())
@ -288,7 +267,6 @@ class DummyGitRepo(GitRepo):
        return "super awsome commit message"


-@mock.patch("trymerge.get_rockset_results", side_effect=empty_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch(
    "trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
@ -604,7 +582,6 @@ class TestTryMerge(TestCase):
            mocked_gh_fetch_merge_base.assert_called_once()


-@mock.patch("trymerge.get_rockset_results", side_effect=mocked_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
@mock.patch(
@ -843,7 +820,7 @@ class TestBypassFailures(TestCase):
        checks = pr.get_checkrun_conclusions()

        # Known flaky failure takes precedence over ignore current (need to set the
-        # merge base here to get the results from Rockset, and that categorize the
+        # merge base here to get the results from Dr. CI, and that categorize the
        # broken trunk failure too
        checks = get_classifications(
            pr.pr_num,
@ -929,7 +906,6 @@ class TestBypassFailures(TestCase):
        )


-@mock.patch("trymerge.get_rockset_results", side_effect=mocked_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
@mock.patch("trymerge.get_drci_classifications", return_value={})
@ -1008,7 +984,6 @@ class TestBypassFailuresOnSandCastle(TestCase):
        self.assertTrue(len(failed) == 2)


-@mock.patch("trymerge.get_rockset_results", side_effect=mocked_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
@mock.patch(
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -452,8 +452,6 @@ RE_DIFF_REV = re.compile(r"^Differential Revision:.+?(D[0-9]+)", re.MULTILINE)
 CIFLOW_LABEL = re.compile(r"^ciflow/.+")
 CIFLOW_TRUNK_LABEL = re.compile(r"^ciflow/trunk")
 MERGE_RULE_PATH = Path(".github") / "merge_rules.yaml"
-ROCKSET_MERGES_COLLECTION = "merges"
-ROCKSET_MERGES_WORKSPACE = "commons"
 REMOTE_MAIN_BRANCH = "origin/main"
 DRCI_CHECKRUN_NAME = "Dr.CI"
 INTERNAL_CHANGES_CHECKRUN_NAME = "Meta Internal-Only Changes Check"
@ -1180,7 +1178,7 @@ class GitHubPR:
        merge_commit_sha = repo.rev_parse(name=self.default_branch())

        if comment_id and self.pr_num:
-            # Finally, upload the record to Rockset. The list of pending and failed
+            # Finally, upload the record to s3. The list of pending and failed
            # checks are at the time of the merge
            save_merge_record(
                comment_id=comment_id,
@ -1202,7 +1200,7 @@ class GitHubPR:
                ignore_current=bool(ignore_current_checks),
            )
        else:
-            print("Missing comment ID or PR number, couldn't upload to Rockset")
+            print("Missing comment ID or PR number, couldn't upload to s3")

        # Usually Github will see that the commit has "resolves <pr_num>" in the
        # commit message and close the PR, but sometimes it doesn't, leading to
@ -1481,7 +1479,7 @@ def find_matching_merge_rule(

        # Categorize all checks when skip_mandatory_checks (force merge) is set. Do it here
        # where the list of checks is readily available. These records will be saved into
-        # Rockset merge records
+        # s3 merge records
        (
            pending_mandatory_checks,
            failed_mandatory_checks,
@ -1508,7 +1506,7 @@ def checks_to_str(checks: List[Tuple[str, Optional[str]]]) -> str:


 def checks_to_markdown_bullets(
-    checks: List[Tuple[str, Optional[str], Optional[int]]]
+    checks: List[Tuple[str, Optional[str], Optional[int]]],
 ) -> List[str]:
    return [
        f"- [{c[0]}]({c[1]})" if c[1] is not None else f"- {c[0]}" for c in checks[:5]
@ -1568,7 +1566,7 @@ def save_merge_record(
    This saves the merge records as a json, which can later be uploaded to s3
    """

-    # Prepare the record to be written into Rockset
+    # Prepare the record to be written into s3
    data = [
        {
            "comment_id": comment_id,
@ -1590,7 +1588,8 @@ def save_merge_record(
            "ignore_current": ignore_current,
            "error": error,
            # This is a unique identifier for the record for deduping purposes
-            # in rockset.  Any unique string would work
+            # in Rockset.  Any unique string would work.  This will not be used
+            # after we migrate off Rockset
            "_id": f"{project}-{pr_num}-{comment_id}-{os.environ.get('GITHUB_RUN_ID')}",
        }
    ]
@ -1600,36 +1599,6 @@ def save_merge_record(
        json.dump(data, f)


-@retries_decorator(rc=[])
-def get_rockset_results(head_sha: str, merge_base: str) -> List[Dict[str, Any]]:
-    query = f"""
-SELECT
-    w.name as workflow_name,
-    j.id,
-    j.name,
-    j.conclusion,
-    j.completed_at,
-    j.html_url,
-    j.head_sha,
-    j.torchci_classification.captures as failure_captures,
-    LENGTH(j.steps) as steps,
-FROM
-    commons.workflow_job j join commons.workflow_run w on w.id = j.run_id
-where
-    j.head_sha in ('{head_sha}','{merge_base}')
-"""
-    try:
-        import rockset  # type: ignore[import]
-
-        res = rockset.RocksetClient(
-            host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
-        ).sql(query)
-        return cast(List[Dict[str, Any]], res.results)
-    except ModuleNotFoundError:
-        print("Could not use RockSet as rocket dependency is missing")
-        return []
-
-
@retries_decorator()
 def get_drci_classifications(pr_num: int, project: str = "pytorch") -> Any:
    """
@ -2067,7 +2036,7 @@ def categorize_checks(
    pending_checks: List[Tuple[str, Optional[str], Optional[int]]] = []
    failed_checks: List[Tuple[str, Optional[str], Optional[int]]] = []

-    # failed_checks_categorization is used to keep track of all ignorable failures when saving the merge record on Rockset
+    # failed_checks_categorization is used to keep track of all ignorable failures when saving the merge record on s3
    failed_checks_categorization: Dict[str, List[Any]] = defaultdict(list)

    # If required_checks is not set or empty, consider all names are relevant
@ -2126,7 +2095,7 @@ def categorize_checks(
    ):
        failed_checks = failed_checks + flaky_or_broken_trunk

-    # The list of failed_checks_categorization is returned so that it can be saved into the Rockset merge record
+    # The list of failed_checks_categorization is returned so that it can be saved into the s3 merge record
    return (pending_checks, failed_checks, failed_checks_categorization)


@ -2410,7 +2379,7 @@ def main() -> None:
        handle_exception(e)

        if args.comment_id and args.pr_num:
-            # Finally, upload the record to Rockset, we don't have access to the
+            # Finally, upload the record to s3, we don't have access to the
            # list of pending and failed checks here, but they are not really
            # needed at the moment
            save_merge_record(
@ -2433,7 +2402,7 @@ def main() -> None:
                error=str(e),
            )
        else:
-            print("Missing comment ID or PR number, couldn't upload to Rockset")
+            print("Missing comment ID or PR number, couldn't upload to s3")
    finally:
        if not args.check_mergeability:
            gh_remove_label(
--- a/.github/scripts/update_runner_determinator.py
+++ b/.github/scripts/update_runner_determinator.py
@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+import re
+
+
+# Read the contents of runner_determinator.py
+with open(".github/scripts/runner_determinator.py") as script_file:
+    script_content = script_file.read()
+
+# Indent the script content by 10 spaces to match destination indentation
+indented_script_content = "\n".join(
+    [" " * 10 + line if line else line for line in script_content.splitlines()]
+)
+
+# Read the contents of _runner-determinator.yml
+with open(".github/workflows/_runner-determinator.yml") as yml_file:
+    yml_content = yml_file.read()
+
+# Replace the content between the markers
+new_yml_content = re.sub(
+    r"(cat <<EOF > runner_determinator.py\n)(.*?)(\n\s+EOF)",
+    lambda match: match.group(1) + indented_script_content + match.group(3),
+    yml_content,
+    flags=re.DOTALL,
+)
+
+# Save the modified content back to _runner-determinator.yml
+with open(".github/workflows/_runner-determinator.yml", "w") as yml_file:
+    yml_file.write(new_yml_content)
+
+print("Updated _runner-determinator.yml with the contents of runner_determinator.py")
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@ -25,7 +25,7 @@ concurrency:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -40,6 +40,16 @@ concurrency:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -54,7 +54,7 @@ env:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -68,6 +68,7 @@ jobs:
    needs: get-label-type
    with:!{{ upload.binary_env_as_input(config) }}
      {%- if "aarch64" in build_environment %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      {%- elif "s390x" in build_environment %}
@ -102,6 +103,7 @@ jobs:
      build_name: !{{ config["build_name"] }}
      build_environment: !{{ build_environment }}
      {%- if "aarch64" in build_environment %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      {%- elif "s390x" in build_environment %}
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -55,7 +55,7 @@ env:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@ -91,14 +91,14 @@ jobs:
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

-      - name: Check if in a ARC runner
+      - name: Check if in a container runner
        shell: bash
-        id: check_arc_runner
-        run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
+        id: check_container_runner
+        run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        if: ${{ inputs.cuda-version != 'cpu' && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+        if: ${{ inputs.cuda-version != 'cpu' && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

      - name: Output disk space left
        run: |
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -271,7 +271,9 @@ jobs:
          )
          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
          if [[ ${BUILD_ENVIRONMENT} == *"aarch64"* ]]; then
-            docker exec -t "${container_name}" bash -c "bash /builder/aarch64_linux/aarch64_ci_build.sh"
+            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/aarch64_linux/aarch64_ci_build.sh"
+          elif [[ ${{ inputs.PACKAGE_TYPE }} == "manywheel" || ${{ inputs.PACKAGE_TYPE }} == "libtorch" ]]; then
+            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
          else
            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/${{ inputs.PACKAGE_TYPE }}/build.sh"
          fi
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -114,22 +114,32 @@ jobs:
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

-      - name: Check if in a ARC runner
+      - name: Check if in a container runner
        shell: bash
-        id: check_arc_runner
-        run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
+        id: check_container_runner
+        run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        id: install-nvidia-driver
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
+
+      - name: Setup GPU_FLAG for docker run
+        id: setup-gpu-flag
+        run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
+
+      - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
+        id: setup-sscache-port-flag
+        run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
+        if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}

      - name: Lock NVIDIA A100 40GB Frequency
        run: |
          sudo nvidia-smi -pm 1
          sudo nvidia-smi -ac 1215,1410
          nvidia-smi
-        if: contains(matrix.runner, 'a100')
+        if: ${{ contains(matrix.runner, 'a100') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

      - name: Start monitoring script
        id: monitor-script
@ -208,6 +218,7 @@ jobs:
          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
          TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+          SCCACHE_REGION: us-east-1
          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
          SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
          DOCKER_IMAGE: ${{ inputs.docker-image }}
@ -218,7 +229,8 @@ jobs:
          DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-
+          IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}
+          ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
        run: |
          set -x

@ -236,6 +248,7 @@ jobs:
          # shellcheck disable=SC2086,SC2090
          container_name=$(docker run \
            ${GPU_FLAG:-} \
+            ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
            -e BUILD_ENVIRONMENT \
            -e PR_NUMBER \
            -e GITHUB_ACTIONS \
@ -265,6 +278,7 @@ jobs:
            -e PR_LABELS \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e SCCACHE_BUCKET \
+            -e SCCACHE_REGION \
            -e SCCACHE_S3_KEY_PREFIX \
            -e XLA_CUDA \
            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
@ -274,6 +288,8 @@ jobs:
            -e HUGGING_FACE_HUB_TOKEN \
            -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
            -e DASHBOARD_TAG \
+            -e IS_A100_RUNNER \
+            -e ARTIFACTS_FILE_SUFFIX \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --security-opt seccomp=unconfined \
            --cap-add=SYS_PTRACE \
@ -343,7 +359,7 @@ jobs:

      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
-        if: always()
+        if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'

      # NB: We are currently having an intermittent GPU-related issue on G5 runners with
      # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@ -3,6 +3,11 @@ name: Check whether the workflow owner can use ARC runners
 on:
  workflow_call:
    inputs:
+      check_experiments:
+        required: false
+        type: string
+        description: |
+          List of experiments for this workfow. If not defined, all default experiments are included.
      triggering_actor:
        required: true
        type: string
@ -35,6 +40,8 @@ on:

 jobs:
  runner-determinator:
+    # Don't run on forked repos
+    if: github.repository_owner == 'pytorch'
    runs-on: ubuntu-latest
    outputs:
      label-type: ${{ steps.set-condition.outputs.label-type }}
@ -43,6 +50,7 @@ jobs:
      ISSUE_NUMBER: ${{ inputs.issue_number }}
      TRIGGERING_ACTOR: ${{ inputs.triggering_actor }}
      ISSUE_OWNER: ${{ inputs.issue_owner }}
+      CHECK_EXPERIMENTS: ${{ inputs.check_experiments }}
    steps:
      # - name: Checkout PyTorch
      #   uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
@ -59,6 +67,10 @@ jobs:
          cat <<EOF > runner_determinator.py
          # flake8: noqa: G004

+          # Note: Copies of this script in runner_determinator.py and _runner-determinator.yml
+          #       must be kept in sync. You can do it easily by running the following command:
+          #           python .github/scripts/update_runner_determinator.py
+
          """
          This runner determinator is used to determine which set of runners to run a
          GitHub job on. It uses the first comment of a GitHub issue (by default
@ -94,7 +106,8 @@ jobs:
              experiments:
                lf:
                  rollout_percent: 25
-
+                  all_branches: false
+                  default: true
              ---

              # Opt-ins:
@ -112,7 +125,7 @@ jobs:
          import random
          from argparse import ArgumentParser
          from logging import LogRecord
-          from typing import Any, Dict, Iterable, List, NamedTuple, Tuple
+          from typing import Any, Dict, FrozenSet, Iterable, List, NamedTuple, Tuple

          import yaml
          from github import Auth, Github
@ -138,6 +151,12 @@ jobs:
              rollout_perc: float = (
                  0  # Percentage of workflows to experiment on when user is not opted-in.
              )
+              all_branches: bool = (
+                  False  # If True, the experiment is also enabled on the exception branches
+              )
+              default: bool = (
+                  True  # If True, the experiment is enabled by default for all queries
+              )

              # Add more fields as needed

@ -192,6 +211,12 @@ jobs:
                  f.write(f"{key}={value}\n")


+          def _str_comma_separated_to_set(value: str) -> FrozenSet[str]:
+              return frozenset(
+                  filter(lambda itm: itm != "", map(str.strip, value.strip(" \n\t").split(",")))
+              )
+
+
          def parse_args() -> Any:
              parser = ArgumentParser("Get dynamic rollout settings")
              parser.add_argument("--github-token", type=str, required=True, help="GitHub token")
@ -226,6 +251,13 @@ jobs:
                  required=True,
                  help="Current GitHub ref type, branch or tag",
              )
+              parser.add_argument(
+                  "--eligible-experiments",
+                  type=_str_comma_separated_to_set,
+                  required=False,
+                  default="",
+                  help="comma separated list of experiments to check, if omitted all experiments marked with default=True are checked",
+              )

              return parser.parse_args()

@ -271,7 +303,7 @@ jobs:

          def is_exception_branch(branch: str) -> bool:
              """
-              Branches that get opted out of all experiments and should always use Meta runners
+              Branches that get opted out of experiments by default, until they're explicitly enabled.
              """
              return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}

@ -397,7 +429,11 @@ jobs:


          def get_runner_prefix(
-              rollout_state: str, workflow_requestors: Iterable[str], is_canary: bool = False
+              rollout_state: str,
+              workflow_requestors: Iterable[str],
+              branch: str,
+              eligible_experiments: FrozenSet[str] = frozenset(),
+              is_canary: bool = False,
          ) -> str:
              settings = parse_settings(rollout_state)
              user_optins = parse_users(rollout_state)
@ -405,7 +441,24 @@ jobs:
              fleet_prefix = ""
              prefixes = []
              for experiment_name, experiment_settings in settings.experiments.items():
-                  enabled = False
+                  if not experiment_settings.all_branches and is_exception_branch(branch):
+                      log.info(
+                          f"Branch {branch} is an exception branch. Not enabling experiment {experiment_name}."
+                      )
+                      continue
+
+                  if eligible_experiments:
+                      if experiment_name not in eligible_experiments:
+                          exp_list = ", ".join(eligible_experiments)
+                          log.info(
+                              f"Skipping experiment '{experiment_name}', as it is not in the eligible_experiments list: {exp_list}"
+                          )
+                          continue
+                  elif not experiment_settings.default:
+                      log.info(
+                          f"Skipping experiment '{experiment_name}', as it is not a default experiment"
+                      )
+                      continue

                  # Is any workflow_requestor opted in to this experiment?
                  opted_in_users = [
@ -414,11 +467,13 @@ jobs:
                      if is_user_opted_in(requestor, user_optins, experiment_name)
                  ]

+                  enabled = False
                  if opted_in_users:
                      log.info(
                          f"{', '.join(opted_in_users)} have opted into experiment {experiment_name}."
                      )
                      enabled = True
+
                  elif experiment_settings.rollout_perc:
                      # If no user is opted in, then we randomly enable the experiment based on the rollout percentage
                      if random.uniform(0, 100) <= experiment_settings.rollout_perc:
@ -466,35 +521,35 @@ jobs:
          def main() -> None:
              args = parse_args()

-              if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
-                  log.info(
-                      f"Exception branch: '{args.github_branch}', using Meta runners and no experiments."
+              runner_label_prefix = DEFAULT_LABEL_PREFIX
+
+              try:
+                  rollout_state = get_rollout_state_from_issue(
+                      args.github_token, args.github_issue_repo, args.github_issue
                  )
-                  runner_label_prefix = DEFAULT_LABEL_PREFIX
-              else:
-                  try:
-                      rollout_state = get_rollout_state_from_issue(
-                          args.github_token, args.github_issue_repo, args.github_issue
-                      )

-                      username = get_potential_pr_author(
-                          args.github_token,
-                          args.github_repo,
-                          args.github_actor,
-                          args.github_ref_type,
-                          args.github_branch,
-                      )
+                  username = get_potential_pr_author(
+                      args.github_token,
+                      args.github_repo,
+                      args.github_actor,
+                      args.github_ref_type,
+                      args.github_branch,
+                  )

-                      is_canary = args.github_repo == "pytorch/pytorch-canary"
+                  is_canary = args.github_repo == "pytorch/pytorch-canary"

-                      runner_label_prefix = get_runner_prefix(
-                          rollout_state, (args.github_issue_owner, username), is_canary
-                      )
+                  runner_label_prefix = get_runner_prefix(
+                      rollout_state,
+                      (args.github_issue_owner, username),
+                      args.github_branch,
+                      args.eligible_experiments,
+                      is_canary,
+                  )

-                  except Exception as e:
-                      log.error(
-                          f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
-                      )
+              except Exception as e:
+                  log.error(
+                      f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
+                  )

              set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)

@ -523,4 +578,5 @@ jobs:
            --github-actor "$TRIGGERING_ACTOR" \
            --github-issue-owner "$ISSUE_OWNER" \
            --github-ref-type "$curr_ref_type" \
-            --github-repo "$GITHUB_REPOSITORY"
+            --github-repo "$GITHUB_REPOSITORY" \
+            --eligible-experiments "$CHECK_EXPERIMENTS" \
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -68,9 +68,10 @@ jobs:
        shell: bash
    steps:
      # Duplicated in win-test because this MUST go before a checkout
-      - name: Enable git symlinks on Windows and disable fsmonitor daemon
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
        shell: bash
        run: |
+          git config --global core.longpaths true
          git config --global core.symlinks true

          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -46,9 +46,10 @@ jobs:
        shell: bash
    steps:
      # Duplicated in win-build because this MUST go before a checkout
-      - name: Enable git symlinks on Windows and disable fsmonitor daemon
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
        shell: bash
        run: |
+          git config --global core.longpaths true
          git config --global core.symlinks true

          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
@ -189,7 +190,7 @@ jobs:
        run: |
          pushd "${PYTORCH_FINAL_PACKAGE_DIR}"
          # shellcheck disable=SC2046,SC2102
-          python3 -mpip install $(echo *.whl)[opt-einsum,optree] optree==0.12.1
+          python3 -mpip install $(echo *.whl)[opt-einsum,optree] optree==0.13.0
          popd

          .ci/pytorch/win-test.sh
--- a/.github/workflows/build-conda-images.yml
+++ b/.github/workflows/build-conda-images.yml
@ -35,7 +35,7 @@ jobs:
    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
-        cuda_version: ["11.8", "12.1", "12.4", "cpu"]
+        cuda_version: ["11.8", "12.1", "12.4", "12.6", "cpu"]
    env:
      CUDA_VERSION: ${{ matrix.cuda_version }}
    steps:
@ -62,5 +62,11 @@ jobs:
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
-        run: |
-          .ci/docker/conda/build.sh conda-builder${{ matrix.cuda_version == 'cpu' && ':' || ':cuda' }}${{matrix.cuda_version}}
+        uses: nick-fields/retry@v3.0.0
+        with:
+          shell: bash
+          timeout_minutes: 90
+          max_attempts: 3
+          retry_wait_seconds: 90
+          command: |
+            .ci/docker/conda/build.sh conda-builder${{ matrix.cuda_version == 'cpu' && ':' || ':cuda' }}${{matrix.cuda_version}}
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -31,7 +31,7 @@ concurrency:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -72,8 +72,14 @@ jobs:
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
-        run: |
-          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cuda${{matrix.cuda_version}}
+        uses: nick-fields/retry@v3.0.0
+        with:
+          shell: bash
+          timeout_minutes: 90
+          max_attempts: 3
+          retry_wait_seconds: 90
+          command: |
+            .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cuda${{matrix.cuda_version}}
  build-docker-rocm:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
@ -108,8 +114,14 @@ jobs:
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
-        run: |
-          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:rocm${{matrix.rocm_version}}
+        uses: nick-fields/retry@v3.0.0
+        with:
+          shell: bash
+          timeout_minutes: 90
+          max_attempts: 3
+          retry_wait_seconds: 90
+          command: |
+            .ci/docker/libtorch/build.sh libtorch-cxx11-builder:rocm${{matrix.rocm_version}}
  build-docker-cpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
@ -138,5 +150,11 @@ jobs:
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
-        run: |
-          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cpu
+        uses: nick-fields/retry@v3.0.0
+        with:
+          shell: bash
+          timeout_minutes: 90
+          max_attempts: 3
+          retry_wait_seconds: 90
+          command: |
+            .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cpu
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -35,7 +35,7 @@ concurrency:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -78,8 +78,14 @@ jobs:
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
-        run: |
-          .ci/docker/manywheel/build.sh manylinux-builder:cuda${{matrix.cuda_version}}
+        uses: nick-fields/retry@v3.0.0
+        with:
+          shell: bash
+          timeout_minutes: 90
+          max_attempts: 3
+          retry_wait_seconds: 90
+          command: |
+            .ci/docker/manywheel/build.sh manylinux-builder:cuda${{matrix.cuda_version}}
  # NOTE: manylinux_2_28 are still experimental, see https://github.com/pytorch/pytorch/issues/123649
  build-docker-cuda-manylinux_2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
@ -117,8 +123,14 @@ jobs:
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
-        run: |
-          .ci/docker/manywheel/build.sh manylinux2_28-builder:cuda${{matrix.cuda_version}}
+        uses: nick-fields/retry@v3.0.0
+        with:
+          shell: bash
+          timeout_minutes: 90
+          max_attempts: 3
+          retry_wait_seconds: 90
+          command: |
+            .ci/docker/manywheel/build.sh manylinux2_28-builder:cuda${{matrix.cuda_version}}
  build-docker-cuda-aarch64:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
@ -151,8 +163,14 @@ jobs:
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
-        run: |
-          .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cuda${{matrix.cuda_version}}
+        uses: nick-fields/retry@v3.0.0
+        with:
+          shell: bash
+          timeout_minutes: 90
+          max_attempts: 3
+          retry_wait_seconds: 90
+          command: |
+            .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cuda${{matrix.cuda_version}}
  build-docker-rocm:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
@ -187,8 +205,14 @@ jobs:
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
-        run: |
-          .ci/docker/manywheel/build.sh manylinux-builder:rocm${{matrix.rocm_version}}
+        uses: nick-fields/retry@v3.0.0
+        with:
+          shell: bash
+          timeout_minutes: 90
+          max_attempts: 3
+          retry_wait_seconds: 90
+          command: |
+            .ci/docker/manywheel/build.sh manylinux-builder:rocm${{matrix.rocm_version}}
  build-docker-cpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
@ -217,8 +241,14 @@ jobs:
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
-        run: |
-          .ci/docker/manywheel/build.sh manylinux-builder:cpu
+        uses: nick-fields/retry@v3.0.0
+        with:
+          shell: bash
+          timeout_minutes: 90
+          max_attempts: 3
+          retry_wait_seconds: 90
+          command: |
+            .ci/docker/manywheel/build.sh manylinux-builder:cpu
  build-docker-cpu-manylinux_2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
@ -249,8 +279,14 @@ jobs:
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
-        run: |
-          .ci/docker/manywheel/build.sh manylinux2_28-builder:cpu
+        uses: nick-fields/retry@v3.0.0
+        with:
+          shell: bash
+          timeout_minutes: 90
+          max_attempts: 3
+          retry_wait_seconds: 90
+          command: |
+            .ci/docker/manywheel/build.sh manylinux2_28-builder:cpu
  build-docker-cpu-aarch64:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
@ -281,8 +317,14 @@ jobs:
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
-        run: |
-          .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cpu-aarch64
+        uses: nick-fields/retry@v3.0.0
+        with:
+          shell: bash
+          timeout_minutes: 90
+          max_attempts: 3
+          retry_wait_seconds: 90
+          command: |
+            .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cpu-aarch64
  build-docker-cpu-aarch64-2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
@ -316,8 +358,14 @@ jobs:
        env:
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          .ci/docker/manywheel/build.sh manylinux2_28_aarch64-builder:cpu-aarch64
+        uses: nick-fields/retry@v3.0.0
+        with:
+          shell: bash
+          timeout_minutes: 90
+          max_attempts: 3
+          retry_wait_seconds: 90
+          command: |
+            .ci/docker/manywheel/build.sh manylinux2_28_aarch64-builder:cpu-aarch64
  build-docker-cpu-cxx11-abi:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
@ -348,8 +396,14 @@ jobs:
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
-        run: |
-          .ci/docker/manywheel/build.sh manylinuxcxx11-abi-builder:cpu-cxx11-abi
+        uses: nick-fields/retry@v3.0.0
+        with:
+          shell: bash
+          timeout_minutes: 90
+          max_attempts: 3
+          retry_wait_seconds: 90
+          command: |
+            .ci/docker/manywheel/build.sh manylinuxcxx11-abi-builder:cpu-cxx11-abi
  build-docker-xpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
@ -380,5 +434,11 @@ jobs:
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
-        run: |
-          .ci/docker/manywheel/build.sh manylinux2_28-builder:xpu
+        uses: nick-fields/retry@v3.0.0
+        with:
+          shell: bash
+          timeout_minutes: 90
+          max_attempts: 3
+          retry_wait_seconds: 90
+          command: |
+            .ci/docker/manywheel/build.sh manylinux2_28-builder:xpu
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -29,7 +29,7 @@ concurrency:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -43,7 +43,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        py_vers: [ "3.9", "3.10", "3.11", "3.12" ]
        device: ["cuda", "rocm", "xpu"]
        include:
          - device: "rocm"
@ -91,9 +91,6 @@ jobs:

          # Determine python executable for given version
          case $PY_VERS in
-          3.8)
-            PYTHON_EXECUTABLE=/opt/python/cp38-cp38/bin/python
-            ;;
          3.9)
            PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python
            ;;
@ -214,7 +211,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        py_vers: [ "3.9", "3.10", "3.11", "3.12" ]
    timeout-minutes: 40
    env:
      DOCKER_IMAGE: pytorch/conda-builder:cpu
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -18,7 +18,7 @@ on:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -32,7 +32,7 @@ permissions: read-all
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -67,6 +67,7 @@ jobs:
          pytorch-linux-jammy-py3.12-halide,
          pytorch-linux-jammy-xpu-2024.0-py3,
          pytorch-linux-jammy-py3-clang15-asan,
+          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-focal-py3-clang10-onnx,
          pytorch-linux-focal-linter,
          pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter,
@ -78,7 +79,9 @@ jobs:
          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
            runner: linux.arm64.m7g.4xlarge
            timeout-minutes: 600
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}"
+    # Docker uploads fail from LF runners, see https://github.com/pytorch/pytorch/pull/137358
+    # runs-on: "${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}"
+    runs-on: "${{ matrix.runner }}"
    env:
      DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ matrix.docker-image-name }}
    steps:
@ -120,7 +123,7 @@ jobs:
          IMAGE_NAME: ${{ matrix.docker-image-name }}
        with:
          shell: bash
-          timeout_minutes: 15
+          timeout_minutes: 30
          max_attempts: 5
          retry_wait_seconds: 90
          command: |
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@ -36,7 +36,7 @@ permissions: read-all
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -39,7 +39,7 @@ concurrency:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -60,11 +60,12 @@ jobs:
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      use_split_build: False
      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-aarch64-test:  # Testing
@ -86,6 +87,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
    secrets:
@ -130,6 +132,7 @@ jobs:
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cuda-aarch64
@ -177,11 +180,12 @@ jobs:
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      use_split_build: False
      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-aarch64-test:  # Testing
@ -203,6 +207,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
    secrets:
@ -247,6 +252,7 @@ jobs:
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64
@ -294,11 +300,12 @@ jobs:
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      use_split_build: False
      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-aarch64-test:  # Testing
@ -320,6 +327,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
    secrets:
@ -364,6 +372,7 @@ jobs:
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64
@ -411,11 +420,12 @@ jobs:
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      use_split_build: False
      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-aarch64-test:  # Testing
@ -437,6 +447,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
    secrets:
@ -481,6 +492,7 @@ jobs:
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64
--- a/.github/workflows/generated-linux-binary-conda-nightly.yml
+++ b/.github/workflows/generated-linux-binary-conda-nightly.yml
@ -39,7 +39,7 @@ concurrency:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-main.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-main.yml
@ -34,7 +34,7 @@ concurrency:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
@ -39,7 +39,7 @@ concurrency:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-main.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-main.yml
@ -34,7 +34,7 @@ concurrency:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
--- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml
@ -39,7 +39,7 @@ concurrency:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -34,7 +34,7 @@ concurrency:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -153,7 +153,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_4-test:  # Testing
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -39,7 +39,7 @@ concurrency:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -343,7 +343,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_4-test:  # Testing
@ -1029,7 +1029,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_4-test:  # Testing
@ -1785,7 +1785,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_4-test:  # Testing
@ -2471,7 +2471,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_4-test:  # Testing
@ -3157,7 +3157,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_4-test:  # Testing
@ -3324,3 +3324,353 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cpu
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cpu-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cpu-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cpu-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cpu-cxx11-abi
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cpu-cxx11-abi-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu-cxx11-abi
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-cxx11-abi-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cpu-cxx11-abi-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu-cxx11-abi
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda11_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda11_8
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda11_8-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda11_8-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda11_8
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda11_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda11_8-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda11_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda12_1-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda12_1
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_1-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda12_1-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_1
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_1-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda12_1-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_1
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda12_4
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda12_4-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_4
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-split-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-split-main.yml
@ -1,182 +0,0 @@
-# @generated DO NOT EDIT MANUALLY
-
-# Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-manywheel-split
-
-
-on:
-  push:
-    branches:
-      - main
-    tags:
-      - 'ciflow/periodic/*'
-  workflow_dispatch:
-
-env:
-  # Needed for conda builds
-  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
-  ANACONDA_USER: pytorch
-  AWS_DEFAULT_REGION: us-east-1
-  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-manywheel-split
-  BUILDER_ROOT: /builder
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  PYTORCH_FINAL_PACKAGE_DIR: /artifacts
-  PYTORCH_ROOT: /pytorch
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 0
-concurrency:
-  group: linux-binary-manywheel-split-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  get-label-type:
-    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_9-cuda11_8-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda11_8
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda11_8-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda11_8-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda11_8
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_9-cuda12_1-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_1
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_1-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_1-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_1
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_9-cuda12_4-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel-split
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_4-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_4-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel-split
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-split-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-split-nightly.yml
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -39,7 +39,7 @@ concurrency:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -64,7 +64,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_9-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-s390x-test:  # Testing
@ -133,7 +133,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_10-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-s390x-test:  # Testing
@ -202,7 +202,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_11-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-s390x-test:  # Testing
@ -271,7 +271,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_12-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-s390x-test:  # Testing
@ -340,7 +340,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_13-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cpu-s390x-test:  # Testing
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -46,7 +46,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -162,7 +162,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -278,7 +278,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -394,7 +394,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -496,3 +496,119 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_13-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-14-xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      BUILDER_ROOT: ${{ github.workspace }}/builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.13"
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          fi
+      - name: Checkout PyTorch
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          quiet-checkout: true
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Checkout pytorch/builder
+        uses: malfet/checkout@silent-checkout
+        with:
+          ref: main
+          submodules: recursive
+          repository: pytorch/builder
+          path: builder
+          quiet-checkout: true
+      - name: Clean pytorch/builder checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: builder
+      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
+        uses: nick-fields/retry@v3.0.0
+        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
+        with:
+          timeout_minutes: 5
+          max_attempts: 3
+          retry_wait_seconds: 90
+          command: |
+            sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache
+            sudo chmod +x /usr/local/bin/sccache
+            echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_13-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_13-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_13-cpu-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      DESIRED_PYTHON: "3.13"
+      build_name: wheel-py3_13-cpu
+      use_s3: False
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-windows-binary-conda-nightly.yml
+++ b/.github/workflows/generated-windows-binary-conda-nightly.yml
@ -34,7 +34,7 @@ concurrency:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -64,7 +64,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -75,6 +75,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -178,7 +188,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -189,6 +199,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -310,7 +330,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -321,6 +341,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -425,7 +455,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -436,6 +466,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -558,7 +598,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -569,6 +609,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -673,7 +723,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -684,6 +734,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -806,7 +866,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -817,6 +877,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -921,7 +991,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -932,6 +1002,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -1053,7 +1133,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -1064,6 +1144,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -1167,7 +1257,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -1178,6 +1268,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -1299,7 +1399,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -1310,6 +1410,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -1414,7 +1524,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -1425,6 +1535,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -1547,7 +1667,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -1558,6 +1678,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -1662,7 +1792,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -1673,6 +1803,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -1795,7 +1935,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -1806,6 +1946,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -1910,7 +2060,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -1921,6 +2071,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -2042,7 +2202,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -2053,6 +2213,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -2156,7 +2326,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -2167,6 +2337,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -2288,7 +2468,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -2299,6 +2479,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -2403,7 +2593,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -2414,6 +2604,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -2536,7 +2736,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -2547,6 +2747,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -2651,7 +2861,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -2662,6 +2872,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -2784,7 +3004,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -2795,6 +3015,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -2899,7 +3129,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -2910,6 +3140,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -3031,7 +3271,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -3042,6 +3282,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -3145,7 +3395,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -3156,6 +3406,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -3277,7 +3537,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -3288,6 +3548,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -3392,7 +3662,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -3403,6 +3673,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -3525,7 +3805,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -3536,6 +3816,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -3640,7 +3930,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -3651,6 +3941,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -3773,7 +4073,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -3784,6 +4084,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -3888,7 +4198,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -3899,6 +4209,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
--- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
@ -27,7 +27,7 @@ concurrency:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -61,7 +61,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -72,6 +72,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -179,7 +189,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -190,6 +200,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -34,7 +34,7 @@ concurrency:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -68,7 +68,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -79,6 +79,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -186,7 +196,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -197,6 +207,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -326,7 +346,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -337,6 +357,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -445,7 +475,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -456,6 +486,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -586,7 +626,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -597,6 +637,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -705,7 +755,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -716,6 +766,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -846,7 +906,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -857,6 +917,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -965,7 +1035,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -976,6 +1046,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
--- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
@ -27,7 +27,7 @@ concurrency:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -61,7 +61,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -72,6 +72,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -179,7 +189,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -190,6 +200,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -34,7 +34,7 @@ concurrency:
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -68,7 +68,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -79,6 +79,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -186,7 +196,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -197,6 +207,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -326,7 +346,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -337,6 +357,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -445,7 +475,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -456,6 +486,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -586,7 +626,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -597,6 +637,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -705,7 +755,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -716,6 +766,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -846,7 +906,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -857,6 +917,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
@ -965,7 +1035,7 @@ jobs:
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
-            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
@ -976,6 +1046,16 @@ jobs:
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
--- a/.github/workflows/inductor-cu124.yml
+++ b/.github/workflows/inductor-cu124.yml
@ -20,7 +20,8 @@ permissions: read-all
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -56,7 +57,7 @@ jobs:
          { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_cpp_wrapper_abi_compatible", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_cpp_wrapper", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}
    secrets:
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
--- a/.github/workflows/inductor-micro-benchmark-x86.yml
+++ b/.github/workflows/inductor-micro-benchmark-x86.yml
@ -17,6 +17,7 @@ permissions: read-all

 jobs:
  linux-jammy-cpu-py3_9-gcc11-inductor-build:
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
    with:
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@ -18,7 +18,8 @@ permissions: read-all
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@ -13,30 +13,43 @@ concurrency:
 permissions: read-all

 jobs:
-  get-label-type:
-    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+  get-default-label-prefix:
+    name: get-default-label-prefix
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

+  get-test-label-type:
+    name: get-test-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+      check_experiments: "awsa100"
+
  linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
+    needs:
+      - get-default-label-prefix
+      - get-test-label-type
    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
-          { config: "inductor_huggingface_perf_compare", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
-          { config: "inductor_timm_perf_compare", shard: 1, num_shards: 2, runner: "linux.gcp.a100" },
-          { config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "linux.gcp.a100" },
-          { config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
+          { config: "inductor_huggingface_perf_compare", shard: 1, num_shards: 1, runner: "${{ needs.get-test-label-type.outputs.label-type }}linux.gcp.a100" },
+          { config: "inductor_timm_perf_compare", shard: 1, num_shards: 2, runner: "${{ needs.get-test-label-type.outputs.label-type }}linux.gcp.a100" },
+          { config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "${{ needs.get-test-label-type.outputs.label-type }}linux.gcp.a100" },
+          { config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "${{ needs.get-test-label-type.outputs.label-type }}linux.gcp.a100" },
        ]}
    secrets:
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
--- a/.github/workflows/inductor-perf-test-nightly-a10g.yml
+++ b/.github/workflows/inductor-perf-test-nightly-a10g.yml
@ -70,7 +70,8 @@ permissions: read-all
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@ -50,7 +50,8 @@ permissions: read-all
 jobs:
  get-label-type:
    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: github.event_name != 'schedule' || github.repository == 'pytorch/pytorch'
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
--- a/Show More
+++ b/Show More