Revert "[ROCm] change preferred blas lib defaults (#150249 )" (#150658 )

This reverts commit 8b6bc59e9552689e115445649b76917b9487a181.
Update expected results for pr_time_benchmarks (#150620 )
2025-10-23 23:04:52 +08:00 · 2025-04-03 20:39:27 -04:00 · 2025-04-03 10:14:13 -04:00 · 2025-04-03 09:26:58 -04:00 · 2025-04-02 17:36:04 -07:00 · 2025-04-02 20:24:52 -04:00
1527 changed files with 59113 additions and 32230 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -20,7 +20,7 @@ cd /
 # on the mounted pytorch repo
 git config --global --add safe.directory /pytorch
 pip install -r /pytorch/requirements.txt
-pip install auditwheel
+pip install auditwheel==6.2.0
 if [ "$DESIRED_CUDA" = "cpu" ]; then
    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -39,7 +39,7 @@ def build_ArmComputeLibrary() -> None:
            "clone",
            "https://github.com/ARM-software/ComputeLibrary.git",
            "-b",
-            "v24.09",
+            "v25.02",
            "--depth",
            "1",
            "--shallow-submodules",
@ -99,10 +99,14 @@ def update_wheel(wheel_path, desired_cuda) -> None:
        if "126" in desired_cuda:
            libs_to_copy += [
                "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.6",
+                "/usr/local/cuda/lib64/libcufile.so.0",
+                "/usr/local/cuda/lib64/libcufile_rdma.so.1",
            ]
        elif "128" in desired_cuda:
            libs_to_copy += [
                "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
+                "/usr/local/cuda/lib64/libcufile.so.0",
+                "/usr/local/cuda/lib64/libcufile_rdma.so.1",
            ]
    else:
        libs_to_copy += [
@ -204,7 +208,7 @@ if __name__ == "__main__":
        else:
            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
    elif branch.startswith(("v1.", "v2.")):
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "

    if enable_mkldnn:
        build_ArmComputeLibrary()
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -329,7 +329,7 @@ def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None
        ]
    )
    host.run_cmd(
-        f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v24.09 {git_clone_flags}"
+        f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v25.02 {git_clone_flags}"
    )

    host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}")
@ -761,7 +761,7 @@ def start_build(
        version = host.check_output("cat pytorch/version.txt").strip()[:-2]
        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
    if branch.startswith(("v1.", "v2.")):
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
    if enable_mkldnn:
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -1,4 +1,8 @@
 #!/bin/bash
+# The purpose of this script is to:
+# 1. Extract the set of parameters to be used for a docker build based on the provided image name.
+# 2. Run docker build with the parameters found in step 1.
+# 3. Run the built image and print out the expected and actual versions of packages installed.

 set -ex

@ -95,11 +99,11 @@ fi
 # configuration, so we hardcode everything here rather than do it
 # from scratch
 case "$image" in
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.1
+  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11)
+    CUDA_VERSION=12.6.3
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
+    GCC_VERSION=11
    PROTOBUF=yes
    DB=yes
    VISION=yes
@ -154,6 +158,65 @@ case "$image" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
+  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9)
+    CUDA_VERSION=12.6.3
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.6.3
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-focal-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.6.3
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-focal-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.6.3
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.13
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
  pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9)
    CUDA_VERSION=11.8.0
    CUDNN_VERSION=9
@ -322,7 +385,7 @@ case "$image" in
    EXECUTORCH=yes
    ;;
  pytorch-linux-jammy-py3.12-halide)
-    CUDA_VERSION=12.4
+    CUDA_VERSION=12.6
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=11
    CONDA_CMAKE=yes
@ -330,7 +393,7 @@ case "$image" in
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3.12-triton-cpu)
-    CUDA_VERSION=12.4
+    CUDA_VERSION=12.6
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=11
    CONDA_CMAKE=yes
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-5e4d6b6380d575e48e37e9d987fded4ec588e7bc
+01a22b6f16d117454b7d21ebdc691b0785b84a7f
--- a/.ci/docker/ci_commit_pins/nccl-cu12.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu12.txt
@ -1 +1 @@
-v2.25.1-1
+v2.26.2-1
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-e98b6fcb8df5b44eb0d0addb6767c573d37ba024
+0bcc8265e677e5321606a3311bf71470f14456a8
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-4b3bb1f8da0ded6ccd572dd1358ef45af5a1befe
+96316ce50fade7e209553aba4898cd9b82aab83b
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -1,6 +1,6 @@
 set -euo pipefail

-readonly version=v24.04
+readonly version=v25.02
 readonly src_host=https://github.com/ARM-software
 readonly src_repo=ComputeLibrary

--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@ -37,7 +37,7 @@ install_ubuntu() {
  if [[ "$UBUNTU_VERSION" == "20.04"* && "$CUDA_VERSION" == "11.8"* ]]; then
    maybe_libnccl_dev="libnccl2=2.15.5-1+cuda11.8 libnccl-dev=2.15.5-1+cuda11.8 --allow-downgrades --allow-change-held-packages"
  elif [[ "$UBUNTU_VERSION" == "20.04"* && "$CUDA_VERSION" == "12.4"* ]]; then
-    maybe_libnccl_dev="libnccl2=2.25.1-1+cuda12.4 libnccl-dev=2.25.1-1+cuda12.4 --allow-downgrades --allow-change-held-packages"
+    maybe_libnccl_dev="libnccl2=2.26.2-1+cuda12.4 libnccl-dev=2.26.2-1+cuda12.4 --allow-downgrades --allow-change-held-packages"
  else
    maybe_libnccl_dev=""
  fi
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -66,7 +66,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then

  # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
  if [[ $(uname -m) == "aarch64" ]]; then
-    conda_install "openblas==0.3.28=*openmp*"
+    conda_install "openblas==0.3.29=*openmp*"
  else
    conda_install "mkl=2021.4.0 mkl-include=2021.4.0"
  fi
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -2,7 +2,7 @@

 set -ex

-NCCL_VERSION=v2.25.1-1
+NCCL_VERSION=v2.26.2-1
 CUDNN_VERSION=9.5.1.17

 function install_cusparselt_040 {
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -3,19 +3,8 @@

 set -ex

-NCCL_VERSION=v2.21.5-1
-CUDNN_VERSION=9.5.1.17
-
-function install_cusparselt_062 {
-    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
-    mkdir tmp_cusparselt && pushd tmp_cusparselt
-    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.6.2.3-archive.tar.xz
-    tar xf libcusparse_lt-linux-sbsa-0.6.2.3-archive.tar.xz
-    cp -a libcusparse_lt-linux-sbsa-0.6.2.3-archive/include/* /usr/local/cuda/include/
-    cp -a libcusparse_lt-linux-sbsa-0.6.2.3-archive/lib/* /usr/local/cuda/lib64/
-    popd
-    rm -rf tmp_cusparselt
-}
+NCCL_VERSION=v2.26.2-1
+CUDNN_VERSION=9.8.0.87

 function install_cusparselt_063 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
@ -28,140 +17,7 @@ function install_cusparselt_063 {
    rm -rf tmp_cusparselt
 }

-function install_124 {
-  CUDNN_VERSION=9.1.0.70
-  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
-  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
-  # install CUDA 12.4.1 in the same container
-  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux_sbsa.run
-  chmod +x cuda_12.4.1_550.54.15_linux_sbsa.run
-  ./cuda_12.4.1_550.54.15_linux_sbsa.run --toolkit --silent
-  rm -f cuda_12.4.1_550.54.15_linux_sbsa.run
-  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
-
-  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  mkdir tmp_cudnn && cd tmp_cudnn
-  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
-  tar xf cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
-  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
-  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
-  cd ..
-  rm -rf tmp_cudnn
-
-  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
-  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
-  git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
-  cd nccl && make -j src.build
-  cp -a build/include/* /usr/local/cuda/include/
-  cp -a build/lib/* /usr/local/cuda/lib64/
-  cd ..
-  rm -rf nccl
-
-  install_cusparselt_063
-
-  ldconfig
-}
-
-function prune_124 {
-  echo "Pruning CUDA 12.4"
-  #####################################################################################
-  # CUDA 12.4 prune static libs
-  #####################################################################################
-  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
-  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
-
-  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
-  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
-
-  if [[ -n "$OVERRIDE_GENCODE" ]]; then
-      export GENCODE=$OVERRIDE_GENCODE
-  fi
-
-  # all CUDA libs except CuDNN and CuBLAS
-  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
-      | xargs -I {} bash -c \
-                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
-
-  # prune CuDNN and CuBLAS
-  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
-  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
-
-  #####################################################################################
-  # CUDA 12.4 prune visual tools
-  #####################################################################################
-  export CUDA_BASE="/usr/local/cuda-12.4/"
-  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
-}
-
-function install_126 {
-  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
-  rm -rf /usr/local/cuda-12.6 /usr/local/cuda
-  # install CUDA 12.6.3 in the same container
-  wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux_sbsa.run
-  chmod +x cuda_12.6.3_560.35.05_linux_sbsa.run
-  ./cuda_12.6.3_560.35.05_linux_sbsa.run --toolkit --silent
-  rm -f cuda_12.6.3_560.35.05_linux_sbsa.run
-  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda
-
-  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  mkdir tmp_cudnn && cd tmp_cudnn
-  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
-  tar xf cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
-  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
-  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
-  cd ..
-  rm -rf tmp_cudnn
-
-  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
-  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
-  git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
-  cd nccl && make -j src.build
-  cp -a build/include/* /usr/local/cuda/include/
-  cp -a build/lib/* /usr/local/cuda/lib64/
-  cd ..
-  rm -rf nccl
-
-  install_cusparselt_063
-
-  ldconfig
-}
-
-function prune_126 {
-  echo "Pruning CUDA 12.6"
-  #####################################################################################
-  # CUDA 12.6 prune static libs
-  #####################################################################################
-  export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
-  export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
-
-  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
-  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
-
-  if [[ -n "$OVERRIDE_GENCODE" ]]; then
-      export GENCODE=$OVERRIDE_GENCODE
-  fi
-  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
-      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
-  fi
-
-  # all CUDA libs except CuDNN and CuBLAS
-  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
-      | xargs -I {} bash -c \
-                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
-
-  # prune CuDNN and CuBLAS
-  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
-  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
-
-  #####################################################################################
-  # CUDA 12.6 prune visual tools
-  #####################################################################################
-  export CUDA_BASE="/usr/local/cuda-12.6/"
-  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
-}
-
 function install_128 {
-  CUDNN_VERSION=9.7.1.26
  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
  # install CUDA 12.8.0 in the same container
@ -198,10 +54,6 @@ function install_128 {
 while test $# -gt 0
 do
    case "$1" in
-    12.4) install_124; prune_124
-        ;;
-    12.6) install_126; prune_126
-        ;;
    12.8) install_128;
        ;;
    *) echo "bad argument $1"; exit 1
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -53,7 +53,7 @@ setup_executorch() {
  export EXECUTORCH_BUILD_PYBIND=ON
  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"

-  as_jenkins .ci/scripts/setup-linux.sh cmake || true
+  as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
  popd
 }

--- a/.ci/docker/common/install_ninja.sh
+++ b/.ci/docker/common/install_ninja.sh
@ -4,10 +4,15 @@ set -ex

 [ -n "$NINJA_VERSION" ]

-url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux.zip"
+arch=$(uname -m)
+if [ "$arch" == "aarch64" ]; then
+    url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux-aarch64.zip"
+else
+    url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux.zip"
+fi

 pushd /tmp
 wget --no-verbose --output-document=ninja-linux.zip "$url"
 unzip ninja-linux.zip -d /usr/local/bin
 rm -f ninja-linux.zip
-popd
+popd
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -32,7 +32,7 @@ pip_install coloredlogs packaging

 pip_install onnxruntime==1.18.1
 pip_install onnx==1.17.0
-pip_install onnxscript==0.1.0 --no-deps
+pip_install onnxscript==0.2.2 --no-deps
 # required by onnxscript
 pip_install ml_dtypes

--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@ -4,7 +4,7 @@
 set -ex

 cd /
-git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.28 --depth 1 --shallow-submodules
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.29 --depth 1 --shallow-submodules


 OPENBLAS_BUILD_FLAGS="
--- a/.ci/docker/common/install_rocm_drm.sh
+++ b/.ci/docker/common/install_rocm_drm.sh
@ -115,7 +115,7 @@ index a5007ffc..13fa07fc 100644
 	if (!fp) {
 -		fprintf(stderr, "%s: %s\n", AMDGPU_ASIC_ID_TABLE,
 -			strerror(errno));
-+		fprintf(stderr, "amdgpu.ids: No such file or directory\n");
+		//fprintf(stderr, "amdgpu.ids: No such file or directory\n");
 		return;
 	}

--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -60,15 +60,15 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}"
  # Triton needs at least gcc-9 to build
  apt-get install -y g++-9

-  CXX=g++-9 pip_install -e .
+  CXX=g++-9 pip_install .
 elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
  # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
  add-apt-repository -y ppa:ubuntu-toolchain-r/test
  apt-get install -y g++-9

-  CXX=g++-9 pip_install -e .
+  CXX=g++-9 pip_install .
 else
-  pip_install -e .
+  pip_install .
 fi

 if [ -n "${CONDA_CMAKE}" ]; then
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -47,6 +47,9 @@ function install_ubuntu() {
    # Development Packages
    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
    # Install Intel Support Packages
+    if [[ "$XPU_VERSION" == "2025.0" ]]; then
+        XPU_PACKAGES="${XPU_PACKAGES} intel-oneapi-dnnl=2025.0.1-6"
+    fi
    apt-get install -y ${XPU_PACKAGES}

    # Cleanup
@ -82,6 +85,9 @@ gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.
 EOF

    # Install Intel Support Packages
+    if [[ "$XPU_VERSION" == "2025.0" ]]; then
+        XPU_PACKAGES="${XPU_PACKAGES} intel-oneapi-dnnl-2025.0.1-6"
+    fi
    yum install -y ${XPU_PACKAGES}
    # The xpu-smi packages
    dnf install -y xpu-smi
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -39,7 +39,7 @@ case ${GPU_ARCH_TYPE} in
        BASE_TARGET=rocm
        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
        GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx942"
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
        ;;
    *)
--- a/.ci/docker/manywheel/Dockerfile_2014
+++ b/.ci/docker/manywheel/Dockerfile_2014
@ -1,153 +0,0 @@
-# syntax = docker/dockerfile:experimental
-ARG ROCM_VERSION=3.7
-ARG BASE_CUDA_VERSION=10.2
-ARG GPU_IMAGE=nvidia/cuda:${BASE_CUDA_VERSION}-devel-centos7
-FROM quay.io/pypa/manylinux2014_x86_64 as base
-
-ENV LC_ALL en_US.UTF-8
-ENV LANG en_US.UTF-8
-ENV LANGUAGE en_US.UTF-8
-
-RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
-RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
-RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
-RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel
-RUN yum install -y yum-utils centos-release-scl sudo
-RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
-RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils
-ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
-ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
-
-# cmake
-RUN yum install -y cmake3 && \
-    ln -s /usr/bin/cmake3 /usr/bin/cmake
-FROM base as openssl
-# Install openssl (this must precede `build python` step)
-# (In order to have a proper SSL module, Python is compiled
-# against a recent openssl [see env vars above], which is linked
-# statically. We delete openssl afterwards.)
-ADD ./common/install_openssl.sh install_openssl.sh
-RUN bash ./install_openssl.sh && rm install_openssl.sh
-
-
-
-# remove unncessary python versions
-RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
-RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
-RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
-RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
-
-FROM base as cuda
-ARG BASE_CUDA_VERSION=10.2
-# Install CUDA
-ADD ./common/install_cuda.sh install_cuda.sh
-RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
-
-FROM base as intel
-# MKL
-ADD ./common/install_mkl.sh install_mkl.sh
-RUN bash ./install_mkl.sh && rm install_mkl.sh
-
-FROM base as magma
-ARG BASE_CUDA_VERSION=10.2
-# Install magma
-ADD ./common/install_magma.sh install_magma.sh
-RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
-
-FROM base as jni
-# Install java jni header
-ADD ./common/install_jni.sh install_jni.sh
-ADD ./java/jni.h jni.h
-RUN bash ./install_jni.sh && rm install_jni.sh
-
-FROM base as libpng
-# Install libpng
-ADD ./common/install_libpng.sh install_libpng.sh
-RUN bash ./install_libpng.sh && rm install_libpng.sh
-
-FROM ${GPU_IMAGE} as common
-RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
-RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
-RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
-ENV LC_ALL en_US.UTF-8
-ENV LANG en_US.UTF-8
-ENV LANGUAGE en_US.UTF-8
-RUN yum install -y \
-        aclocal \
-        autoconf \
-        automake \
-        bison \
-        bzip2 \
-        curl \
-        diffutils \
-        file \
-        git \
-        make \
-        patch \
-        perl \
-        unzip \
-        util-linux \
-        wget \
-        which \
-        xz \
-        yasm
-RUN yum install -y \
-    https://repo.ius.io/ius-release-el7.rpm \
-    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
-
-RUN yum swap -y git git236-core
-# git236+ would refuse to run git commands in repos owned by other users
-# Which causes version check to fail, as pytorch repo is bind-mounted into the image
-# Override this behaviour by treating every folder as safe
-# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
-RUN git config --global --add safe.directory "*"
-
-ENV SSL_CERT_FILE=/opt/_internal/certs.pem
-# Install LLVM version
-COPY --from=openssl            /opt/openssl                          /opt/openssl
-COPY --from=base               /opt/python                           /opt/python
-COPY --from=base               /opt/_internal                        /opt/_internal
-COPY --from=base               /usr/local/bin/auditwheel             /usr/local/bin/auditwheel
-COPY --from=intel              /opt/intel                            /opt/intel
-COPY --from=base               /usr/local/bin/patchelf               /usr/local/bin/patchelf
-COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
-COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
-COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
-COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
-COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
-COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
-COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
-
-FROM common as cpu_final
-ARG BASE_CUDA_VERSION=10.2
-RUN yum install -y yum-utils centos-release-scl
-RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
-RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils
-ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
-ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
-
-# cmake
-RUN yum install -y cmake3 && \
-    ln -s /usr/bin/cmake3 /usr/bin/cmake
-
-# ninja
-RUN yum install -y http://repo.okay.com.mx/centos/7/x86_64/release/okay-release-1-1.noarch.rpm
-RUN yum install -y ninja-build
-
-FROM cpu_final as cuda_final
-RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
-COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
-COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
-
-FROM common as rocm_final
-ARG ROCM_VERSION=3.7
-# Install ROCm
-ADD ./common/install_rocm.sh install_rocm.sh
-RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh
-# cmake is already installed inside the rocm base image, but both 2 and 3 exist
-# cmake3 is needed for the later MIOpen custom build, so that step is last.
-RUN yum install -y cmake3 && \
-    rm -f /usr/bin/cmake && \
-    ln -s /usr/bin/cmake3 /usr/bin/cmake
-ADD ./common/install_miopen.sh install_miopen.sh
-RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -38,6 +38,12 @@ RUN yum install -y \
  sudo \
  gcc-toolset-${GCCTOOLSET_VERSION}-toolchain

+# (optional) Install non-default Ninja version
+ARG NINJA_VERSION
+COPY ./common/install_ninja.sh install_ninja.sh
+RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
+RUN rm install_ninja.sh
+
 # Ensure the expected devtoolset is used
 ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -48,7 +48,7 @@ case ${GPU_ARCH_TYPE} in
        TARGET=final
        DOCKER_TAG=cpu-aarch64
        GPU_IMAGE=arm64v8/almalinux:8
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11 --build-arg NINJA_VERSION=1.12.1"
        MANY_LINUX_VERSION="2_28_aarch64"
        ;;
    cpu-cxx11-abi)
@ -97,7 +97,7 @@ case ${GPU_ARCH_TYPE} in
            DEVTOOLSET_VERSION="11"
            GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
        fi
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101"
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
    xpu)
@ -121,7 +121,8 @@ fi
 (
    set -x

-    if [ "$(uname -m)" != "s390x" ]; then
+    # Only activate this if in CI
+    if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then
        # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
        # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
        sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
@ -139,7 +140,7 @@ fi
        "${TOPDIR}/.ci/docker/"
 )

-GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+GITHUB_REF=${GITHUB_REF:-"dev")}
 GIT_BRANCH_NAME=${GITHUB_REF##*/}
 GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
 DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
--- a/.ci/docker/manywheel/build_scripts/build_utils.sh
+++ b/.ci/docker/manywheel/build_scripts/build_utils.sh
@ -3,7 +3,7 @@
 # Script used only in CD pipeline

 OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/
-CURL_DOWNLOAD_URL=https://curl.askapache.com/download
+CURL_DOWNLOAD_URL=https://curl.se/download

 AUTOCONF_DOWNLOAD_URL=https://ftp.gnu.org/gnu/autoconf

--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -90,10 +90,10 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.13.0
+mypy==1.14.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
-#Pinned versions: 1.10.0
+#Pinned versions: 1.14.0
 #test that import: test_typing.py, test_type_hints.py

 networkx==2.8.8
@ -294,7 +294,7 @@ ghstack==0.8.0
 #Pinned versions: 0.8.0
 #test that import:

-jinja2==3.1.5
+jinja2==3.1.6
 #Description: jinja2 template engine
 #Pinned versions: 3.1.4
 #test that import:
@ -339,7 +339,7 @@ onnx==1.17.0
 #Pinned versions:
 #test that import:

-onnxscript==0.1.0
+onnxscript==0.2.2
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.2.0
+3.3.0
--- a/.ci/magma/Makefile
+++ b/.ci/magma/Makefile
@ -12,7 +12,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_CUDA_SHORT} \
 	-e DESIRED_CUDA=${DESIRED_CUDA} \
 	-e CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" \
-	"pytorch/manylinux-builder:cuda${DESIRED_CUDA}-main" \
+	"pytorch/manylinux2_28-builder:cuda${DESIRED_CUDA}-main" \
 	magma/build_magma.sh

 .PHONY: all
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -54,11 +54,11 @@ cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
 TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
 case ${CUDA_VERSION} in
    12.8)
-        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0;10.0;12.0+PTX" #Ripping out 5.0 and 6.0 due to ld error
+        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX" #removing sm_50-sm_70 as these architectures are deprecated in CUDA 12.8 and will be removed in future releases
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
        ;;
    12.6)
-        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0+PTX"
+        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
        ;;
    12.4)
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -173,6 +173,7 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
  export USE_KINETO=0
+  export TORCH_XPU_ARCH_LIST=pvc
 fi

 # sccache will fail for CUDA builds if all cores are used for compiling
@ -191,7 +192,7 @@ fi

 # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
 # memory to build and will OOM
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]] && [ -z "$MAX_JOBS_OVERRIDE" ]; then
  echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
  echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
  export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
@ -377,8 +378,10 @@ else
    # This is an attempt to mitigate flaky libtorch build OOM error. By default, the build parallelization
    # is set to be the number of CPU minus 2. So, let's try a more conservative value here. A 4xlarge has
    # 16 CPUs
-    MAX_JOBS=$(nproc --ignore=4)
-    export MAX_JOBS
+    if [ -z "$MAX_JOBS_OVERRIDE" ]; then
+      MAX_JOBS=$(nproc --ignore=4)
+      export MAX_JOBS
+    fi

    # NB: Install outside of source directory (at the same level as the root
    # pytorch folder) so that it doesn't get cleaned away prior to docker push.
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -202,7 +202,7 @@ function install_torchrec_and_fbgemm() {

 function clone_pytorch_xla() {
  if [[ ! -d ./xla ]]; then
-    git clone --recursive --quiet https://github.com/pytorch/xla.git
+    git clone --recursive -b r2.7 https://github.com/pytorch/xla.git
    pushd xla
    # pin the xla hash so that we don't get broken by changes to xla
    git checkout "$(cat ../.github/ci_commit_pins/xla.txt)"
--- a/.ci/pytorch/smoke_test/max_autotune.py
+++ b/.ci/pytorch/smoke_test/max_autotune.py
@ -46,7 +46,9 @@ def train(args, model, device, train_loader, optimizer, epoch):
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            print(
-                f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}"  # noqa: B950
+                f"Train Epoch: {epoch} "
+                f"[{batch_idx * len(data)}/{len(train_loader.dataset)} "
+                f"({100.0 * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}"
            )
            if args.dry_run:
                break
@ -71,7 +73,9 @@ def test(model, device, test_loader):
    test_loss /= len(test_loader.dataset)

    print(
-        f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n"  # noqa: B950
+        f"\nTest set: Average loss: {test_loss:.4f}, "
+        f"Accuracy: {correct}/{len(test_loader.dataset)} "
+        f"({100.0 * correct / len(test_loader.dataset):.0f}%)\n"
    )


--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@ -166,6 +166,10 @@ def test_cuda_gds_errors_captured() -> None:
    major_version = int(torch.version.cuda.split(".")[0])
    minor_version = int(torch.version.cuda.split(".")[1])

+    if target_os == "windows":
+        print(f"{target_os} is not supported for GDS smoke test")
+        return
+
    if major_version < 12 or (major_version == 12 and minor_version < 6):
        print("CUDA version is not supported for GDS smoke test")
        return
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -314,6 +314,13 @@ test_python() {
  assert_git_not_dirty
 }

+test_lazy_tensor_meta_reference_disabled() {
+  export TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE=1
+  echo "Testing lazy tensor operations without meta reference"
+  time python test/run_test.py --include lazy/test_ts_opinfo.py --verbose
+  export -n TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE
+}
+

 test_dynamo_wrapped_shard() {
  if [[ -z "$NUM_TEST_SHARDS" ]]; then
@ -476,6 +483,8 @@ elif [[ "${TEST_CONFIG}" == *aot_eager* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--backend aot_eager)
 elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--export-aot-inductor)
+elif [[ "${TEST_CONFIG}" == *max_autotune_inductor* ]]; then
+  DYNAMO_BENCHMARK_FLAGS+=(--inductor --inductor-compile-mode max-autotune)
 elif [[ "${TEST_CONFIG}" == *inductor* && "${TEST_CONFIG}" != *perf* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--inductor)
 fi
@ -490,6 +499,59 @@ else
  DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
 fi

+test_cachebench() {
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+
+  local BENCHMARK
+  if [[ "${SHARD_NUMBER}" == 1 ]]; then
+    local BENCHMARK=torchbench
+  elif [[ "${SHARD_NUMBER}" == 2 ]]; then
+    local BENCHMARK=huggingface
+  else
+    echo "invalid SHARD_NUMBER: ${SHARD_NUMBER}"
+    exit 1
+  fi
+
+  local mode_options=("training" "inference")
+
+  for mode in "${mode_options[@]}"; do
+    $TASKSET python "benchmarks/dynamo/cachebench.py" \
+        --mode "$mode" \
+        --device cuda \
+        --benchmark "$BENCHMARK" \
+        --repeat 3 \
+        --output "$TEST_REPORTS_DIR/cachebench_${BENCHMARK}_${mode}.json"
+
+    $TASKSET python "benchmarks/dynamo/cachebench.py" \
+        --mode "$mode" \
+        --dynamic \
+        --device cuda \
+        --benchmark "$BENCHMARK" \
+        --repeat 3 \
+        --output "$TEST_REPORTS_DIR/cachebench_${BENCHMARK}_${mode}_dynamic.json"
+  done
+}
+
+test_verify_cachebench() {
+  TMP_TEST_REPORTS_DIR=$(mktemp -d)
+  TEST_OUTPUT="$TMP_TEST_REPORTS_DIR/test.json"
+
+  $TASKSET python "benchmarks/dynamo/cachebench.py" \
+      --mode training \
+      --device cpu \
+      --model nanogpt \
+      --benchmark torchbench \
+      --output "$TEST_OUTPUT"
+
+  # -s checks file exists and is non empty
+  if [[ ! -s "$TEST_OUTPUT" ]]; then
+    echo "Cachebench failed to produce an output."
+    echo "Run 'python benchmarks/dynamo/cachebench.py' to make sure it works"
+    exit 1
+  fi
+}
+
 test_perf_for_dashboard() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
@ -518,6 +580,8 @@ test_perf_for_dashboard() {
    test_inductor_set_cpu_affinity
  elif [[ "${TEST_CONFIG}" == *cuda_a10g* ]]; then
    device=cuda_a10g
+  elif [[ "${TEST_CONFIG}" == *h100* ]]; then
+    device=cuda_h100
  elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
    device=rocm
  fi
@ -698,6 +762,8 @@ test_dynamo_benchmark() {
      fi
    elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
+    elif [[ "${TEST_CONFIG}" == *max_autotune_inductor* ]]; then
+      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
    else
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
      test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
@ -1107,8 +1173,9 @@ build_xla() {
  apply_patches
  SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
  # These functions are defined in .circleci/common.sh in pytorch/xla repo
-  retry install_deps_pytorch_xla $XLA_DIR $USE_CACHE
+  retry install_pre_deps_pytorch_xla $XLA_DIR $USE_CACHE
  CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch:${CMAKE_PREFIX_PATH}" XLA_SANDBOX_BUILD=1 build_torch_xla $XLA_DIR
+  retry install_post_deps_pytorch_xla
  assert_git_not_dirty
 }

@ -1415,7 +1482,7 @@ test_executorch() {
  bash examples/models/llama3_2_vision/install_requirements.sh
  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
  # from the PR
-  bash .ci/scripts/setup-linux.sh cmake
+  bash .ci/scripts/setup-linux.sh --build-tool cmake

  echo "Run ExecuTorch unit tests"
  pytest -v -n auto
@ -1439,7 +1506,7 @@ test_executorch() {
 test_linux_aarch64() {
  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
        test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
-        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops \
+        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

  # Dynamo tests
@ -1507,6 +1574,16 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
  install_torchvision
  id=$((SHARD_NUMBER-1))
  test_dynamo_benchmark timm_models "$id"
+elif [[ "${TEST_CONFIG}" == cachebench ]]; then
+  install_torchaudio cuda
+  install_torchvision
+  checkout_install_torchbench nanogpt BERT_pytorch resnet50 hf_T5 llama moco
+  PYTHONPATH=$(pwd)/torchbench test_cachebench
+elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
+  install_torchaudio cpu
+  install_torchvision
+  checkout_install_torchbench nanogpt
+  PYTHONPATH=$(pwd)/torchbench test_verify_cachebench
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
    install_torchaudio cpu
@ -1562,6 +1639,7 @@ elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
  test_python_shard "$SHARD_NUMBER"
  test_aten
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
+  test_lazy_tensor_meta_reference_disabled
  test_without_numpy
  install_torchvision
  test_python_shard 1
--- a/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat
+++ b/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat
@ -17,32 +17,24 @@ curl -L -o "%INSTALLER_FILE%" %DOWNLOAD_URL%
 :: Install the Visual Studio Build Tools with C++ components
 echo Installing Visual Studio Build Tools with C++ components...
 echo Installing MSVC %MSVC_VERSION%
-if "%MSVC_VERSION%" == "latest" (
-    "%INSTALLER_FILE%" --norestart --nocache --quiet --wait --installPath "%DEPENDENCIES_DIR%\VSBuildTools" ^
-        --add Microsoft.VisualStudio.Component.Windows11SDK.22621 ^
-        --add Microsoft.VisualStudio.Component.VC.ASAN ^
-        --add Microsoft.VisualStudio.Component.VC.CMake.Project ^
-        --add Microsoft.VisualStudio.Component.VC.Tools.ARM64 ^
-        --add Microsoft.VisualStudio.Component.VC.Tools.x86.x64
-) else if "%MSVC_VERSION%" == "14.40" (
-    "%INSTALLER_FILE%" --norestart --nocache --quiet --wait --installPath "%DEPENDENCIES_DIR%\VSBuildTools" ^
-        --add Microsoft.VisualStudio.Component.Windows11SDK.22621 ^
-        --add Microsoft.VisualStudio.Component.VC.ASAN ^
-        --add Microsoft.VisualStudio.Component.VC.CMake.Project ^
-        --add Microsoft.VisualStudio.Component.VC.14.40.17.10.ARM64 ^
-        --add Microsoft.VisualStudio.Component.VC.14.40.17.10.x86.x64
-) else if "%MSVC_VERSION%" == "14.36" (
-    "%INSTALLER_FILE%" --norestart --nocache --quiet --wait --installPath "%DEPENDENCIES_DIR%\VSBuildTools" ^
-        --add Microsoft.VisualStudio.Component.Windows11SDK.22621 ^
-        --add Microsoft.VisualStudio.Component.VC.ASAN ^
-        --add Microsoft.VisualStudio.Component.VC.CMake.Project ^
-        --add Microsoft.VisualStudio.Component.VC.14.36.17.6.ARM64 ^
-        --add Microsoft.VisualStudio.Component.VC.14.36.17.6.x86.x64
-)
+"%INSTALLER_FILE%" --norestart --quiet --wait --installPath "%DEPENDENCIES_DIR%\VSBuildTools" ^
+    --add Microsoft.VisualStudio.Workload.VCTools ^
+    --add Microsoft.VisualStudio.Component.Windows10SDK ^
+    --add Microsoft.VisualStudio.Component.Windows11SDK.22621 ^
+    --add Microsoft.VisualStudio.Component.VC.ASAN ^
+    --add Microsoft.VisualStudio.Component.VC.CMake.Project ^
+    --add Microsoft.VisualStudio.Component.VC.CoreBuildTools ^
+    --add Microsoft.VisualStudio.Component.VC.CoreIde ^
+    --add Microsoft.VisualStudio.Component.VC.Redist.14.Latest ^
+    --add Microsoft.VisualStudio.Component.VC.Tools.ARM64EC ^
+    --add Microsoft.VisualStudio.Component.VC.Tools.ARM64 ^
+    --add Microsoft.VisualStudio.Component.VC.Tools.x86.x64
+
+echo exitcode = %errorlevel%

 :: Check if installation was successful
 if %errorlevel% neq 0 (
-    echo "Failed to install Visual Studio Build Tools with C++ components. (exitcode = %errorlevel%)"
+    echo Failed to install Visual Studio Build Tools with C++ components.
    exit /b 1
 )

--- a/.ci/pytorch/windows/arm64/bootstrap_python.bat
+++ b/.ci/pytorch/windows/arm64/bootstrap_python.bat
@ -6,22 +6,25 @@ echo Dependency Python installation started.
 if not exist "%DOWNLOADS_DIR%" mkdir %DOWNLOADS_DIR%
 if not exist "%DEPENDENCIES_DIR%" mkdir %DEPENDENCIES_DIR%

-if "%PYTHON_VERSION%"=="Python312" (
-    echo Python version is set to Python312
-    set DOWNLOAD_URL="https://www.python.org/ftp/python/3.12.7/python-3.12.7-arm64.exe"
-) else if "%PYTHON_VERSION%"=="Python311" (
-    echo Python version is set to Python311
-    set DOWNLOAD_URL="https://www.python.org/ftp/python/3.11.9/python-3.11.9-arm64.exe"
+if "%DESIRED_PYTHON%" == "3.13" (
+    echo Python version is set to 3.13
+    set DOWNLOAD_URL=https://www.python.org/ftp/python/3.13.2/python-3.13.2-arm64.exe
+) else if "%DESIRED_PYTHON%" == "3.12" (
+    echo Python version is set to 3.12
+    set DOWNLOAD_URL=https://www.python.org/ftp/python/3.12.7/python-3.12.7-arm64.exe
+) else if "%DESIRED_PYTHON%" == "3.11" (
+    echo Python version is set to 3.11
+    set DOWNLOAD_URL=https://www.python.org/ftp/python/3.11.9/python-3.11.9-arm64.exe
 ) else (
-    echo PYTHON_VERSION not defined, Python version is set to Python312
-    set DOWNLOAD_URL="https://www.python.org/ftp/python/3.12.7/python-3.12.7-arm64.exe"
+    echo DESIRED_PYTHON not defined, Python version is set to 3.12
+    set DOWNLOAD_URL=https://www.python.org/ftp/python/3.12.7/python-3.12.7-arm64.exe
 )

 set INSTALLER_FILE=%DOWNLOADS_DIR%\python-installer.exe

 :: Download installer
 echo Downloading Python...
-curl -L -o "%INSTALLER_FILE%" %DOWNLOAD_URL%
+curl -L -o "%INSTALLER_FILE%" "%DOWNLOAD_URL%"

 :: Install Python
 echo Installing Python...
--- a/.ci/pytorch/windows/arm64/bootstrap_tests.bat
+++ b/.ci/pytorch/windows/arm64/bootstrap_tests.bat
@ -14,7 +14,7 @@ where python
 :: install dependencies
 python -m pip install --upgrade pip
 pip install -r requirements.txt
-pip install pytest numpy
+pip install pytest numpy protobuf expecttest hypothesis

 :: find file name for pytorch wheel
 for /f "delims=" %%f in ('dir /b "%PYTORCH_FINAL_PACKAGE_DIR%" ^| findstr "torch-"') do set "TORCH_WHEEL_FILENAME=%PYTORCH_FINAL_PACKAGE_DIR%\%%f"
--- a/.ci/pytorch/windows/arm64/smoke_test.bat
+++ b/.ci/pytorch/windows/arm64/smoke_test.bat
@ -1,8 +1,6 @@
@echo off
 setlocal

-set "ORIG_PATH=%PATH%"
-
 if "%PACKAGE_TYPE%" == "wheel" goto wheel
 if "%PACKAGE_TYPE%" == "libtorch" goto libtorch

@ -10,21 +8,7 @@ echo "unknown package type"
 exit /b 1

 :wheel
-echo "install wheel package"
-
-echo Running pip install...
-pip install -q --pre numpy protobuf
-echo Error level after pip install: %ERRORLEVEL%
-if errorlevel 1 exit /b 1
-
-for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *.whl') do pip install "%%i"
-if errorlevel 1 exit /b 1
-
-goto smoke_test
-
-:smoke_test
-python -c "import torch"
-if ERRORLEVEL 1 exit /b 1
+call %PYTORCH_ROOT%\.ci\pytorch\windows\arm64\bootstrap_tests.bat

 echo Running python rnn_smoke.py...
 python %PYTORCH_ROOT%\.ci\pytorch\test_example_code\rnn_smoke_win_arm64.py
@ -39,10 +23,12 @@ goto end
 :libtorch
 echo "install and test libtorch"

-for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *-latest.zip') do tar -xf "%%i" -C tmp
+if not exist tmp mkdir tmp
+
+for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *-latest.zip') do C:\Windows\System32\tar.exe -xf "%%i" -C tmp
 if ERRORLEVEL 1 exit /b 1

-pushd tmp\libtorch
+pushd tmp

 set VC_VERSION_LOWER=14
 set VC_VERSION_UPPER=36
@ -60,6 +46,4 @@ if ERRORLEVEL 1 exit /b 1
 .\simple-torch-test.exe
 if ERRORLEVEL 1 exit /b 1

-:end
-set "PATH=%ORIG_PATH%"
-popd
+:end
--- a/.ci/pytorch/windows/internal/smoke_test.bat
+++ b/.ci/pytorch/windows/internal/smoke_test.bat
@ -71,11 +71,20 @@ if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install --pre numpy==2.1.2
 if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install --pre numpy==2.0.2 protobuf
 if "%DESIRED_PYTHON%" == "3.11" %PYTHON_EXEC% -m pip install --pre numpy==2.0.2 protobuf
 if "%DESIRED_PYTHON%" == "3.10" %PYTHON_EXEC% -m pip install --pre numpy==2.0.2 protobuf
-if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install --pre numpy==2.0.2 protobuf
+if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install --pre numpy==2.0.2 protobuf networkx

 if errorlevel 1 exit /b 1

-for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *.whl') do %PYTHON_EXEC% -m pip install "%%i"
+if "%PYTORCH_BUILD_VERSION:dev=%" NEQ "%PYTORCH_BUILD_VERSION%" (
+    set "CHANNEL=nightly"
+) else (
+    set "CHANNEL=test"
+)
+
+set "EXTRA_INDEX= "
+if "%CUDA_VERSION%" == "xpu" set "EXTRA_INDEX=--index-url https://download.pytorch.org/whl/%CHANNEL%/xpu"
+
+for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *.whl') do %PYTHON_EXEC% -m pip install "%%i" %EXTRA_INDEX%
 if errorlevel 1 exit /b 1

 goto smoke_test
--- a/.ci/pytorch/windows/internal/xpu_install.bat
+++ b/.ci/pytorch/windows/internal/xpu_install.bat
@ -47,9 +47,9 @@ set XPU_EXTRA_INSTALLED=0
 set XPU_EXTRA_UNINSTALL=0

 if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.0] (
-    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/efc86abd-cb77-452e-a03f-a741895b8ece/intel-deep-learning-essentials-2025.0.0.336_offline.exe
+    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe
    set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
-    set XPU_BUNDLE_VERSION=2025.0.0+335
+    set XPU_BUNDLE_VERSION=2025.0.1+20
    set XPU_BUNDLE_INSTALLED=0
    set XPU_BUNDLE_UNINSTALL=0
    set XPU_EXTRA_URL=NULL
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -31,9 +31,9 @@ fi
 export DOCKER_IMAGE=${DOCKER_IMAGE:-}
 if [[ -z "$DOCKER_IMAGE" ]]; then
  if [[ "$DESIRED_CUDA" == cpu ]]; then
-    export DOCKER_IMAGE="pytorch/manylinux:cpu"
+    export DOCKER_IMAGE="pytorch/manylinux2_28:cpu"
  else
-    export DOCKER_IMAGE="pytorch/manylinux-builder:${DESIRED_CUDA:2}"
+    export DOCKER_IMAGE="pytorch/manylinux2_28-builder:${DESIRED_CUDA:2}"
  fi
 fi

@ -74,6 +74,12 @@ TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)

 # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
 TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
+
+# CUDA 12.8 builds have triton for Linux and Linux aarch64 binaries.
+if [[ "$DESIRED_CUDA" == cu128 ]]; then
+  TRITON_CONSTRAINT="platform_system == 'Linux'"
+fi
+
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
  if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
@ -98,11 +104,11 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B
 fi

 # Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton xpu package
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*xpu.* && $(uname) == "Linux" ]]; then
-    TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
+if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
+    TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}"
    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
        TRITON_SHORTHASH=$(cut -c1-8 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-xpu.txt)
-        TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}+git${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
+        TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}+git${TRITON_SHORTHASH}"
    fi
    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
--- a/.circleci/scripts/binary_windows_arm64_build.sh
+++ b/.circleci/scripts/binary_windows_arm64_build.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+set -eux -o pipefail
+
+source "${BINARY_ENV_FILE:-/c/w/env}"
+mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
+
+export USE_SCCACHE=1
+export SCCACHE_IGNORE_SERVER_IO_ERROR=1
+
+echo "Free space on filesystem before build:"
+df -h
+
+export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT"
+
+if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
+    pytorch/.ci/pytorch/windows/arm64/build_libtorch.bat
+elif [[ "$PACKAGE_TYPE" == 'wheel' ]]; then
+    pytorch/.ci/pytorch/windows/arm64/build_pytorch.bat
+fi
+
+echo "Free space on filesystem after build:"
+df -h
--- a/.circleci/scripts/binary_windows_arm64_test.sh
+++ b/.circleci/scripts/binary_windows_arm64_test.sh
@ -0,0 +1,6 @@
+#!/bin/bash
+set -eux -o pipefail
+
+source "${BINARY_ENV_FILE:-/c/w/env}"
+
+pytorch/.ci/pytorch/windows/arm64/smoke_test.bat
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@ -13,6 +13,7 @@ export VC_YEAR=2022
 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
    export USE_SCCACHE=0
    export XPU_VERSION=2025.0
+    export XPU_ENABLE_KINETO=1
 fi

 echo "Free space on filesystem before build:"
--- a/.clang-tidy
+++ b/.clang-tidy
@ -12,6 +12,7 @@ bugprone-*,
 -bugprone-macro-parentheses,
 -bugprone-lambda-function-name,
 -bugprone-reserved-identifier,
+-bugprone-return-const-ref-from-parameter,
 -bugprone-swapped-arguments,
 clang-analyzer-core.*,
 clang-analyzer-cplusplus.*,
@ -24,6 +25,7 @@ cppcoreguidelines-*,
 -cppcoreguidelines-avoid-non-const-global-variables,
 -cppcoreguidelines-interfaces-global-init,
 -cppcoreguidelines-macro-usage,
+-cppcoreguidelines-macro-to-enum,
 -cppcoreguidelines-owning-memory,
 -cppcoreguidelines-pro-bounds-array-to-pointer-decay,
 -cppcoreguidelines-pro-bounds-constant-array-index,
@ -46,6 +48,7 @@ misc-*,
 -misc-no-recursion,
 -misc-non-private-member-variables-in-classes,
 -misc-unused-using-decls,
+-misc-use-internal-linkage,
 modernize-*,
 -modernize-macro-to-enum,
 -modernize-return-braced-init-list,
@ -55,6 +58,7 @@ modernize-*,
 -modernize-use-trailing-return-type,
 -modernize-use-nodiscard,
 performance-*,
+-performance-enum-size,
 readability-container-size-empty,
 readability-delete-null-pointer,
 readability-duplicate-include
--- a/.flake8
+++ b/.flake8
@ -38,6 +38,7 @@ per-file-ignores =
    torchgen/api/types/__init__.py: F401,F403
    torchgen/executorch/api/types/__init__.py: F401,F403
    test/dynamo/test_higher_order_ops.py: B950
+    test/dynamo/test_error_messages.py: B950
    torch/testing/_internal/dynamo_test_failures.py: B950
    # TOR901 is only for test, we want to ignore it for everything else.
    # It's not easy to configure this without affecting other per-file-ignores,
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -1,5 +1,7 @@
 self-hosted-runner:
  labels:
+    # GitHub hosted runner that actionlint doesn't recognize because actionlint version (1.6.21) is too old
+    - ubuntu-24.04
    # GitHub hosted x86 Linux runners
    - linux.20_04.4x
    - linux.20_04.16x
@ -10,7 +12,6 @@ self-hosted-runner:
    - linux.9xlarge.ephemeral
    - am2.linux.9xlarge.ephemeral
    - linux.12xlarge
-    - linux.12xlarge.ephemeral
    - linux.24xlarge
    - linux.24xlarge.ephemeral
    - linux.arm64.2xlarge
@ -42,6 +43,8 @@ self-hosted-runner:
    - windows.8xlarge.nvidia.gpu
    - windows.8xlarge.nvidia.gpu.nonephemeral
    - windows.g5.4xlarge.nvidia.gpu
+    # Windows ARM64 runners
+    - windows-11-arm64
    # Organization-wide AMD hosted runners
    - linux.rocm.gpu
    - linux.rocm.gpu.2
--- a/.github/actions/checkout-pytorch/action.yml
+++ b/.github/actions/checkout-pytorch/action.yml
@ -40,6 +40,11 @@ runs:
        fi
        mkdir "${GITHUB_WORKSPACE}"

+        # Use all available CPUs for fetching
+        cd "${GITHUB_WORKSPACE}"
+        git config --global fetch.parallel 0
+        git config --global submodule.fetchJobs 0
+
    - name: Checkout PyTorch
      uses: actions/checkout@v4
      with:
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-f084f34bbb743fada85f66b0ed8041387565e69c
+c670ad81fda266b6598aeeef434583eb98197ae8
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-b2b890e962f5fb6f481e5da2eb4a43bb990d0f1b
+r2.7
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -98,7 +98,7 @@
 - test/distributed/**
 - torch/testing/_internal/distributed/**

-"module: distributed_checkpoint":
+"release notes: distributed (checkpoint)":
 - torch/distributed/checkpoint/**
 - test/distributed/checkpoint/**

--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -334,6 +334,7 @@
  - XiaobingSuper
  - jgong5
  - mingfeima
+  - EikanWang
  mandatory_checks_name:
  - EasyCLA
  - Lint
@ -366,6 +367,7 @@
  - jgong5
  - vfdev-5
  - leslie-fang-intel
+  - EikanWang
  mandatory_checks_name:
  - EasyCLA
  - Lint
@ -379,6 +381,7 @@
  approved_by:
  - leslie-fang-intel
  - jgong5
+  - EikanWang
  mandatory_checks_name:
  - EasyCLA
  - Lint
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -7,6 +7,7 @@ ciflow_push_tags:
 - ciflow/inductor
 - ciflow/inductor-periodic
 - ciflow/inductor-rocm
+- ciflow/inductor-perf-test-nightly-rocm
 - ciflow/inductor-perf-compare
 - ciflow/inductor-micro-benchmark
 - ciflow/inductor-micro-benchmark-cpu-x86
@ -16,6 +17,7 @@ ciflow_push_tags:
 - ciflow/nightly
 - ciflow/periodic
 - ciflow/rocm
+- ciflow/rocm-mi300
 - ciflow/s390
 - ciflow/slow
 - ciflow/trunk
--- a/.github/requirements-gha-cache.txt
+++ b/.github/requirements-gha-cache.txt
@ -5,7 +5,7 @@
 #   functorch/docs/requirements.txt
 #   .ci/docker/requirements-ci.txt
 boto3==1.35.42
-jinja2==3.1.5
+jinja2==3.1.6
 lintrunner==0.10.7
 ninja==1.10.0.post1
 nvidia-ml-py==11.525.84
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@ -123,7 +123,7 @@ def main() -> None:
    parser = ArgumentParser("Build Triton binaries")
    parser.add_argument("--release", action="store_true")
    parser.add_argument(
-        "--device", type=str, default="cuda", choices=["cuda", "rocm", "xpu"]
+        "--device", type=str, default="cuda", choices=["cuda", "rocm", "xpu", "aarch64"]
    )
    parser.add_argument("--py-version", type=str)
    parser.add_argument("--commit-hash", type=str)
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -39,9 +39,9 @@ SUPPORTED_PERIODICAL_MODES: dict[str, Callable[[Optional[str]], bool]] = {
 }

 # The link to the published list of disabled jobs
-DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json"
+DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=n.FT07XR3dLMwOLBwmRNquyYSeGk8Het"
 # and unstable jobs
-UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json"
+UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=.Ox7WAXa21I1PVqadHyPfhMRPhl0aCnD"

 # Some constants used to handle disabled and unstable jobs
 JOB_NAME_SEP = "/"
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -16,16 +16,15 @@ from typing import Optional


 # NOTE: Also update the CUDA sources in tools/nightly.py when changing this list
-CUDA_ARCHES = ["11.8", "12.4", "12.6", "12.8"]
+CUDA_ARCHES = ["11.8", "12.6", "12.8"]
+CUDA_STABLE = "12.6"
 CUDA_ARCHES_FULL_VERSION = {
    "11.8": "11.8.0",
-    "12.4": "12.4.1",
    "12.6": "12.6.3",
    "12.8": "12.8.0",
 }
 CUDA_ARCHES_CUDNN_VERSION = {
    "11.8": "9",
-    "12.4": "9",
    "12.6": "9",
    "12.8": "9",
 }
@ -41,7 +40,7 @@ CPU_AARCH64_ARCH = ["cpu-aarch64"]

 CPU_S390X_ARCH = ["cpu-s390x"]

-CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64"]
+CUDA_AARCH64_ARCHES = ["12.8-aarch64"]


 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
@ -58,21 +57,6 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
-    "12.4": (
-        "nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'"
-    ),
    "12.6": (
        "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
@ -84,7 +68,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -100,19 +84,23 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "xpu": (
-        "intel-cmplr-lib-rt==2025.0.2 | "
-        "intel-cmplr-lib-ur==2025.0.2 | "
-        "intel-cmplr-lic-rt==2025.0.2 | "
-        "intel-sycl-rt==2025.0.2 | "
+        "intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | "
+        "intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | "
+        "intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | "
+        "intel-sycl-rt==2025.0.4; platform_system == 'Linux' | "
+        "intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | "
+        "intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | "
+        "intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | "
+        "intel-sycl-rt==2025.0.5; platform_system == 'Windows' | "
        "tcmlib==1.2.0 | "
        "umf==0.9.1 | "
-        "intel-pti==0.10.0"
+        "intel-pti==0.10.1"
    ),
 }

@ -246,14 +234,8 @@ def generate_libtorch_matrix(
        if os == "linux":
            arches += CUDA_ARCHES
            arches += ROCM_ARCHES
-            # skip CUDA 12.8 builds for libtorch
-            if "12.8" in arches:
-                arches.remove("12.8")
        elif os == "windows":
            arches += CUDA_ARCHES
-            # skip CUDA 12.8 builds on Windows
-            if "12.8" in arches:
-                arches.remove("12.8")
    if libtorch_variants is None:
        libtorch_variants = [
            "shared-with-deps",
@ -281,11 +263,15 @@ def generate_libtorch_matrix(
                        gpu_arch_type, gpu_arch_version
                    ),
                    "libtorch_variant": libtorch_variant,
-                    "libtorch_config": abi_version if os == "windows" else "",
-                    "devtoolset": abi_version if os != "windows" else "",
+                    "libtorch_config": abi_version
+                    if os in ("windows", "windows-arm64")
+                    else "",
+                    "devtoolset": abi_version
+                    if os not in ("windows", "windows-arm64")
+                    else "",
                    "container_image": (
                        LIBTORCH_CONTAINER_IMAGES[(arch_version, abi_version)]
-                        if os != "windows"
+                        if os not in ("windows", "windows-arm64")
                        else ""
                    ),
                    "package_type": "libtorch",
@ -318,9 +304,6 @@ def generate_wheels_matrix(
            arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
        elif os == "windows":
            arches += CUDA_ARCHES + XPU_ARCHES
-            # skip CUDA 12.8 builds on Windows until available
-            if "12.8" in arches:
-                arches.remove("12.8")
        elif os == "linux-aarch64":
            # Separate new if as the CPU type is different and
            # uses different build/test scripts
@ -349,7 +332,7 @@ def generate_wheels_matrix(
                continue

            if use_split_build and (
-                arch_version not in ["12.6", "12.4", "11.8", "cpu"] or os != "linux"
+                arch_version not in ["12.6", "12.8", "11.8", "cpu"] or os != "linux"
            ):
                raise RuntimeError(
                    "Split build is only supported on linux with cuda 12*, 11.8, and cpu.\n"
@ -360,26 +343,27 @@ def generate_wheels_matrix(
            # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install

            if (
-                arch_version in ["12.8", "12.6", "12.4", "11.8"]
+                arch_version in ["12.8", "12.6", "11.8"]
                and os == "linux"
                or arch_version in CUDA_AARCH64_ARCHES
            ):
+                desired_cuda = translate_desired_cuda(gpu_arch_type, gpu_arch_version)
                ret.append(
                    {
                        "python_version": python_version,
                        "gpu_arch_type": gpu_arch_type,
                        "gpu_arch_version": gpu_arch_version,
-                        "desired_cuda": translate_desired_cuda(
-                            gpu_arch_type, gpu_arch_version
-                        ),
+                        "desired_cuda": desired_cuda,
                        "use_split_build": "True" if use_split_build else "False",
                        "devtoolset": "cxx11-abi",
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
                        "pytorch_extra_install_requirements": (
-                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]
-                            if os != "linux-aarch64"
-                            else ""
+                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS[
+                                f"{desired_cuda[2:4]}.{desired_cuda[4:]}"  # for cuda-aarch64: cu126 -> 12.6
+                            ]
+                            if os == "linux-aarch64"
+                            else PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]
                        ),
                        "build_name": (
                            f"{package_type}-py{python_version}-{gpu_arch_type}"
@ -389,8 +373,8 @@ def generate_wheels_matrix(
                        ),  # include special case for aarch64 build, remove the -aarch64 postfix
                    }
                )
-                # Special build building to use on Colab. Python 3.11 for 12.4 CUDA
-                if python_version == "3.11" and arch_version == "12.4":
+                # Special build building to use on Colab. Python 3.11 for 12.6 CUDA
+                if python_version == "3.11" and arch_version == CUDA_STABLE:
                    ret.append(
                        {
                            "python_version": python_version,
@ -433,7 +417,7 @@ def generate_wheels_matrix(
                        "pytorch_extra_install_requirements": (
                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS["xpu"]
                            if gpu_arch_type == "xpu"
-                            else PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.4"]
+                            else PYTORCH_EXTRA_INSTALL_REQUIREMENTS[CUDA_STABLE]
                            if os != "linux"
                            else ""
                        ),
@ -445,5 +429,4 @@ def generate_wheels_matrix(

 validate_nccl_dep_consistency("12.8")
 validate_nccl_dep_consistency("12.6")
-validate_nccl_dep_consistency("12.4")
 validate_nccl_dep_consistency("11.8")
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -96,6 +96,7 @@ class BinaryBuildWorkflow:
 class OperatingSystem:
    LINUX = "linux"
    WINDOWS = "windows"
+    WINDOWS_ARM64 = "windows-arm64"
    MACOS = "macos"
    MACOS_ARM64 = "macos-arm64"
    LINUX_AARCH64 = "linux-aarch64"
@ -151,7 +152,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
        package_type="manywheel",
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
-            arches=["11.8", "12.4", "12.6", "12.8"],
+            arches=["11.8", "12.6", "12.8"],
            python_versions=["3.9"],
        ),
        branches="main",
@ -261,6 +262,52 @@ WINDOWS_BINARY_SMOKE_WORKFLOWS = [
    ),
 ]

+WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS = [
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS_ARM64,
+        package_type="wheel",
+        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
+            OperatingSystem.WINDOWS_ARM64,
+            arches=["cpu"],
+            python_versions=["3.12"],
+        ),
+        ciflow_config=CIFlowConfig(
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
+            isolated_workflow=True,
+        ),
+    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS_ARM64,
+        package_type="libtorch",
+        abi_version=generate_binary_build_matrix.RELEASE,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.WINDOWS_ARM64,
+            generate_binary_build_matrix.RELEASE,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        ciflow_config=CIFlowConfig(
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
+            isolated_workflow=True,
+        ),
+    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS_ARM64,
+        package_type="libtorch",
+        abi_version=generate_binary_build_matrix.DEBUG,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.WINDOWS_ARM64,
+            generate_binary_build_matrix.DEBUG,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        ciflow_config=CIFlowConfig(
+            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
+            isolated_workflow=True,
+        ),
+    ),
+]
+
 MACOS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.MACOS_ARM64,
@ -355,6 +402,10 @@ def main() -> None:
            jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
            WINDOWS_BINARY_SMOKE_WORKFLOWS,
        ),
+        (
+            jinja_env.get_template("windows_arm64_binary_build_workflow.yml.j2"),
+            WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS,
+        ),
        (
            jinja_env.get_template("macos_binary_build_workflow.yml.j2"),
            MACOS_BINARY_BUILD_WORKFLOWS,
--- a/.github/scripts/get_ci_variable.py
+++ b/.github/scripts/get_ci_variable.py
@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+"""Helper script - Return CI variables such as stable cuda, min python version, etc."""
+
+import argparse
+import sys
+
+
+def main(args: list[str]) -> None:
+    import generate_binary_build_matrix
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--cuda-stable-version",
+        action="store_true",
+        help="get cuda stable version",
+    )
+    parser.add_argument(
+        "--min-python-version",
+        action="store_true",
+        help="get min supported python version",
+    )
+    options = parser.parse_args(args)
+    if options.cuda_stable_version:
+        return print(generate_binary_build_matrix.CUDA_STABLE)
+    if options.min_python_version:
+        return print(generate_binary_build_matrix.FULL_PYTHON_VERSIONS[0])
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -57,10 +57,10 @@ def gh_fetch_url_and_headers(
            print(
                f"""{url}
                Rate limit exceeded:
-                Used: {err.headers['X-RateLimit-Used']}
-                Limit: {err.headers['X-RateLimit-Limit']}
-                Remaining: {err.headers['X-RateLimit-Remaining']}
-                Resets at: {err.headers['x-RateLimit-Reset']}"""
+                Used: {err.headers["X-RateLimit-Used"]}
+                Limit: {err.headers["X-RateLimit-Limit"]}
+                Remaining: {err.headers["X-RateLimit-Remaining"]}
+                Resets at: {err.headers["x-RateLimit-Reset"]}"""
            )
        else:
            print(f"Error fetching {url} {err}")
--- a/.github/scripts/label_utils.py
+++ b/.github/scripts/label_utils.py
@ -63,9 +63,9 @@ def gh_get_labels(org: str, repo: str) -> list[str]:
    update_labels(labels, info)

    last_page = get_last_page_num_from_header(header)
-    assert (
-        last_page > 0
-    ), "Error reading header info to determine total number of pages of labels"
+    assert last_page > 0, (
+        "Error reading header info to determine total number of pages of labels"
+    )
    for page_number in range(2, last_page + 1):  # skip page 1
        _, info = request_for_labels(prefix + f"&page={page_number}")
        update_labels(labels, info)
--- a/.github/scripts/pytest_caching_utils.py
+++ b/.github/scripts/pytest_caching_utils.py
@ -33,7 +33,7 @@ class PRIdentifier(str):
    __slots__ = ()

    def __new__(cls, value: str) -> "PRIdentifier":
-        md5 = hashlib.md5(value.encode("utf-8")).hexdigest()
+        md5 = hashlib.md5(value.encode("utf-8"), usedforsecurity=False).hexdigest()
        return super().__new__(cls, md5)


--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -485,7 +485,7 @@ def get_check_run_name_prefix(workflow_run: Any) -> str:
    if workflow_run is None:
        return ""
    else:
-        return f'{workflow_run["workflow"]["name"]} / '
+        return f"{workflow_run['workflow']['name']} / "


 def is_passing_status(status: Optional[str]) -> bool:
@ -545,7 +545,7 @@ def add_workflow_conclusions(
                    if not isinstance(checkrun_node, dict):
                        warn(f"Expected dictionary, but got {type(checkrun_node)}")
                        continue
-                    checkrun_name = f'{get_check_run_name_prefix(workflow_run)}{checkrun_node["name"]}'
+                    checkrun_name = f"{get_check_run_name_prefix(workflow_run)}{checkrun_node['name']}"
                    existing_checkrun = workflow_obj.jobs.get(checkrun_name)
                    if existing_checkrun is None or not is_passing_status(
                        existing_checkrun.status
@ -1224,9 +1224,17 @@ class GitHubPR:
        if not self.is_ghstack_pr():
            msg = self.gen_commit_message()
            pr_branch_name = f"__pull-request-{self.pr_num}__init__"
-            repo.fetch(f"pull/{self.pr_num}/head", pr_branch_name)
+            repo.fetch(self.last_commit()["oid"], pr_branch_name)
            repo._run_git("merge", "--squash", pr_branch_name)
            repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
+
+            # Did the PR change since we started the merge?
+            pulled_sha = repo.show_ref(pr_branch_name)
+            latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
+            if pulled_sha != latest_pr_status.last_commit()["oid"]:
+                raise RuntimeError(
+                    "PR has been updated since CI checks last passed. Please rerun the merge command."
+                )
            return []
        else:
            return self.merge_ghstack_into(
@ -1507,6 +1515,36 @@ def checks_to_markdown_bullets(
    ]


+def post_starting_merge_comment(
+    repo: GitRepo,
+    pr: GitHubPR,
+    explainer: TryMergeExplainer,
+    dry_run: bool,
+    ignore_current_checks_info: Optional[
+        list[tuple[str, Optional[str], Optional[int]]]
+    ] = None,
+) -> None:
+    """Post the initial merge starting message on the PR. Also post a short
+    message on all PRs in the stack."""
+    gh_post_pr_comment(
+        pr.org,
+        pr.project,
+        pr.pr_num,
+        explainer.get_merge_message(ignore_current_checks_info),
+        dry_run=dry_run,
+    )
+    if pr.is_ghstack_pr():
+        for additional_prs, _ in get_ghstack_prs(repo, pr):
+            if additional_prs.pr_num != pr.pr_num:
+                gh_post_pr_comment(
+                    additional_prs.org,
+                    additional_prs.project,
+                    additional_prs.pr_num,
+                    f"Starting merge as part of PR stack under #{pr.pr_num}",
+                    dry_run=dry_run,
+                )
+
+
 def manually_close_merged_pr(
    pr: GitHubPR,
    additional_merged_prs: list[GitHubPR],
@ -2130,13 +2168,7 @@ def merge(
    check_for_sev(pr.org, pr.project, skip_mandatory_checks)

    if skip_mandatory_checks:
-        gh_post_pr_comment(
-            pr.org,
-            pr.project,
-            pr.pr_num,
-            explainer.get_merge_message(),
-            dry_run=dry_run,
-        )
+        post_starting_merge_comment(repo, pr, explainer, dry_run)
        return pr.merge_into(
            repo,
            dry_run=dry_run,
@ -2159,12 +2191,12 @@ def merge(
        )
        ignore_current_checks_info = failing

-    gh_post_pr_comment(
-        pr.org,
-        pr.project,
-        pr.pr_num,
-        explainer.get_merge_message(ignore_current_checks_info),
-        dry_run=dry_run,
+    post_starting_merge_comment(
+        repo,
+        pr,
+        explainer,
+        dry_run,
+        ignore_current_checks_info=ignore_current_checks_info,
    )

    start_time = time.time()
--- a/.github/scripts/trymerge_explainer.py
+++ b/.github/scripts/trymerge_explainer.py
@ -79,7 +79,7 @@ class TryMergeExplainer:
            (
                "<details><summary>Advanced Debugging</summary>",
                "Check the merge workflow status ",
-                f"<a href=\"{os.getenv('GH_RUN_URL')}\">here</a>",
+                f'<a href="{os.getenv("GH_RUN_URL")}">here</a>',
                "</details>",
            )
        )
--- a/.github/scripts/windows/build_triton.bat
+++ b/.github/scripts/windows/build_triton.bat
@ -0,0 +1,18 @@
+@echo on
+
+set PYTHON_PREFIX=%PY_VERS:.=%
+set PYTHON_PREFIX=py%PYTHON_PREFIX:;=;py%
+call .ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
+:: Create a new conda environment
+if "%PY_VERS%" == "3.13t" (
+    call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python-freethreading python=3.13
+) else (
+    call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python=%PY_VERS%
+)
+:: Fix cmake version for issue https://github.com/pytorch/pytorch/issues/150480
+call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja
+
+dir "%VC_INSTALL_PATH%"
+
+call "%VC_INSTALL_PATH%\VC\Auxiliary\Build\vcvarsall.bat" x64
+call conda run -n %PYTHON_PREFIX% python .github/scripts/build_triton_wheel.py --device=%BUILD_DEVICE% %RELEASE%
--- a/.github/scripts/windows/install_vs2022.ps1
+++ b/.github/scripts/windows/install_vs2022.ps1
@ -0,0 +1,35 @@
+#Requires -RunAsAdministrator
+
+# Enable long paths on Windows
+Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
+
+$VC_VERSION_major = [int] ${env:VC_VERSION}.split(".")[0]
+$VC_DOWNLOAD_LINK = "https://aka.ms/vs/$VC_VERSION_major/release/vs_BuildTools.exe"
+$VC_INSTALL_ARGS = @("--nocache","--quiet","--norestart","--wait", "--add Microsoft.VisualStudio.Workload.VCTools",
+                                                    "--add Microsoft.Component.MSBuild",
+                                                    "--add Microsoft.VisualStudio.Component.Roslyn.Compiler",
+                                                    "--add Microsoft.VisualStudio.Component.TextTemplating",
+                                                    "--add Microsoft.VisualStudio.Component.VC.CoreBuildTools",
+                                                    "--add Microsoft.VisualStudio.Component.VC.CoreIde",
+                                                    "--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest",
+                                                    "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core",
+                                                    "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
+                                                    "--add Microsoft.VisualStudio.Component.Windows11SDK.22621")
+
+
+echo "Downloading Visual Studio installer from $VC_DOWNLOAD_LINK."
+curl.exe --retry 3 -kL $VC_DOWNLOAD_LINK --output vs_installer.exe
+if ($LASTEXITCODE -ne 0) {
+    echo "Download of the VS ${env:VC_YEAR} Version ${env:VC_VERSION} installer failed"
+    exit 1
+}
+$InstallationPath = ${env:VC_INSTALL_PATH}
+$VC_INSTALL_ARGS = "--installPath `"$InstallationPath`"" + " " + $VC_INSTALL_ARGS
+echo "Installing Visual Studio version ${env:VC_VERSION} in $InstallationPath."
+$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VC_INSTALL_ARGS -NoNewWindow -Wait -PassThru
+Remove-Item -Path vs_installer.exe -Force
+$exitCode = $process.ExitCode
+if (($exitCode -ne 0) -and ($exitCode -ne 3010)) {
+    echo "VS ${env:VC_YEAR} installer exited with code $exitCode, which should be one of [0, 3010]."
+    exit 1
+}
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@ -4,6 +4,7 @@
 {%- set download_artifact_action = "actions/download-artifact@v4.1.7" -%}

 {%- set timeout_minutes = 240 -%}
+{%- set timeout_minutes_windows_binary = 300 -%}

 {%- macro concurrency(build_environment) -%}
 concurrency:
@ -31,7 +32,7 @@ concurrency:
 {%- macro setup_ec2_windows() -%}
      !{{ display_ec2_information() }}
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -53,7 +53,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -111,7 +111,10 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      {%- elif config["gpu_arch_type"] == "rocm" %}
      runs_on: linux.rocm.gpu
-      {%- elif config["gpu_arch_type"] == "cuda" %}
+      {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] == "12.8" %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
+      {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] != "12.8"%}
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
      {%- else %}
@ -144,9 +147,9 @@ jobs:
        with:
          name: !{{ config["build_name"] }}
          path: "${{ runner.temp }}/artifacts/"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: !{{ config["container_image"] }}
      - name: Test Pytorch binary
@ -165,12 +168,12 @@ jobs:
        with:
          name: !{{ config["build_name"] }}
          path: "${{ runner.temp }}/artifacts/"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      - name: ROCm set GPU_FLAG
        run: |
          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: !{{ config["container_image"] }}
      - name: Test Pytorch binary
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -76,7 +76,7 @@ jobs:
          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
          fi
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      - name: Populate binary env
        run: |
          # shellcheck disable=SC1091
--- a/.github/templates/windows_arm64_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_arm64_binary_build_workflow.yml.j2
@ -0,0 +1,197 @@
+{% import 'common.yml.j2' as common %}
+{% import 'upload.yml.j2' as upload %}
+
+{%- block name -%}
+# Template is at:    .github/templates/windows_arm64_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: !{{ build_environment }}
+{%- endblock %}
+
+{%- macro set_runner_specific_vars() -%}
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: cmd
+        run: |
+          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
+          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
+          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
+{%- endmacro %}
+
+on:
+  push:
+    branches:
+      - !{{ branches }}
+    {%- if branches == "nightly" %}
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+    {%- endif %}
+{%- for label in ciflow_config.labels | sort %}
+    {%- if loop.first and branches != "nightly" %}
+    tags:
+    {%- endif %}
+      - '!{{ label }}/*'
+{%- endfor %}
+  workflow_dispatch:
+
+env:
+  BUILD_ENVIRONMENT: !{{ build_environment }}
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+  PYTORCH_ROOT: /pytorch
+  DOWNLOADS_DIR: c:\temp\downloads
+  DEPENDENCIES_DIR: c:\temp\dependencies
+  ENABLE_APL: 1
+  ENABLE_OPENBLAS: 0
+  MSVC_VERSION : 14.42
+  AWS_DEFAULT_REGION: us-east-1
+
+jobs:
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+{%- for config in build_configs %}
+  !{{ config["build_name"] }}-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "windows-11-arm64"
+    timeout-minutes: !{{ common.timeout_minutes }}
+    !{{ upload.binary_env(config, True) }}
+    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
+    {%- endif %}
+    steps:
+      !{{ set_runner_specific_vars() }}
+      - name: Bootstrap folders
+        shell: cmd
+        run: |
+          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
+          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+      - name: Git checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+      - name: Bootstrap Build Tools
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
+      - name: Bootstrap Git
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
+      - name: Remove Pytorch folder
+        shell: cmd
+        run: |
+          rmdir /s /q "pytorch"
+      - name: Git checkout PyTorch - recursive
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+          submodules: recursive
+      - name: Bootstrap Python
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
+      - name: Bootstrap APL
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
+      - name: Bootstrap Rust
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+      - name: Bootstrap sccache
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat"
+      - name: Bootstrap Libuv
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "pytorch/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "pytorch/.circleci/scripts/binary_windows_arm64_build.sh"
+      - uses: !{{ common.upload_artifact_action }}
+        if: always()
+        with:
+          name: !{{ config["build_name"] }}
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  !{{ config["build_name"] }}-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - !{{ config["build_name"] }}-build
+      - get-label-type
+    runs-on: "windows-11-arm64"
+    timeout-minutes: !{{ common.timeout_minutes }}
+    !{{ upload.binary_env(config, True) }}
+    steps:
+      !{{ set_runner_specific_vars() }}
+      - uses: !{{ common.download_artifact_action }}
+        name: Download Build Artifacts
+        with:
+          name: !{{ config["build_name"] }}
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Git checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+      - name: Bootstrap Git
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
+      - name: Remove Pytorch folder
+        shell: cmd
+        run: |
+          rmdir /s /q "pytorch"
+      - name: Git checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+          submodules: recursive
+      - name: Bootstrap APL
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
+      - name: Bootstrap Python
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
+      - name: Bootstrap Build Tools
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
+      - name: Bootstrap Rust
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "pytorch/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "pytorch/.circleci/scripts/binary_windows_arm64_test.sh"
+  {%- if branches == "nightly" %}
+  !{{ upload.upload_binaries(config, True) }}
+  {%- endif %}
+{%- endfor %}
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -55,7 +55,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -71,7 +71,7 @@ jobs:
    {%- else %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    {%- endif %}
-    timeout-minutes: !{{ common.timeout_minutes }}
+    timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
    !{{ upload.binary_env(config, True) }}
    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
@ -79,7 +79,7 @@ jobs:
    steps:
      !{{ common.setup_ec2_windows() }}
      !{{ set_runner_specific_vars() }}
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      - name: Populate binary env
        shell: bash
        run: |
@ -107,10 +107,14 @@ jobs:
 {%- else %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge.nonephemeral"
 {%- endif %}
+{%- else %}
+{%- if branches == "nightly" %}
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
 {%- else %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
 {%- endif %}
-    timeout-minutes: !{{ common.timeout_minutes }}
+{%- endif %}
+    timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
    !{{ upload.binary_env(config, True) }}
    steps:
      !{{ common.setup_ec2_windows() }}
@ -120,7 +124,7 @@ jobs:
        with:
          name: !{{ config["build_name"] }}
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      - name: Populate binary env
        shell: bash
        run: |
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@ -47,7 +47,7 @@ jobs:
      reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          fetch-depth: 1
          submodules: false
@ -69,25 +69,25 @@ jobs:
    runs-on: ${{ matrix.runner }}
    steps:
      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7

      - name: Setup Linux
        uses: ./.github/actions/setup-linux

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
          docker-image-name: ${{ inputs.docker-image-name }}

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -97,7 +97,7 @@ jobs:
        run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.7
        if: ${{ inputs.cuda-version != 'cpu' && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

      - name: Output disk space left
@ -209,5 +209,5 @@ jobs:
          file-suffix: bazel-${{ github.job }}_${{ steps.get-job-id.outputs.job-id }}

      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
        if: always()
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -18,7 +18,7 @@ on:
        description: prefix for runner label
      runs_on:
        required: false
-        default: linux.12xlarge.ephemeral
+        default: linux.12xlarge.memory.ephemeral
        type: string
        description: Hardware to run this "build" job on, linux.12xlarge or linux.arm64.2xlarge.
      timeout-minutes:
@ -150,13 +150,13 @@ jobs:

      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        if: inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        continue-on-error: true
        with:
          github-secret: ${{ secrets.github-token }}

      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }}

@ -186,7 +186,6 @@ jobs:
      - name: Checkout PyTorch to pytorch dir
        uses: actions/checkout@v4
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
@ -211,7 +210,7 @@ jobs:

      - name: Pull Docker image
        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: ${{ inputs.DOCKER_IMAGE }}

@ -267,7 +266,7 @@ jobs:

      - name: Teardown Linux
        if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7

      - name: Chown workspace
        if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -133,14 +133,14 @@ jobs:

      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        if: inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        continue-on-error: true
        with:
          github-secret: ${{ secrets.github-token }}

        # Setup the environment
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          no-sudo: ${{ inputs.build_environment == 'linux-aarch64-binary-manywheel' || inputs.build_environment == 'linux-s390x-binary-manywheel' }}

@ -163,7 +163,6 @@ jobs:
      - name: Checkout PyTorch to pytorch dir
        uses: actions/checkout@v4
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          show-progress: false
          path: pytorch
@ -194,12 +193,12 @@ jobs:
          path: "${{ runner.temp }}/artifacts/"

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.7
        if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}

      - name: Pull Docker image
        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: ${{ inputs.DOCKER_IMAGE }}

@ -209,7 +208,7 @@ jobs:

      - name: Teardown Linux
        if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7

      - name: Chown workspace
        if: always() && inputs.build_environment != 'linux-s390x-binary-manywheel'
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@ -66,7 +66,6 @@ on:
 jobs:
  upload:
    runs-on: ubuntu-22.04
-    environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
    container:
      image: continuumio/miniconda3:4.12.0
    env:
@ -91,7 +90,7 @@ jobs:
      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          no-sudo: true

--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@ -84,7 +84,7 @@ jobs:
    name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
    steps:
      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
          instructions: |
@ -95,7 +95,7 @@ jobs:

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
@ -110,12 +110,12 @@ jobs:

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
          docker-image-name: ${{ inputs.docker-image }}

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -222,5 +222,5 @@ jobs:
          s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }}/functorchdocs

      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
        if: always()
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -69,13 +69,11 @@ on:
        required: false
        type: string
        default: ""
-      use_split_build:
+      max-jobs:
        description: |
-          [Experimental] Build a libtorch only wheel and build pytorch such that
-          are built from the libtorch wheel.
+          Overwrite the number of jobs to use for the build
        required: false
-        type: boolean
-        default: false
+        type: string

    secrets:
      HUGGING_FACE_HUB_TOKEN:
@ -108,7 +106,7 @@ jobs:
      test-matrix: ${{ steps.filter.outputs.test-matrix }}
    steps:
      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
@ -118,7 +116,7 @@ jobs:
      # checkout because when we run this action we don't *have* a local
      # checkout. In other cases you should prefer a local checkout.
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          no-sudo: true

@ -136,7 +134,7 @@ jobs:

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image-name: ${{ inputs.docker-image-name }}
@ -152,7 +150,7 @@ jobs:
          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
@ -210,7 +208,7 @@ jobs:
          OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
+          MAX_JOBS_OVERRIDE: ${{ inputs.max-jobs }}
        run: |
          START_TIME=$(date +%s)
          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
@ -230,6 +228,12 @@ jobs:
            DOCKER_SHELL_CMD=
          fi

+          if [[ ${MAX_JOBS_OVERRIDE} == "" ]]; then
+            MAX_JOBS="$(nproc --ignore=2)"
+          else
+            MAX_JOBS="${MAX_JOBS_OVERRIDE}"
+          fi
+
          # Leaving 1GB for the runner and other things
          TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
          # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
@ -241,7 +245,8 @@ jobs:
          # shellcheck disable=SC2086
          container_name=$(docker run \
            -e BUILD_ENVIRONMENT \
-            -e MAX_JOBS="$(nproc --ignore=2)" \
+            -e MAX_JOBS=${MAX_JOBS} \
+            -e MAX_JOBS_OVERRIDE \
            -e AWS_DEFAULT_REGION \
            -e PR_NUMBER \
            -e SHA1 \
@ -282,7 +287,7 @@ jobs:

      - name: Store PyTorch Build Artifacts on S3
        uses: seemethere/upload-artifact-s3@v5
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}
          retention-days: 14
@ -290,34 +295,15 @@ jobs:
          path: artifacts.zip
          s3-bucket: ${{ inputs.s3-bucket }}

-      - name: Store PyTorch Build Artifacts on S3 for split build
-        uses: seemethere/upload-artifact-s3@v5
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
-        with:
-          name: ${{ inputs.build-environment }}-experimental-split-build
-          retention-days: 14
-          if-no-files-found: error
-          path: artifacts.zip
-          s3-bucket: ${{ inputs.s3-bucket }}
-
      - name: Store PyTorch Build Artifacts for s390x
        uses: actions/upload-artifact@v4
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.build-environment == 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}
          retention-days: 14
          if-no-files-found: error
          path: artifacts.zip

-      - name: Store PyTorch Build Artifacts for s390x for split build
-        uses: actions/upload-artifact@v4
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
-        with:
-          name: ${{ inputs.build-environment }}-experimental-split-build
-          retention-days: 14
-          if-no-files-found: error
-          path: artifacts.zip
-
      - name: Upload sccache stats
        if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
        uses: ./.github/actions/upload-sccache-stats
@ -326,7 +312,7 @@ jobs:
          build-time: ${{ steps.build.outputs.build_time }}

      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
        if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'

      - name: Cleanup docker
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -80,7 +80,7 @@ jobs:
    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
    steps:
      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        if: ${{ !contains(matrix.runner, 'gcp.a100') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
@ -89,7 +89,7 @@ jobs:
              docker exec -it $(docker container ps --format '{{.ID}}') bash

      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          no-sudo: true

@ -107,7 +107,7 @@ jobs:

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image-name: ${{ inputs.docker-image }}
@ -123,7 +123,7 @@ jobs:
          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
@ -135,7 +135,7 @@ jobs:

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        id: install-nvidia-driver
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.7
        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

      - name: Setup GPU_FLAG for docker run
@ -371,7 +371,7 @@ jobs:
          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}

      - name: Upload the benchmark results
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.7
        with:
          benchmark-results-dir: test/test-reports
          dry-run: false
@ -428,7 +428,7 @@ jobs:
          workflow_attempt: ${{github.run_attempt}}

      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
        if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'

      # NB: We are currently having an intermittent GPU-related issue on G5 runners with
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@ -71,11 +71,11 @@ jobs:
      test-matrix: ${{ steps.filter.outputs.test-matrix }}
    steps:
      - name: Clean up disk space before running MacOS workflow
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.7

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7

      - name: Set xcode version
        env:
@ -87,7 +87,7 @@ jobs:

      - name: Setup miniconda
        if: inputs.environment-file == ''
-        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@release/2.7
        with:
          python-version: ${{ inputs.python-version }}
          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
@ -97,7 +97,7 @@ jobs:
      # environment even though the arch is x86-64
      - name: Setup miniconda using the provided environment file
        if: inputs.environment-file != ''
-        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@release/2.7
        with:
          python-version: ${{ inputs.python-version }}
          environment-file: ${{ inputs.environment-file }}
@ -207,4 +207,4 @@ jobs:
      - name: Clean up disk space
        if: always()
        continue-on-error: true
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.7
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@ -41,7 +41,7 @@ jobs:
      reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false

@ -82,7 +82,7 @@ jobs:
          use-gha: true

      - name: Setup miniconda
-        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@release/2.7
        with:
          python-version: ${{ inputs.python-version }}
          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
@ -170,4 +170,4 @@ jobs:
      - name: Clean up disk space
        if: always()
        continue-on-error: true
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.7
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -82,11 +82,11 @@ jobs:
          done

      - name: Clean up disk space before running MacOS workflow
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.7

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7

      - name: Start monitoring script
        id: monitor-script
@ -109,7 +109,7 @@ jobs:
          use-gha: true

      - name: Setup miniconda
-        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@release/2.7
        with:
          python-version: ${{ inputs.python-version }}
          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
@ -224,7 +224,7 @@ jobs:
          file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}

      - name: Upload the benchmark results
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.7
        with:
          benchmark-results-dir: test/test-reports
          dry-run: false
@ -234,4 +234,4 @@ jobs:
      - name: Clean up disk space
        if: always()
        continue-on-error: true
-        uses: pytorch/test-infra/.github/actions/check-disk-space@main
+        uses: pytorch/test-infra/.github/actions/check-disk-space@release/2.7
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -70,7 +70,7 @@ jobs:
    steps:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          no-sudo: true

@ -92,12 +92,12 @@ jobs:

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
          docker-image-name: ${{ inputs.docker-image }}

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -251,6 +251,11 @@ jobs:
          # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"

+      - name: Change permissions (only needed for MI300 runners for now)
+        if: ${{ always() && steps.test.conclusion && contains(matrix.runner, 'mi300') }}
+        run: |
+          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
+
      - name: Print remaining test logs
        shell: bash
        if: always() && steps.test.conclusion
@ -297,7 +302,7 @@ jobs:
          aws-region: us-east-1

      - name: Upload the benchmark results
-        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@release/2.7
        with:
          benchmark-results-dir: test/test-reports
          dry-run: false
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@ -54,7 +54,7 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
    steps:
      # - name: Checkout PyTorch
-      #   uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+      #   uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
      #   with:
      #     fetch-depth: 1
      #     submodules: true
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -84,10 +84,10 @@ jobs:
          git config --global core.fsmonitor false

      - name: Clean up leftover processes on non-ephemeral Windows runner
-        uses: pytorch/test-infra/.github/actions/cleanup-runner@main
+        uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.7

      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
          instructions: |
@ -102,7 +102,7 @@ jobs:

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          no-sudo: true

--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -66,10 +66,10 @@ jobs:
          git config --global core.fsmonitor false

      - name: Clean up leftover processes on non-ephemeral Windows runner
-        uses: pytorch/test-infra/.github/actions/cleanup-runner@main
+        uses: pytorch/test-infra/.github/actions/cleanup-runner@release/2.7

      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
          instructions: |
@ -85,7 +85,7 @@ jobs:

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          no-sudo: true

--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -62,7 +62,7 @@ jobs:
    steps:
      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7

      - name: Setup XPU
        uses: ./.github/actions/setup-xpu
@ -80,12 +80,12 @@ jobs:

      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
          docker-image-name: ${{ inputs.docker-image }}

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -41,12 +41,12 @@ jobs:
      CUDA_VERSION: ${{ matrix.cuda_version }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
            docker-image-name: almalinux-builder${{ matrix.cuda_version == 'cpu' && '-' || '-cuda' }}${{matrix.cuda_version}}
            docker-build-dir:  .ci/docker/almalinux
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -32,7 +32,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -51,12 +51,12 @@ jobs:
      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
            docker-image-name: libtorch-cxx11-builder-cuda${{matrix.cuda_version}}
            docker-build-dir:  .ci/docker/libtorch
@ -93,12 +93,12 @@ jobs:
      GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
            docker-image-name: libtorch-cxx11-builder-rocm${{matrix.rocm_version}}
            docker-build-dir:  .ci/docker/libtorch
@ -129,12 +129,12 @@ jobs:
    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
            docker-image-name: libtorch-cxx11-builder-cpu
            docker-build-dir:  .ci/docker/libtorch
--- a/.github/workflows/build-manywheel-images-s390x.yml
+++ b/.github/workflows/build-manywheel-images-s390x.yml
@ -41,7 +41,7 @@ jobs:
      GPU_ARCH_TYPE: cpu-s390x
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false
          no-sudo: true
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -36,58 +36,13 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  build-docker-cuda:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
-    strategy:
-      matrix:
-        cuda_version: ["12.8", "12.6", "12.4", "11.8"]
-    env:
-      GPU_ARCH_TYPE: cuda
-      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
-    steps:
-      - name: Purge tools folder (free space for build)
-        run: rm -rf /opt/hostedtoolcache
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: manylinux-builder-cuda${{matrix.cuda_version}}
-            docker-build-dir:  .ci/docker/manywheel
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/manywheel/build.sh manylinux-builder:cuda${{matrix.cuda_version}}
-  # NOTE: manylinux_2_28 are still experimental, see https://github.com/pytorch/pytorch/issues/123649
  build-docker-cuda-manylinux_2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
@ -102,12 +57,12 @@ jobs:
      - name: Purge tools folder (free space for build)
        run: rm -rf /opt/hostedtoolcache
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
            docker-image-name: manylinux2_28-builder-cuda${{matrix.cuda_version}}
            docker-build-dir:  .ci/docker/manywheel
@ -138,7 +93,7 @@ jobs:
    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
    strategy:
      matrix:
-        cuda_version: ["12.8", "12.6"]
+        cuda_version: ["12.8"]
    env:
      GPU_ARCH_TYPE: cuda-aarch64
      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
@ -147,7 +102,7 @@ jobs:
        uses: actions/checkout@v3
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
            docker-image-name: manylinuxaarch64-builder-cuda${{matrix.cuda_version}}
            docker-build-dir:  .ci/docker/manywheel
@ -184,12 +139,12 @@ jobs:
      GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
            docker-image-name: manylinux2_28-builder-rocm${{matrix.rocm_version}}
            docker-build-dir:  .ci/docker/manywheel
@ -214,42 +169,6 @@ jobs:
          retry_wait_seconds: 90
          command: |
            .ci/docker/manywheel/build.sh manylinux2_28-builder:rocm${{matrix.rocm_version}}
-  build-docker-cpu:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: manylinux-builder-cpu
-            docker-build-dir:  .ci/docker/manywheel
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/manywheel/build.sh manylinux-builder:cpu
  build-docker-cpu-manylinux_2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
@ -258,12 +177,12 @@ jobs:
      GPU_ARCH_TYPE: cpu-manylinux_2_28
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
            docker-image-name: manylinux2_28-builder-cpu
            docker-build-dir:  .ci/docker/manywheel
@ -296,12 +215,12 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
            docker-image-name: manylinuxaarch64-builder-cpu-aarch64
            docker-build-dir:  .ci/docker/manywheel
@ -334,12 +253,12 @@ jobs:
      GPU_ARCH_TYPE: cpu-aarch64-2_28
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
            docker-image-name: manylinux2_28_aarch64-builder-cpu-aarch64
            docker-build-dir:  .ci/docker/manywheel
@ -375,12 +294,12 @@ jobs:
      GPU_ARCH_TYPE: cpu-cxx11-abi
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
            docker-image-name: manylinuxcxx11-abi-builder-cpu-cxx11-abi
            docker-build-dir:  .ci/docker/manywheel
@ -413,12 +332,12 @@ jobs:
      GPU_ARCH_TYPE: xpu
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
            docker-image-name: manylinux2_28-builder-xpu
            docker-build-dir:  .ci/docker/manywheel
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -3,7 +3,7 @@ name: Build Triton wheels
 on:
  push:
    branches:
-      - main
+      - release/2.7
    tags:
      # NOTE: Binary build pipelines should only get triggered on release candidate builds
      # Release candidate tags look like: v1.11.0-rc1
@ -12,6 +12,8 @@ on:
      - .github/workflows/build-triton-wheel.yml
      - .github/scripts/build_triton_wheel.py
      - .github/ci_commit_pins/triton.txt
+      - .github/scripts/windows/install_vs2022.ps1
+      - .github/scripts/windows/build_triton.bat
      - .ci/docker/ci_commit_pins/triton.txt
      - .ci/docker/ci_commit_pins/triton-xpu.txt
  pull_request:
@ -19,6 +21,8 @@ on:
      - .github/workflows/build-triton-wheel.yml
      - .github/scripts/build_triton_wheel.py
      - .github/ci_commit_pins/triton.txt
+      - .github/scripts/windows/install_vs2022.ps1
+      - .github/scripts/windows/build_triton.bat
      - .ci/docker/ci_commit_pins/triton.txt
      - .ci/docker/ci_commit_pins/triton-xpu.txt

@ -30,7 +34,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -40,37 +44,40 @@ jobs:
  build-wheel:
    name: "Build Triton Wheel"
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
+    runs-on: ${{ matrix.runs_on }}
    strategy:
      fail-fast: false
      matrix:
        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ]
-        device: ["cuda", "rocm", "xpu"]
-        docker-image: ["pytorch/manylinux-builder:cpu", "pytorch/manylinux2_28-builder:cpu"]
-        exclude:
-          - device: "rocm"
-            docker-image: "pytorch/manylinux-builder:cpu"
-          - device: "xpu"
-            docker-image: "pytorch/manylinux2_28-builder:cpu"
+        device: ["cuda", "rocm", "xpu", "aarch64"]
+        docker-image: ["pytorch/manylinux2_28-builder:cpu"]
        include:
          - device: "rocm"
            rocm_version: "6.3"
+            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
          - device: "cuda"
            rocm_version: ""
+            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
+          - device: "xpu"
+            rocm_version: ""
+            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
+          - device: "aarch64"
+            rocm_version: ""
+            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge"
    timeout-minutes: 40
    env:
-      DOCKER_IMAGE: ${{ matrix.device == 'rocm' && format('pytorch/manylinux2_28-builder:rocm{0}', matrix.rocm_version) || matrix.docker-image }}
+      DOCKER_IMAGE: ${{ matrix.device == 'rocm' && format('pytorch/manylinux2_28-builder:rocm{0}', matrix.rocm_version) || matrix.device == 'aarch64' && 'pytorch/manylinux2_28_aarch64-builder:cpu-aarch64' || matrix.docker-image }}
      PY_VERS: ${{ matrix.py_vers }}
      BUILD_DEVICE: ${{ matrix.device }}
-      PLATFORM: ${{ contains(matrix.docker-image, '2_28') && 'manylinux_2_28_x86_64' || 'manylinux2014_x86_64' }}
+      PLATFORM: 'manylinux_2_28_x86_64'
    steps:
      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}

      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false

@ -78,7 +85,7 @@ jobs:
        uses: ./.github/actions/setup-linux

      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: ${{ env.DOCKER_IMAGE }}

@ -130,19 +137,22 @@ jobs:
          fi

          docker exec -t "${container_name}" yum install -y zlib-devel zip
-          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U setuptools==67.4.0 pybind11==2.13.1 auditwheel
-          if [[ ("${{ matrix.device }}" == "cuda" || "${{ matrix.device }}" == "rocm") && "${PLATFORM}" == "manylinux_2_28_x86_64" ]]; then
+          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U setuptools==67.4.0 pybind11==2.13.1 auditwheel wheel
+
+          if [[ ("${{ matrix.device }}" == "cuda" || "${{ matrix.device }}" == "rocm" || "${{ matrix.device }}" == "aarch64" ) ]]; then
            # With this install, it gets clang 16.0.6.
            docker exec -t "${container_name}" dnf install clang lld -y
            WITH_CLANG_LDD="--with-clang-ldd"
          fi
+
          if [[ "${BUILD_DEVICE}" == xpu ]]; then
-            docker exec -t "${container_name}" yum install -y devtoolset-11-gcc-c++
-            docker exec -t "${container_name}" bash -c "source /opt/rh/devtoolset-11/enable && ${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE"
+            docker exec -t "${container_name}" bash -c "dnf install -y gcc-toolset-13-gcc-c++"
+            docker exec -t "${container_name}" bash -c "source /opt/rh/gcc-toolset-13/enable && ${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE"
          else
            docker exec -t "${container_name}" bash -c "${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE $WITH_CLANG_LDD"
          fi
-          if [[ "${{ matrix.device }}" == "cuda" ]]; then
+
+          if [[ ("${{ matrix.device }}" == "cuda" || "${{ matrix.device }}" == "xpu") ]]; then
            docker exec -t "${container_name}"  bash -c "auditwheel repair --plat ${PLATFORM} //artifacts/*.whl"
          else
            docker exec -t "${container_name}"  bash -c "mkdir //artifacts/wheelhouse"
@ -157,18 +167,104 @@ jobs:
          path: ${{ runner.temp }}/artifacts/wheelhouse/*

      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
        if: always()

+  build-wheel-win:
+    name: "Build Triton Windows Wheel"
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
+    strategy:
+      fail-fast: false
+      matrix:
+        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ]
+        device: ["xpu"]
+    timeout-minutes: 40
+    env:
+      PY_VERS: ${{ matrix.py_vers }}
+      BUILD_DEVICE: ${{ matrix.device }}
+      VC_INSTALL_PATH: "C:\\MSVC-BuildTools-2022"
+    steps:
+      - name: Display EC2 information
+        shell: bash
+        run: |
+          set -euo pipefail
+          function get_ec2_metadata() {
+            # Pulled from instance metadata endpoint for EC2
+            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
+            category=$1
+            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          }
+          echo "ami-id: $(get_ec2_metadata ami-id)"
+          echo "instance-id: $(get_ec2_metadata instance-id)"
+          echo "instance-type: $(get_ec2_metadata instance-type)"
+          echo "system info $(uname -a)"
+      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
+        continue-on-error: true
+        with:
+          github-secret: ${{ secrets.GITHUB_TOKEN }}
+      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
+        shell: bash
+        run: |
+          git config --global core.longpaths true
+          git config --global core.symlinks true
+          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
+          # the directory on Windows and prevent GHA from checking out as reported
+          # in https://github.com/actions/checkout/issues/1018
+          git config --global core.fsmonitor false
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: false
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Enable long paths on Windows and install VS2022 17.13.2
+        env:
+          VC_YEAR: 2022
+          VC_VERSION: 17.13.2
+        shell: bash
+        working-directory: pytorch
+        run: |
+          powershell .github/scripts/windows/install_vs2022.ps1
+      - name: Build Triton wheel
+        env:
+          IS_RELEASE_TAG: ${{ startsWith(github.event.ref, 'refs/tags/v') }}
+        working-directory: pytorch
+        shell: bash
+        run: |
+          set -x
+          export RELEASE=""
+          if [[ "${IS_RELEASE_TAG}" == true ]]; then
+            export RELEASE="--release"
+          fi
+          .github/scripts/windows/build_triton.bat
+          mkdir -p "${RUNNER_TEMP}/artifacts/"
+          mv ./*.whl "${RUNNER_TEMP}/artifacts/"
+      - uses: actions/upload-artifact@v4.4.0
+        with:
+          name: pytorch-triton-wheel-${{ matrix.py_vers }}-${{ matrix.device }}
+          if-no-files-found: error
+          path: ${{ runner.temp }}/artifacts/*
+
+
  upload-wheel:
    runs-on: ubuntu-22.04
-    needs: build-wheel
+    needs:
+      - build-wheel
+      - build-wheel-win
    permissions:
      id-token: write
      contents: read
    container:
      image: continuumio/miniconda3:4.12.0
-    environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
+    environment: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') && 'nightly-wheel-upload' || '' }}
    steps:
      - uses: actions/checkout@v3

--- a/.github/workflows/check-labels.yml
+++ b/.github/workflows/check-labels.yml
@ -38,7 +38,7 @@ jobs:
    runs-on: linux.20_04.4x
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false
          fetch-depth: 1
--- a/.github/workflows/close-nonexistent-disable-issues.yml
+++ b/.github/workflows/close-nonexistent-disable-issues.yml
@ -13,7 +13,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          submodules: false
          fetch-depth: 1
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -19,7 +19,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -33,7 +33,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -49,10 +49,13 @@ jobs:
      matrix:
        runner: [linux.12xlarge]
        docker-image-name: [
-          pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9,
          pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks,
          pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks,
          pytorch-linux-focal-cuda12.4-cudnn9-py3.13-gcc9-inductor-benchmarks,
+          pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11,
+          pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks,
+          pytorch-linux-focal-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks,
+          pytorch-linux-focal-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks,
          pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9,
          pytorch-linux-focal-py3.9-clang10,
          pytorch-linux-focal-py3.11-clang10,
@ -96,21 +99,21 @@ jobs:
      # [see note: pytorch repo ref]
      # deep clone (fetch-depth 0) required for git merge-base
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7

      - name: Setup Linux
        uses: ./.github/actions/setup-linux

      - name: Build docker image
        id: build-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
          docker-image-name: ${{ matrix.docker-image-name }}
          always-rebuild: true
          push: true

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: ${{ steps.build-docker-image.outputs.docker-image }}

@ -142,5 +145,5 @@ jobs:
        if: always()

      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
        if: always()
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@ -37,7 +37,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -52,7 +52,7 @@ jobs:
      matrix: ${{ steps.generate-matrix.outputs.matrix }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          fetch-depth: 1
          submodules: true
@ -82,7 +82,7 @@ jobs:
      CUDNN_VERSION: ${{ matrix.cudnn_version }}
    steps:
      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
      # [see note: pytorch repo ref]
@ -103,20 +103,24 @@ jobs:
          password: ${{ secrets.GHCR_PAT }}
      # Setup multi-arch image builds
      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
+        uses: docker/setup-qemu-action@v3
        env:
          QEMU_BINARY_PATH: ${{ runner.temp }}/bin
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
        with:
-          version: v0.10.0
+          version: latest
+          driver-opts: image=moby/buildkit:v0.19.0
      - name: Setup job specific variables
        run: |
          set -eou pipefail
          # To get QEMU binaries in our PATH
          echo "${RUNNER_TEMP}/bin" >> "${GITHUB_PATH}"
          # Generate PyTorch version to use
-          echo "PYTORCH_VERSION=$(python3 .github/scripts/generate_pytorch_version.py --no-build-suffix)" >> "${GITHUB_ENV}"
+          {
+            echo "PYTORCH_VERSION=$(python3 .github/scripts/generate_pytorch_version.py --no-build-suffix)";
+            echo "STABLE_CUDA_VERSION=$(python3 .github/scripts/get_ci_variable.py --stable-cuda-version)"
+          } >> "${GITHUB_ENV}"
      - name: Setup test specific variables
        if: ${{ startsWith(github.event.ref, 'refs/tags/v') }}
        run: |
@ -153,19 +157,19 @@ jobs:
          docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}"

          # Please note, here we ned to pin specific verison of CUDA as with latest label
-          if [[ ${CUDA_VERSION_SHORT} == "12.4" ]]; then
+          if [[ ${CUDA_VERSION_SHORT} == "${STABLE_CUDA_VERSION}" ]]; then
            docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}" \
                    ghcr.io/pytorch/pytorch-nightly:latest
            docker push ghcr.io/pytorch/pytorch-nightly:latest
          fi

      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
        if: always()

  validate:
    needs: build
-    uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@main
+    uses: pytorch/test-infra/.github/workflows/validate-docker-images.yml@release/2.7
    with:
-      channel: nightly
+      channel: test
      ref: main
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -38,7 +38,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -55,7 +55,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main
+      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.9"
@ -64,7 +64,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-aarch64-test:  # Testing
@ -80,7 +80,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main
+      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.9"
@ -104,7 +104,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main
+      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.9"
@ -113,53 +113,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_9-cuda-aarch64-12_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6-aarch64
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.6-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_9-cuda-aarch64-12_6
-      build_environment: linux-aarch64-binary-manywheel
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda-aarch64-12_6-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_9-cuda-aarch64-12_6-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6-aarch64
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.6-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda-aarch64-12_6
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_9-cuda-aarch64-12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -172,7 +125,7 @@ jobs:
      DESIRED_CUDA: cu128
      GPU_ARCH_VERSION: 12.8-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-main
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.9"
@ -181,6 +134,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -198,7 +152,7 @@ jobs:
      DESIRED_CUDA: cu128
      GPU_ARCH_VERSION: 12.8-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-main
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.9"
@ -218,7 +172,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main
+      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.10"
@ -227,7 +181,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-aarch64-test:  # Testing
@ -243,7 +197,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main
+      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.10"
@ -267,7 +221,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main
+      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.10"
@ -276,53 +230,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_10-cuda-aarch64-12_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6-aarch64
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.6-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_10-cuda-aarch64-12_6
-      build_environment: linux-aarch64-binary-manywheel
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda-aarch64-12_6-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_10-cuda-aarch64-12_6-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6-aarch64
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.6-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda-aarch64-12_6
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_10-cuda-aarch64-12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -335,7 +242,7 @@ jobs:
      DESIRED_CUDA: cu128
      GPU_ARCH_VERSION: 12.8-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-main
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.10"
@ -344,6 +251,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -361,7 +269,7 @@ jobs:
      DESIRED_CUDA: cu128
      GPU_ARCH_VERSION: 12.8-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-main
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.10"
@ -381,7 +289,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main
+      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.11"
@ -390,7 +298,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-aarch64-test:  # Testing
@ -406,7 +314,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main
+      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.11"
@ -430,7 +338,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main
+      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.11"
@ -439,53 +347,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_11-cuda-aarch64-12_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6-aarch64
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.6-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      use_split_build: False
-      DESIRED_PYTHON: "3.11"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_11-cuda-aarch64-12_6
-      build_environment: linux-aarch64-binary-manywheel
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda-aarch64-12_6-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_11-cuda-aarch64-12_6-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6-aarch64
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.6-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      use_split_build: False
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda-aarch64-12_6
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_11-cuda-aarch64-12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -498,7 +359,7 @@ jobs:
      DESIRED_CUDA: cu128
      GPU_ARCH_VERSION: 12.8-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-main
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.11"
@ -507,6 +368,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -524,7 +386,7 @@ jobs:
      DESIRED_CUDA: cu128
      GPU_ARCH_VERSION: 12.8-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-main
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.11"
@ -544,7 +406,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main
+      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.12"
@ -553,7 +415,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-aarch64-test:  # Testing
@ -569,7 +431,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main
+      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.12"
@ -593,7 +455,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main
+      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.12"
@ -602,53 +464,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_12-cuda-aarch64-12_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6-aarch64
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.6-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      use_split_build: False
-      DESIRED_PYTHON: "3.12"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_12-cuda-aarch64-12_6
-      build_environment: linux-aarch64-binary-manywheel
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_12-cuda-aarch64-12_6-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_12-cuda-aarch64-12_6-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6-aarch64
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.6-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      use_split_build: False
-      DESIRED_PYTHON: "3.12"
-      build_name: manywheel-py3_12-cuda-aarch64-12_6
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_12-cuda-aarch64-12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -661,7 +476,7 @@ jobs:
      DESIRED_CUDA: cu128
      GPU_ARCH_VERSION: 12.8-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-main
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.12"
@ -670,6 +485,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -687,7 +503,7 @@ jobs:
      DESIRED_CUDA: cu128
      GPU_ARCH_VERSION: 12.8-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-main
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.12"
@ -707,7 +523,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main
+      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.13"
@ -716,7 +532,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cpu-aarch64-test:  # Testing
@ -732,7 +548,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main
+      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.13"
@ -756,7 +572,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main
+      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.13"
@ -765,53 +581,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_13-cuda-aarch64-12_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6-aarch64
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.6-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      use_split_build: False
-      DESIRED_PYTHON: "3.13"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_13-cuda-aarch64-12_6
-      build_environment: linux-aarch64-binary-manywheel
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13-cuda-aarch64-12_6-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13-cuda-aarch64-12_6-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6-aarch64
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.6-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      use_split_build: False
-      DESIRED_PYTHON: "3.13"
-      build_name: manywheel-py3_13-cuda-aarch64-12_6
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_13-cuda-aarch64-12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -824,7 +593,7 @@ jobs:
      DESIRED_CUDA: cu128
      GPU_ARCH_VERSION: 12.8-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-main
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.13"
@ -833,6 +602,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -850,7 +620,7 @@ jobs:
      DESIRED_CUDA: cu128
      GPU_ARCH_VERSION: 12.8-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-main
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.13"
@ -870,7 +640,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main
+      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.13t"
@ -879,7 +649,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cpu-aarch64-test:  # Testing
@ -895,7 +665,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main
+      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.13t"
@ -919,7 +689,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-main
+      DOCKER_IMAGE: pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.13t"
@ -928,53 +698,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_13t-cuda-aarch64-12_6-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6-aarch64
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.6-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      use_split_build: False
-      DESIRED_PYTHON: "3.13t"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_13t-cuda-aarch64-12_6
-      build_environment: linux-aarch64-binary-manywheel
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_13t-cuda-aarch64-12_6-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_13t-cuda-aarch64-12_6-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6-aarch64
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.6-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      use_split_build: False
-      DESIRED_PYTHON: "3.13t"
-      build_name: manywheel-py3_13t-cuda-aarch64-12_6
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_13t-cuda-aarch64-12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -987,7 +710,7 @@ jobs:
      DESIRED_CUDA: cu128
      GPU_ARCH_VERSION: 12.8-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-main
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.13t"
@ -996,6 +719,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1013,7 +737,7 @@ jobs:
      DESIRED_CUDA: cu128
      GPU_ARCH_VERSION: 12.8-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-main
+      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cuda12.8-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.13t"
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-main.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-main.yml
@ -33,7 +33,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -50,7 +50,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.7
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@ -71,7 +71,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.7
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-shared-with-deps-cxx11-abi
--- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml
@ -38,7 +38,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -55,7 +55,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.7
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@ -76,7 +76,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.7
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-shared-with-deps-cxx11-abi
@ -98,7 +98,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-2.7
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cpu-shared-with-deps-cxx11-abi
@ -118,7 +118,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-2.7
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@ -140,7 +140,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-2.7
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda11_8-shared-with-deps-cxx11-abi
@ -163,7 +163,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda11.8-2.7
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda11_8-shared-with-deps-cxx11-abi
@ -171,71 +171,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  libtorch-cuda12_4-shared-with-deps-cxx11-abi-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.4-main
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: libtorch-cuda12_4-shared-with-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda12_4-shared-with-deps-cxx11-abi-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-cuda12_4-shared-with-deps-cxx11-abi-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.4-main
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda12_4-shared-with-deps-cxx11-abi
-      build_environment: linux-binary-libtorch-cxx11-abi
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-cuda12_4-shared-with-deps-cxx11-abi-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: libtorch-cuda12_4-shared-with-deps-cxx11-abi-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.4-main
-      LIBTORCH_VARIANT: shared-with-deps
-      DESIRED_DEVTOOLSET: cxx11-abi
-      build_name: libtorch-cuda12_4-shared-with-deps-cxx11-abi
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  libtorch-cuda12_6-shared-with-deps-cxx11-abi-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -248,7 +183,7 @@ jobs:
      DESIRED_CUDA: cu126
      GPU_ARCH_VERSION: 12.6
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.6-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.6-2.7
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@ -270,7 +205,7 @@ jobs:
      DESIRED_CUDA: cu126
      GPU_ARCH_VERSION: 12.6
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.6-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.6-2.7
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda12_6-shared-with-deps-cxx11-abi
@ -293,7 +228,7 @@ jobs:
      DESIRED_CUDA: cu126
      GPU_ARCH_VERSION: 12.6
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.6-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.6-2.7
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-cuda12_6-shared-with-deps-cxx11-abi
@ -301,6 +236,71 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  libtorch-cuda12_8-shared-with-deps-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.8-2.7
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-cuda12_8-shared-with-deps-cxx11-abi
+      build_environment: linux-binary-libtorch-cxx11-abi
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_8-shared-with-deps-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cuda12_8-shared-with-deps-cxx11-abi-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.8-2.7
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+      build_name: libtorch-cuda12_8-shared-with-deps-cxx11-abi
+      build_environment: linux-binary-libtorch-cxx11-abi
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_8-shared-with-deps-cxx11-abi-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_8-shared-with-deps-cxx11-abi-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.8-2.7
+      LIBTORCH_VARIANT: shared-with-deps
+      DESIRED_DEVTOOLSET: cxx11-abi
+      build_name: libtorch-cuda12_8-shared-with-deps-cxx11-abi
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  libtorch-rocm6_2_4-shared-with-deps-cxx11-abi-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -313,7 +313,7 @@ jobs:
      DESIRED_CUDA: rocm6.2.4
      GPU_ARCH_VERSION: 6.2.4
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.2.4-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.2.4-2.7
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@ -337,7 +337,7 @@ jobs:
      GPU_ARCH_VERSION: 6.2.4
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.2.4-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.2.4-2.7
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
    steps:
@ -351,7 +351,6 @@ jobs:
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
@ -364,9 +363,9 @@ jobs:
        run: |
          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm6.2.4-main
+          docker-image: pytorch/libtorch-cxx11-builder:rocm6.2.4-2.7
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
@ -385,7 +384,7 @@ jobs:
      DESIRED_CUDA: rocm6.2.4
      GPU_ARCH_VERSION: 6.2.4
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.2.4-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.2.4-2.7
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-rocm6_2_4-shared-with-deps-cxx11-abi
@ -405,7 +404,7 @@ jobs:
      DESIRED_CUDA: rocm6.3
      GPU_ARCH_VERSION: 6.3
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.3-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.3-2.7
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
@ -429,7 +428,7 @@ jobs:
      GPU_ARCH_VERSION: 6.3
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.3-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.3-2.7
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
    steps:
@ -443,7 +442,6 @@ jobs:
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
@ -456,9 +454,9 @@ jobs:
        run: |
          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm6.3-main
+          docker-image: pytorch/libtorch-cxx11-builder:rocm6.3-2.7
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
@ -477,7 +475,7 @@ jobs:
      DESIRED_CUDA: rocm6.3
      GPU_ARCH_VERSION: 6.3
      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.3-main
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.3-2.7
      LIBTORCH_VARIANT: shared-with-deps
      DESIRED_DEVTOOLSET: cxx11-abi
      build_name: libtorch-rocm6_3-shared-with-deps-cxx11-abi
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -33,7 +33,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -51,7 +51,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda11.8-main
+      DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda11.8-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.9"
@ -75,7 +75,7 @@ jobs:
      DESIRED_CUDA: cu118
      GPU_ARCH_VERSION: 11.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda11.8-main
+      DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda11.8-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.9"
@ -86,53 +86,6 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

-  manywheel-py3_9-cuda12_4-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.4-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-cuda12_4-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_9-cuda12_4-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.4-main
-      DESIRED_DEVTOOLSET: cxx11-abi
-      use_split_build: False
-      DESIRED_PYTHON: "3.9"
-      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
  manywheel-py3_9-cuda12_6-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -145,14 +98,14 @@ jobs:
      DESIRED_CUDA: cu126
      GPU_ARCH_VERSION: 12.6
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-main
+      DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_6-test:  # Testing
@ -169,7 +122,7 @@ jobs:
      DESIRED_CUDA: cu126
      GPU_ARCH_VERSION: 12.6
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-main
+      DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.6-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.9"
@ -192,14 +145,14 @@ jobs:
      DESIRED_CUDA: cu128
      GPU_ARCH_VERSION: 12.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-main
+      DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_8-test:  # Testing
@ -216,13 +169,13 @@ jobs:
      DESIRED_CUDA: cu128
      GPU_ARCH_VERSION: 12.8
      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-main
+      DOCKER_IMAGE: pytorch/manylinux2_28-builder:cuda12.8-2.7
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_8
      build_environment: linux-binary-manywheel
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .2.0
 .3.0