apply

2025-11-01 22:14:53 +08:00 · 2025-02-18 11:24:34 -08:00
1825 changed files with 37236 additions and 94611 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -20,7 +20,7 @@ cd /
 # on the mounted pytorch repo
 git config --global --add safe.directory /pytorch
 pip install -r /pytorch/requirements.txt
-pip install auditwheel==6.2.0
+pip install auditwheel
 if [ "$DESIRED_CUDA" = "cpu" ]; then
    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -39,7 +39,7 @@ def build_ArmComputeLibrary() -> None:
            "clone",
            "https://github.com/ARM-software/ComputeLibrary.git",
            "-b",
-            "v25.02",
+            "v24.09",
            "--depth",
            "1",
            "--shallow-submodules",
@ -99,14 +99,10 @@ def update_wheel(wheel_path, desired_cuda) -> None:
        if "126" in desired_cuda:
            libs_to_copy += [
                "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.6",
-                "/usr/local/cuda/lib64/libcufile.so.0",
-                "/usr/local/cuda/lib64/libcufile_rdma.so.1",
            ]
        elif "128" in desired_cuda:
            libs_to_copy += [
                "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
-                "/usr/local/cuda/lib64/libcufile.so.0",
-                "/usr/local/cuda/lib64/libcufile_rdma.so.1",
            ]
    else:
        libs_to_copy += [
@ -208,7 +204,7 @@ if __name__ == "__main__":
        else:
            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
    elif branch.startswith(("v1.", "v2.")):
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "

    if enable_mkldnn:
        build_ArmComputeLibrary()
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -19,11 +19,13 @@ import boto3

 # AMI images for us-east-1, change the following based on your ~/.aws/config
 os_amis = {
+    "ubuntu18_04": "ami-078eece1d8119409f",  # login_name: ubuntu
    "ubuntu20_04": "ami-052eac90edaa9d08f",  # login_name: ubuntu
    "ubuntu22_04": "ami-0c6c29c5125214c77",  # login_name: ubuntu
    "redhat8": "ami-0698b90665a2ddcf1",  # login_name: ec2-user
 }

+ubuntu18_04_ami = os_amis["ubuntu18_04"]
 ubuntu20_04_ami = os_amis["ubuntu20_04"]


@ -327,7 +329,7 @@ def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None
        ]
    )
    host.run_cmd(
-        f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v25.02 {git_clone_flags}"
+        f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v24.09 {git_clone_flags}"
    )

    host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}")
@ -657,6 +659,18 @@ def configure_system(
            "sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
        )
    host.run_cmd("pip3 install dataclasses typing-extensions")
+    # Install and switch to gcc-8 on Ubuntu-18.04
+    if not host.using_docker() and host.ami == ubuntu18_04_ami and compiler == "gcc-8":
+        host.run_cmd("sudo apt-get install -y g++-8 gfortran-8")
+        host.run_cmd(
+            "sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 100"
+        )
+        host.run_cmd(
+            "sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-8 100"
+        )
+        host.run_cmd(
+            "sudo update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-8 100"
+        )
    if not use_conda:
        print("Installing Cython + numpy from PyPy")
        host.run_cmd("sudo pip3 install Cython")
@ -747,7 +761,7 @@ def start_build(
        version = host.check_output("cat pytorch/version.txt").strip()[:-2]
        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
    if branch.startswith(("v1.", "v2.")):
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
    if enable_mkldnn:
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -1,8 +1,4 @@
 #!/bin/bash
-# The purpose of this script is to:
-# 1. Extract the set of parameters to be used for a docker build based on the provided image name.
-# 2. Run docker build with the parameters found in step 1.
-# 3. Run the built image and print out the expected and actual versions of packages installed.

 set -ex

@ -99,12 +95,13 @@ fi
 # configuration, so we hardcode everything here rather than do it
 # from scratch
 case "$image" in
-  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11)
-    CUDA_VERSION=12.6.3
+  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
+    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
+    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -118,6 +115,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -132,6 +130,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -146,61 +145,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
    PROTOBUF=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.6.3
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6.3
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-focal-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6.3
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
-    PROTOBUF=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-focal-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6.3
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.13
-    GCC_VERSION=9
-    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -215,6 +160,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -226,6 +172,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=10
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    ONNX=yes
@ -234,7 +181,10 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=10
    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    VULKAN_SDK_VERSION=1.2.162.1
+    SWIFTSHADER=yes
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
@ -242,7 +192,10 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.11
    CLANG_VERSION=10
    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    VULKAN_SDK_VERSION=1.2.162.1
+    SWIFTSHADER=yes
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
@ -250,6 +203,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    TRITON=yes
@ -258,6 +212,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    ROCM_VERSION=6.2.4
    NINJA_VERSION=1.9.0
@ -272,6 +227,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    ROCM_VERSION=6.3
    NINJA_VERSION=1.9.0
@ -286,6 +242,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    XPU_VERSION=0.5
    NINJA_VERSION=1.9.0
@ -296,6 +253,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    XPU_VERSION=2025.0
    NINJA_VERSION=1.9.0
@ -306,6 +264,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    CONDA_CMAKE=yes
@ -319,6 +278,7 @@ case "$image" in
    CUDNN_VERSION=9
    CLANG_VERSION=12
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    TRITON=yes
    ;;
@ -326,6 +286,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=12
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    TRITON=yes
@ -346,6 +307,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    CONDA_CMAKE=yes
@ -360,7 +322,7 @@ case "$image" in
    EXECUTORCH=yes
    ;;
  pytorch-linux-jammy-py3.12-halide)
-    CUDA_VERSION=12.6
+    CUDA_VERSION=12.4
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=11
    CONDA_CMAKE=yes
@ -368,7 +330,7 @@ case "$image" in
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3.12-triton-cpu)
-    CUDA_VERSION=12.6
+    CUDA_VERSION=12.4
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=11
    CONDA_CMAKE=yes
@ -378,19 +340,20 @@ case "$image" in
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
    # We will need to update mypy version eventually, but that's for another day. The task
    # would be to upgrade mypy to 1.0.0 with Python 3.11
-    PYTHON_VERSION=3.9
-    PIP_CMAKE=yes
+    ANACONDA_PYTHON_VERSION=3.9
+    CONDA_CMAKE=yes
    ;;
  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter)
-    PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.9
    CUDA_VERSION=11.8
-    PIP_CMAKE=yes
+    CONDA_CMAKE=yes
    ;;
  pytorch-linux-jammy-aarch64-py3.10-gcc11)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    ACL=yes
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    # snadampal: skipping llvm src build install because the current version
@ -402,6 +365,7 @@ case "$image" in
    GCC_VERSION=11
    ACL=yes
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    # snadampal: skipping llvm src build install because the current version
@ -412,6 +376,7 @@ case "$image" in
  *)
    # Catch-all for builds that are not hardcoded.
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    echo "image '$image' did not match an existing build configuration"
    if [[ "$image" == *py* ]]; then
@ -467,6 +432,7 @@ docker build \
       --build-arg "BUILD_ENVIRONMENT=${image}" \
       --build-arg "PROTOBUF=${PROTOBUF:-}" \
       --build-arg "LLVMDEV=${LLVMDEV:-}" \
+       --build-arg "DB=${DB:-}" \
       --build-arg "VISION=${VISION:-}" \
       --build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \
       --build-arg "CENTOS_VERSION=${CENTOS_VERSION}" \
@ -474,12 +440,13 @@ docker build \
       --build-arg "GLIBC_VERSION=${GLIBC_VERSION}" \
       --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
       --build-arg "ANACONDA_PYTHON_VERSION=${ANACONDA_PYTHON_VERSION}" \
-       --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
       --build-arg "GCC_VERSION=${GCC_VERSION}" \
       --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
       --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
       --build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \
       --build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \
+       --build-arg "VULKAN_SDK_VERSION=${VULKAN_SDK_VERSION}" \
+       --build-arg "SWIFTSHADER=${SWIFTSHADER}" \
       --build-arg "CMAKE_VERSION=${CMAKE_VERSION:-}" \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
@ -489,7 +456,6 @@ docker build \
       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
       --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
-       --build-arg "PIP_CMAKE=${PIP_CMAKE}" \
       --build-arg "TRITON=${TRITON}" \
       --build-arg "TRITON_CPU=${TRITON_CPU}" \
       --build-arg "ONNX=${ONNX}" \
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -55,6 +55,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-01a22b6f16d117454b7d21ebdc691b0785b84a7f
+5e4d6b6380d575e48e37e9d987fded4ec588e7bc
--- a/.ci/docker/ci_commit_pins/nccl-cu12.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu12.txt
@ -1 +1 @@
-v2.26.2-1
+v2.25.1-1
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-83111ab22be6e4a588d184ac45175986a7dde9fc
+e98b6fcb8df5b44eb0d0addb6767c573d37ba024
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-96316ce50fade7e209553aba4898cd9b82aab83b
+4b3bb1f8da0ded6ccd572dd1358ef45af5a1befe
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -1,6 +1,6 @@
 set -euo pipefail

-readonly version=v25.02
+readonly version=v24.04
 readonly src_host=https://github.com/ARM-software
 readonly src_repo=ComputeLibrary

--- a/.ci/docker/common/install_clang.sh
+++ b/.ci/docker/common/install_clang.sh
@ -4,10 +4,16 @@ set -ex

 if [ -n "$CLANG_VERSION" ]; then

-  if [[ $UBUNTU_VERSION == 22.04 ]]; then
+  if [[ $CLANG_VERSION == 9 && $UBUNTU_VERSION == 18.04 ]]; then
+    sudo apt-get update
+    # gpg-agent is not available by default on 18.04
+    sudo apt-get install  -y --no-install-recommends gpg-agent
+    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add  -
+    apt-add-repository "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-${CLANG_VERSION} main"
+  elif [[ $UBUNTU_VERSION == 22.04 ]]; then
    # work around ubuntu apt-get conflicts
    sudo apt-get -y -f install
-    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
+    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add  -
    if [[ $CLANG_VERSION == 18 ]]; then
      apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main"
    fi
@ -35,7 +41,7 @@ if [ -n "$CLANG_VERSION" ]; then
  # clang's packaging is a little messed up (the runtime libs aren't
  # added into the linker path), so give it a little help
  clang_lib=("/usr/lib/llvm-$CLANG_VERSION/lib/clang/"*"/lib/linux")
-  echo "$clang_lib" >/etc/ld.so.conf.d/clang.conf
+  echo "$clang_lib" > /etc/ld.so.conf.d/clang.conf
  ldconfig

  # Cleanup package manager
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -66,7 +66,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then

  # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
  if [[ $(uname -m) == "aarch64" ]]; then
-    conda_install "openblas==0.3.29=*openmp*"
+    conda_install "openblas==0.3.28=*openmp*"
  else
    conda_install "mkl=2021.4.0 mkl-include=2021.4.0"
  fi
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -240,7 +240,7 @@ function prune_126 {
 }

 function install_128 {
-  CUDNN_VERSION=9.8.0.87
+  CUDNN_VERSION=9.7.1.26
  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
  # install CUDA 12.8.0 in the same container
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -3,8 +3,19 @@

 set -ex

-NCCL_VERSION=v2.26.2-1
-CUDNN_VERSION=9.8.0.87
+NCCL_VERSION=v2.21.5-1
+CUDNN_VERSION=9.5.1.17
+
+function install_cusparselt_062 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.6.2.3-archive.tar.xz
+    tar xf libcusparse_lt-linux-sbsa-0.6.2.3-archive.tar.xz
+    cp -a libcusparse_lt-linux-sbsa-0.6.2.3-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-sbsa-0.6.2.3-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}

 function install_cusparselt_063 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
@ -17,7 +28,140 @@ function install_cusparselt_063 {
    rm -rf tmp_cusparselt
 }

+function install_124 {
+  CUDNN_VERSION=9.1.0.70
+  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
+  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
+  # install CUDA 12.4.1 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux_sbsa.run
+  chmod +x cuda_12.4.1_550.54.15_linux_sbsa.run
+  ./cuda_12.4.1_550.54.15_linux_sbsa.run --toolkit --silent
+  rm -f cuda_12.4.1_550.54.15_linux_sbsa.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn
+
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl
+
+  install_cusparselt_063
+
+  ldconfig
+}
+
+function prune_124 {
+  echo "Pruning CUDA 12.4"
+  #####################################################################################
+  # CUDA 12.4 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.4 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.4/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
+}
+
+function install_126 {
+  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
+  rm -rf /usr/local/cuda-12.6 /usr/local/cuda
+  # install CUDA 12.6.3 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux_sbsa.run
+  chmod +x cuda_12.6.3_560.35.05_linux_sbsa.run
+  ./cuda_12.6.3_560.35.05_linux_sbsa.run --toolkit --silent
+  rm -f cuda_12.6.3_560.35.05_linux_sbsa.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn
+
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl
+
+  install_cusparselt_063
+
+  ldconfig
+}
+
+function prune_126 {
+  echo "Pruning CUDA 12.6"
+  #####################################################################################
+  # CUDA 12.6 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.6 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.6/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
+}
+
 function install_128 {
+  CUDNN_VERSION=9.7.1.26
  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
  # install CUDA 12.8.0 in the same container
@ -54,6 +198,10 @@ function install_128 {
 while test $# -gt 0
 do
    case "$1" in
+    12.4) install_124; prune_124
+        ;;
+    12.6) install_126; prune_126
+        ;;
    12.8) install_128;
        ;;
    *) echo "bad argument $1"; exit 1
--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@ -5,7 +5,7 @@ if [[ -n "${CUDNN_VERSION}" ]]; then
    mkdir tmp_cudnn
    pushd tmp_cudnn
    if [[ ${CUDA_VERSION:0:4} == "12.8" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.8.0.87_cuda12-archive"
+        CUDNN_NAME="cudnn-linux-x86_64-9.7.1.26_cuda12-archive"
    elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
        CUDNN_NAME="cudnn-linux-x86_64-9.5.1.17_cuda12-archive"
    elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
--- a/.ci/docker/common/install_db.sh
+++ b/.ci/docker/common/install_db.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -ex
+
+install_ubuntu() {
+  apt-get update
+
+  # Cleanup
+  apt-get autoclean && apt-get clean
+  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+}
+
+install_centos() {
+  # Need EPEL for many packages we depend on.
+  # See http://fedoraproject.org/wiki/EPEL
+  yum --enablerepo=extras install -y epel-release
+
+  # Cleanup
+  yum clean all
+  rm -rf /var/cache/yum
+  rm -rf /var/lib/yum/yumdb
+  rm -rf /var/lib/yum/history
+}
+
+# Install base packages depending on the base OS
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  ubuntu)
+    install_ubuntu
+    ;;
+  centos)
+    install_centos
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -53,7 +53,7 @@ setup_executorch() {
  export EXECUTORCH_BUILD_PYBIND=ON
  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"

-  as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
+  as_jenkins .ci/scripts/setup-linux.sh cmake || true
  popd
 }

--- a/.ci/docker/common/install_linter.sh
+++ b/.ci/docker/common/install_linter.sh
@ -2,6 +2,8 @@

 set -ex

+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
 if [ -n "${UBUNTU_VERSION}" ]; then
  apt update
  apt-get install -y clang doxygen git graphviz nodejs npm libtinfo5
@ -13,8 +15,8 @@ chown -R jenkins pytorch

 pushd pytorch
 # Install all linter dependencies
-pip install -r requirements.txt
-lintrunner init
+pip_install -r requirements.txt
+conda_run lintrunner init

 # Cache .lintbin directory as part of the Docker image
 cp -r .lintbin /tmp
--- a/.ci/docker/common/install_ninja.sh
+++ b/.ci/docker/common/install_ninja.sh
@ -4,15 +4,10 @@ set -ex

 [ -n "$NINJA_VERSION" ]

-arch=$(uname -m)
-if [ "$arch" == "aarch64" ]; then
-    url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux-aarch64.zip"
-else
-    url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux.zip"
-fi
+url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux.zip"

 pushd /tmp
 wget --no-verbose --output-document=ninja-linux.zip "$url"
 unzip ninja-linux.zip -d /usr/local/bin
 rm -f ninja-linux.zip
-popd
+popd
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -32,7 +32,7 @@ pip_install coloredlogs packaging

 pip_install onnxruntime==1.18.1
 pip_install onnx==1.17.0
-pip_install onnxscript==0.2.2 --no-deps
+pip_install onnxscript==0.1.0 --no-deps
 # required by onnxscript
 pip_install ml_dtypes

--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@ -4,7 +4,7 @@
 set -ex

 cd /
-git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.29 --depth 1 --shallow-submodules
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.28 --depth 1 --shallow-submodules


 OPENBLAS_BUILD_FLAGS="
--- a/.ci/docker/common/install_python.sh
+++ b/.ci/docker/common/install_python.sh
@ -1,18 +0,0 @@
-#!/bin/bash
-set -ex
-
-apt-get update
-# Use deadsnakes in case we need an older python version
-sudo add-apt-repository ppa:deadsnakes/ppa
-apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python3-pip python${PYTHON_VERSION}-venv
-
-# Use a venv because uv and some other package managers don't support --user install
-ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
-python -m venv /var/lib/jenkins/ci_env
-source /var/lib/jenkins/ci_env/bin/activate
-
-python -mpip install --upgrade pip
-python -mpip install -r /opt/requirements-ci.txt
-if [ -n "${PIP_CMAKE}" ]; then
-  python -mpip install cmake==3.31.6
-fi
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -8,6 +8,10 @@ ver() {

 install_ubuntu() {
    apt-get update
+    if [[ $UBUNTU_VERSION == 18.04 ]]; then
+      # gpg-agent is not available by default on 18.04
+      apt-get install -y --no-install-recommends gpg-agent
+    fi
    if [[ $UBUNTU_VERSION == 20.04 ]]; then
      # gpg-agent is not available by default on 20.04
      apt-get install -y --no-install-recommends gpg-agent
--- a/.ci/docker/common/install_rocm_drm.sh
+++ b/.ci/docker/common/install_rocm_drm.sh
@ -25,9 +25,7 @@ python3 -m pip install meson ninja
 ###########################
 ### clone repo
 ###########################
-# TEMPORARY FIX: https://gitlab.freedesktop.org/mesa/drm.git is down until 2025/03/22
-# GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
-GIT_SSL_NO_VERIFY=true git clone git://anongit.freedesktop.org/mesa/drm
+GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
 pushd drm

 ###########################
@ -117,7 +115,7 @@ index a5007ffc..13fa07fc 100644
 	if (!fp) {
 -		fprintf(stderr, "%s: %s\n", AMDGPU_ASIC_ID_TABLE,
 -			strerror(errno));
-+		//fprintf(stderr, "amdgpu.ids: No such file or directory\n");
+		fprintf(stderr, "amdgpu.ids: No such file or directory\n");
 		return;
 	}

--- a/.ci/docker/common/install_swiftshader.sh
+++ b/.ci/docker/common/install_swiftshader.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "${SWIFTSHADER}" ]
+
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+_https_amazon_aws=https://ossci-android.s3.amazonaws.com
+
+# SwiftShader
+_swiftshader_dir=/var/lib/jenkins/swiftshader
+_swiftshader_file_targz=swiftshader-abe07b943-prebuilt.tar.gz
+mkdir -p $_swiftshader_dir
+_tmp_swiftshader_targz="/tmp/${_swiftshader_file_targz}"
+
+curl --silent --show-error --location --fail --retry 3 \
+  --output "${_tmp_swiftshader_targz}" "$_https_amazon_aws/${_swiftshader_file_targz}"
+
+tar -C "${_swiftshader_dir}" -xzf "${_tmp_swiftshader_targz}"
+
+export VK_ICD_FILENAMES="${_swiftshader_dir}/build/Linux/vk_swiftshader_icd.json"
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -60,15 +60,15 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}"
  # Triton needs at least gcc-9 to build
  apt-get install -y g++-9

-  CXX=g++-9 pip_install .
+  CXX=g++-9 pip_install -e .
 elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
  # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
  add-apt-repository -y ppa:ubuntu-toolchain-r/test
  apt-get install -y g++-9

-  CXX=g++-9 pip_install .
+  CXX=g++-9 pip_install -e .
 else
-  pip_install .
+  pip_install -e .
 fi

 if [ -n "${CONDA_CMAKE}" ]; then
--- a/.ci/docker/common/install_vulkan_sdk.sh
+++ b/.ci/docker/common/install_vulkan_sdk.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "${VULKAN_SDK_VERSION}" ]
+
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+_vulkansdk_dir=/var/lib/jenkins/vulkansdk
+_tmp_vulkansdk_targz=/tmp/vulkansdk.tar.gz
+
+curl \
+  --silent \
+  --show-error \
+  --location \
+  --fail \
+  --retry 3 \
+  --output "${_tmp_vulkansdk_targz}" "https://ossci-android.s3.amazonaws.com/vulkansdk-linux-x86_64-${VULKAN_SDK_VERSION}.tar.gz"
+
+mkdir -p "${_vulkansdk_dir}"
+tar -C "${_vulkansdk_dir}" -xzf "${_tmp_vulkansdk_targz}" --strip-components 1
+rm -rf "${_tmp_vulkansdk_targz}"
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -39,7 +39,7 @@ case ${GPU_ARCH_TYPE} in
        BASE_TARGET=rocm
        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
        GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx942"
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
        ;;
    *)
--- a/.ci/docker/linter-cuda/Dockerfile
+++ b/.ci/docker/linter-cuda/Dockerfile
@ -18,14 +18,15 @@ COPY ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh

 # Install conda and other packages (e.g., numpy, pytest)
-ARG PYTHON_VERSION
-ARG PIP_CMAKE
-# Put venv into the env vars so users don't need to activate it
-ENV PATH /var/lib/jenkins/ci_env/bin:$PATH
-ENV VIRTUAL_ENV /var/lib/jenkins/ci_env
-COPY requirements-ci.txt /opt/requirements-ci.txt
-COPY ./common/install_python.sh install_python.sh
-RUN bash ./install_python.sh && rm install_python.sh /opt/requirements-ci.txt
+ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
+ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
+ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+COPY requirements-ci.txt /opt/conda/requirements-ci.txt
+COPY ./common/install_conda.sh install_conda.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/install_magma_conda.sh install_magma_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt

 # Install cuda and cudnn
 ARG CUDA_VERSION
@ -36,10 +37,9 @@ ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH

 # Note that Docker build forbids copying file outside the build context
 COPY ./common/install_linter.sh install_linter.sh
+COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_linter.sh
-RUN rm install_linter.sh
-
-RUN chown -R jenkins:jenkins /var/lib/jenkins/ci_env
+RUN rm install_linter.sh common_utils.sh

 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/linter/Dockerfile
+++ b/.ci/docker/linter/Dockerfile
@ -15,18 +15,20 @@ COPY ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh

 # Install conda and other packages (e.g., numpy, pytest)
-ARG PYTHON_VERSION
-ARG PIP_CMAKE
-ENV PATH /var/lib/jenkins/ci_env/bin:$PATH
-ENV VIRTUAL_ENV /var/lib/jenkins/ci_env
-COPY requirements-ci.txt /opt/requirements-ci.txt
-COPY ./common/install_python.sh install_python.sh
-RUN bash ./install_python.sh && rm install_python.sh /opt/requirements-ci.txt
+ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
+ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
+ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+COPY requirements-ci.txt /opt/conda/requirements-ci.txt
+COPY ./common/install_conda.sh install_conda.sh
+COPY ./common/common_utils.sh common_utils.sh
+RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt

 # Note that Docker build forbids copying file outside the build context
 COPY ./common/install_linter.sh install_linter.sh
+COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_linter.sh
-RUN rm install_linter.sh
+RUN rm install_linter.sh common_utils.sh

 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/manywheel/Dockerfile_2014
+++ b/.ci/docker/manywheel/Dockerfile_2014
@ -0,0 +1,153 @@
+# syntax = docker/dockerfile:experimental
+ARG ROCM_VERSION=3.7
+ARG BASE_CUDA_VERSION=10.2
+ARG GPU_IMAGE=nvidia/cuda:${BASE_CUDA_VERSION}-devel-centos7
+FROM quay.io/pypa/manylinux2014_x86_64 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel
+RUN yum install -y yum-utils centos-release-scl sudo
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils
+ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+
+
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
+
+FROM base as cuda
+ARG BASE_CUDA_VERSION=10.2
+# Install CUDA
+ADD ./common/install_cuda.sh install_cuda.sh
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
+
+FROM base as intel
+# MKL
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as magma
+ARG BASE_CUDA_VERSION=10.2
+# Install magma
+ADD ./common/install_magma.sh install_magma.sh
+RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
+
+FROM base as jni
+# Install java jni header
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+# Install libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM ${GPU_IMAGE} as common
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+RUN yum install -y \
+        aclocal \
+        autoconf \
+        automake \
+        bison \
+        bzip2 \
+        curl \
+        diffutils \
+        file \
+        git \
+        make \
+        patch \
+        perl \
+        unzip \
+        util-linux \
+        wget \
+        which \
+        xz \
+        yasm
+RUN yum install -y \
+    https://repo.ius.io/ius-release-el7.rpm \
+    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
+
+RUN yum swap -y git git236-core
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+# Install LLVM version
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=base               /opt/python                           /opt/python
+COPY --from=base               /opt/_internal                        /opt/_internal
+COPY --from=base               /usr/local/bin/auditwheel             /usr/local/bin/auditwheel
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=base               /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+
+FROM common as cpu_final
+ARG BASE_CUDA_VERSION=10.2
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils
+ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+
+# ninja
+RUN yum install -y http://repo.okay.com.mx/centos/7/x86_64/release/okay-release-1-1.noarch.rpm
+RUN yum install -y ninja-build
+
+FROM cpu_final as cuda_final
+RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+
+FROM common as rocm_final
+ARG ROCM_VERSION=3.7
+# Install ROCm
+ADD ./common/install_rocm.sh install_rocm.sh
+RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh
+# cmake is already installed inside the rocm base image, but both 2 and 3 exist
+# cmake3 is needed for the later MIOpen custom build, so that step is last.
+RUN yum install -y cmake3 && \
+    rm -f /usr/bin/cmake && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+ADD ./common/install_miopen.sh install_miopen.sh
+RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -38,12 +38,6 @@ RUN yum install -y \
  sudo \
  gcc-toolset-${GCCTOOLSET_VERSION}-toolchain

-# (optional) Install non-default Ninja version
-ARG NINJA_VERSION
-COPY ./common/install_ninja.sh install_ninja.sh
-RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
-RUN rm install_ninja.sh
-
 # Ensure the expected devtoolset is used
 ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -42,7 +42,6 @@ RUN yum install -y \
  llvm-devel \
  libzstd-devel \
  python3.12-devel \
-  python3.12-test \
  python3.12-setuptools \
  python3.12-pip \
  python3-virtualenv \
@ -102,33 +101,24 @@ CMD ["/bin/bash"]

 # install test dependencies:
 # - grpcio requires system openssl, bundled crypto fails to build
+# - ml_dtypes 0.4.0 requires some fixes provided in later commits to build
 RUN dnf install -y \
  protobuf-devel \
  protobuf-c-devel \
  protobuf-lite-devel \
-  hdf5-devel \
-  python3-h5py \
-  git
+  wget \
+  patch

-RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
-
-# cmake-3.28.0 from pip for onnxruntime
-RUN python3 -mpip install cmake==3.28.0
-
-# build onnxruntime 1.21.0 from sources.
-# it is not possible to build it from sources using pip,
-# so just build it from upstream repository.
-# h5py is dependency of onnxruntime_training.
-# h5py==3.11.0 builds with hdf5-devel 1.10.5 from repository.
-# install newest flatbuffers version first:
-# for some reason old version is getting pulled in otherwise.
-# packaging package is required for onnxruntime wheel build.
-RUN pip3 install flatbuffers && \
-  pip3 install h5py==3.11.0 && \
-  pip3 install packaging && \
-  git clone https://github.com/microsoft/onnxruntime && \
-  cd onnxruntime && git checkout v1.21.0 && \
+RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio==1.65.4
+RUN cd ~ && \
+  git clone https://github.com/jax-ml/ml_dtypes && \
+  cd ml_dtypes && \
+  git checkout v0.4.0 && \
  git submodule update --init --recursive && \
-  ./build.sh --config Release --parallel 0 --enable_pybind --build_wheel --enable_training --enable_training_apis --enable_training_ops --skip_tests --allow_running_as_root && \
-  pip3 install ./build/Linux/Release/dist/onnxruntime_training-*.whl && \
-  cd .. && /bin/rm -rf ./onnxruntime
+  wget https://github.com/jax-ml/ml_dtypes/commit/b969f76914d6b30676721bc92bf0f6021a0d1321.patch && \
+  wget https://github.com/jax-ml/ml_dtypes/commit/d4e6d035ecda073eab8bcf60f4eef572ee7087e6.patch && \
+  patch -p1 < b969f76914d6b30676721bc92bf0f6021a0d1321.patch && \
+  patch -p1 < d4e6d035ecda073eab8bcf60f4eef572ee7087e6.patch && \
+  python3 setup.py bdist_wheel && \
+  pip3 install dist/*.whl && \
+  rm -rf ml_dtypes
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -48,7 +48,7 @@ case ${GPU_ARCH_TYPE} in
        TARGET=final
        DOCKER_TAG=cpu-aarch64
        GPU_IMAGE=arm64v8/almalinux:8
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11 --build-arg NINJA_VERSION=1.12.1"
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28_aarch64"
        ;;
    cpu-cxx11-abi)
@ -97,7 +97,7 @@ case ${GPU_ARCH_TYPE} in
            DEVTOOLSET_VERSION="11"
            GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
        fi
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101"
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
    xpu)
@ -121,8 +121,7 @@ fi
 (
    set -x

-    # Only activate this if in CI
-    if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then
+    if [ "$(uname -m)" != "s390x" ]; then
        # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
        # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
        sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
@ -140,7 +139,7 @@ fi
        "${TOPDIR}/.ci/docker/"
 )

-GITHUB_REF=${GITHUB_REF:-"dev")}
+GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
 GIT_BRANCH_NAME=${GITHUB_REF##*/}
 GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
 DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
--- a/.ci/docker/manywheel/build_scripts/build_utils.sh
+++ b/.ci/docker/manywheel/build_scripts/build_utils.sh
@ -3,7 +3,7 @@
 # Script used only in CD pipeline

 OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/
-CURL_DOWNLOAD_URL=https://curl.se/download
+CURL_DOWNLOAD_URL=https://curl.askapache.com/download

 AUTOCONF_DOWNLOAD_URL=https://ftp.gnu.org/gnu/autoconf

--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -41,14 +41,11 @@ fbscribelogger==0.1.7
 #Pinned versions: 0.1.6
 #test that import:

-flatbuffers==2.0 ; platform_machine != "s390x"
+flatbuffers==2.0
 #Description: cross platform serialization library
 #Pinned versions: 2.0
 #test that import:

-flatbuffers ; platform_machine == "s390x"
-#Description: cross platform serialization library; Newer version is required on s390x for new python version
-
 hypothesis==5.35.1
 # Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
 #Description: advanced library for generating parametrized tests
@ -93,10 +90,10 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.14.0
+mypy==1.13.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
-#Pinned versions: 1.14.0
+#Pinned versions: 1.10.0
 #test that import: test_typing.py, test_type_hints.py

 networkx==2.8.8
@ -105,10 +102,10 @@ networkx==2.8.8
 #Pinned versions: 2.8.8
 #test that import: functorch

-ninja==1.11.1.3
-#Description: build system. Used in some tests. Used in build to generate build
-#time tracing information
-#Pinned versions: 1.11.1.3
+#ninja
+#Description: build system.  Note that it install from
+#here breaks things so it is commented out
+#Pinned versions: 1.10.0.post1
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py

 numba==0.49.0 ; python_version < "3.9"
@ -297,7 +294,7 @@ ghstack==0.8.0
 #Pinned versions: 0.8.0
 #test that import:

-jinja2==3.1.6
+jinja2==3.1.5
 #Description: jinja2 template engine
 #Pinned versions: 3.1.4
 #test that import:
@ -342,7 +339,7 @@ onnx==1.17.0
 #Pinned versions:
 #test that import:

-onnxscript==0.2.2
+onnxscript==0.1.0
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
@ -368,6 +365,7 @@ PyYAML
 pyzstd
 setuptools

+ninja==1.11.1 ; platform_machine == "aarch64"
 scons==4.5.2 ; platform_machine == "aarch64"

 pulp==2.9.0 ; python_version >= "3.8"
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.3.0
+3.2.0
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -50,6 +50,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -50,6 +50,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -77,6 +77,13 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -74,6 +74,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -81,6 +88,18 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
 ENV INSTALLED_VISION ${VISION}

+# (optional) Install Vulkan SDK
+ARG VULKAN_SDK_VERSION
+COPY ./common/install_vulkan_sdk.sh install_vulkan_sdk.sh
+RUN if [ -n "${VULKAN_SDK_VERSION}" ]; then bash ./install_vulkan_sdk.sh; fi
+RUN rm install_vulkan_sdk.sh
+
+# (optional) Install swiftshader
+ARG SWIFTSHADER
+COPY ./common/install_swiftshader.sh install_swiftshader.sh
+RUN if [ -n "${SWIFTSHADER}" ]; then bash ./install_swiftshader.sh; fi
+RUN rm install_swiftshader.sh
+
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
--- a/.ci/magma/Makefile
+++ b/.ci/magma/Makefile
@ -12,7 +12,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_CUDA_SHORT} \
 	-e DESIRED_CUDA=${DESIRED_CUDA} \
 	-e CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" \
-	"pytorch/manylinux2_28-builder:cuda${DESIRED_CUDA}-main" \
+	"pytorch/manylinux-builder:cuda${DESIRED_CUDA}-main" \
 	magma/build_magma.sh

 .PHONY: all
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -111,6 +111,12 @@ case ${DESIRED_PYTHON} in
    ;;
 esac

+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    export _GLIBCXX_USE_CXX11_ABI=1
+else
+    export _GLIBCXX_USE_CXX11_ABI=0
+fi
+
 if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
    echo "Calling build_amd.py at $(date)"
    python tools/amd_build/build_amd.py
@ -203,6 +209,12 @@ if [[ -n "$BUILD_PYTHONLESS" ]]; then

    mkdir -p /tmp/$LIBTORCH_HOUSE_DIR

+    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+        LIBTORCH_ABI="cxx11-abi-"
+    else
+        LIBTORCH_ABI=
+    fi
+
    zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch
    cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \
       /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-latest.zip
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -54,11 +54,11 @@ cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
 TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
 case ${CUDA_VERSION} in
    12.8)
-        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX" #removing sm_50-sm_70 as these architectures are deprecated in CUDA 12.8 and will be removed in future releases
+        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0;10.0;12.0+PTX" #Ripping out 5.0 and 6.0 due to ld error
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
        ;;
    12.6)
-        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
+        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0+PTX"
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
        ;;
    12.4)
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@ -95,6 +95,12 @@ python setup.py clean
 retry pip install -qr requirements.txt
 retry pip install -q numpy==2.0.1

+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    export _GLIBCXX_USE_CXX11_ABI=1
+else
+    export _GLIBCXX_USE_CXX11_ABI=0
+fi
+
 if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
    echo "Calling build_amd.py at $(date)"
    python tools/amd_build/build_amd.py
@ -163,6 +169,12 @@ fi

 )

+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    LIBTORCH_ABI="cxx11-abi-"
+else
+    LIBTORCH_ABI=
+fi
+
 (
    set -x

--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -173,7 +173,6 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
  export USE_KINETO=0
-  export TORCH_XPU_ARCH_LIST=pvc
 fi

 # sccache will fail for CUDA builds if all cores are used for compiling
@ -192,7 +191,7 @@ fi

 # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
 # memory to build and will OOM
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]] && [ -z "$MAX_JOBS_OVERRIDE" ]; then
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
  echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
  echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
  export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
@ -378,10 +377,8 @@ else
    # This is an attempt to mitigate flaky libtorch build OOM error. By default, the build parallelization
    # is set to be the number of CPU minus 2. So, let's try a more conservative value here. A 4xlarge has
    # 16 CPUs
-    if [ -z "$MAX_JOBS_OVERRIDE" ]; then
-      MAX_JOBS=$(nproc --ignore=4)
-      export MAX_JOBS
-    fi
+    MAX_JOBS=$(nproc --ignore=4)
+    export MAX_JOBS

    # NB: Install outside of source directory (at the same level as the root
    # pytorch folder) so that it doesn't get cleaned away prior to docker push.
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -73,14 +73,26 @@ fi
 # Check GCC ABI
 ###############################################################################

-# NOTE: As of https://github.com/pytorch/pytorch/issues/126551 we only produce
-#       wheels with cxx11-abi
+# NOTE [ Building libtorch with old vs. new gcc ABI ]
+#
+# Packages built with one version of ABI could not be linked against by client
+# C++ libraries that were compiled using the other version of ABI. Since both
+# gcc ABIs are still common in the wild, we need to support both ABIs. Currently:
+#
+# - All the nightlies built on CentOS 7 + devtoolset7 use the old gcc ABI.
+# - All the nightlies built on Ubuntu 16.04 + gcc 5.4 use the new gcc ABI.

 echo "Checking that the gcc ABI is what we expect"
 if [[ "$(uname)" != 'Darwin' ]]; then
  function is_expected() {
-    if [[ "$1" -gt 0 || "$1" == "ON " ]]; then
-      echo 1
+    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* || "$DESIRED_CUDA" == *"rocm"* ]]; then
+      if [[ "$1" -gt 0 || "$1" == "ON " ]]; then
+        echo 1
+      fi
+    else
+      if [[ -z "$1" || "$1" == 0 || "$1" == "OFF" ]]; then
+        echo 1
+      fi
    fi
  }

@ -196,11 +208,35 @@ setup_link_flags () {

 TEST_CODE_DIR="$(dirname $(realpath ${BASH_SOURCE[0]}))/test_example_code"
 build_and_run_example_cpp () {
+  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    GLIBCXX_USE_CXX11_ABI=1
+  else
+    GLIBCXX_USE_CXX11_ABI=0
+  fi
  setup_link_flags
-  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
+  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -D_GLIBCXX_USE_CXX11_ABI=$GLIBCXX_USE_CXX11_ABI -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
  ./$1
 }

+build_example_cpp_with_incorrect_abi () {
+  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    GLIBCXX_USE_CXX11_ABI=0
+  else
+    GLIBCXX_USE_CXX11_ABI=1
+  fi
+  set +e
+  setup_link_flags
+  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -D_GLIBCXX_USE_CXX11_ABI=$GLIBCXX_USE_CXX11_ABI -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
+  ERRCODE=$?
+  set -e
+  if [ "$ERRCODE" -eq "0" ]; then
+    echo "Building example with incorrect ABI didn't throw error. Aborting."
+    exit 1
+  else
+    echo "Building example with incorrect ABI throws expected error. Proceeding."
+  fi
+}
+
 ###############################################################################
 # Check simple Python/C++ calls
 ###############################################################################
@ -210,6 +246,11 @@ if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
    export LD_LIBRARY_PATH=/usr/local/cuda/lib64
  fi
  build_and_run_example_cpp simple-torch-test
+  # `_GLIBCXX_USE_CXX11_ABI` is always ignored by gcc in devtoolset7, so we test
+  # the expected failure case for Ubuntu 16.04 + gcc 5.4 only.
+  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    build_example_cpp_with_incorrect_abi simple-torch-test
+  fi
 else
  pushd /tmp
  python -c 'import torch'
--- a/.ci/pytorch/smoke_test/check_binary_symbols.py
+++ b/.ci/pytorch/smoke_test/check_binary_symbols.py
@ -121,9 +121,9 @@ def main() -> None:
        else:
            install_root = Path(distutils.sysconfig.get_python_lib()) / "torch"

-    libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so")
-    # NOTE: All binaries are built with cxx11abi now
-    check_lib_symbols_for_abi_correctness(libtorch_cpu_path, False)
+    libtorch_cpu_path = install_root / "lib" / "libtorch_cpu.so"
+    pre_cxx11_abi = "cxx11-abi" not in os.getenv("DESIRED_DEVTOOLSET", "")
+    check_lib_symbols_for_abi_correctness(libtorch_cpu_path, pre_cxx11_abi)


 if __name__ == "__main__":
--- a/.ci/pytorch/smoke_test/max_autotune.py
+++ b/.ci/pytorch/smoke_test/max_autotune.py
@ -46,9 +46,7 @@ def train(args, model, device, train_loader, optimizer, epoch):
        optimizer.step()
        if batch_idx % args.log_interval == 0:
            print(
-                f"Train Epoch: {epoch} "
-                f"[{batch_idx * len(data)}/{len(train_loader.dataset)} "
-                f"({100.0 * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}"
+                f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}"  # noqa: B950
            )
            if args.dry_run:
                break
@ -73,9 +71,7 @@ def test(model, device, test_loader):
    test_loss /= len(test_loader.dataset)

    print(
-        f"\nTest set: Average loss: {test_loss:.4f}, "
-        f"Accuracy: {correct}/{len(test_loader.dataset)} "
-        f"({100.0 * correct / len(test_loader.dataset):.0f}%)\n"
+        f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n"  # noqa: B950
    )


--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@ -76,13 +76,10 @@ def read_release_matrix():


 def test_numpy():
-    try:
-        import numpy as np
+    import numpy as np

-        x = np.arange(5)
-        torch.tensor(x)
-    except ImportError:
-        print("Numpy check skipped. Numpy is not installed.")
+    x = np.arange(5)
+    torch.tensor(x)


 def check_version(package: str) -> None:
@ -169,10 +166,6 @@ def test_cuda_gds_errors_captured() -> None:
    major_version = int(torch.version.cuda.split(".")[0])
    minor_version = int(torch.version.cuda.split(".")[1])

-    if target_os == "windows":
-        print(f"{target_os} is not supported for GDS smoke test")
-        return
-
    if major_version < 12 or (major_version == 12 and minor_version < 6):
        print("CUDA version is not supported for GDS smoke test")
        return
@ -413,7 +406,6 @@ def main() -> None:
    smoke_test_conv2d()
    test_linalg()
    test_numpy()
-
    if is_cuda_system:
        test_linalg("cuda")
        test_cuda_gds_errors_captured()
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -314,13 +314,6 @@ test_python() {
  assert_git_not_dirty
 }

-test_lazy_tensor_meta_reference_disabled() {
-  export TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE=1
-  echo "Testing lazy tensor operations without meta reference"
-  time python test/run_test.py --include lazy/test_ts_opinfo.py --verbose
-  export -n TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE
-}
-

 test_dynamo_wrapped_shard() {
  if [[ -z "$NUM_TEST_SHARDS" ]]; then
@ -483,8 +476,6 @@ elif [[ "${TEST_CONFIG}" == *aot_eager* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--backend aot_eager)
 elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--export-aot-inductor)
-elif [[ "${TEST_CONFIG}" == *max_autotune_inductor* ]]; then
-  DYNAMO_BENCHMARK_FLAGS+=(--inductor --inductor-compile-mode max-autotune)
 elif [[ "${TEST_CONFIG}" == *inductor* && "${TEST_CONFIG}" != *perf* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--inductor)
 fi
@ -499,59 +490,6 @@ else
  DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
 fi

-test_cachebench() {
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-
-  local BENCHMARK
-  if [[ "${SHARD_NUMBER}" == 1 ]]; then
-    local BENCHMARK=torchbench
-  elif [[ "${SHARD_NUMBER}" == 2 ]]; then
-    local BENCHMARK=huggingface
-  else
-    echo "invalid SHARD_NUMBER: ${SHARD_NUMBER}"
-    exit 1
-  fi
-
-  local mode_options=("training" "inference")
-
-  for mode in "${mode_options[@]}"; do
-    $TASKSET python "benchmarks/dynamo/cachebench.py" \
-        --mode "$mode" \
-        --device cuda \
-        --benchmark "$BENCHMARK" \
-        --repeat 3 \
-        --output "$TEST_REPORTS_DIR/cachebench_${BENCHMARK}_${mode}.json"
-
-    $TASKSET python "benchmarks/dynamo/cachebench.py" \
-        --mode "$mode" \
-        --dynamic \
-        --device cuda \
-        --benchmark "$BENCHMARK" \
-        --repeat 3 \
-        --output "$TEST_REPORTS_DIR/cachebench_${BENCHMARK}_${mode}_dynamic.json"
-  done
-}
-
-test_verify_cachebench() {
-  TMP_TEST_REPORTS_DIR=$(mktemp -d)
-  TEST_OUTPUT="$TMP_TEST_REPORTS_DIR/test.json"
-
-  $TASKSET python "benchmarks/dynamo/cachebench.py" \
-      --mode training \
-      --device cpu \
-      --model nanogpt \
-      --benchmark torchbench \
-      --output "$TEST_OUTPUT"
-
-  # -s checks file exists and is non empty
-  if [[ ! -s "$TEST_OUTPUT" ]]; then
-    echo "Cachebench failed to produce an output."
-    echo "Run 'python benchmarks/dynamo/cachebench.py' to make sure it works"
-    exit 1
-  fi
-}
-
 test_perf_for_dashboard() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
@ -580,8 +518,6 @@ test_perf_for_dashboard() {
    test_inductor_set_cpu_affinity
  elif [[ "${TEST_CONFIG}" == *cuda_a10g* ]]; then
    device=cuda_a10g
-  elif [[ "${TEST_CONFIG}" == *h100* ]]; then
-    device=cuda_h100
  elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
    device=rocm
  fi
@ -762,8 +698,6 @@ test_dynamo_benchmark() {
      fi
    elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
-    elif [[ "${TEST_CONFIG}" == *max_autotune_inductor* ]]; then
-      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
    else
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
      test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"
@ -1481,7 +1415,7 @@ test_executorch() {
  bash examples/models/llama3_2_vision/install_requirements.sh
  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
  # from the PR
-  bash .ci/scripts/setup-linux.sh --build-tool cmake
+  bash .ci/scripts/setup-linux.sh cmake

  echo "Run ExecuTorch unit tests"
  pytest -v -n auto
@ -1505,7 +1439,7 @@ test_executorch() {
 test_linux_aarch64() {
  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
        test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
-        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops test_ops \
+        test_foreach test_reductions test_unary_ufuncs test_tensor_creation_ops \
        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

  # Dynamo tests
@ -1573,16 +1507,6 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
  install_torchvision
  id=$((SHARD_NUMBER-1))
  test_dynamo_benchmark timm_models "$id"
-elif [[ "${TEST_CONFIG}" == cachebench ]]; then
-  install_torchaudio cuda
-  install_torchvision
-  checkout_install_torchbench nanogpt BERT_pytorch resnet50 hf_T5 llama moco
-  PYTHONPATH=$(pwd)/torchbench test_cachebench
-elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
-  install_torchaudio cpu
-  install_torchvision
-  checkout_install_torchbench nanogpt
-  PYTHONPATH=$(pwd)/torchbench test_verify_cachebench
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
    install_torchaudio cpu
@ -1619,7 +1543,6 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
  install_torchvision
  checkout_install_torchbench hf_T5 llama moco
  PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
-  test_inductor_aoti
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
  test_inductor_shard "${SHARD_NUMBER}"
@ -1639,7 +1562,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
  test_python_shard "$SHARD_NUMBER"
  test_aten
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
-  test_lazy_tensor_meta_reference_disabled
  test_without_numpy
  install_torchvision
  test_python_shard 1
--- a/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat
+++ b/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat
@ -17,24 +17,32 @@ curl -L -o "%INSTALLER_FILE%" %DOWNLOAD_URL%
 :: Install the Visual Studio Build Tools with C++ components
 echo Installing Visual Studio Build Tools with C++ components...
 echo Installing MSVC %MSVC_VERSION%
-"%INSTALLER_FILE%" --norestart --quiet --wait --installPath "%DEPENDENCIES_DIR%\VSBuildTools" ^
-    --add Microsoft.VisualStudio.Workload.VCTools ^
-    --add Microsoft.VisualStudio.Component.Windows10SDK ^
-    --add Microsoft.VisualStudio.Component.Windows11SDK.22621 ^
-    --add Microsoft.VisualStudio.Component.VC.ASAN ^
-    --add Microsoft.VisualStudio.Component.VC.CMake.Project ^
-    --add Microsoft.VisualStudio.Component.VC.CoreBuildTools ^
-    --add Microsoft.VisualStudio.Component.VC.CoreIde ^
-    --add Microsoft.VisualStudio.Component.VC.Redist.14.Latest ^
-    --add Microsoft.VisualStudio.Component.VC.Tools.ARM64EC ^
-    --add Microsoft.VisualStudio.Component.VC.Tools.ARM64 ^
-    --add Microsoft.VisualStudio.Component.VC.Tools.x86.x64
-
-echo exitcode = %errorlevel%
+if "%MSVC_VERSION%" == "latest" (
+    "%INSTALLER_FILE%" --norestart --nocache --quiet --wait --installPath "%DEPENDENCIES_DIR%\VSBuildTools" ^
+        --add Microsoft.VisualStudio.Component.Windows11SDK.22621 ^
+        --add Microsoft.VisualStudio.Component.VC.ASAN ^
+        --add Microsoft.VisualStudio.Component.VC.CMake.Project ^
+        --add Microsoft.VisualStudio.Component.VC.Tools.ARM64 ^
+        --add Microsoft.VisualStudio.Component.VC.Tools.x86.x64
+) else if "%MSVC_VERSION%" == "14.40" (
+    "%INSTALLER_FILE%" --norestart --nocache --quiet --wait --installPath "%DEPENDENCIES_DIR%\VSBuildTools" ^
+        --add Microsoft.VisualStudio.Component.Windows11SDK.22621 ^
+        --add Microsoft.VisualStudio.Component.VC.ASAN ^
+        --add Microsoft.VisualStudio.Component.VC.CMake.Project ^
+        --add Microsoft.VisualStudio.Component.VC.14.40.17.10.ARM64 ^
+        --add Microsoft.VisualStudio.Component.VC.14.40.17.10.x86.x64
+) else if "%MSVC_VERSION%" == "14.36" (
+    "%INSTALLER_FILE%" --norestart --nocache --quiet --wait --installPath "%DEPENDENCIES_DIR%\VSBuildTools" ^
+        --add Microsoft.VisualStudio.Component.Windows11SDK.22621 ^
+        --add Microsoft.VisualStudio.Component.VC.ASAN ^
+        --add Microsoft.VisualStudio.Component.VC.CMake.Project ^
+        --add Microsoft.VisualStudio.Component.VC.14.36.17.6.ARM64 ^
+        --add Microsoft.VisualStudio.Component.VC.14.36.17.6.x86.x64
+)

 :: Check if installation was successful
 if %errorlevel% neq 0 (
-    echo Failed to install Visual Studio Build Tools with C++ components.
+    echo "Failed to install Visual Studio Build Tools with C++ components. (exitcode = %errorlevel%)"
    exit /b 1
 )

--- a/.ci/pytorch/windows/arm64/bootstrap_python.bat
+++ b/.ci/pytorch/windows/arm64/bootstrap_python.bat
@ -6,25 +6,22 @@ echo Dependency Python installation started.
 if not exist "%DOWNLOADS_DIR%" mkdir %DOWNLOADS_DIR%
 if not exist "%DEPENDENCIES_DIR%" mkdir %DEPENDENCIES_DIR%

-if "%DESIRED_PYTHON%" == "3.13" (
-    echo Python version is set to 3.13
-    set DOWNLOAD_URL=https://www.python.org/ftp/python/3.13.2/python-3.13.2-arm64.exe
-) else if "%DESIRED_PYTHON%" == "3.12" (
-    echo Python version is set to 3.12
-    set DOWNLOAD_URL=https://www.python.org/ftp/python/3.12.7/python-3.12.7-arm64.exe
-) else if "%DESIRED_PYTHON%" == "3.11" (
-    echo Python version is set to 3.11
-    set DOWNLOAD_URL=https://www.python.org/ftp/python/3.11.9/python-3.11.9-arm64.exe
+if "%PYTHON_VERSION%"=="Python312" (
+    echo Python version is set to Python312
+    set DOWNLOAD_URL="https://www.python.org/ftp/python/3.12.7/python-3.12.7-arm64.exe"
+) else if "%PYTHON_VERSION%"=="Python311" (
+    echo Python version is set to Python311
+    set DOWNLOAD_URL="https://www.python.org/ftp/python/3.11.9/python-3.11.9-arm64.exe"
 ) else (
-    echo DESIRED_PYTHON not defined, Python version is set to 3.12
-    set DOWNLOAD_URL=https://www.python.org/ftp/python/3.12.7/python-3.12.7-arm64.exe
+    echo PYTHON_VERSION not defined, Python version is set to Python312
+    set DOWNLOAD_URL="https://www.python.org/ftp/python/3.12.7/python-3.12.7-arm64.exe"
 )

 set INSTALLER_FILE=%DOWNLOADS_DIR%\python-installer.exe

 :: Download installer
 echo Downloading Python...
-curl -L -o "%INSTALLER_FILE%" "%DOWNLOAD_URL%"
+curl -L -o "%INSTALLER_FILE%" %DOWNLOAD_URL%

 :: Install Python
 echo Installing Python...
--- a/.ci/pytorch/windows/arm64/bootstrap_tests.bat
+++ b/.ci/pytorch/windows/arm64/bootstrap_tests.bat
@ -14,7 +14,7 @@ where python
 :: install dependencies
 python -m pip install --upgrade pip
 pip install -r requirements.txt
-pip install pytest numpy protobuf expecttest hypothesis
+pip install pytest numpy

 :: find file name for pytorch wheel
 for /f "delims=" %%f in ('dir /b "%PYTORCH_FINAL_PACKAGE_DIR%" ^| findstr "torch-"') do set "TORCH_WHEEL_FILENAME=%PYTORCH_FINAL_PACKAGE_DIR%\%%f"
--- a/.ci/pytorch/windows/arm64/smoke_test.bat
+++ b/.ci/pytorch/windows/arm64/smoke_test.bat
@ -1,6 +1,8 @@
@echo off
 setlocal

+set "ORIG_PATH=%PATH%"
+
 if "%PACKAGE_TYPE%" == "wheel" goto wheel
 if "%PACKAGE_TYPE%" == "libtorch" goto libtorch

@ -8,7 +10,21 @@ echo "unknown package type"
 exit /b 1

 :wheel
-call %PYTORCH_ROOT%\.ci\pytorch\windows\arm64\bootstrap_tests.bat
+echo "install wheel package"
+
+echo Running pip install...
+pip install -q --pre numpy protobuf
+echo Error level after pip install: %ERRORLEVEL%
+if errorlevel 1 exit /b 1
+
+for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *.whl') do pip install "%%i"
+if errorlevel 1 exit /b 1
+
+goto smoke_test
+
+:smoke_test
+python -c "import torch"
+if ERRORLEVEL 1 exit /b 1

 echo Running python rnn_smoke.py...
 python %PYTORCH_ROOT%\.ci\pytorch\test_example_code\rnn_smoke_win_arm64.py
@ -23,12 +39,10 @@ goto end
 :libtorch
 echo "install and test libtorch"

-if not exist tmp mkdir tmp
-
-for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *-latest.zip') do C:\Windows\System32\tar.exe -xf "%%i" -C tmp
+for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *-latest.zip') do tar -xf "%%i" -C tmp
 if ERRORLEVEL 1 exit /b 1

-pushd tmp
+pushd tmp\libtorch

 set VC_VERSION_LOWER=14
 set VC_VERSION_UPPER=36
@ -46,4 +60,6 @@ if ERRORLEVEL 1 exit /b 1
 .\simple-torch-test.exe
 if ERRORLEVEL 1 exit /b 1

-:end
+:end
+set "PATH=%ORIG_PATH%"
+popd
--- a/.ci/pytorch/windows/internal/smoke_test.bat
+++ b/.ci/pytorch/windows/internal/smoke_test.bat
@ -71,20 +71,11 @@ if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install --pre numpy==2.1.2
 if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install --pre numpy==2.0.2 protobuf
 if "%DESIRED_PYTHON%" == "3.11" %PYTHON_EXEC% -m pip install --pre numpy==2.0.2 protobuf
 if "%DESIRED_PYTHON%" == "3.10" %PYTHON_EXEC% -m pip install --pre numpy==2.0.2 protobuf
-if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install --pre numpy==2.0.2 protobuf networkx
+if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install --pre numpy==2.0.2 protobuf

 if errorlevel 1 exit /b 1

-if "%PYTORCH_BUILD_VERSION:dev=%" NEQ "%PYTORCH_BUILD_VERSION%" (
-    set "CHANNEL=nightly"
-) else (
-    set "CHANNEL=test"
-)
-
-set "EXTRA_INDEX= "
-if "%CUDA_VERSION%" == "xpu" set "EXTRA_INDEX=--index-url https://download.pytorch.org/whl/%CHANNEL%/xpu"
-
-for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *.whl') do %PYTHON_EXEC% -m pip install "%%i" %EXTRA_INDEX%
+for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *.whl') do %PYTHON_EXEC% -m pip install "%%i"
 if errorlevel 1 exit /b 1

 goto smoke_test
--- a/.ci/pytorch/windows/internal/xpu_install.bat
+++ b/.ci/pytorch/windows/internal/xpu_install.bat
@ -47,9 +47,9 @@ set XPU_EXTRA_INSTALLED=0
 set XPU_EXTRA_UNINSTALL=0

 if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.0] (
-    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe
+    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/efc86abd-cb77-452e-a03f-a741895b8ece/intel-deep-learning-essentials-2025.0.0.336_offline.exe
    set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
-    set XPU_BUNDLE_VERSION=2025.0.1+20
+    set XPU_BUNDLE_VERSION=2025.0.0+335
    set XPU_BUNDLE_INSTALLED=0
    set XPU_BUNDLE_UNINSTALL=0
    set XPU_EXTRA_URL=NULL
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -31,9 +31,9 @@ fi
 export DOCKER_IMAGE=${DOCKER_IMAGE:-}
 if [[ -z "$DOCKER_IMAGE" ]]; then
  if [[ "$DESIRED_CUDA" == cpu ]]; then
-    export DOCKER_IMAGE="pytorch/manylinux2_28:cpu"
+    export DOCKER_IMAGE="pytorch/manylinux:cpu"
  else
-    export DOCKER_IMAGE="pytorch/manylinux2_28-builder:${DESIRED_CUDA:2}"
+    export DOCKER_IMAGE="pytorch/manylinux-builder:${DESIRED_CUDA:2}"
  fi
 fi

@ -74,12 +74,6 @@ TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)

 # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
 TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
-
-# CUDA 12.8 builds have triton for Linux and Linux aarch64 binaries.
-if [[ "$DESIRED_CUDA" == cu128 ]]; then
-  TRITON_CONSTRAINT="platform_system == 'Linux'"
-fi
-
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
  if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
@ -104,11 +98,11 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B
 fi

 # Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton xpu package
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
-    TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}"
+if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*xpu.* && $(uname) == "Linux" ]]; then
+    TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
        TRITON_SHORTHASH=$(cut -c1-8 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-xpu.txt)
-        TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}+git${TRITON_SHORTHASH}"
+        TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}+git${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
    fi
    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -55,16 +55,12 @@ s3_upload() {
    s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/"
  fi
  (
-    cache_control_flag=""
-    if [[ "${UPLOAD_CHANNEL}" = "test" ]]; then
-      cache_control_flag="--cache-control='no-cache,no-store,must-revalidate'"
-    fi
    for pkg in ${PKG_DIR}/*.${extension}; do
      (
        set -x
        shm_id=$(sha256sum "${pkg}" | awk '{print $1}')
        ${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_upload_dir}" \
-          --metadata "checksum-sha256=${shm_id}" ${cache_control_flag}
+          --metadata "checksum-sha256=${shm_id}"
      )
    done
  )
--- a/.circleci/scripts/binary_windows_arm64_build.sh
+++ b/.circleci/scripts/binary_windows_arm64_build.sh
@ -1,22 +0,0 @@
-#!/bin/bash
-set -eux -o pipefail
-
-source "${BINARY_ENV_FILE:-/c/w/env}"
-mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
-
-export USE_SCCACHE=1
-export SCCACHE_IGNORE_SERVER_IO_ERROR=1
-
-echo "Free space on filesystem before build:"
-df -h
-
-export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT"
-
-if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
-    pytorch/.ci/pytorch/windows/arm64/build_libtorch.bat
-elif [[ "$PACKAGE_TYPE" == 'wheel' ]]; then
-    pytorch/.ci/pytorch/windows/arm64/build_pytorch.bat
-fi
-
-echo "Free space on filesystem after build:"
-df -h
--- a/.circleci/scripts/binary_windows_arm64_test.sh
+++ b/.circleci/scripts/binary_windows_arm64_test.sh
@ -1,6 +0,0 @@
-#!/bin/bash
-set -eux -o pipefail
-
-source "${BINARY_ENV_FILE:-/c/w/env}"
-
-pytorch/.ci/pytorch/windows/arm64/smoke_test.bat
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@ -13,7 +13,6 @@ export VC_YEAR=2022
 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
    export USE_SCCACHE=0
    export XPU_VERSION=2025.0
-    export XPU_ENABLE_KINETO=1
 fi

 echo "Free space on filesystem before build:"
--- a/.clang-tidy
+++ b/.clang-tidy
@ -12,7 +12,6 @@ bugprone-*,
 -bugprone-macro-parentheses,
 -bugprone-lambda-function-name,
 -bugprone-reserved-identifier,
-bugprone-return-const-ref-from-parameter,
 -bugprone-swapped-arguments,
 clang-analyzer-core.*,
 clang-analyzer-cplusplus.*,
@ -25,7 +24,6 @@ cppcoreguidelines-*,
 -cppcoreguidelines-avoid-non-const-global-variables,
 -cppcoreguidelines-interfaces-global-init,
 -cppcoreguidelines-macro-usage,
-cppcoreguidelines-macro-to-enum,
 -cppcoreguidelines-owning-memory,
 -cppcoreguidelines-pro-bounds-array-to-pointer-decay,
 -cppcoreguidelines-pro-bounds-constant-array-index,
@ -57,7 +55,6 @@ modernize-*,
 -modernize-use-trailing-return-type,
 -modernize-use-nodiscard,
 performance-*,
-performance-enum-size,
 readability-container-size-empty,
 readability-delete-null-pointer,
 readability-duplicate-include
--- a/.flake8
+++ b/.flake8
@ -38,7 +38,6 @@ per-file-ignores =
    torchgen/api/types/__init__.py: F401,F403
    torchgen/executorch/api/types/__init__.py: F401,F403
    test/dynamo/test_higher_order_ops.py: B950
-    test/dynamo/test_error_messages.py: B950
    torch/testing/_internal/dynamo_test_failures.py: B950
    # TOR901 is only for test, we want to ignore it for everything else.
    # It's not easy to configure this without affecting other per-file-ignores,
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -1,13 +1,8 @@
 self-hosted-runner:
  labels:
-    # GitHub hosted runner that actionlint doesn't recognize because actionlint version (1.6.21) is too old
-    - ubuntu-24.04
    # GitHub hosted x86 Linux runners
-    # TODO: Cleanup mentions of linux.20_04 when upgrade to linux.24_04 is complete
    - linux.20_04.4x
    - linux.20_04.16x
-    - linux.24_04.4x
-    - linux.24_04.16x
    # Organization-wide AWS Linux Runners
    - linux.large
    - linux.2xlarge
@ -15,6 +10,7 @@ self-hosted-runner:
    - linux.9xlarge.ephemeral
    - am2.linux.9xlarge.ephemeral
    - linux.12xlarge
+    - linux.12xlarge.ephemeral
    - linux.24xlarge
    - linux.24xlarge.ephemeral
    - linux.arm64.2xlarge
@ -46,13 +42,10 @@ self-hosted-runner:
    - windows.8xlarge.nvidia.gpu
    - windows.8xlarge.nvidia.gpu.nonephemeral
    - windows.g5.4xlarge.nvidia.gpu
-    # Windows ARM64 runners
-    - windows-11-arm64
    # Organization-wide AMD hosted runners
    - linux.rocm.gpu
    - linux.rocm.gpu.2
    - linux.rocm.gpu.4
-    - rocm-docker
    # Repo-specific Apple hosted  runners
    - macos-m1-ultra
    - macos-m2-14
--- a/.github/actions/checkout-pytorch/action.yml
+++ b/.github/actions/checkout-pytorch/action.yml
@ -23,44 +23,9 @@ runs:
      id: check_container_runner
      run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

-    - name: Set up parallel fetch and clean workspace
-      id: first-clean
-      continue-on-error: true
+    - name: Clean workspace
      shell: bash
      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
-      env:
-        NO_SUDO: ${{ inputs.no-sudo }}
-      run: |
-        # Use all available CPUs for fetching
-        cd "${GITHUB_WORKSPACE}"
-        git config --global fetch.parallel 0
-        git config --global submodule.fetchJobs 0
-
-        # Clean workspace. The default checkout action should also do this, but
-        # do it here as well just in case
-        if [[ -d .git ]]; then
-          if [ -z "${NO_SUDO}" ]; then
-            sudo git clean -ffdx
-          else
-            git clean -ffdx
-          fi
-        fi
-
-    - name: Checkout PyTorch
-      id: first-checkout-attempt
-      continue-on-error: true
-      uses: actions/checkout@v4
-      with:
-        ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-        # --depth=1 for speed, manually fetch history and other refs as necessary
-        fetch-depth: ${{ inputs.fetch-depth }}
-        submodules: ${{ inputs.submodules }}
-        show-progress: false
-
-    - name: Clean workspace (try again)
-      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
-        (steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}
-      shell: bash
      env:
        NO_SUDO: ${{ inputs.no-sudo }}
      run: |
@ -75,11 +40,11 @@ runs:
        fi
        mkdir "${GITHUB_WORKSPACE}"

-    - name: Checkout PyTorch (try again)
+    - name: Checkout PyTorch
      uses: actions/checkout@v4
-      if: ${{ steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success' }}
      with:
        ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+        # --depth=1 for speed, manually fetch history and other refs as necessary
        fetch-depth: ${{ inputs.fetch-depth }}
        submodules: ${{ inputs.submodules }}
        show-progress: false
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@ -15,6 +15,7 @@ runs:
          -e BINARY_ENV_FILE \
          -e BUILD_ENVIRONMENT \
          -e DESIRED_CUDA \
+          -e DESIRED_DEVTOOLSET \
          -e DESIRED_PYTHON \
          -e GITHUB_ACTIONS \
          -e GPU_ARCH_TYPE \
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-318bace01aebc1f82ae13d0d133fcf9fede73383
+f084f34bbb743fada85f66b0ed8041387565e69c
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -98,7 +98,7 @@
 - test/distributed/**
 - torch/testing/_internal/distributed/**

-"release notes: distributed (checkpoint)":
+"module: distributed_checkpoint":
 - torch/distributed/checkpoint/**
 - test/distributed/checkpoint/**

--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -334,7 +334,6 @@
  - XiaobingSuper
  - jgong5
  - mingfeima
-  - EikanWang
  mandatory_checks_name:
  - EasyCLA
  - Lint
@ -367,7 +366,6 @@
  - jgong5
  - vfdev-5
  - leslie-fang-intel
-  - EikanWang
  mandatory_checks_name:
  - EasyCLA
  - Lint
@ -381,7 +379,6 @@
  approved_by:
  - leslie-fang-intel
  - jgong5
-  - EikanWang
  mandatory_checks_name:
  - EasyCLA
  - Lint
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -7,7 +7,6 @@ ciflow_push_tags:
 - ciflow/inductor
 - ciflow/inductor-periodic
 - ciflow/inductor-rocm
- ciflow/inductor-perf-test-nightly-rocm
 - ciflow/inductor-perf-compare
 - ciflow/inductor-micro-benchmark
 - ciflow/inductor-micro-benchmark-cpu-x86
@ -17,7 +16,6 @@ ciflow_push_tags:
 - ciflow/nightly
 - ciflow/periodic
 - ciflow/rocm
- ciflow/rocm-mi300
 - ciflow/s390
 - ciflow/slow
 - ciflow/trunk
--- a/.github/requirements-gha-cache.txt
+++ b/.github/requirements-gha-cache.txt
@ -5,7 +5,7 @@
 #   functorch/docs/requirements.txt
 #   .ci/docker/requirements-ci.txt
 boto3==1.35.42
-jinja2==3.1.6
+jinja2==3.1.5
 lintrunner==0.10.7
 ninja==1.10.0.post1
 nvidia-ml-py==11.525.84
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@ -123,7 +123,7 @@ def main() -> None:
    parser = ArgumentParser("Build Triton binaries")
    parser.add_argument("--release", action="store_true")
    parser.add_argument(
-        "--device", type=str, default="cuda", choices=["cuda", "rocm", "xpu", "aarch64"]
+        "--device", type=str, default="cuda", choices=["cuda", "rocm", "xpu"]
    )
    parser.add_argument("--py-version", type=str)
    parser.add_argument("--commit-hash", type=str)
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -16,15 +16,16 @@ from typing import Optional


 # NOTE: Also update the CUDA sources in tools/nightly.py when changing this list
-CUDA_ARCHES = ["11.8", "12.6", "12.8"]
-CUDA_STABLE = "12.6"
+CUDA_ARCHES = ["11.8", "12.4", "12.6", "12.8"]
 CUDA_ARCHES_FULL_VERSION = {
    "11.8": "11.8.0",
+    "12.4": "12.4.1",
    "12.6": "12.6.3",
    "12.8": "12.8.0",
 }
 CUDA_ARCHES_CUDNN_VERSION = {
    "11.8": "9",
+    "12.4": "9",
    "12.6": "9",
    "12.8": "9",
 }
@ -34,11 +35,13 @@ ROCM_ARCHES = ["6.2.4", "6.3"]

 XPU_ARCHES = ["xpu"]

+CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]
+
 CPU_AARCH64_ARCH = ["cpu-aarch64"]

 CPU_S390X_ARCH = ["cpu-s390x"]

-CUDA_AARCH64_ARCHES = ["12.8-aarch64"]
+CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64"]


 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
@ -55,6 +58,21 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
+    "12.4": (
+        "nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'"
+    ),
    "12.6": (
        "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
@ -66,7 +84,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -75,30 +93,26 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "xpu": (
-        "intel-cmplr-lib-rt==2025.0.4; platform_system == 'Linux' | "
-        "intel-cmplr-lib-ur==2025.0.4; platform_system == 'Linux' | "
-        "intel-cmplr-lic-rt==2025.0.4; platform_system == 'Linux' | "
-        "intel-sycl-rt==2025.0.4; platform_system == 'Linux' | "
-        "intel-cmplr-lib-rt==2025.0.5; platform_system == 'Windows' | "
-        "intel-cmplr-lib-ur==2025.0.5; platform_system == 'Windows' | "
-        "intel-cmplr-lic-rt==2025.0.5; platform_system == 'Windows' | "
-        "intel-sycl-rt==2025.0.5; platform_system == 'Windows' | "
+        "intel-cmplr-lib-rt==2025.0.2 | "
+        "intel-cmplr-lib-ur==2025.0.2 | "
+        "intel-cmplr-lic-rt==2025.0.2 | "
+        "intel-sycl-rt==2025.0.2 | "
        "tcmlib==1.2.0 | "
        "umf==0.9.1 | "
-        "intel-pti==0.10.1"
+        "intel-pti==0.10.0"
    ),
 }

@ -144,6 +158,8 @@ def arch_type(arch_version: str) -> str:
        return "rocm"
    elif arch_version in XPU_ARCHES:
        return "xpu"
+    elif arch_version in CPU_CXX11_ABI_ARCH:
+        return "cpu-cxx11-abi"
    elif arch_version in CPU_AARCH64_ARCH:
        return "cpu-aarch64"
    elif arch_version in CPU_S390X_ARCH:
@ -172,23 +188,31 @@ WHEEL_CONTAINER_IMAGES = {
    },
    "xpu": f"pytorch/manylinux2_28-builder:xpu-{DEFAULT_TAG}",
    "cpu": f"pytorch/manylinux2_28-builder:cpu-{DEFAULT_TAG}",
+    "cpu-cxx11-abi": f"pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-{DEFAULT_TAG}",
    "cpu-aarch64": f"pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
    "cpu-s390x": f"pytorch/manylinuxs390x-builder:cpu-s390x-{DEFAULT_TAG}",
 }

+CXX11_ABI = "cxx11-abi"
 RELEASE = "release"
 DEBUG = "debug"

-LIBTORCH_CONTAINER_IMAGES: dict[str, str] = {
+LIBTORCH_CONTAINER_IMAGES: dict[tuple[str, str], str] = {
    **{
-        gpu_arch: f"pytorch/libtorch-cxx11-builder:cuda{gpu_arch}-{DEFAULT_TAG}"
+        (
+            gpu_arch,
+            CXX11_ABI,
+        ): f"pytorch/libtorch-cxx11-builder:cuda{gpu_arch}-{DEFAULT_TAG}"
        for gpu_arch in CUDA_ARCHES
    },
    **{
-        gpu_arch: f"pytorch/libtorch-cxx11-builder:rocm{gpu_arch}-{DEFAULT_TAG}"
+        (
+            gpu_arch,
+            CXX11_ABI,
+        ): f"pytorch/libtorch-cxx11-builder:rocm{gpu_arch}-{DEFAULT_TAG}"
        for gpu_arch in ROCM_ARCHES
    },
-    "cpu": f"pytorch/libtorch-cxx11-builder:cpu-{DEFAULT_TAG}",
+    ("cpu", CXX11_ABI): f"pytorch/libtorch-cxx11-builder:cpu-{DEFAULT_TAG}",
 }

 FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
@ -198,6 +222,7 @@ def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
    return {
        "cpu": "cpu",
        "cpu-aarch64": "cpu",
+        "cpu-cxx11-abi": "cpu-cxx11-abi",
        "cpu-s390x": "cpu",
        "cuda": f"cu{gpu_arch_version.replace('.', '')}",
        "cuda-aarch64": f"cu{gpu_arch_version.replace('-aarch64', '').replace('.', '')}",
@ -212,7 +237,7 @@ def list_without(in_list: list[str], without: list[str]) -> list[str]:

 def generate_libtorch_matrix(
    os: str,
-    release_type: str,
+    abi_version: str,
    arches: Optional[list[str]] = None,
    libtorch_variants: Optional[list[str]] = None,
 ) -> list[dict[str, str]]:
@ -221,8 +246,14 @@ def generate_libtorch_matrix(
        if os == "linux":
            arches += CUDA_ARCHES
            arches += ROCM_ARCHES
+            # skip CUDA 12.8 builds for libtorch
+            if "12.8" in arches:
+                arches.remove("12.8")
        elif os == "windows":
            arches += CUDA_ARCHES
+            # skip CUDA 12.8 builds on Windows
+            if "12.8" in arches:
+                arches.remove("12.8")
    if libtorch_variants is None:
        libtorch_variants = [
            "shared-with-deps",
@ -234,6 +265,9 @@ def generate_libtorch_matrix(
    ret: list[dict[str, str]] = []
    for arch_version in arches:
        for libtorch_variant in libtorch_variants:
+            # one of the values in the following list must be exactly
+            # CXX11_ABI, but the precise value of the other one doesn't
+            # matter
            gpu_arch_type = arch_type(arch_version)
            gpu_arch_version = "" if arch_version == "cpu" else arch_version
            # ROCm builds without-deps failed even in ROCm runners; skip for now
@ -246,15 +280,16 @@ def generate_libtorch_matrix(
                    "desired_cuda": translate_desired_cuda(
                        gpu_arch_type, gpu_arch_version
                    ),
-                    "libtorch_config": release_type,
                    "libtorch_variant": libtorch_variant,
+                    "libtorch_config": abi_version if os == "windows" else "",
+                    "devtoolset": abi_version if os != "windows" else "",
                    "container_image": (
-                        LIBTORCH_CONTAINER_IMAGES[arch_version]
-                        if os not in ("windows", "windows-arm64")
+                        LIBTORCH_CONTAINER_IMAGES[(arch_version, abi_version)]
+                        if os != "windows"
                        else ""
                    ),
                    "package_type": "libtorch",
-                    "build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{release_type}".replace(
+                    "build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{abi_version}".replace(
                        ".", "_"
                    ),
                }
@ -280,9 +315,12 @@ def generate_wheels_matrix(
        # Define default compute archivectures
        arches = ["cpu"]
        if os == "linux":
-            arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
+            arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
        elif os == "windows":
            arches += CUDA_ARCHES + XPU_ARCHES
+            # skip CUDA 12.8 builds on Windows until available
+            if "12.8" in arches:
+                arches.remove("12.8")
        elif os == "linux-aarch64":
            # Separate new if as the CPU type is different and
            # uses different build/test scripts
@ -299,6 +337,7 @@ def generate_wheels_matrix(
            gpu_arch_version = (
                ""
                if arch_version == "cpu"
+                or arch_version == "cpu-cxx11-abi"
                or arch_version == "cpu-aarch64"
                or arch_version == "cpu-s390x"
                or arch_version == "xpu"
@ -310,7 +349,7 @@ def generate_wheels_matrix(
                continue

            if use_split_build and (
-                arch_version not in ["12.6", "12.8", "11.8", "cpu"] or os != "linux"
+                arch_version not in ["12.6", "12.4", "11.8", "cpu"] or os != "linux"
            ):
                raise RuntimeError(
                    "Split build is only supported on linux with cuda 12*, 11.8, and cpu.\n"
@ -321,26 +360,26 @@ def generate_wheels_matrix(
            # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install

            if (
-                arch_version in ["12.8", "12.6", "11.8"]
+                arch_version in ["12.8", "12.6", "12.4", "11.8"]
                and os == "linux"
                or arch_version in CUDA_AARCH64_ARCHES
            ):
-                desired_cuda = translate_desired_cuda(gpu_arch_type, gpu_arch_version)
                ret.append(
                    {
                        "python_version": python_version,
                        "gpu_arch_type": gpu_arch_type,
                        "gpu_arch_version": gpu_arch_version,
-                        "desired_cuda": desired_cuda,
+                        "desired_cuda": translate_desired_cuda(
+                            gpu_arch_type, gpu_arch_version
+                        ),
                        "use_split_build": "True" if use_split_build else "False",
+                        "devtoolset": "cxx11-abi",
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
                        "pytorch_extra_install_requirements": (
-                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS[
-                                f"{desired_cuda[2:4]}.{desired_cuda[4:]}"  # for cuda-aarch64: cu126 -> 12.6
-                            ]
-                            if os == "linux-aarch64"
-                            else PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]
+                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]
+                            if os != "linux-aarch64"
+                            else ""
                        ),
                        "build_name": (
                            f"{package_type}-py{python_version}-{gpu_arch_type}"
@ -350,8 +389,8 @@ def generate_wheels_matrix(
                        ),  # include special case for aarch64 build, remove the -aarch64 postfix
                    }
                )
-                # Special build building to use on Colab. Python 3.11 for 12.6 CUDA
-                if python_version == "3.11" and arch_version == CUDA_STABLE:
+                # Special build building to use on Colab. Python 3.11 for 12.4 CUDA
+                if python_version == "3.11" and arch_version == "12.4":
                    ret.append(
                        {
                            "python_version": python_version,
@ -361,6 +400,7 @@ def generate_wheels_matrix(
                                gpu_arch_type, gpu_arch_version
                            ),
                            "use_split_build": "True" if use_split_build else "False",
+                            "devtoolset": "",
                            "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                            "package_type": package_type,
                            "pytorch_extra_install_requirements": "",
@ -379,6 +419,12 @@ def generate_wheels_matrix(
                            gpu_arch_type, gpu_arch_version
                        ),
                        "use_split_build": "True" if use_split_build else "False",
+                        "devtoolset": (
+                            "cxx11-abi"
+                            if (arch_version in ["cpu-cxx11-abi", "cpu-aarch64"])
+                            or os == "linux"
+                            else ""
+                        ),
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
                        "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
@ -387,7 +433,7 @@ def generate_wheels_matrix(
                        "pytorch_extra_install_requirements": (
                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS["xpu"]
                            if gpu_arch_type == "xpu"
-                            else PYTORCH_EXTRA_INSTALL_REQUIREMENTS[CUDA_STABLE]
+                            else PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.4"]
                            if os != "linux"
                            else ""
                        ),
@ -399,4 +445,5 @@ def generate_wheels_matrix(

 validate_nccl_dep_consistency("12.8")
 validate_nccl_dep_consistency("12.6")
+validate_nccl_dep_consistency("12.4")
 validate_nccl_dep_consistency("11.8")
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -54,6 +54,7 @@ class BinaryBuildWorkflow:

    # Optional fields
    build_environment: str = ""
+    abi_version: str = ""
    ciflow_config: CIFlowConfig = field(default_factory=CIFlowConfig)
    is_scheduled: str = ""
    branches: str = "nightly"
@ -63,7 +64,12 @@ class BinaryBuildWorkflow:
    use_split_build: bool = False

    def __post_init__(self) -> None:
-        self.build_environment = f"{self.os}-binary-{self.package_type}"
+        if self.abi_version:
+            self.build_environment = (
+                f"{self.os}-binary-{self.package_type}-{self.abi_version}"
+            )
+        else:
+            self.build_environment = f"{self.os}-binary-{self.package_type}"
        if self.use_split_build:
            # added to distinguish concurrency groups
            self.build_environment += "-split"
@ -90,7 +96,6 @@ class BinaryBuildWorkflow:
 class OperatingSystem:
    LINUX = "linux"
    WINDOWS = "windows"
-    WINDOWS_ARM64 = "windows-arm64"
    MACOS = "macos"
    MACOS_ARM64 = "macos-arm64"
    LINUX_AARCH64 = "linux-aarch64"
@ -127,9 +132,10 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="libtorch",
+        abi_version=generate_binary_build_matrix.CXX11_ABI,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.LINUX,
-            generate_binary_build_matrix.RELEASE,
+            generate_binary_build_matrix.CXX11_ABI,
            libtorch_variants=["shared-with-deps"],
        ),
        ciflow_config=CIFlowConfig(
@ -145,7 +151,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
        package_type="manywheel",
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
-            arches=["11.8", "12.6", "12.8"],
+            arches=["11.8", "12.4", "12.6", "12.8"],
            python_versions=["3.9"],
        ),
        branches="main",
@ -169,9 +175,10 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="libtorch",
+        abi_version=generate_binary_build_matrix.CXX11_ABI,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.LINUX,
-            generate_binary_build_matrix.RELEASE,
+            generate_binary_build_matrix.CXX11_ABI,
            arches=["cpu"],
            libtorch_variants=["shared-with-deps"],
        ),
@ -194,6 +201,7 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS,
        package_type="libtorch",
+        abi_version=generate_binary_build_matrix.RELEASE,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.WINDOWS,
            generate_binary_build_matrix.RELEASE,
@ -207,6 +215,7 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS,
        package_type="libtorch",
+        abi_version=generate_binary_build_matrix.DEBUG,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.WINDOWS,
            generate_binary_build_matrix.DEBUG,
@ -223,6 +232,7 @@ WINDOWS_BINARY_SMOKE_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS,
        package_type="libtorch",
+        abi_version=generate_binary_build_matrix.RELEASE,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.WINDOWS,
            generate_binary_build_matrix.RELEASE,
@ -237,6 +247,7 @@ WINDOWS_BINARY_SMOKE_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS,
        package_type="libtorch",
+        abi_version=generate_binary_build_matrix.DEBUG,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.WINDOWS,
            generate_binary_build_matrix.DEBUG,
@ -250,57 +261,14 @@ WINDOWS_BINARY_SMOKE_WORKFLOWS = [
    ),
 ]

-WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS = [
-    BinaryBuildWorkflow(
-        os=OperatingSystem.WINDOWS_ARM64,
-        package_type="wheel",
-        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-            OperatingSystem.WINDOWS_ARM64,
-            arches=["cpu"],
-            python_versions=["3.12"],
-        ),
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
-            isolated_workflow=True,
-        ),
-    ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.WINDOWS_ARM64,
-        package_type="libtorch",
-        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.WINDOWS_ARM64,
-            generate_binary_build_matrix.RELEASE,
-            arches=["cpu"],
-            libtorch_variants=["shared-with-deps"],
-        ),
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
-            isolated_workflow=True,
-        ),
-    ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.WINDOWS_ARM64,
-        package_type="libtorch",
-        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.WINDOWS_ARM64,
-            generate_binary_build_matrix.DEBUG,
-            arches=["cpu"],
-            libtorch_variants=["shared-with-deps"],
-        ),
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH},
-            isolated_workflow=True,
-        ),
-    ),
-]
-
 MACOS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.MACOS_ARM64,
        package_type="libtorch",
+        abi_version=generate_binary_build_matrix.CXX11_ABI,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.MACOS,
-            generate_binary_build_matrix.RELEASE,
+            generate_binary_build_matrix.CXX11_ABI,
            libtorch_variants=["shared-with-deps"],
        ),
        cross_compile_arm64=False,
@ -387,10 +355,6 @@ def main() -> None:
            jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
            WINDOWS_BINARY_SMOKE_WORKFLOWS,
        ),
-        (
-            jinja_env.get_template("windows_arm64_binary_build_workflow.yml.j2"),
-            WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS,
-        ),
        (
            jinja_env.get_template("macos_binary_build_workflow.yml.j2"),
            MACOS_BINARY_BUILD_WORKFLOWS,
--- a/.github/scripts/get_ci_variable.py
+++ b/.github/scripts/get_ci_variable.py
@ -1,30 +0,0 @@
-#!/usr/bin/env python3
-"""Helper script - Return CI variables such as stable cuda, min python version, etc."""
-
-import argparse
-import sys
-
-
-def main(args: list[str]) -> None:
-    import generate_binary_build_matrix
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--cuda-stable-version",
-        action="store_true",
-        help="get cuda stable version",
-    )
-    parser.add_argument(
-        "--min-python-version",
-        action="store_true",
-        help="get min supported python version",
-    )
-    options = parser.parse_args(args)
-    if options.cuda_stable_version:
-        return print(generate_binary_build_matrix.CUDA_STABLE)
-    if options.min_python_version:
-        return print(generate_binary_build_matrix.FULL_PYTHON_VERSIONS[0])
-
-
-if __name__ == "__main__":
-    main(sys.argv[1:])
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -57,10 +57,10 @@ def gh_fetch_url_and_headers(
            print(
                f"""{url}
                Rate limit exceeded:
-                Used: {err.headers["X-RateLimit-Used"]}
-                Limit: {err.headers["X-RateLimit-Limit"]}
-                Remaining: {err.headers["X-RateLimit-Remaining"]}
-                Resets at: {err.headers["x-RateLimit-Reset"]}"""
+                Used: {err.headers['X-RateLimit-Used']}
+                Limit: {err.headers['X-RateLimit-Limit']}
+                Remaining: {err.headers['X-RateLimit-Remaining']}
+                Resets at: {err.headers['x-RateLimit-Reset']}"""
            )
        else:
            print(f"Error fetching {url} {err}")
--- a/.github/scripts/label_utils.py
+++ b/.github/scripts/label_utils.py
@ -63,9 +63,9 @@ def gh_get_labels(org: str, repo: str) -> list[str]:
    update_labels(labels, info)

    last_page = get_last_page_num_from_header(header)
-    assert last_page > 0, (
-        "Error reading header info to determine total number of pages of labels"
-    )
+    assert (
+        last_page > 0
+    ), "Error reading header info to determine total number of pages of labels"
    for page_number in range(2, last_page + 1):  # skip page 1
        _, info = request_for_labels(prefix + f"&page={page_number}")
        update_labels(labels, info)
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -1,6 +1,11 @@
 #!/usr/bin/env bash
 set -ex

+# The generic Linux job chooses to use base env, not the one setup by the image
+CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+eval "$(command conda 'shell.bash' 'hook' 2> /dev/null)"
+conda activate "${CONDA_ENV}"
+
 # Use uv to speed up lintrunner init
 python3 -m pip install uv==0.1.45

--- a/.github/scripts/pytest_caching_utils.py
+++ b/.github/scripts/pytest_caching_utils.py
@ -33,7 +33,7 @@ class PRIdentifier(str):
    __slots__ = ()

    def __new__(cls, value: str) -> "PRIdentifier":
-        md5 = hashlib.md5(value.encode("utf-8"), usedforsecurity=False).hexdigest()
+        md5 = hashlib.md5(value.encode("utf-8")).hexdigest()
        return super().__new__(cls, md5)


--- a/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile
+++ b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile
@ -5,50 +5,6 @@ FROM --platform=linux/amd64 docker.io/ubuntu:24.04 as ld-prefix
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get -y install ca-certificates libicu74 libssl3

-# Patched podman
-FROM --platform=linux/s390x docker.io/ubuntu:24.04 as podman
-ENV DEBIAN_FRONTEND=noninteractive
-RUN sed -i 's/^Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/ubuntu.sources
-RUN apt-get update && \
-    apt-get install -y \
-        cmake \
-        curl \
-        devscripts \
-        dpkg-dev \
-        gdb \
-        less \
-        make \
-        python3 \
-        python3-pip \
-        quilt \
-        rsync \
-        software-properties-common \
-        stress-ng \
-        vim \
-        nano \
-        wget && \
-    apt-get build-dep -y podman && \
-    apt-get source podman
-
-COPY podman-patches/podman-25245.patch /tmp/podman-25245.patch
-COPY podman-patches/podman-25102-backport.patch /tmp/podman-25102-backport.patch
-
-# import and apply patches
-# patches:
-# https://github.com/containers/podman/pull/25102
-# https://github.com/containers/podman/pull/25245
-RUN cd /libpod-* && \
-    quilt import /tmp/podman-25245.patch && quilt push && \
-    quilt import /tmp/podman-25102-backport.patch && quilt push && \
-    dch -i "Fix podman deadlock and add option to clean up build leftovers" && \
-    /bin/rm /tmp/podman-25245.patch /tmp/podman-25102-backport.patch
-
-# build patched podman
-RUN cd /libpod-* && \
-    debuild -i -us -uc -b && \
-    /bin/rm /podman-remote_*.deb && \
-    mkdir /tmp/podman && cp -v /podman*.deb /tmp/podman
-
 # Main image.
 FROM --platform=linux/s390x docker.io/ubuntu:24.04

@ -89,11 +45,7 @@ COPY fs/ /
 RUN chmod +x /usr/bin/actions-runner /usr/bin/entrypoint

 # install podman
-# RUN apt-get update && apt -y install podman podman-docker
-
-# install patched podman
-COPY --from=podman /tmp/podman /tmp/podman
-RUN apt-get update && apt -y install /tmp/podman/*.deb && /bin/rm -rfv /tmp/podman
+RUN apt -y install podman podman-docker

 # amd64 Github Actions Runner.
 RUN useradd -m actions-runner
@ -113,7 +65,7 @@ RUN virtualenv --system-site-packages venv
 #
 COPY --chown=actions-runner:actions-runner manywheel-s390x.tar /home/actions-runner/manywheel-s390x.tar

-RUN curl -L https://github.com/actions/runner/releases/download/v2.322.0/actions-runner-linux-x64-2.322.0.tar.gz | tar -xz
+RUN curl -L https://github.com/actions/runner/releases/download/v2.317.0/actions-runner-linux-x64-2.317.0.tar.gz | tar -xz

 ENTRYPOINT ["/usr/bin/entrypoint"]
 CMD ["/usr/bin/actions-runner"]
--- a/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25102-backport.patch
+++ b/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25102-backport.patch
@ -1,358 +0,0 @@
-diff --git a/cmd/podman/system/prune.go b/cmd/podman/system/prune.go
-index f7cf7b551..739f87cde 100644
--- a/cmd/podman/system/prune.go
-+++ b/cmd/podman/system/prune.go
-@@ -48,6 +48,7 @@ func init() {
- 	flags.BoolVarP(&force, "force", "f", false, "Do not prompt for confirmation.  The default is false")
- 	flags.BoolVarP(&pruneOptions.All, "all", "a", false, "Remove all unused data")
- 	flags.BoolVar(&pruneOptions.External, "external", false, "Remove container data in storage not controlled by podman")
-+	flags.BoolVar(&pruneOptions.Build, "build", false, "Remove build containers")
- 	flags.BoolVar(&pruneOptions.Volume, "volumes", false, "Prune volumes")
- 	filterFlagName := "filter"
- 	flags.StringArrayVar(&filters, filterFlagName, []string{}, "Provide filter values (e.g. 'label=<key>=<value>')")
-@@ -64,8 +65,12 @@ func prune(cmd *cobra.Command, args []string) error {
- 			volumeString = `
- 	- all volumes not used by at least one container`
- 		}
-
-		fmt.Printf(createPruneWarningMessage(pruneOptions), volumeString, "Are you sure you want to continue? [y/N] ")
-+		buildString := ""
-+		if pruneOptions.Build {
-+			buildString = `
-+	- all build containers`
-+		}
-+		fmt.Printf(createPruneWarningMessage(pruneOptions), volumeString, buildString, "Are you sure you want to continue? [y/N] ")
- 
- 		answer, err := reader.ReadString('\n')
- 		if err != nil {
-@@ -124,7 +129,7 @@ func createPruneWarningMessage(pruneOpts entities.SystemPruneOptions) string {
- 	if pruneOpts.All {
- 		return `WARNING! This command removes:
- 	- all stopped containers
-	- all networks not used by at least one container%s
-+	- all networks not used by at least one container%s%s
- 	- all images without at least one container associated with them
- 	- all build cache
- 
-@@ -132,7 +137,7 @@ func createPruneWarningMessage(pruneOpts entities.SystemPruneOptions) string {
- 	}
- 	return `WARNING! This command removes:
- 	- all stopped containers
-	- all networks not used by at least one container%s
-+	- all networks not used by at least one container%s%s
- 	- all dangling images
- 	- all dangling build cache
- 
-diff --git a/docs/source/markdown/podman-system-prune.1.md b/docs/source/markdown/podman-system-prune.1.md
-index 52f9ec1c7..95099d018 100644
--- a/docs/source/markdown/podman-system-prune.1.md
-+++ b/docs/source/markdown/podman-system-prune.1.md
-@@ -7,20 +7,28 @@ podman\-system\-prune - Remove all unused pods, containers, images, networks, an
- **podman system prune** [*options*]
- 
- ## DESCRIPTION
-**podman system prune** removes all unused containers (both dangling and unreferenced), pods, networks, and optionally, volumes from local storage.
-+**podman system prune** removes all unused containers (both dangling and unreferenced), build containers, pods, networks, and optionally, volumes from local storage.
- 
- Use the **--all** option to delete all unused images.  Unused images are dangling images as well as any image that does not have any containers based on it.
- 
- By default, volumes are not removed to prevent important data from being deleted if there is currently no container using the volume. Use the **--volumes** flag when running the command to prune volumes as well.
- 
-+By default, build containers are not removed to prevent interference with builds in progress. Use the **--build** flag when running the command to remove build containers as well.
-+
- ## OPTIONS
- #### **--all**, **-a**
- 
- Recursively remove all unused pods, containers, images, networks, and volume data. (Maximum 50 iterations.)
- 
-+#### **--build**
-+
-+Removes any build containers that were created during the build, but were not removed because the build was unexpectedly terminated.
-+
-+Note: **This is not safe operation and should be executed only when no builds are in progress. It can interfere with builds in progress.**
-+
- #### **--external**
- 
-Removes all leftover container storage files from local storage not managed by Podman. In normal circumstances, no such data exists, but in case of an unclean shutdown, the Podman database may be corrupted and cause this.
-+Tries to clean up remainders of previous containers or layers that are not references in the storage json files. These can happen in the case of unclean shutdowns or regular restarts in transient storage mode.
- 
- However, when using transient storage mode, the Podman database does not persist. This means containers leave the writable layers on disk after a reboot. When using a transient store, it is recommended that the **podman system prune --external** command is run during boot.
- 
-diff --git a/libpod/runtime.go b/libpod/runtime.go
-index 986e40f60..609fbba57 100644
--- a/libpod/runtime.go
-+++ b/libpod/runtime.go
-@@ -33,6 +33,7 @@ import (
- 	"github.com/containers/podman/v4/libpod/lock"
- 	"github.com/containers/podman/v4/libpod/plugin"
- 	"github.com/containers/podman/v4/libpod/shutdown"
-+	"github.com/containers/podman/v4/pkg/domain/entities/reports"
- 	"github.com/containers/podman/v4/pkg/rootless"
- 	"github.com/containers/podman/v4/pkg/systemd"
- 	"github.com/containers/podman/v4/pkg/util"
-@@ -1250,3 +1251,52 @@ func (r *Runtime) LockConflicts() (map[uint32][]string, []uint32, error) {
- 
- 	return toReturn, locksHeld, nil
- }
-+
-+// Exists checks whether a file or directory exists at the given path.
-+// If the path is a symlink, the symlink is followed.
-+func Exists(path string) error {
-+	// It uses unix.Faccessat which is a faster operation compared to os.Stat for
-+	// simply checking the existence of a file.
-+	err := unix.Faccessat(unix.AT_FDCWD, path, unix.F_OK, 0)
-+	if err != nil {
-+		return &os.PathError{Op: "faccessat", Path: path, Err: err}
-+	}
-+	return nil
-+}
-+
-+// PruneBuildContainers removes any build containers that were created during the build,
-+// but were not removed because the build was unexpectedly terminated.
-+//
-+// Note: This is not safe operation and should be executed only when no builds are in progress. It can interfere with builds in progress.
-+func (r *Runtime) PruneBuildContainers() ([]*reports.PruneReport, error) {
-+	stageContainersPruneReports := []*reports.PruneReport{}
-+
-+	containers, err := r.store.Containers()
-+	if err != nil {
-+		return stageContainersPruneReports, err
-+	}
-+	for _, container := range containers {
-+		path, err := r.store.ContainerDirectory(container.ID)
-+		if err != nil {
-+			return stageContainersPruneReports, err
-+		}
-+		if err := Exists(filepath.Join(path, "buildah.json")); err != nil {
-+			continue
-+		}
-+
-+		report := &reports.PruneReport{
-+			Id: container.ID,
-+		}
-+		size, err := r.store.ContainerSize(container.ID)
-+		if err != nil {
-+			report.Err = err
-+		}
-+		report.Size = uint64(size)
-+
-+		if err := r.store.DeleteContainer(container.ID); err != nil {
-+			report.Err = errors.Join(report.Err, err)
-+		}
-+		stageContainersPruneReports = append(stageContainersPruneReports, report)
-+	}
-+	return stageContainersPruneReports, nil
-+}
-diff --git a/pkg/api/handlers/libpod/system.go b/pkg/api/handlers/libpod/system.go
-index 70d4493f8..7c129b1ba 100644
--- a/pkg/api/handlers/libpod/system.go
-+++ b/pkg/api/handlers/libpod/system.go
-@@ -22,6 +22,7 @@ func SystemPrune(w http.ResponseWriter, r *http.Request) {
- 		All      bool `schema:"all"`
- 		Volumes  bool `schema:"volumes"`
- 		External bool `schema:"external"`
-+		Build    bool `schema:"build"`
- 	}{}
- 
- 	if err := decoder.Decode(&query, r.URL.Query()); err != nil {
-@@ -43,6 +44,7 @@ func SystemPrune(w http.ResponseWriter, r *http.Request) {
- 		Volume:   query.Volumes,
- 		Filters:  *filterMap,
- 		External: query.External,
-+		Build:    query.Build,
- 	}
- 	report, err := containerEngine.SystemPrune(r.Context(), pruneOptions)
- 	if err != nil {
-diff --git a/pkg/bindings/system/types.go b/pkg/bindings/system/types.go
-index 89e093f68..b4a4ff064 100644
--- a/pkg/bindings/system/types.go
-+++ b/pkg/bindings/system/types.go
-@@ -18,6 +18,7 @@ type PruneOptions struct {
- 	Filters  map[string][]string
- 	Volumes  *bool
- 	External *bool
-+	Build    *bool
- }
- 
- // VersionOptions are optional options for getting version info
-diff --git a/pkg/bindings/system/types_prune_options.go b/pkg/bindings/system/types_prune_options.go
-index d00498520..5f3bd652c 100644
--- a/pkg/bindings/system/types_prune_options.go
-+++ b/pkg/bindings/system/types_prune_options.go
-@@ -76,3 +76,18 @@ func (o *PruneOptions) GetExternal() bool {
- 	}
- 	return *o.External
- }
-+
-+// WithBuild set field Build to given value
-+func (o *PruneOptions) WithBuild(value bool) *PruneOptions {
-+	o.Build = &value
-+	return o
-+}
-+
-+// GetBuild returns value of field Build
-+func (o *PruneOptions) GetBuild() bool {
-+	if o.Build == nil {
-+		var z bool
-+		return z
-+	}
-+	return *o.Build
-+}
-diff --git a/pkg/domain/entities/system.go b/pkg/domain/entities/system.go
-index 473db3530..f6938652a 100644
--- a/pkg/domain/entities/system.go
-+++ b/pkg/domain/entities/system.go
-@@ -22,6 +22,7 @@ type SystemPruneOptions struct {
- 	Volume   bool
- 	Filters  map[string][]string `json:"filters" schema:"filters"`
- 	External bool
-+	Build    bool
- }
- 
- // SystemPruneReport provides report after system prune is executed.
-diff --git a/pkg/domain/infra/abi/system.go b/pkg/domain/infra/abi/system.go
-index 24ee64d29..ea3e5f203 100644
--- a/pkg/domain/infra/abi/system.go
-+++ b/pkg/domain/infra/abi/system.go
-@@ -150,16 +150,16 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool)
- 	return nil
- }
- 
-// SystemPrune removes unused data from the system. Pruning pods, containers, networks, volumes and images.
-+// SystemPrune removes unused data from the system. Pruning pods, containers, build container, networks, volumes and images.
- func (ic *ContainerEngine) SystemPrune(ctx context.Context, options entities.SystemPruneOptions) (*entities.SystemPruneReport, error) {
- 	var systemPruneReport = new(entities.SystemPruneReport)
- 
- 	if options.External {
-		if options.All || options.Volume || len(options.Filters) > 0 {
-+		if options.All || options.Volume || len(options.Filters) > 0 || options.Build {
- 			return nil, fmt.Errorf("system prune --external cannot be combined with other options")
- 		}
-		err := ic.Libpod.GarbageCollect()
-		if err != nil {
-+
-+		if err := ic.Libpod.GarbageCollect(); err != nil {
- 			return nil, err
- 		}
- 		return systemPruneReport, nil
-@@ -170,6 +170,17 @@ func (ic *ContainerEngine) SystemPrune(ctx context.Context, options entities.Sys
- 		filters = append(filters, fmt.Sprintf("%s=%s", k, v[0]))
- 	}
- 	reclaimedSpace := (uint64)(0)
-+
-+	// Prune Build Containers
-+	if options.Build {
-+		stageContainersPruneReports, err := ic.Libpod.PruneBuildContainers()
-+		if err != nil {
-+			return nil, err
-+		}
-+		reclaimedSpace += reports.PruneReportsSize(stageContainersPruneReports)
-+		systemPruneReport.ContainerPruneReports = append(systemPruneReport.ContainerPruneReports, stageContainersPruneReports...)
-+	}
-+
- 	found := true
- 	for found {
- 		found = false
-diff --git a/pkg/domain/infra/tunnel/system.go b/pkg/domain/infra/tunnel/system.go
-index fc82e7b2b..142a9fa5c 100644
--- a/pkg/domain/infra/tunnel/system.go
-+++ b/pkg/domain/infra/tunnel/system.go
-@@ -19,7 +19,7 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool)
- 
- // SystemPrune prunes unused data from the system.
- func (ic *ContainerEngine) SystemPrune(ctx context.Context, opts entities.SystemPruneOptions) (*entities.SystemPruneReport, error) {
-	options := new(system.PruneOptions).WithAll(opts.All).WithVolumes(opts.Volume).WithFilters(opts.Filters).WithExternal(opts.External)
-+	options := new(system.PruneOptions).WithAll(opts.All).WithVolumes(opts.Volume).WithFilters(opts.Filters).WithExternal(opts.External).WithBuild(opts.Build)
- 	return system.Prune(ic.ClientCtx, options)
- }
- 
-diff --git a/test/e2e/prune_test.go b/test/e2e/prune_test.go
-index 01e848478..57bd5582d 100644
--- a/test/e2e/prune_test.go
-+++ b/test/e2e/prune_test.go
-@@ -4,6 +4,8 @@ import (
- 	"fmt"
- 	"os"
- 	"path/filepath"
-+	"syscall"
-+	"time"
- 
- 	. "github.com/containers/podman/v4/test/utils"
- 	. "github.com/onsi/ginkgo/v2"
-@@ -22,6 +24,11 @@ FROM scratch
- ENV test1=test1
- ENV test2=test2`
- 
-+var longBuildImage = fmt.Sprintf(`
-+FROM %s
-+RUN echo "Hello, World!"
-+RUN RUN echo "Please use signal 9 this will never ends" && sleep 10000s`, ALPINE)
-+
- var _ = Describe("Podman prune", func() {
- 
- 	It("podman container prune containers", func() {
-@@ -593,4 +600,63 @@ var _ = Describe("Podman prune", func() {
- 		Expect(err).ToNot(HaveOccurred())
- 		Expect(dirents).To(HaveLen(3))
- 	})
-+
-+	It("podman system prune --build clean up after terminated build", func() {
-+		useCustomNetworkDir(podmanTest, tempdir)
-+
-+		podmanTest.BuildImage(pruneImage, "alpine_notleaker:latest", "false")
-+
-+		create := podmanTest.Podman([]string{"create", "--name", "test", BB, "sleep", "10000"})
-+		create.WaitWithDefaultTimeout()
-+		Expect(create).Should(ExitCleanly())
-+
-+		containerFilePath := filepath.Join(podmanTest.TempDir, "ContainerFile-podman-leaker")
-+		err := os.WriteFile(containerFilePath, []byte(longBuildImage), 0755)
-+		Expect(err).ToNot(HaveOccurred())
-+
-+		build := podmanTest.Podman([]string{"build", "-f", containerFilePath, "-t", "podmanleaker"})
-+		// Build will never finish so let's wait for build to ask for SIGKILL to simulate a failed build that leaves stage containers.
-+		matchedOutput := false
-+		for range 900 {
-+			if build.LineInOutputContains("Please use signal 9") {
-+				matchedOutput = true
-+				build.Signal(syscall.SIGKILL)
-+				break
-+			}
-+			time.Sleep(100 * time.Millisecond)
-+		}
-+		if !matchedOutput {
-+			Fail("Did not match special string in podman build")
-+		}
-+
-+		// Check Intermediate image of stage container
-+		none := podmanTest.Podman([]string{"images", "-a"})
-+		none.WaitWithDefaultTimeout()
-+		Expect(none).Should(ExitCleanly())
-+		Expect(none.OutputToString()).Should(ContainSubstring("none"))
-+
-+		// Check if Container and Stage Container exist
-+		count := podmanTest.Podman([]string{"ps", "-aq", "--external"})
-+		count.WaitWithDefaultTimeout()
-+		Expect(count).Should(ExitCleanly())
-+		Expect(count.OutputToStringArray()).To(HaveLen(3))
-+
-+		prune := podmanTest.Podman([]string{"system", "prune", "--build", "-f"})
-+		prune.WaitWithDefaultTimeout()
-+		Expect(prune).Should(ExitCleanly())
-+
-+		// Container should still exist, but no stage containers
-+		count = podmanTest.Podman([]string{"ps", "-aq", "--external"})
-+		count.WaitWithDefaultTimeout()
-+		Expect(count).Should(ExitCleanly())
-+		Expect(count.OutputToString()).To(BeEmpty())
-+
-+		Expect(podmanTest.NumberOfContainers()).To(Equal(0))
-+
-+		after := podmanTest.Podman([]string{"images", "-a"})
-+		after.WaitWithDefaultTimeout()
-+		Expect(after).Should(ExitCleanly())
-+		Expect(after.OutputToString()).ShouldNot(ContainSubstring("none"))
-+		Expect(after.OutputToString()).Should(ContainSubstring("notleaker"))
-+	})
- })
-
--- a/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25245.patch
+++ b/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25245.patch
@ -1,21 +0,0 @@
-diff --git a/pkg/rootless/rootless_linux.c b/pkg/rootless/rootless_linux.c
-index 4f71d49e5c..3d74af6a6c 100644
--- a/pkg/rootless/rootless_linux.c
-+++ b/pkg/rootless/rootless_linux.c
-@@ -658,7 +658,7 @@ create_pause_process (const char *pause_pid_file_path, char **argv)
-   if (pipe (p) < 0)
-     return -1;
-
-  pid = fork ();
-+  pid = syscall_clone (SIGCHLD, NULL);
-   if (pid < 0)
-     {
-       close (p[0]);
-@@ -689,7 +689,7 @@ create_pause_process (const char *pause_pid_file_path, char **argv)
-       close (p[0]);
-
-       setsid ();
-      pid = fork ();
-+      pid = syscall_clone (SIGCHLD, NULL);
-       if (pid < 0)
-         _exit (EXIT_FAILURE);
--- a/.github/scripts/s390x-ci/tests_list.py
+++ b/.github/scripts/s390x-ci/tests_list.py
@ -1,97 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import re
-import sys
-
-
-sys.path.insert(1, os.path.join(sys.path[0], "..", "..", ".."))
-
-from tools.testing.discover_tests import TESTS
-
-
-skip_list = [
-    # these tests fail due to various reasons
-    "dynamo/test_misc",
-    "inductor/test_aot_inductor",
-    "inductor/test_cpu_repro",
-    "inductor/test_cpu_select_algorithm",
-    "inductor/test_aot_inductor_arrayref",
-    "inductor/test_torchinductor_codegen_dynamic_shapes",
-    "lazy/test_meta_kernel",
-    "onnx/test_utility_funs",
-    "profiler/test_profiler",
-    "test_ao_sparsity",
-    "test_cpp_extensions_open_device_registration",
-    "test_jit",
-    "test_metal",
-    "test_mps",
-    "dynamo/test_torchrec",
-    "inductor/test_aot_inductor_utils",
-    "inductor/test_coordinate_descent_tuner",
-    "test_jiterator",
-    # these tests run long and fail in addition to that
-    "dynamo/test_dynamic_shapes",
-    "test_quantization",
-    "inductor/test_torchinductor",
-    "inductor/test_torchinductor_dynamic_shapes",
-    "inductor/test_torchinductor_opinfo",
-    "test_binary_ufuncs",
-    "test_unary_ufuncs",
-    # these tests fail when cuda is not available
-    "inductor/test_cudacodecache",
-    "inductor/test_inductor_utils",
-    "inductor/test_inplacing_pass",
-    "inductor/test_kernel_benchmark",
-    "inductor/test_max_autotune",
-    "inductor/test_move_constructors_to_cuda",
-    "inductor/test_multi_kernel",
-    "inductor/test_pattern_matcher",
-    "inductor/test_perf",
-    "inductor/test_select_algorithm",
-    "inductor/test_snode_runtime",
-    "inductor/test_triton_wrapper",
-    # these tests fail when mkldnn is not available
-    "inductor/test_custom_post_grad_passes",
-    "inductor/test_mkldnn_pattern_matcher",
-    # lacks quantization support
-    "onnx/test_models_quantized_onnxruntime",
-    "onnx/test_pytorch_onnx_onnxruntime",
-    # https://github.com/pytorch/pytorch/issues/102078
-    "test_decomp",
-    # https://github.com/pytorch/pytorch/issues/146698
-    "test_model_exports_to_core_aten",
-    # runs very long, skip for now
-    "inductor/test_layout_optim",
-    "test_fx",
-    # some false errors
-    "doctests",
-]
-
-skip_list_regex = [
-    # distributed tests fail randomly
-    "distributed/.*",
-]
-
-all_testfiles = sorted(TESTS)
-
-filtered_testfiles = []
-
-for filename in all_testfiles:
-    if filename in skip_list:
-        continue
-
-    regex_filtered = False
-
-    for regex_string in skip_list_regex:
-        if re.fullmatch(regex_string, filename):
-            regex_filtered = True
-            break
-
-    if regex_filtered:
-        continue
-
-    filtered_testfiles.append(filename)
-
-for filename in filtered_testfiles:
-    print('    "' + filename + '",')
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -485,7 +485,7 @@ def get_check_run_name_prefix(workflow_run: Any) -> str:
    if workflow_run is None:
        return ""
    else:
-        return f"{workflow_run['workflow']['name']} / "
+        return f'{workflow_run["workflow"]["name"]} / '


 def is_passing_status(status: Optional[str]) -> bool:
@ -545,7 +545,7 @@ def add_workflow_conclusions(
                    if not isinstance(checkrun_node, dict):
                        warn(f"Expected dictionary, but got {type(checkrun_node)}")
                        continue
-                    checkrun_name = f"{get_check_run_name_prefix(workflow_run)}{checkrun_node['name']}"
+                    checkrun_name = f'{get_check_run_name_prefix(workflow_run)}{checkrun_node["name"]}'
                    existing_checkrun = workflow_obj.jobs.get(checkrun_name)
                    if existing_checkrun is None or not is_passing_status(
                        existing_checkrun.status
@ -819,9 +819,10 @@ class GitHubPR:
                    cursor=info["reviews"]["pageInfo"]["startCursor"],
                )
                info = rc["data"]["repository"]["pullRequest"]
-        reviews = {
-            author: state for author, state in self._reviews if state != "COMMENTED"
-        }
+        reviews = {}
+        for author, state in self._reviews:
+            if state != "COMMENTED":
+                reviews[author] = state
        return list(reviews.items())

    def get_approved_by(self) -> list[str]:
@ -1223,17 +1224,9 @@ class GitHubPR:
        if not self.is_ghstack_pr():
            msg = self.gen_commit_message()
            pr_branch_name = f"__pull-request-{self.pr_num}__init__"
-            repo.fetch(self.last_commit()["oid"], pr_branch_name)
+            repo.fetch(f"pull/{self.pr_num}/head", pr_branch_name)
            repo._run_git("merge", "--squash", pr_branch_name)
            repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
-
-            # Did the PR change since we started the merge?
-            pulled_sha = repo.show_ref(pr_branch_name)
-            latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
-            if pulled_sha != latest_pr_status.last_commit()["oid"]:
-                raise RuntimeError(
-                    "PR has been updated since CI checks last passed. Please rerun the merge command."
-                )
            return []
        else:
            return self.merge_ghstack_into(
@ -1514,36 +1507,6 @@ def checks_to_markdown_bullets(
    ]


-def post_starting_merge_comment(
-    repo: GitRepo,
-    pr: GitHubPR,
-    explainer: TryMergeExplainer,
-    dry_run: bool,
-    ignore_current_checks_info: Optional[
-        list[tuple[str, Optional[str], Optional[int]]]
-    ] = None,
-) -> None:
-    """Post the initial merge starting message on the PR. Also post a short
-    message on all PRs in the stack."""
-    gh_post_pr_comment(
-        pr.org,
-        pr.project,
-        pr.pr_num,
-        explainer.get_merge_message(ignore_current_checks_info),
-        dry_run=dry_run,
-    )
-    if pr.is_ghstack_pr():
-        for additional_prs, _ in get_ghstack_prs(repo, pr):
-            if additional_prs.pr_num != pr.pr_num:
-                gh_post_pr_comment(
-                    additional_prs.org,
-                    additional_prs.project,
-                    additional_prs.pr_num,
-                    f"Starting merge as part of PR stack under #{pr.pr_num}",
-                    dry_run=dry_run,
-                )
-
-
 def manually_close_merged_pr(
    pr: GitHubPR,
    additional_merged_prs: list[GitHubPR],
@ -2167,7 +2130,13 @@ def merge(
    check_for_sev(pr.org, pr.project, skip_mandatory_checks)

    if skip_mandatory_checks:
-        post_starting_merge_comment(repo, pr, explainer, dry_run)
+        gh_post_pr_comment(
+            pr.org,
+            pr.project,
+            pr.pr_num,
+            explainer.get_merge_message(),
+            dry_run=dry_run,
+        )
        return pr.merge_into(
            repo,
            dry_run=dry_run,
@ -2190,12 +2159,12 @@ def merge(
        )
        ignore_current_checks_info = failing

-    post_starting_merge_comment(
-        repo,
-        pr,
-        explainer,
-        dry_run,
-        ignore_current_checks_info=ignore_current_checks_info,
+    gh_post_pr_comment(
+        pr.org,
+        pr.project,
+        pr.pr_num,
+        explainer.get_merge_message(ignore_current_checks_info),
+        dry_run=dry_run,
    )

    start_time = time.time()
@ -2281,8 +2250,7 @@ def merge(
        except MandatoryChecksMissingError as ex:
            last_exception = str(ex)
            print(
-                f"Merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} failed due to: {ex}. Retrying in 5 min",
-                flush=True,
+                f"Merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} failed due to: {ex}. Retrying in 5 min"
            )
            time.sleep(5 * 60)
    # Finally report timeout back
--- a/.github/scripts/trymerge_explainer.py
+++ b/.github/scripts/trymerge_explainer.py
@ -79,7 +79,7 @@ class TryMergeExplainer:
            (
                "<details><summary>Advanced Debugging</summary>",
                "Check the merge workflow status ",
-                f'<a href="{os.getenv("GH_RUN_URL")}">here</a>',
+                f"<a href=\"{os.getenv('GH_RUN_URL')}\">here</a>",
                "</details>",
            )
        )
--- a/.github/scripts/windows/build_triton.bat
+++ b/.github/scripts/windows/build_triton.bat
@ -1,17 +0,0 @@
-@echo on
-
-set PYTHON_PREFIX=%PY_VERS:.=%
-set PYTHON_PREFIX=py%PYTHON_PREFIX:;=;py%
-call .ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
-:: Create a new conda environment
-if "%PY_VERS%" == "3.13t" (
-    call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python-freethreading python=3.13
-) else (
-    call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python=%PY_VERS%
-)
-call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake setuptools==72.1.0 ninja
-
-dir "%VC_INSTALL_PATH%"
-
-call "%VC_INSTALL_PATH%\VC\Auxiliary\Build\vcvarsall.bat" x64
-call conda run -n %PYTHON_PREFIX% python .github/scripts/build_triton_wheel.py --device=%BUILD_DEVICE% %RELEASE%
--- a/.github/scripts/windows/install_vs2022.ps1
+++ b/.github/scripts/windows/install_vs2022.ps1
@ -1,35 +0,0 @@
-#Requires -RunAsAdministrator
-
-# Enable long paths on Windows
-Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
-
-$VC_VERSION_major = [int] ${env:VC_VERSION}.split(".")[0]
-$VC_DOWNLOAD_LINK = "https://aka.ms/vs/$VC_VERSION_major/release/vs_BuildTools.exe"
-$VC_INSTALL_ARGS = @("--nocache","--quiet","--norestart","--wait", "--add Microsoft.VisualStudio.Workload.VCTools",
-                                                    "--add Microsoft.Component.MSBuild",
-                                                    "--add Microsoft.VisualStudio.Component.Roslyn.Compiler",
-                                                    "--add Microsoft.VisualStudio.Component.TextTemplating",
-                                                    "--add Microsoft.VisualStudio.Component.VC.CoreBuildTools",
-                                                    "--add Microsoft.VisualStudio.Component.VC.CoreIde",
-                                                    "--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest",
-                                                    "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core",
-                                                    "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
-                                                    "--add Microsoft.VisualStudio.Component.Windows11SDK.22621")
-
-
-echo "Downloading Visual Studio installer from $VC_DOWNLOAD_LINK."
-curl.exe --retry 3 -kL $VC_DOWNLOAD_LINK --output vs_installer.exe
-if ($LASTEXITCODE -ne 0) {
-    echo "Download of the VS ${env:VC_YEAR} Version ${env:VC_VERSION} installer failed"
-    exit 1
-}
-$InstallationPath = ${env:VC_INSTALL_PATH}
-$VC_INSTALL_ARGS = "--installPath `"$InstallationPath`"" + " " + $VC_INSTALL_ARGS
-echo "Installing Visual Studio version ${env:VC_VERSION} in $InstallationPath."
-$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VC_INSTALL_ARGS -NoNewWindow -Wait -PassThru
-Remove-Item -Path vs_installer.exe -Force
-$exitCode = $process.ExitCode
-if (($exitCode -ne 0) -and ($exitCode -ne 3010)) {
-    echo "VS ${env:VC_YEAR} installer exited with code $exitCode, which should be one of [0, 3010]."
-    exit 1
-}
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@ -4,7 +4,6 @@
 {%- set download_artifact_action = "actions/download-artifact@v4.1.7" -%}

 {%- set timeout_minutes = 240 -%}
-{%- set timeout_minutes_windows_binary = 300 -%}

 {%- macro concurrency(build_environment) -%}
 concurrency:
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -111,10 +111,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      {%- elif config["gpu_arch_type"] == "rocm" %}
      runs_on: linux.rocm.gpu
-      {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] == "12.8" %}
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 build needs sm_70+ runner
-      {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] != "12.8"%}
+      {%- elif config["gpu_arch_type"] == "cuda" %}
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
      {%- else %}
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@ -25,6 +25,9 @@
      DOCKER_IMAGE: !{{ config["container_image"] }}
 {%- endif %}
 {%- if config["package_type"] == "manywheel" %}
+  {%- if config["devtoolset"] %}
+      DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }}
+  {%- endif %}
  {%- if config.use_split_build is defined %}
      use_split_build: !{{ config["use_split_build"] }}
  {%- endif %}
@ -34,6 +37,9 @@
      LIBTORCH_CONFIG: !{{ config["libtorch_config"] }}
  {%- endif %}
      LIBTORCH_VARIANT: !{{ config["libtorch_variant"] }}
+  {%- if config["devtoolset"] %}
+      DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }}
+  {%- endif %}
  {%- if is_windows %}
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
--- a/.github/templates/windows_arm64_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_arm64_binary_build_workflow.yml.j2
@ -1,197 +0,0 @@
-{% import 'common.yml.j2' as common %}
-{% import 'upload.yml.j2' as upload %}
-
-{%- block name -%}
-# Template is at:    .github/templates/windows_arm64_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: !{{ build_environment }}
-{%- endblock %}
-
-{%- macro set_runner_specific_vars() -%}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: cmd
-        run: |
-          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
-          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
-          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
-{%- endmacro %}
-
-on:
-  push:
-    branches:
-      - !{{ branches }}
-    {%- if branches == "nightly" %}
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-    {%- endif %}
-{%- for label in ciflow_config.labels | sort %}
-    {%- if loop.first and branches != "nightly" %}
-    tags:
-    {%- endif %}
-      - '!{{ label }}/*'
-{%- endfor %}
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: !{{ build_environment }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
-  PYTORCH_ROOT: /pytorch
-  DOWNLOADS_DIR: c:\temp\downloads
-  DEPENDENCIES_DIR: c:\temp\dependencies
-  ENABLE_APL: 1
-  ENABLE_OPENBLAS: 0
-  MSVC_VERSION : 14.42
-  AWS_DEFAULT_REGION: us-east-1
-
-jobs:
-  get-label-type:
-    if: github.repository_owner == 'pytorch'
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-
-{%- for config in build_configs %}
-  !{{ config["build_name"] }}-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: get-label-type
-    runs-on: "windows-11-arm64"
-    timeout-minutes: !{{ common.timeout_minutes }}
-    !{{ upload.binary_env(config, True) }}
-    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
-    {%- endif %}
-    steps:
-      !{{ set_runner_specific_vars() }}
-      - name: Bootstrap folders
-        shell: cmd
-        run: |
-          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
-          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
-      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
-      - name: Bootstrap Git
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
-      - name: Git checkout PyTorch - recursive
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-          submodules: recursive
-      - name: Bootstrap Python
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
-      - name: Bootstrap APL
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
-      - name: Bootstrap Rust
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
-      - name: Bootstrap sccache
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat"
-      - name: Bootstrap Libuv
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat"
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "pytorch/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "pytorch/.circleci/scripts/binary_windows_arm64_build.sh"
-      - uses: !{{ common.upload_artifact_action }}
-        if: always()
-        with:
-          name: !{{ config["build_name"] }}
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  !{{ config["build_name"] }}-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - !{{ config["build_name"] }}-build
-      - get-label-type
-    runs-on: "windows-11-arm64"
-    timeout-minutes: !{{ common.timeout_minutes }}
-    !{{ upload.binary_env(config, True) }}
-    steps:
-      !{{ set_runner_specific_vars() }}
-      - uses: !{{ common.download_artifact_action }}
-        name: Download Build Artifacts
-        with:
-          name: !{{ config["build_name"] }}
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Bootstrap Git
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
-      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-          submodules: recursive
-      - name: Bootstrap APL
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
-      - name: Bootstrap Python
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
-      - name: Bootstrap Rust
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "pytorch/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "pytorch/.circleci/scripts/binary_windows_arm64_test.sh"
-  {%- if branches == "nightly" %}
-  !{{ upload.upload_binaries(config, True) }}
-  {%- endif %}
-{%- endfor %}
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -71,7 +71,7 @@ jobs:
    {%- else %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    {%- endif %}
-    timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
+    timeout-minutes: !{{ common.timeout_minutes }}
    !{{ upload.binary_env(config, True) }}
    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
@ -107,14 +107,10 @@ jobs:
 {%- else %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge.nonephemeral"
 {%- endif %}
-{%- else %}
-{%- if branches == "nightly" %}
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
 {%- else %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
 {%- endif %}
-{%- endif %}
-    timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
+    timeout-minutes: !{{ common.timeout_minutes }}
    !{{ upload.binary_env(config, True) }}
    steps:
      !{{ common.setup_ec2_windows() }}
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -18,7 +18,7 @@ on:
        description: prefix for runner label
      runs_on:
        required: false
-        default: linux.12xlarge.memory.ephemeral
+        default: linux.12xlarge.ephemeral
        type: string
        description: Hardware to run this "build" job on, linux.12xlarge or linux.arm64.2xlarge.
      timeout-minutes:
@ -70,6 +70,10 @@ on:
        required: false
        type: string
        description: Desired libtorch variant (for libtorch builds only)
+      DESIRED_DEVTOOLSET:
+        required: false
+        type: string
+        description: Desired dev toolset
      DESIRED_PYTHON:
        required: false
        type: string
@ -100,6 +104,7 @@ jobs:
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: ${{ inputs.LIBTORCH_CONFIG }}
      LIBTORCH_VARIANT: ${{ inputs.LIBTORCH_VARIANT }}
+      DESIRED_DEVTOOLSET: ${{ inputs.DESIRED_DEVTOOLSET }}
      DESIRED_PYTHON: ${{ inputs.DESIRED_PYTHON }}
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: ${{ inputs.PYTORCH_EXTRA_INSTALL_REQUIREMENTS }}
      ALPINE_IMAGE: ${{ inputs.ALPINE_IMAGE }}
@ -125,6 +130,7 @@ jobs:
            echo "SKIP_ALL_TESTS=${{ env.SKIP_ALL_TESTS }}"
            echo "LIBTORCH_CONFIG=${{ env.LIBTORCH_CONFIG }}"
            echo "LIBTORCH_VARIANT=${{ env.LIBTORCH_VARIANT }}"
+            echo "DESIRED_DEVTOOLSET=${{ env.DESIRED_DEVTOOLSET }}"
            echo "DESIRED_PYTHON=${{ env.DESIRED_PYTHON }}"
            echo "PYTORCH_EXTRA_INSTALL_REQUIREMENTS=${{ env.PYTORCH_EXTRA_INSTALL_REQUIREMENTS }}"
            echo "ALPINE_IMAGE=${{ env.ALPINE_IMAGE }}"
@ -218,6 +224,7 @@ jobs:
            -e BINARY_ENV_FILE \
            -e BUILD_ENVIRONMENT \
            -e DESIRED_CUDA \
+            -e DESIRED_DEVTOOLSET \
            -e DESIRED_PYTHON \
            -e GITHUB_ACTIONS \
            -e GPU_ARCH_TYPE \
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -47,6 +47,10 @@ on:
        required: false
        type: string
        description: Desired libtorch variant (for libtorch builds only)
+      DESIRED_DEVTOOLSET:
+        required: false
+        type: string
+        description: Desired dev toolset
      DESIRED_PYTHON:
        required: false
        type: string
@ -88,6 +92,7 @@ jobs:
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: ${{ inputs.LIBTORCH_CONFIG }}
      LIBTORCH_VARIANT: ${{ inputs.LIBTORCH_VARIANT }}
+      DESIRED_DEVTOOLSET: ${{ inputs.DESIRED_DEVTOOLSET }}
      DESIRED_PYTHON: ${{ inputs.DESIRED_PYTHON }}
      ALPINE_IMAGE: ${{ inputs.ALPINE_IMAGE }}
      AWS_DEFAULT_REGION: us-east-1
@ -113,6 +118,7 @@ jobs:
            echo "SKIP_ALL_TESTS=${{ env.SKIP_ALL_TESTS }}"
            echo "LIBTORCH_CONFIG=${{ env.LIBTORCH_CONFIG }}"
            echo "LIBTORCH_VARIANT=${{ env.LIBTORCH_VARIANT }}"
+            echo "DESIRED_DEVTOOLSET=${{ env.DESIRED_DEVTOOLSET }}"
            echo "DESIRED_PYTHON=${{ env.DESIRED_PYTHON }}"

            echo "ALPINE_IMAGE=${{ env.ALPINE_IMAGE }}"
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@ -43,6 +43,10 @@ on:
        required: false
        type: string
        description: Desired libtorch variant (for libtorch builds only)
+      DESIRED_DEVTOOLSET:
+        required: false
+        type: string
+        description: Desired dev toolset
      DESIRED_PYTHON:
        required: false
        type: string
@ -62,6 +66,7 @@ on:
 jobs:
  upload:
    runs-on: ubuntu-22.04
+    environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
    container:
      image: continuumio/miniconda3:4.12.0
    env:
@ -76,6 +81,7 @@ jobs:
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: ${{ inputs.LIBTORCH_CONFIG }}
      LIBTORCH_VARIANT: ${{ inputs.LIBTORCH_VARIANT }}
+      DESIRED_DEVTOOLSET: ${{ inputs.DESIRED_DEVTOOLSET }}
      DESIRED_PYTHON: ${{ inputs.DESIRED_PYTHON }}
      BINARY_ENV_FILE: /tmp/env
      GITHUB_TOKEN: ${{ secrets.github-token }}
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -69,11 +69,13 @@ on:
        required: false
        type: string
        default: ""
-      max-jobs:
+      use_split_build:
        description: |
-          Overwrite the number of jobs to use for the build
+          [Experimental] Build a libtorch only wheel and build pytorch such that
+          are built from the libtorch wheel.
        required: false
-        type: string
+        type: boolean
+        default: false

    secrets:
      HUGGING_FACE_HUB_TOKEN:
@ -208,7 +210,7 @@ jobs:
          OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          MAX_JOBS_OVERRIDE: ${{ inputs.max-jobs }}
+          USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
        run: |
          START_TIME=$(date +%s)
          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
@ -228,12 +230,6 @@ jobs:
            DOCKER_SHELL_CMD=
          fi

-          if [[ ${MAX_JOBS_OVERRIDE} == "" ]]; then
-            MAX_JOBS="$(nproc --ignore=2)"
-          else
-            MAX_JOBS="${MAX_JOBS_OVERRIDE}"
-          fi
-
          # Leaving 1GB for the runner and other things
          TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
          # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
@ -245,8 +241,7 @@ jobs:
          # shellcheck disable=SC2086
          container_name=$(docker run \
            -e BUILD_ENVIRONMENT \
-            -e MAX_JOBS=${MAX_JOBS} \
-            -e MAX_JOBS_OVERRIDE \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e AWS_DEFAULT_REGION \
            -e PR_NUMBER \
            -e SHA1 \
@ -287,7 +282,7 @@ jobs:

      - name: Store PyTorch Build Artifacts on S3
        uses: seemethere/upload-artifact-s3@v5
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}
          retention-days: 14
@ -295,15 +290,34 @@ jobs:
          path: artifacts.zip
          s3-bucket: ${{ inputs.s3-bucket }}

+      - name: Store PyTorch Build Artifacts on S3 for split build
+        uses: seemethere/upload-artifact-s3@v5
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
+        with:
+          name: ${{ inputs.build-environment }}-experimental-split-build
+          retention-days: 14
+          if-no-files-found: error
+          path: artifacts.zip
+          s3-bucket: ${{ inputs.s3-bucket }}
+
      - name: Store PyTorch Build Artifacts for s390x
        uses: actions/upload-artifact@v4
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.build-environment == 'linux-s390x-binary-manywheel'
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}
          retention-days: 14
          if-no-files-found: error
          path: artifacts.zip

+      - name: Store PyTorch Build Artifacts for s390x for split build
+        uses: actions/upload-artifact@v4
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
+        with:
+          name: ${{ inputs.build-environment }}-experimental-split-build
+          retention-days: 14
+          if-no-files-found: error
+          path: artifacts.zip
+
      - name: Upload sccache stats
        if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
        uses: ./.github/actions/upload-sccache-stats
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@ -33,6 +33,10 @@ on:
        default: "3.9"
        description: |
          The python version to be used. Will be 3.9 by default
+      environment-file:
+        required: false
+        type: string
+        description: Set the conda environment file used to setup macOS build.
      test-matrix:
        required: false
        type: string
@ -82,12 +86,23 @@ jobs:
          fi

      - name: Setup miniconda
+        if: inputs.environment-file == ''
        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
        with:
          python-version: ${{ inputs.python-version }}
          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
          pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt

+      # This option is used when cross-compiling arm64 from x86-64. Specifically, we need arm64 conda
+      # environment even though the arch is x86-64
+      - name: Setup miniconda using the provided environment file
+        if: inputs.environment-file != ''
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: ${{ inputs.python-version }}
+          environment-file: ${{ inputs.environment-file }}
+          pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
+
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        uses: nick-fields/retry@v3.0.0
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -251,11 +251,6 @@ jobs:
          # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct
          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test"

-      - name: Change permissions (only needed for MI300 runners for now)
-        if: ${{ always() && steps.test.conclusion && contains(matrix.runner, 'mi300') }}
-        run: |
-          docker exec -t "${{ env.CONTAINER_NAME }}" sh -c "sudo chown -R 1001:1001 test"
-
      - name: Print remaining test logs
        shell: bash
        if: always() && steps.test.conclusion
--- a/.github/workflows/build-manywheel-images-s390x.yml
+++ b/.github/workflows/build-manywheel-images-s390x.yml
@ -62,12 +62,7 @@ jobs:
        if: cancelled()
        shell: bash
        run: |
-          # If podman build command is interrupted,
+          # if podman build command is interrupted,
          # it can leave a couple of processes still running.
-          # Order them to stop for clean shutdown.
-          # It looks like sometimes some processes remain
-          # after first cleanup.
-          # Wait a bit and do cleanup again. It looks like it helps.
-          docker system prune --build -f || true
-          sleep 60
+          # order them to stop for clean shutdown.
          docker system prune --build -f || true
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .3.0
 .2.0