Update on "Hide all symbols (except stable/headeronly/shim) if TORCH_STABLE_ONLY is defined"

Fixes https://github.com/pytorch/pytorch/issues/161660 This extends the `TORCH_STABLE_ONLY` stopgap added in https://github.com/pytorch/pytorch/pull/161658 [ghstack-poisoned]
Update base for Update on "Hide all symbols (except stable/headeronly/shim) if TORCH_STABLE_ONLY is defined"
2025-11-19 01:54:54 +08:00 · 2025-11-17 21:22:45 -08:00 · 2025-11-17 21:22:45 -08:00 · 2025-11-18 00:17:45 +00:00 · 2025-11-17 23:44:18 +00:00 · 2025-11-17 23:38:39 +00:00
26 changed files with 2027 additions and 409 deletions
--- a/.ci/aarch64_linux/README.md
+++ b/.ci/aarch64_linux/README.md
@ -0,0 +1,19 @@
+# Aarch64 (ARM/Graviton) Support Scripts
+Scripts for building aarch64 PyTorch PIP Wheels. These scripts build the following wheels:
+* torch
+* torchvision
+* torchaudio
+* torchtext
+* torchdata
+## Aarch64_ci_build.sh
+This script is design to support CD operations within PyPi manylinux aarch64 container, and be executed in the container. It prepares the container and then executes __aarch64_wheel_ci_build.py__ to build the wheels. The script "assumes" the PyTorch repo is located at: ```/pytorch``` and will put the wheels into ```/artifacts```.
+### Usage
+```DESIRED_PYTHON=<PythonVersion> aarch64_ci_build.sh```
+
+__NOTE:__ CI build is currently __EXPERMINTAL__
+
+## Build_aarch64_wheel.py
+This app allows a person to build using AWS EC3 resources and requires AWS-CLI and Boto3 with AWS credentials to support building EC2 instances for the wheel builds. Can be used in a codebuild CD or from a local system.
+
+### Usage
+```build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch <RCtag>```
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -0,0 +1,53 @@
+#!/bin/bash
+set -eux -o pipefail
+
+GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
+
+# Set CUDA architecture lists to match x86 build_cuda.sh
+if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0"
+elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
+elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
+elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
+fi
+
+# Compress the fatbin with -compress-mode=size for CUDA 13
+if [[ "$DESIRED_CUDA" == *"13"* ]]; then
+    export TORCH_NVCC_FLAGS="-compress-mode=size"
+    # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
+    export BUILD_BUNDLE_PTXAS=1
+fi
+
+SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
+source $SCRIPTPATH/aarch64_ci_setup.sh
+
+###############################################################################
+# Run aarch64 builder python
+###############################################################################
+cd /
+# adding safe directory for git as the permissions will be
+# on the mounted pytorch repo
+git config --global --add safe.directory /pytorch
+pip install -r /pytorch/requirements.txt
+pip install auditwheel==6.2.0 wheel
+if [ "$DESIRED_CUDA" = "cpu" ]; then
+    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
+    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
+else
+    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
+    export USE_SYSTEM_NCCL=1
+
+    # Check if we should use NVIDIA libs from PyPI (similar to x86 build_cuda.sh logic)
+    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
+        echo "Bundling CUDA libraries with wheel for aarch64."
+    else
+        echo "Using nvidia libs from pypi for aarch64."
+        echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
+        export USE_NVIDIA_PYPI_LIBS=1
+    fi
+
+    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
+fi
--- a/.ci/aarch64_linux/aarch64_ci_setup.sh
+++ b/.ci/aarch64_linux/aarch64_ci_setup.sh
@ -0,0 +1,21 @@
+#!/bin/bash
+set -eux -o pipefail
+
+# This script is used to prepare the Docker container for aarch64_ci_wheel_build.py python script
+# By creating symlinks from desired /opt/python to /usr/local/bin/
+
+NUMPY_VERSION=2.0.2
+if [[ "$DESIRED_PYTHON"  == "3.13" || "$DESIRED_PYTHON" == "3.13t" ]]; then
+    NUMPY_VERSION=2.1.2
+fi
+
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+source $SCRIPTPATH/../manywheel/set_desired_python.sh
+
+pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2
+
+for tool in python python3 pip pip3 ninja scons patchelf; do
+    ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin;
+done
+
+python --version
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -0,0 +1,333 @@
+#!/usr/bin/env python3
+# encoding: UTF-8
+
+import os
+import shutil
+from subprocess import check_call, check_output
+
+
+def list_dir(path: str) -> list[str]:
+    """'
+    Helper for getting paths for Python
+    """
+    return check_output(["ls", "-1", path]).decode().split("\n")
+
+
+def replace_tag(filename) -> None:
+    with open(filename) as f:
+        lines = f.readlines()
+    for i, line in enumerate(lines):
+        if line.startswith("Tag:"):
+            lines[i] = line.replace("-linux_", "-manylinux_2_28_")
+            print(f"Updated tag from {line} to {lines[i]}")
+            break
+
+    with open(filename, "w") as f:
+        f.writelines(lines)
+
+
+def patch_library_rpath(
+    folder: str,
+    lib_name: str,
+    use_nvidia_pypi_libs: bool = False,
+    desired_cuda: str = "",
+) -> None:
+    """Apply patchelf to set RPATH for a library in torch/lib"""
+    lib_path = f"{folder}/tmp/torch/lib/{lib_name}"
+
+    if use_nvidia_pypi_libs:
+        # For PyPI NVIDIA libraries, construct CUDA RPATH
+        cuda_rpaths = [
+            "$ORIGIN/../../nvidia/cudnn/lib",
+            "$ORIGIN/../../nvidia/nvshmem/lib",
+            "$ORIGIN/../../nvidia/nccl/lib",
+            "$ORIGIN/../../nvidia/cusparselt/lib",
+        ]
+
+        if "130" in desired_cuda:
+            cuda_rpaths.append("$ORIGIN/../../nvidia/cu13/lib")
+        else:
+            cuda_rpaths.extend(
+                [
+                    "$ORIGIN/../../nvidia/cublas/lib",
+                    "$ORIGIN/../../nvidia/cuda_cupti/lib",
+                    "$ORIGIN/../../nvidia/cuda_nvrtc/lib",
+                    "$ORIGIN/../../nvidia/cuda_runtime/lib",
+                    "$ORIGIN/../../nvidia/cufft/lib",
+                    "$ORIGIN/../../nvidia/curand/lib",
+                    "$ORIGIN/../../nvidia/cusolver/lib",
+                    "$ORIGIN/../../nvidia/cusparse/lib",
+                    "$ORIGIN/../../nvidia/nvtx/lib",
+                    "$ORIGIN/../../nvidia/cufile/lib",
+                ]
+            )
+
+        # Add $ORIGIN for local torch libs
+        rpath = ":".join(cuda_rpaths) + ":$ORIGIN"
+    else:
+        # For bundled libraries, just use $ORIGIN
+        rpath = "$ORIGIN"
+
+    if os.path.exists(lib_path):
+        os.system(
+            f"cd {folder}/tmp/torch/lib/; "
+            f"patchelf --set-rpath '{rpath}' --force-rpath {lib_name}"
+        )
+
+
+def copy_and_patch_library(
+    src_path: str,
+    folder: str,
+    use_nvidia_pypi_libs: bool = False,
+    desired_cuda: str = "",
+) -> None:
+    """Copy a library to torch/lib and patch its RPATH"""
+    if os.path.exists(src_path):
+        lib_name = os.path.basename(src_path)
+        shutil.copy2(src_path, f"{folder}/tmp/torch/lib/{lib_name}")
+        patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
+
+
+def package_cuda_wheel(wheel_path, desired_cuda) -> None:
+    """
+    Package the cuda wheel libraries
+    """
+    folder = os.path.dirname(wheel_path)
+    os.mkdir(f"{folder}/tmp")
+    os.system(f"unzip {wheel_path} -d {folder}/tmp")
+    # Delete original wheel since it will be repackaged
+    os.system(f"rm {wheel_path}")
+
+    # Check if we should use PyPI NVIDIA libraries or bundle system libraries
+    use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
+
+    if use_nvidia_pypi_libs:
+        print("Using nvidia libs from pypi - skipping CUDA library bundling")
+        # For PyPI approach, we don't bundle CUDA libraries - they come from PyPI packages
+        # We only need to bundle non-NVIDIA libraries
+        minimal_libs_to_copy = [
+            "/lib64/libgomp.so.1",
+            "/usr/lib64/libgfortran.so.5",
+            "/acl/build/libarm_compute.so",
+            "/acl/build/libarm_compute_graph.so",
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_lapack_core.so.0",
+            "/usr/local/lib/libnvpl_blas_core.so.0",
+        ]
+
+        # Copy minimal libraries to unzipped_folder/torch/lib
+        for lib_path in minimal_libs_to_copy:
+            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
+
+        # Patch torch libraries used for searching libraries
+        torch_libs_to_patch = [
+            "libtorch.so",
+            "libtorch_cpu.so",
+            "libtorch_cuda.so",
+            "libtorch_cuda_linalg.so",
+            "libtorch_global_deps.so",
+            "libtorch_python.so",
+            "libtorch_nvshmem.so",
+            "libc10.so",
+            "libc10_cuda.so",
+            "libcaffe2_nvrtc.so",
+            "libshm.so",
+        ]
+        for lib_name in torch_libs_to_patch:
+            patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
+    else:
+        print("Bundling CUDA libraries with wheel")
+        # Original logic for bundling system CUDA libraries
+        # Common libraries for all CUDA versions
+        common_libs = [
+            # Non-NVIDIA system libraries
+            "/lib64/libgomp.so.1",
+            "/usr/lib64/libgfortran.so.5",
+            "/acl/build/libarm_compute.so",
+            "/acl/build/libarm_compute_graph.so",
+            # Common CUDA libraries (same for all versions)
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_lapack_core.so.0",
+            "/usr/local/lib/libnvpl_blas_core.so.0",
+            "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
+            "/usr/local/cuda/lib64/libcudnn.so.9",
+            "/usr/local/cuda/lib64/libcusparseLt.so.0",
+            "/usr/local/cuda/lib64/libcurand.so.10",
+            "/usr/local/cuda/lib64/libnccl.so.2",
+            "/usr/local/cuda/lib64/libnvshmem_host.so.3",
+            "/usr/local/cuda/lib64/libcudnn_adv.so.9",
+            "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
+            "/usr/local/cuda/lib64/libcudnn_graph.so.9",
+            "/usr/local/cuda/lib64/libcudnn_ops.so.9",
+            "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
+            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
+            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
+            "/usr/local/cuda/lib64/libcufile.so.0",
+            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
+            "/usr/local/cuda/lib64/libcusparse.so.12",
+        ]
+
+        # CUDA version-specific libraries
+        if "13" in desired_cuda:
+            minor_version = desired_cuda[-1]
+            version_specific_libs = [
+                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
+                "/usr/local/cuda/lib64/libcublas.so.13",
+                "/usr/local/cuda/lib64/libcublasLt.so.13",
+                "/usr/local/cuda/lib64/libcudart.so.13",
+                "/usr/local/cuda/lib64/libcufft.so.12",
+                "/usr/local/cuda/lib64/libcusolver.so.12",
+                "/usr/local/cuda/lib64/libnvJitLink.so.13",
+                "/usr/local/cuda/lib64/libnvrtc.so.13",
+                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",
+            ]
+        elif "12" in desired_cuda:
+            # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
+            minor_version = desired_cuda[-1]
+            version_specific_libs = [
+                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
+                "/usr/local/cuda/lib64/libcublas.so.12",
+                "/usr/local/cuda/lib64/libcublasLt.so.12",
+                "/usr/local/cuda/lib64/libcudart.so.12",
+                "/usr/local/cuda/lib64/libcufft.so.11",
+                "/usr/local/cuda/lib64/libcusolver.so.11",
+                "/usr/local/cuda/lib64/libnvJitLink.so.12",
+                "/usr/local/cuda/lib64/libnvrtc.so.12",
+                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
+            ]
+        else:
+            raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")
+
+        # Combine all libraries
+        libs_to_copy = common_libs + version_specific_libs
+
+        # Copy libraries to unzipped_folder/torch/lib
+        for lib_path in libs_to_copy:
+            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
+
+    # Make sure the wheel is tagged with manylinux_2_28
+    for f in os.scandir(f"{folder}/tmp/"):
+        if f.is_dir() and f.name.endswith(".dist-info"):
+            replace_tag(f"{f.path}/WHEEL")
+            break
+
+    os.system(f"wheel pack {folder}/tmp/ -d {folder}")
+    os.system(f"rm -rf {folder}/tmp/")
+
+
+def complete_wheel(folder: str) -> str:
+    """
+    Complete wheel build and put in artifact location
+    """
+    wheel_name = list_dir(f"/{folder}/dist")[0]
+
+    # Please note for cuda we don't run auditwheel since we use custom script to package
+    # the cuda dependencies to the wheel file using update_wheel() method.
+    # However we need to make sure filename reflects the correct Manylinux platform.
+    if "pytorch" in folder and not enable_cuda:
+        print("Repairing Wheel with AuditWheel")
+        check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
+        repaired_wheel_name = list_dir(f"/{folder}/wheelhouse")[0]
+
+        print(f"Moving {repaired_wheel_name} wheel to /{folder}/dist")
+        os.rename(
+            f"/{folder}/wheelhouse/{repaired_wheel_name}",
+            f"/{folder}/dist/{repaired_wheel_name}",
+        )
+    else:
+        repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
+
+    print(f"Copying {repaired_wheel_name} to artifacts")
+    shutil.copy2(
+        f"/{folder}/dist/{repaired_wheel_name}", f"/artifacts/{repaired_wheel_name}"
+    )
+
+    return repaired_wheel_name
+
+
+def parse_arguments():
+    """
+    Parse inline arguments
+    """
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser("AARCH64 wheels python CD")
+    parser.add_argument("--debug", action="store_true")
+    parser.add_argument("--build-only", action="store_true")
+    parser.add_argument("--test-only", type=str)
+    parser.add_argument("--enable-mkldnn", action="store_true")
+    parser.add_argument("--enable-cuda", action="store_true")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    """
+    Entry Point
+    """
+    args = parse_arguments()
+    enable_mkldnn = args.enable_mkldnn
+    enable_cuda = args.enable_cuda
+    branch = check_output(
+        ["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd="/pytorch"
+    ).decode()
+
+    print("Building PyTorch wheel")
+    build_vars = ""
+    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
+    if enable_cuda:
+        build_vars += "MAX_JOBS=5 "
+
+        # Handle PyPI NVIDIA libraries vs bundled libraries
+        use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
+        if use_nvidia_pypi_libs:
+            print("Configuring build for PyPI NVIDIA libraries")
+            # Configure for dynamic linking (matching x86 logic)
+            build_vars += "ATEN_STATIC_CUDA=0 USE_CUDA_STATIC_LINK=0 USE_CUPTI_SO=1 "
+        else:
+            print("Configuring build for bundled NVIDIA libraries")
+            # Keep existing static linking approach - already configured above
+
+    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
+    desired_cuda = os.getenv("DESIRED_CUDA")
+    if override_package_version is not None:
+        version = override_package_version
+        build_vars += (
+            f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
+        )
+    elif branch in ["nightly", "main"]:
+        build_date = (
+            check_output(["git", "log", "--pretty=format:%cs", "-1"], cwd="/pytorch")
+            .decode()
+            .replace("-", "")
+        )
+        version = (
+            check_output(["cat", "version.txt"], cwd="/pytorch").decode().strip()[:-2]
+        )
+        if enable_cuda:
+            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date}+{desired_cuda} PYTORCH_BUILD_NUMBER=1 "
+        else:
+            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
+    elif branch.startswith(("v1.", "v2.")):
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
+
+    if enable_mkldnn:
+        print("build pytorch with mkldnn+acl backend")
+        build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
+        build_vars += "ACL_ROOT_DIR=/acl "
+        if enable_cuda:
+            build_vars += "BLAS=NVPL "
+        else:
+            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS "
+    else:
+        print("build pytorch without mkldnn backend")
+
+    os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")
+    if enable_cuda:
+        print("Updating Cuda Dependency")
+        filename = os.listdir("/pytorch/dist/")
+        wheel_path = f"/pytorch/dist/{filename[0]}"
+        package_cuda_wheel(wheel_path, desired_cuda)
+    pytorch_wheel_name = complete_wheel("/pytorch/")
+    print(f"Build Complete. Created {pytorch_wheel_name}..")
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -0,0 +1,999 @@
+#!/usr/bin/env python3
+
+# This script is for building  AARCH64 wheels using AWS EC2 instances.
+# To generate binaries for the release follow these steps:
+# 1. Update mappings for each of the Domain Libraries by adding new row to a table like this:
+#         "v1.11.0": ("0.11.0", "rc1"),
+# 2. Run script with following arguments for each of the supported python versions and required tag, for example:
+# build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch v1.11.0-rc3
+
+
+import os
+import subprocess
+import sys
+import time
+from typing import Optional, Union
+
+import boto3
+
+
+# AMI images for us-east-1, change the following based on your ~/.aws/config
+os_amis = {
+    "ubuntu20_04": "ami-052eac90edaa9d08f",  # login_name: ubuntu
+    "ubuntu22_04": "ami-0c6c29c5125214c77",  # login_name: ubuntu
+    "redhat8": "ami-0698b90665a2ddcf1",  # login_name: ec2-user
+}
+
+ubuntu20_04_ami = os_amis["ubuntu20_04"]
+
+
+def compute_keyfile_path(key_name: Optional[str] = None) -> tuple[str, str]:
+    if key_name is None:
+        key_name = os.getenv("AWS_KEY_NAME")
+        if key_name is None:
+            return os.getenv("SSH_KEY_PATH", ""), ""
+
+    homedir_path = os.path.expanduser("~")
+    default_path = os.path.join(homedir_path, ".ssh", f"{key_name}.pem")
+    return os.getenv("SSH_KEY_PATH", default_path), key_name
+
+
+ec2 = boto3.resource("ec2")
+
+
+def ec2_get_instances(filter_name, filter_value):
+    return ec2.instances.filter(
+        Filters=[{"Name": filter_name, "Values": [filter_value]}]
+    )
+
+
+def ec2_instances_of_type(instance_type="t4g.2xlarge"):
+    return ec2_get_instances("instance-type", instance_type)
+
+
+def ec2_instances_by_id(instance_id):
+    rc = list(ec2_get_instances("instance-id", instance_id))
+    return rc[0] if len(rc) > 0 else None
+
+
+def start_instance(
+    key_name, ami=ubuntu20_04_ami, instance_type="t4g.2xlarge", ebs_size: int = 50
+):
+    inst = ec2.create_instances(
+        ImageId=ami,
+        InstanceType=instance_type,
+        SecurityGroups=["ssh-allworld"],
+        KeyName=key_name,
+        MinCount=1,
+        MaxCount=1,
+        BlockDeviceMappings=[
+            {
+                "DeviceName": "/dev/sda1",
+                "Ebs": {
+                    "DeleteOnTermination": True,
+                    "VolumeSize": ebs_size,
+                    "VolumeType": "standard",
+                },
+            }
+        ],
+    )[0]
+    print(f"Create instance {inst.id}")
+    inst.wait_until_running()
+    running_inst = ec2_instances_by_id(inst.id)
+    print(f"Instance started at {running_inst.public_dns_name}")
+    return running_inst
+
+
+class RemoteHost:
+    addr: str
+    keyfile_path: str
+    login_name: str
+    container_id: Optional[str] = None
+    ami: Optional[str] = None
+
+    def __init__(self, addr: str, keyfile_path: str, login_name: str = "ubuntu"):
+        self.addr = addr
+        self.keyfile_path = keyfile_path
+        self.login_name = login_name
+
+    def _gen_ssh_prefix(self) -> list[str]:
+        return [
+            "ssh",
+            "-o",
+            "StrictHostKeyChecking=no",
+            "-i",
+            self.keyfile_path,
+            f"{self.login_name}@{self.addr}",
+            "--",
+        ]
+
+    @staticmethod
+    def _split_cmd(args: Union[str, list[str]]) -> list[str]:
+        return args.split() if isinstance(args, str) else args
+
+    def run_ssh_cmd(self, args: Union[str, list[str]]) -> None:
+        subprocess.check_call(self._gen_ssh_prefix() + self._split_cmd(args))
+
+    def check_ssh_output(self, args: Union[str, list[str]]) -> str:
+        return subprocess.check_output(
+            self._gen_ssh_prefix() + self._split_cmd(args)
+        ).decode("utf-8")
+
+    def scp_upload_file(self, local_file: str, remote_file: str) -> None:
+        subprocess.check_call(
+            [
+                "scp",
+                "-i",
+                self.keyfile_path,
+                local_file,
+                f"{self.login_name}@{self.addr}:{remote_file}",
+            ]
+        )
+
+    def scp_download_file(
+        self, remote_file: str, local_file: Optional[str] = None
+    ) -> None:
+        if local_file is None:
+            local_file = "."
+        subprocess.check_call(
+            [
+                "scp",
+                "-i",
+                self.keyfile_path,
+                f"{self.login_name}@{self.addr}:{remote_file}",
+                local_file,
+            ]
+        )
+
+    def start_docker(self, image="quay.io/pypa/manylinux2014_aarch64:latest") -> None:
+        self.run_ssh_cmd("sudo apt-get install -y docker.io")
+        self.run_ssh_cmd(f"sudo usermod -a -G docker {self.login_name}")
+        self.run_ssh_cmd("sudo service docker start")
+        self.run_ssh_cmd(f"docker pull {image}")
+        self.container_id = self.check_ssh_output(
+            f"docker run -t -d -w /root {image}"
+        ).strip()
+
+    def using_docker(self) -> bool:
+        return self.container_id is not None
+
+    def run_cmd(self, args: Union[str, list[str]]) -> None:
+        if not self.using_docker():
+            return self.run_ssh_cmd(args)
+        assert self.container_id is not None
+        docker_cmd = self._gen_ssh_prefix() + [
+            "docker",
+            "exec",
+            "-i",
+            self.container_id,
+            "bash",
+        ]
+        p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE)
+        p.communicate(
+            input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
+                "utf-8"
+            )
+        )
+        rc = p.wait()
+        if rc != 0:
+            raise subprocess.CalledProcessError(rc, docker_cmd)
+
+    def check_output(self, args: Union[str, list[str]]) -> str:
+        if not self.using_docker():
+            return self.check_ssh_output(args)
+        assert self.container_id is not None
+        docker_cmd = self._gen_ssh_prefix() + [
+            "docker",
+            "exec",
+            "-i",
+            self.container_id,
+            "bash",
+        ]
+        p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+        (out, err) = p.communicate(
+            input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
+                "utf-8"
+            )
+        )
+        rc = p.wait()
+        if rc != 0:
+            raise subprocess.CalledProcessError(rc, docker_cmd, output=out, stderr=err)
+        return out.decode("utf-8")
+
+    def upload_file(self, local_file: str, remote_file: str) -> None:
+        if not self.using_docker():
+            return self.scp_upload_file(local_file, remote_file)
+        tmp_file = os.path.join("/tmp", os.path.basename(local_file))
+        self.scp_upload_file(local_file, tmp_file)
+        self.run_ssh_cmd(
+            ["docker", "cp", tmp_file, f"{self.container_id}:/root/{remote_file}"]
+        )
+        self.run_ssh_cmd(["rm", tmp_file])
+
+    def download_file(self, remote_file: str, local_file: Optional[str] = None) -> None:
+        if not self.using_docker():
+            return self.scp_download_file(remote_file, local_file)
+        tmp_file = os.path.join("/tmp", os.path.basename(remote_file))
+        self.run_ssh_cmd(
+            ["docker", "cp", f"{self.container_id}:/root/{remote_file}", tmp_file]
+        )
+        self.scp_download_file(tmp_file, local_file)
+        self.run_ssh_cmd(["rm", tmp_file])
+
+    def download_wheel(
+        self, remote_file: str, local_file: Optional[str] = None
+    ) -> None:
+        if self.using_docker() and local_file is None:
+            basename = os.path.basename(remote_file)
+            local_file = basename.replace(
+                "-linux_aarch64.whl", "-manylinux2014_aarch64.whl"
+            )
+        self.download_file(remote_file, local_file)
+
+    def list_dir(self, path: str) -> list[str]:
+        return self.check_output(["ls", "-1", path]).split("\n")
+
+
+def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
+    import socket
+
+    for i in range(attempt_cnt):
+        try:
+            with socket.create_connection((addr, port), timeout=timeout):
+                return
+        except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203
+            if i == attempt_cnt - 1:
+                raise
+            time.sleep(timeout)
+
+
+def update_apt_repo(host: RemoteHost) -> None:
+    time.sleep(5)
+    host.run_cmd("sudo systemctl stop apt-daily.service || true")
+    host.run_cmd("sudo systemctl stop unattended-upgrades.service || true")
+    host.run_cmd(
+        "while systemctl is-active --quiet apt-daily.service; do sleep 1; done"
+    )
+    host.run_cmd(
+        "while systemctl is-active --quiet unattended-upgrades.service; do sleep 1; done"
+    )
+    host.run_cmd("sudo apt-get update")
+    time.sleep(3)
+    host.run_cmd("sudo apt-get update")
+
+
+def install_condaforge(
+    host: RemoteHost, suffix: str = "latest/download/Miniforge3-Linux-aarch64.sh"
+) -> None:
+    print("Install conda-forge")
+    host.run_cmd(f"curl -OL https://github.com/conda-forge/miniforge/releases/{suffix}")
+    host.run_cmd(f"sh -f {os.path.basename(suffix)} -b")
+    host.run_cmd(f"rm -f {os.path.basename(suffix)}")
+    if host.using_docker():
+        host.run_cmd("echo 'PATH=$HOME/miniforge3/bin:$PATH'>>.bashrc")
+    else:
+        host.run_cmd(
+            [
+                "sed",
+                "-i",
+                "'/^# If not running interactively.*/i PATH=$HOME/miniforge3/bin:$PATH'",
+                ".bashrc",
+            ]
+        )
+
+
+def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
+    if python_version == "3.6":
+        # Python-3.6 EOLed and not compatible with conda-4.11
+        install_condaforge(
+            host, suffix="download/4.10.3-10/Miniforge3-4.10.3-10-Linux-aarch64.sh"
+        )
+        host.run_cmd(f"conda install -y python={python_version} numpy pyyaml")
+    else:
+        install_condaforge(
+            host, suffix="download/4.11.0-4/Miniforge3-4.11.0-4-Linux-aarch64.sh"
+        )
+        # Pytorch-1.10 or older are not compatible with setuptools=59.6 or newer
+        host.run_cmd(
+            f"conda install -y python={python_version} numpy pyyaml setuptools>=59.5.0"
+        )
+
+
+def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:
+    host.run_cmd("pip3 install auditwheel")
+    host.run_cmd(
+        "conda install -y patchelf" if use_conda else "sudo apt-get install -y patchelf"
+    )
+    from tempfile import NamedTemporaryFile
+
+    with NamedTemporaryFile() as tmp:
+        tmp.write(embed_library_script.encode("utf-8"))
+        tmp.flush()
+        host.upload_file(tmp.name, "embed_library.py")
+
+    print("Embedding libgomp into wheel")
+    if host.using_docker():
+        host.run_cmd(f"python3 embed_library.py {wheel_name} --update-tag")
+    else:
+        host.run_cmd(f"python3 embed_library.py {wheel_name}")
+
+
+def checkout_repo(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    url: str,
+    git_clone_flags: str,
+    mapping: dict[str, tuple[str, str]],
+) -> Optional[str]:
+    for prefix in mapping:
+        if not branch.startswith(prefix):
+            continue
+        tag = f"v{mapping[prefix][0]}-{mapping[prefix][1]}"
+        host.run_cmd(f"git clone {url} -b {tag} {git_clone_flags}")
+        return mapping[prefix][0]
+
+    host.run_cmd(f"git clone {url} -b {branch} {git_clone_flags}")
+    return None
+
+
+def build_torchvision(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    use_conda: bool = True,
+    git_clone_flags: str,
+    run_smoke_tests: bool = True,
+) -> str:
+    print("Checking out TorchVision repo")
+    build_version = checkout_repo(
+        host,
+        branch=branch,
+        url="https://github.com/pytorch/vision",
+        git_clone_flags=git_clone_flags,
+        mapping={
+            "v1.7.1": ("0.8.2", "rc2"),
+            "v1.8.0": ("0.9.0", "rc3"),
+            "v1.8.1": ("0.9.1", "rc1"),
+            "v1.9.0": ("0.10.0", "rc1"),
+            "v1.10.0": ("0.11.1", "rc1"),
+            "v1.10.1": ("0.11.2", "rc1"),
+            "v1.10.2": ("0.11.3", "rc1"),
+            "v1.11.0": ("0.12.0", "rc1"),
+            "v1.12.0": ("0.13.0", "rc4"),
+            "v1.12.1": ("0.13.1", "rc6"),
+            "v1.13.0": ("0.14.0", "rc4"),
+            "v1.13.1": ("0.14.1", "rc2"),
+            "v2.0.0": ("0.15.1", "rc2"),
+            "v2.0.1": ("0.15.2", "rc2"),
+        },
+    )
+    print("Building TorchVision wheel")
+
+    # Please note libnpg and jpeg are required to build image.so extension
+    if use_conda:
+        host.run_cmd("conda install -y libpng jpeg")
+        # Remove .so files to force static linking
+        host.run_cmd(
+            "rm miniforge3/lib/libpng.so miniforge3/lib/libpng16.so miniforge3/lib/libjpeg.so"
+        )
+        # And patch setup.py to include libz dependency for libpng
+        host.run_cmd(
+            [
+                'sed -i -e \'s/image_link_flags\\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py'
+            ]
+        )
+
+    build_vars = ""
+    if branch == "nightly":
+        version = host.check_output(
+            ["if [ -f vision/version.txt ]; then cat vision/version.txt; fi"]
+        ).strip()
+        if len(version) == 0:
+            # In older revisions, version was embedded in setup.py
+            version = (
+                host.check_output(["grep", '"version = \'"', "vision/setup.py"])
+                .strip()
+                .split("'")[1][:-2]
+            )
+        build_date = (
+            host.check_output("cd vision && git log --pretty=format:%s -1")
+            .strip()
+            .split()[0]
+            .replace("-", "")
+        )
+        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
+    elif build_version is not None:
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
+    if host.using_docker():
+        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
+
+    host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation")
+    vision_wheel_name = host.list_dir("vision/dist")[0]
+    embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name))
+
+    print("Copying TorchVision wheel")
+    host.download_wheel(os.path.join("vision", "dist", vision_wheel_name))
+    if run_smoke_tests:
+        host.run_cmd(
+            f"pip3 install {os.path.join('vision', 'dist', vision_wheel_name)}"
+        )
+        host.run_cmd("python3 vision/test/smoke_test.py")
+    print("Delete vision checkout")
+    host.run_cmd("rm -rf vision")
+
+    return vision_wheel_name
+
+
+def build_torchdata(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    use_conda: bool = True,
+    git_clone_flags: str = "",
+) -> str:
+    print("Checking out TorchData repo")
+    git_clone_flags += " --recurse-submodules"
+    build_version = checkout_repo(
+        host,
+        branch=branch,
+        url="https://github.com/pytorch/data",
+        git_clone_flags=git_clone_flags,
+        mapping={
+            "v1.13.1": ("0.5.1", ""),
+            "v2.0.0": ("0.6.0", "rc5"),
+            "v2.0.1": ("0.6.1", "rc1"),
+        },
+    )
+    print("Building TorchData wheel")
+    build_vars = ""
+    if branch == "nightly":
+        version = host.check_output(
+            ["if [ -f data/version.txt ]; then cat data/version.txt; fi"]
+        ).strip()
+        build_date = (
+            host.check_output("cd data && git log --pretty=format:%s -1")
+            .strip()
+            .split()[0]
+            .replace("-", "")
+        )
+        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
+    elif build_version is not None:
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
+    if host.using_docker():
+        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
+
+    host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation")
+    wheel_name = host.list_dir("data/dist")[0]
+    embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name))
+
+    print("Copying TorchData wheel")
+    host.download_wheel(os.path.join("data", "dist", wheel_name))
+
+    return wheel_name
+
+
+def build_torchtext(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    use_conda: bool = True,
+    git_clone_flags: str = "",
+) -> str:
+    print("Checking out TorchText repo")
+    git_clone_flags += " --recurse-submodules"
+    build_version = checkout_repo(
+        host,
+        branch=branch,
+        url="https://github.com/pytorch/text",
+        git_clone_flags=git_clone_flags,
+        mapping={
+            "v1.9.0": ("0.10.0", "rc1"),
+            "v1.10.0": ("0.11.0", "rc2"),
+            "v1.10.1": ("0.11.1", "rc1"),
+            "v1.10.2": ("0.11.2", "rc1"),
+            "v1.11.0": ("0.12.0", "rc1"),
+            "v1.12.0": ("0.13.0", "rc2"),
+            "v1.12.1": ("0.13.1", "rc5"),
+            "v1.13.0": ("0.14.0", "rc3"),
+            "v1.13.1": ("0.14.1", "rc1"),
+            "v2.0.0": ("0.15.1", "rc2"),
+            "v2.0.1": ("0.15.2", "rc2"),
+        },
+    )
+    print("Building TorchText wheel")
+    build_vars = ""
+    if branch == "nightly":
+        version = host.check_output(
+            ["if [ -f text/version.txt ]; then cat text/version.txt; fi"]
+        ).strip()
+        build_date = (
+            host.check_output("cd text && git log --pretty=format:%s -1")
+            .strip()
+            .split()[0]
+            .replace("-", "")
+        )
+        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
+    elif build_version is not None:
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
+    if host.using_docker():
+        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
+
+    host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation")
+    wheel_name = host.list_dir("text/dist")[0]
+    embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name))
+
+    print("Copying TorchText wheel")
+    host.download_wheel(os.path.join("text", "dist", wheel_name))
+
+    return wheel_name
+
+
+def build_torchaudio(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    use_conda: bool = True,
+    git_clone_flags: str = "",
+) -> str:
+    print("Checking out TorchAudio repo")
+    git_clone_flags += " --recurse-submodules"
+    build_version = checkout_repo(
+        host,
+        branch=branch,
+        url="https://github.com/pytorch/audio",
+        git_clone_flags=git_clone_flags,
+        mapping={
+            "v1.9.0": ("0.9.0", "rc2"),
+            "v1.10.0": ("0.10.0", "rc5"),
+            "v1.10.1": ("0.10.1", "rc1"),
+            "v1.10.2": ("0.10.2", "rc1"),
+            "v1.11.0": ("0.11.0", "rc1"),
+            "v1.12.0": ("0.12.0", "rc3"),
+            "v1.12.1": ("0.12.1", "rc5"),
+            "v1.13.0": ("0.13.0", "rc4"),
+            "v1.13.1": ("0.13.1", "rc2"),
+            "v2.0.0": ("2.0.1", "rc3"),
+            "v2.0.1": ("2.0.2", "rc2"),
+        },
+    )
+    print("Building TorchAudio wheel")
+    build_vars = ""
+    if branch == "nightly":
+        version = (
+            host.check_output(["grep", '"version = \'"', "audio/setup.py"])
+            .strip()
+            .split("'")[1][:-2]
+        )
+        build_date = (
+            host.check_output("cd audio && git log --pretty=format:%s -1")
+            .strip()
+            .split()[0]
+            .replace("-", "")
+        )
+        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
+    elif build_version is not None:
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
+    if host.using_docker():
+        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
+
+    host.run_cmd(
+        f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \
+        && ./packaging/ffmpeg/build.sh \
+        && {build_vars} python3 -m build --wheel --no-isolation"
+    )
+
+    wheel_name = host.list_dir("audio/dist")[0]
+    embed_libgomp(host, use_conda, os.path.join("audio", "dist", wheel_name))
+
+    print("Copying TorchAudio wheel")
+    host.download_wheel(os.path.join("audio", "dist", wheel_name))
+
+    return wheel_name
+
+
+def configure_system(
+    host: RemoteHost,
+    *,
+    compiler: str = "gcc-8",
+    use_conda: bool = True,
+    python_version: str = "3.8",
+) -> None:
+    if use_conda:
+        install_condaforge_python(host, python_version)
+
+    print("Configuring the system")
+    if not host.using_docker():
+        update_apt_repo(host)
+        host.run_cmd("sudo apt-get install -y ninja-build g++ git cmake gfortran unzip")
+    else:
+        host.run_cmd("yum install -y sudo")
+        host.run_cmd("conda install -y ninja scons")
+
+    if not use_conda:
+        host.run_cmd(
+            "sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
+        )
+    host.run_cmd("pip3 install dataclasses typing-extensions")
+    if not use_conda:
+        print("Installing Cython + numpy from PyPy")
+        host.run_cmd("sudo pip3 install Cython")
+        host.run_cmd("sudo pip3 install numpy")
+
+
+def build_domains(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    use_conda: bool = True,
+    git_clone_flags: str = "",
+) -> tuple[str, str, str, str]:
+    vision_wheel_name = build_torchvision(
+        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
+    )
+    audio_wheel_name = build_torchaudio(
+        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
+    )
+    data_wheel_name = build_torchdata(
+        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
+    )
+    text_wheel_name = build_torchtext(
+        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
+    )
+    return (vision_wheel_name, audio_wheel_name, data_wheel_name, text_wheel_name)
+
+
+def start_build(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    compiler: str = "gcc-8",
+    use_conda: bool = True,
+    python_version: str = "3.8",
+    pytorch_only: bool = False,
+    pytorch_build_number: Optional[str] = None,
+    shallow_clone: bool = True,
+    enable_mkldnn: bool = False,
+) -> tuple[str, str, str, str, str]:
+    git_clone_flags = " --depth 1 --shallow-submodules" if shallow_clone else ""
+    if host.using_docker() and not use_conda:
+        print("Auto-selecting conda option for docker images")
+        use_conda = True
+    if not host.using_docker():
+        print("Disable mkldnn for host builds")
+        enable_mkldnn = False
+
+    configure_system(
+        host, compiler=compiler, use_conda=use_conda, python_version=python_version
+    )
+
+    if host.using_docker():
+        print("Move libgfortant.a into a standard location")
+        # HACK: pypa gforntran.a is compiled without PIC, which leads to the following error
+        # libgfortran.a(error.o)(.text._gfortrani_st_printf+0x34): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `__stack_chk_guard@@GLIBC_2.17'  # noqa: E501, B950
+        # Workaround by copying gfortran library from the host
+        host.run_ssh_cmd("sudo apt-get install -y gfortran-8")
+        host.run_cmd("mkdir -p /usr/lib/gcc/aarch64-linux-gnu/8")
+        host.run_ssh_cmd(
+            [
+                "docker",
+                "cp",
+                "/usr/lib/gcc/aarch64-linux-gnu/8/libgfortran.a",
+                f"{host.container_id}:/opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/",
+            ]
+        )
+
+    print("Checking out PyTorch repo")
+    host.run_cmd(
+        f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"
+    )
+
+    host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")
+
+    print("Building PyTorch wheel")
+    build_opts = ""
+    if pytorch_build_number is not None:
+        build_opts += f" -C--build-option=--build-number={pytorch_build_number}"
+    # Breakpad build fails on aarch64
+    build_vars = "USE_BREAKPAD=0 "
+    if branch == "nightly":
+        build_date = (
+            host.check_output("cd pytorch && git log --pretty=format:%s -1")
+            .strip()
+            .split()[0]
+            .replace("-", "")
+        )
+        version = host.check_output("cat pytorch/version.txt").strip()[:-2]
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
+    if branch.startswith(("v1.", "v2.")):
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
+    if host.using_docker():
+        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
+    if enable_mkldnn:
+        host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")
+        print("build pytorch with mkldnn+acl backend")
+        build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
+        build_vars += " BLAS=OpenBLAS"
+        build_vars += " OpenBLAS_HOME=/opt/OpenBLAS"
+        build_vars += " ACL_ROOT_DIR=/acl"
+        host.run_cmd(
+            f"cd $HOME/pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
+        )
+        print("Repair the wheel")
+        pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
+        ld_library_path = "/acl/build:$HOME/pytorch/build/lib"
+        host.run_cmd(
+            f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"
+        )
+        print("replace the original wheel with the repaired one")
+        pytorch_repaired_wheel_name = host.list_dir("wheelhouse")[0]
+        host.run_cmd(
+            f"cp $HOME/wheelhouse/{pytorch_repaired_wheel_name} $HOME/pytorch/dist/{pytorch_wheel_name}"
+        )
+    else:
+        print("build pytorch without mkldnn backend")
+        host.run_cmd(
+            f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
+        )
+
+    print("Deleting build folder")
+    host.run_cmd("cd pytorch && rm -rf build")
+    pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
+    embed_libgomp(host, use_conda, os.path.join("pytorch", "dist", pytorch_wheel_name))
+    print("Copying the wheel")
+    host.download_wheel(os.path.join("pytorch", "dist", pytorch_wheel_name))
+
+    print("Installing PyTorch wheel")
+    host.run_cmd(f"pip3 install pytorch/dist/{pytorch_wheel_name}")
+
+    if pytorch_only:
+        return (pytorch_wheel_name, None, None, None, None)
+    domain_wheels = build_domains(
+        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
+    )
+
+    return (pytorch_wheel_name, *domain_wheels)
+
+
+embed_library_script = """
+#!/usr/bin/env python3
+
+from auditwheel.patcher import Patchelf
+from auditwheel.wheeltools import InWheelCtx
+from auditwheel.elfutils import elf_file_filter
+from auditwheel.repair import copylib
+from auditwheel.lddtree import lddtree
+from subprocess import check_call
+import os
+import shutil
+import sys
+from tempfile import TemporaryDirectory
+
+
+def replace_tag(filename):
+   with open(filename, 'r') as f:
+     lines = f.read().split("\\n")
+   for i,line in enumerate(lines):
+       if not line.startswith("Tag: "):
+           continue
+       lines[i] = line.replace("-linux_", "-manylinux2014_")
+       print(f'Updated tag from {line} to {lines[i]}')
+
+   with open(filename, 'w') as f:
+       f.write("\\n".join(lines))
+
+
+class AlignedPatchelf(Patchelf):
+    def set_soname(self, file_name: str, new_soname: str) -> None:
+        check_call(['patchelf', '--page-size', '65536', '--set-soname', new_soname, file_name])
+
+    def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
+        check_call(['patchelf', '--page-size', '65536', '--replace-needed', soname, new_soname, file_name])
+
+
+def embed_library(whl_path, lib_soname, update_tag=False):
+    patcher = AlignedPatchelf()
+    out_dir = TemporaryDirectory()
+    whl_name = os.path.basename(whl_path)
+    tmp_whl_name = os.path.join(out_dir.name, whl_name)
+    with InWheelCtx(whl_path) as ctx:
+        torchlib_path = os.path.join(ctx._tmpdir.name, 'torch', 'lib')
+        ctx.out_wheel=tmp_whl_name
+        new_lib_path, new_lib_soname = None, None
+        for filename, elf in elf_file_filter(ctx.iter_files()):
+            if not filename.startswith('torch/lib'):
+                continue
+            libtree = lddtree(filename)
+            if lib_soname not in libtree['needed']:
+                continue
+            lib_path = libtree['libs'][lib_soname]['path']
+            if lib_path is None:
+                print(f"Can't embed {lib_soname} as it could not be found")
+                break
+            if lib_path.startswith(torchlib_path):
+                continue
+
+            if new_lib_path is None:
+                new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
+            patcher.replace_needed(filename, lib_soname, new_lib_soname)
+            print(f'Replacing {lib_soname} with {new_lib_soname} for {filename}')
+        if update_tag:
+            # Add manylinux2014 tag
+            for filename in ctx.iter_files():
+                if os.path.basename(filename) != 'WHEEL':
+                    continue
+                replace_tag(filename)
+    shutil.move(tmp_whl_name, whl_path)
+
+
+if __name__ == '__main__':
+    embed_library(sys.argv[1], 'libgomp.so.1', len(sys.argv) > 2 and sys.argv[2] == '--update-tag')
+"""
+
+
+def run_tests(host: RemoteHost, whl: str, branch="main") -> None:
+    print("Configuring the system")
+    update_apt_repo(host)
+    host.run_cmd("sudo apt-get install -y python3-pip git")
+    host.run_cmd("sudo pip3 install Cython")
+    host.run_cmd("sudo pip3 install numpy")
+    host.upload_file(whl, ".")
+    host.run_cmd(f"sudo pip3 install {whl}")
+    host.run_cmd("python3 -c 'import torch;print(torch.rand((3,3))'")
+    host.run_cmd(f"git clone -b {branch} https://github.com/pytorch/pytorch")
+    host.run_cmd("cd pytorch/test; python3 test_torch.py -v")
+
+
+def get_instance_name(instance) -> Optional[str]:
+    if instance.tags is None:
+        return None
+    for tag in instance.tags:
+        if tag["Key"] == "Name":
+            return tag["Value"]
+    return None
+
+
+def list_instances(instance_type: str) -> None:
+    print(f"All instances of type {instance_type}")
+    for instance in ec2_instances_of_type(instance_type):
+        ifaces = instance.network_interfaces
+        az = ifaces[0].subnet.availability_zone if len(ifaces) > 0 else None
+        print(
+            f"{instance.id} {get_instance_name(instance)} {instance.public_dns_name} {instance.state['Name']} {az}"
+        )
+
+
+def terminate_instances(instance_type: str) -> None:
+    print(f"Terminating all instances of type {instance_type}")
+    instances = list(ec2_instances_of_type(instance_type))
+    for instance in instances:
+        print(f"Terminating {instance.id}")
+        instance.terminate()
+    print("Waiting for termination to complete")
+    for instance in instances:
+        instance.wait_until_terminated()
+
+
+def parse_arguments():
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser("Build and test AARCH64 wheels using EC2")
+    parser.add_argument("--key-name", type=str)
+    parser.add_argument("--debug", action="store_true")
+    parser.add_argument("--build-only", action="store_true")
+    parser.add_argument("--test-only", type=str)
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument("--os", type=str, choices=list(os_amis.keys()))
+    group.add_argument("--ami", type=str)
+    parser.add_argument(
+        "--python-version",
+        type=str,
+        choices=[f"3.{d}" for d in range(6, 12)],
+        default=None,
+    )
+    parser.add_argument("--alloc-instance", action="store_true")
+    parser.add_argument("--list-instances", action="store_true")
+    parser.add_argument("--pytorch-only", action="store_true")
+    parser.add_argument("--keep-running", action="store_true")
+    parser.add_argument("--terminate-instances", action="store_true")
+    parser.add_argument("--instance-type", type=str, default="t4g.2xlarge")
+    parser.add_argument("--ebs-size", type=int, default=50)
+    parser.add_argument("--branch", type=str, default="main")
+    parser.add_argument("--use-docker", action="store_true")
+    parser.add_argument(
+        "--compiler",
+        type=str,
+        choices=["gcc-7", "gcc-8", "gcc-9", "clang"],
+        default="gcc-8",
+    )
+    parser.add_argument("--use-torch-from-pypi", action="store_true")
+    parser.add_argument("--pytorch-build-number", type=str, default=None)
+    parser.add_argument("--disable-mkldnn", action="store_true")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    ami = (
+        args.ami
+        if args.ami is not None
+        else os_amis[args.os]
+        if args.os is not None
+        else ubuntu20_04_ami
+    )
+    keyfile_path, key_name = compute_keyfile_path(args.key_name)
+
+    if args.list_instances:
+        list_instances(args.instance_type)
+        sys.exit(0)
+
+    if args.terminate_instances:
+        terminate_instances(args.instance_type)
+        sys.exit(0)
+
+    if len(key_name) == 0:
+        raise RuntimeError("""
+            Cannot start build without key_name, please specify
+            --key-name argument or AWS_KEY_NAME environment variable.""")
+    if len(keyfile_path) == 0 or not os.path.exists(keyfile_path):
+        raise RuntimeError(f"""
+            Cannot find keyfile with name: [{key_name}] in path: [{keyfile_path}], please
+            check `~/.ssh/` folder or manually set SSH_KEY_PATH environment variable.""")
+
+    # Starting the instance
+    inst = start_instance(
+        key_name, ami=ami, instance_type=args.instance_type, ebs_size=args.ebs_size
+    )
+    instance_name = f"{args.key_name}-{args.os}"
+    if args.python_version is not None:
+        instance_name += f"-py{args.python_version}"
+    inst.create_tags(
+        DryRun=False,
+        Tags=[
+            {
+                "Key": "Name",
+                "Value": instance_name,
+            }
+        ],
+    )
+    addr = inst.public_dns_name
+    wait_for_connection(addr, 22)
+    host = RemoteHost(addr, keyfile_path)
+    host.ami = ami
+    if args.use_docker:
+        update_apt_repo(host)
+        host.start_docker()
+
+    if args.test_only:
+        run_tests(host, args.test_only)
+        sys.exit(0)
+
+    if args.alloc_instance:
+        if args.python_version is None:
+            sys.exit(0)
+        install_condaforge_python(host, args.python_version)
+        sys.exit(0)
+
+    python_version = args.python_version if args.python_version is not None else "3.10"
+
+    if args.use_torch_from_pypi:
+        configure_system(host, compiler=args.compiler, python_version=python_version)
+        print("Installing PyTorch wheel")
+        host.run_cmd("pip3 install torch")
+        build_domains(
+            host, branch=args.branch, git_clone_flags=" --depth 1 --shallow-submodules"
+        )
+    else:
+        start_build(
+            host,
+            branch=args.branch,
+            compiler=args.compiler,
+            python_version=python_version,
+            pytorch_only=args.pytorch_only,
+            pytorch_build_number=args.pytorch_build_number,
+            enable_mkldnn=not args.disable_mkldnn,
+        )
+    if not args.keep_running:
+        print(f"Waiting for instance {inst.id} to terminate")
+        inst.terminate()
+        inst.wait_until_terminated()
--- a/.ci/aarch64_linux/embed_library.py
+++ b/.ci/aarch64_linux/embed_library.py
@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+
+import os
+import shutil
+import sys
+from subprocess import check_call
+from tempfile import TemporaryDirectory
+
+from auditwheel.elfutils import elf_file_filter
+from auditwheel.lddtree import lddtree
+from auditwheel.patcher import Patchelf
+from auditwheel.repair import copylib
+from auditwheel.wheeltools import InWheelCtx
+
+
+def replace_tag(filename):
+    with open(filename) as f:
+        lines = f.read().split("\\n")
+    for i, line in enumerate(lines):
+        if not line.startswith("Tag: "):
+            continue
+        lines[i] = line.replace("-linux_", "-manylinux2014_")
+        print(f"Updated tag from {line} to {lines[i]}")
+
+    with open(filename, "w") as f:
+        f.write("\\n".join(lines))
+
+
+class AlignedPatchelf(Patchelf):
+    def set_soname(self, file_name: str, new_soname: str) -> None:
+        check_call(
+            ["patchelf", "--page-size", "65536", "--set-soname", new_soname, file_name]
+        )
+
+    def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
+        check_call(
+            [
+                "patchelf",
+                "--page-size",
+                "65536",
+                "--replace-needed",
+                soname,
+                new_soname,
+                file_name,
+            ]
+        )
+
+
+def embed_library(whl_path, lib_soname, update_tag=False):
+    patcher = AlignedPatchelf()
+    out_dir = TemporaryDirectory()
+    whl_name = os.path.basename(whl_path)
+    tmp_whl_name = os.path.join(out_dir.name, whl_name)
+    with InWheelCtx(whl_path) as ctx:
+        torchlib_path = os.path.join(ctx._tmpdir.name, "torch", "lib")
+        ctx.out_wheel = tmp_whl_name
+        new_lib_path, new_lib_soname = None, None
+        for filename, _ in elf_file_filter(ctx.iter_files()):
+            if not filename.startswith("torch/lib"):
+                continue
+            libtree = lddtree(filename)
+            if lib_soname not in libtree["needed"]:
+                continue
+            lib_path = libtree["libs"][lib_soname]["path"]
+            if lib_path is None:
+                print(f"Can't embed {lib_soname} as it could not be found")
+                break
+            if lib_path.startswith(torchlib_path):
+                continue
+
+            if new_lib_path is None:
+                new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
+            patcher.replace_needed(filename, lib_soname, new_lib_soname)
+            print(f"Replacing {lib_soname} with {new_lib_soname} for {filename}")
+        if update_tag:
+            # Add manylinux2014 tag
+            for filename in ctx.iter_files():
+                if os.path.basename(filename) != "WHEEL":
+                    continue
+                replace_tag(filename)
+    shutil.move(tmp_whl_name, whl_path)
+
+
+if __name__ == "__main__":
+    embed_library(
+        sys.argv[1], "libgomp.so.1", len(sys.argv) > 2 and sys.argv[2] == "--update-tag"
+    )
--- a/.ci/manywheel/build.sh
+++ b/.ci/manywheel/build.sh
@ -4,17 +4,14 @@ set -ex

 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

-# Source the common build script for architecture-specific configurations (MKLDNN, ACL, etc.)
-source "${SCRIPTPATH}/../pytorch/build.sh" || true
-
 case "${GPU_ARCH_TYPE:-BLANK}" in
-    cuda | cuda-aarch64)
+    cuda)
        bash "${SCRIPTPATH}/build_cuda.sh"
        ;;
    rocm)
        bash "${SCRIPTPATH}/build_rocm.sh"
        ;;
-    cpu | cpu-cxx11-abi | cpu-aarch64 | cpu-s390x)
+    cpu | cpu-cxx11-abi | cpu-s390x)
        bash "${SCRIPTPATH}/build_cpu.sh"
        ;;
    xpu)
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -18,31 +18,12 @@ retry () {
    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
 }

-# Detect architecture first
-ARCH=$(uname -m)
-echo "Detected architecture: $ARCH"
-
 PLATFORM=""
 # TODO move this into the Docker images
 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
 if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    retry yum install -q -y zip openssl
-    # Set platform based on architecture
-    case $ARCH in
-        x86_64)
-            PLATFORM="manylinux_2_28_x86_64"
-            ;;
-        aarch64)
-            PLATFORM="manylinux_2_28_aarch64"
-            ;;
-        s390x)
-            PLATFORM="manylinux_2_28_s390x"
-            ;;
-        *)
-            echo "Unsupported architecture: $ARCH"
-            exit 1
-            ;;
-    esac
+    PLATFORM="manylinux_2_28_x86_64"
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    retry dnf install -q -y zip openssl
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
@ -57,8 +38,6 @@ else
    exit 1
 fi

-echo "Platform set to: $PLATFORM"
-
 # We use the package name to test the package by passing this to 'pip install'
 # This is the env variable that setup.py uses to name the package. Note that
 # pip 'normalizes' the name first by changing all - to _
@ -320,8 +299,8 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
            # ROCm workaround for roctracer dlopens
            if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
                patchedpath=$(fname_without_so_number $destpath)
-            # Keep the so number for XPU dependencies, libgomp.so.1, ACL libraries, and NVPL libraries to avoid twice load
-            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" || "$filename" == libarm_compute* || "$filename" == libnvpl* || "$filename" == "libgfortran.so.5" ]]; then
+            # Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load
+            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then
                patchedpath=$destpath
            else
                patchedpath=$(fname_with_sha256 $destpath)
@ -367,22 +346,9 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
    done

    # create Manylinux 2_28 tag this needs to happen before regenerate the RECORD
-    # Support all architectures (x86_64, aarch64, s390x)
-    if [[ "$IS_MANYLINUX2_28" == "1" && $GPU_ARCH_TYPE != "xpu" ]]; then
+    if [[ $PLATFORM == "manylinux_2_28_x86_64" && $GPU_ARCH_TYPE != "cpu-s390x" && $GPU_ARCH_TYPE != "xpu" ]]; then
        wheel_file=$(echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/WHEEL/g')
-        echo "Updating wheel tag for $ARCH architecture"
-        # Replace linux_* with manylinux_2_28_* based on architecture
-        case $ARCH in
-            x86_64)
-                sed -i -e 's#linux_x86_64#manylinux_2_28_x86_64#g' $wheel_file
-                ;;
-            aarch64)
-                sed -i -e 's#linux_aarch64#manylinux_2_28_aarch64#g' $wheel_file
-                ;;
-            s390x)
-                sed -i -e 's#linux_s390x#manylinux_2_28_s390x#g' $wheel_file
-                ;;
-        esac
+        sed -i -e s#linux_x86_64#"${PLATFORM}"# $wheel_file;
    fi

    # regenerate the RECORD file with new hashes
--- a/.ci/manywheel/build_cpu.sh
+++ b/.ci/manywheel/build_cpu.sh
@ -15,10 +15,6 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
    EXTRA_CAFFE2_CMAKE_FLAGS=()
 fi

-# Detect architecture
-ARCH=$(uname -m)
-echo "Building CPU wheel for architecture: $ARCH"
-
 WHEELHOUSE_DIR="wheelhousecpu"
 LIBTORCH_HOUSE_DIR="libtorch_housecpu"
 if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
@ -38,10 +34,8 @@ elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
 elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
-    if [[ "$ARCH" == "s390x" ]]; then
+    if [[ "$(uname -m)" == "s390x" ]]; then
        LIBGOMP_PATH="/usr/lib/s390x-linux-gnu/libgomp.so.1"
-    elif [[ "$ARCH" == "aarch64" ]]; then
-        LIBGOMP_PATH="/usr/lib/aarch64-linux-gnu/libgomp.so.1"
    else
        LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
    fi
@ -55,34 +49,6 @@ DEPS_SONAME=(
    "libgomp.so.1"
 )

-# Add ARM-specific library dependencies for CPU builds
-if [[ "$ARCH" == "aarch64" ]]; then
-    echo "Adding ARM-specific CPU library dependencies"
-
-    # ARM Compute Library (if available)
-    if [[ -d "/acl/build" ]]; then
-        echo "Adding ARM Compute Library for CPU"
-        DEPS_LIST+=(
-            "/acl/build/libarm_compute.so"
-            "/acl/build/libarm_compute_graph.so"
-        )
-        DEPS_SONAME+=(
-            "libarm_compute.so"
-            "libarm_compute_graph.so"
-        )
-    fi
-
-    # ARM system libraries
-    DEPS_LIST+=(
-        "/usr/lib64/libgfortran.so.5"
-        "/opt/OpenBLAS/lib/libopenblas.so.0"
-    )
-    DEPS_SONAME+=(
-        "libgfortran.so.5"
-        "libopenblas.so.0"
-    )
-fi
-
 rm -rf /usr/local/cuda*

 SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -29,10 +29,6 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
    EXTRA_CAFFE2_CMAKE_FLAGS=()
 fi

-# Detect architecture
-ARCH=$(uname -m)
-echo "Building for architecture: $ARCH"
-
 # Determine CUDA version and architectures to build for
 #
 # NOTE: We should first check `DESIRED_CUDA` when determining `CUDA_VERSION`,
@ -57,60 +53,34 @@ fi
 cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
 EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")

-# Function to remove architectures from a list
-remove_archs() {
-    local result="$1"
-    shift
-    for arch in "$@"; do
-        result="${result//${arch};/}"
-    done
-    echo "$result"
-}
-
-# Function to filter CUDA architectures for aarch64
-# aarch64 ARM GPUs only support certain compute capabilities
-# Keep: 8.0 (A100), 9.0+ (Hopper, Grace Hopper, newer)
-# Remove: < 8.0 (no ARM GPUs), 8.6 (x86_64 RTX 3090/A6000 only)
-filter_aarch64_archs() {
-    local arch_list="$1"
-    # Explicitly remove architectures not needed on aarch64
-    arch_list=$(remove_archs "$arch_list" "5.0" "6.0" "7.0" "7.5" "8.6")
-    echo "$arch_list"
-}
-
-# Base: Common architectures across all modern CUDA versions
-TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0"
-
 case ${CUDA_VERSION} in
-    12.6) TORCH_CUDA_ARCH_LIST="5.0;6.0;${TORCH_CUDA_ARCH_LIST}" ;;  # Only 12.6 includes Legacy Maxwell/Pascal that will be removed in future releases
-    12.8) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0" ;;  # +Hopper/Blackwell support
-    12.9) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0+PTX" # +Hopper/Blackwell support + PTX for forward compatibility
+    #removing sm_50-sm_60 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
+    #however we would like to keep sm_70 architecture see: https://github.com/pytorch/pytorch/issues/157517
+    12.8)
+        TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0"
+        ;;
+    12.9)
+        TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0+PTX"
+        # WAR to resolve the ld error in libtorch build with CUDA 12.9
        if [[ "$PACKAGE_TYPE" == "libtorch" ]]; then
-            TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//7.0;/}"  # Remove 7.0 to resolve the ld error
-            TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//8.6;/}"  # Remove 8.6 for libtorch
+            TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
        fi
        ;;
    13.0)
-        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;$([[ "$ARCH" == "aarch64" ]] && echo "11.0;" || echo "")12.0+PTX"
-        export TORCH_NVCC_FLAGS="-compress-mode=size"
-        export BUILD_BUNDLE_PTXAS=1
+        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
+        ;;
+    12.6)
+        TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
+        ;;
+    *)
+        echo "unknown cuda version $CUDA_VERSION"
+        exit 1
        ;;
-    *) echo "unknown cuda version $CUDA_VERSION"; exit 1 ;;
 esac

-# Filter for aarch64: Remove < 8.0 and 8.6
-[[ "$ARCH" == "aarch64" ]] && TORCH_CUDA_ARCH_LIST=$(filter_aarch64_archs "$TORCH_CUDA_ARCH_LIST")
-
-echo "TORCH_CUDA_ARCH_LIST set to: $TORCH_CUDA_ARCH_LIST"
 export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
 echo "${TORCH_CUDA_ARCH_LIST}"

-# Disable MAGMA for aarch64 as pre-built libraries are x86-64 only
-if [[ "$ARCH" == "aarch64" ]]; then
-    echo "Disabling MAGMA for aarch64 architecture"
-    export USE_MAGMA=0
-fi
-
 # Package directories
 WHEELHOUSE_DIR="wheelhouse$cuda_version_nodot"
 LIBTORCH_HOUSE_DIR="libtorch_house$cuda_version_nodot"
@ -274,51 +244,6 @@ else
    exit 1
 fi

-# Add ARM-specific library dependencies
-if [[ "$ARCH" == "aarch64" ]]; then
-    echo "Adding ARM-specific library dependencies"
-
-    # ARM Compute Library (if available)
-    if [[ -d "/acl/build" ]]; then
-        echo "Adding ARM Compute Library"
-        DEPS_LIST+=(
-            "/acl/build/libarm_compute.so"
-            "/acl/build/libarm_compute_graph.so"
-        )
-        DEPS_SONAME+=(
-            "libarm_compute.so"
-            "libarm_compute_graph.so"
-        )
-    fi
-
-    # ARM system libraries
-    DEPS_LIST+=(
-        "/lib64/libgomp.so.1"
-        "/usr/lib64/libgfortran.so.5"
-    )
-    DEPS_SONAME+=(
-        "libgomp.so.1"
-        "libgfortran.so.5"
-    )
-
-    # NVPL libraries (ARM optimized BLAS/LAPACK)
-    if [[ -d "/usr/local/lib" && -f "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0" ]]; then
-        echo "Adding NVPL libraries for ARM"
-        DEPS_LIST+=(
-            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0"
-            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0"
-            "/usr/local/lib/libnvpl_lapack_core.so.0"
-            "/usr/local/lib/libnvpl_blas_core.so.0"
-        )
-        DEPS_SONAME+=(
-            "libnvpl_lapack_lp64_gomp.so.0"
-            "libnvpl_blas_lp64_gomp.so.0"
-            "libnvpl_lapack_core.so.0"
-            "libnvpl_blas_core.so.0"
-        )
-    fi
-fi
-
 # run_tests.sh requires DESIRED_CUDA to know what tests to exclude
 export DESIRED_CUDA="$cuda_version_nodot"

@ -326,11 +251,9 @@ export DESIRED_CUDA="$cuda_version_nodot"
 rm -rf /usr/local/cuda || true
 ln -s "/usr/local/cuda-${CUDA_VERSION}" /usr/local/cuda

-# Switch `/usr/local/magma` to the desired CUDA version (skip for aarch64)
-if [[ "$ARCH" != "aarch64" ]]; then
-    rm -rf /usr/local/magma || true
-    ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
-fi
+# Switch `/usr/local/magma` to the desired CUDA version
+rm -rf /usr/local/magma || true
+ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma

 export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev) # 10.0.130
 export CUDA_VERSION_SHORT=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev | cut -f1,2 -d".") # 10.0
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -86,20 +86,10 @@ else
  fi
 fi

-# Enable MKLDNN with ARM Compute Library for ARM builds
 if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
  export USE_MKLDNN=1
-
-  # ACL is required for aarch64 builds
-  if [[ ! -d "/acl" ]]; then
-    echo "ERROR: ARM Compute Library not found at /acl"
-    echo "ACL is required for aarch64 builds. Check Docker image setup."
-    exit 1
-  fi
-
  export USE_MKLDNN_ACL=1
  export ACL_ROOT_DIR=/acl
-  echo "ARM Compute Library enabled for MKLDNN: ACL_ROOT_DIR=/acl"
 fi

 if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then
--- a/.ci/pytorch/smoke_test/check_binary_symbols.py
+++ b/.ci/pytorch/smoke_test/check_binary_symbols.py
@ -100,6 +100,337 @@ def check_lib_statically_linked_libstdc_cxx_abi_symbols(lib: str) -> None:
        )


+def _compile_and_extract_symbols(
+    cpp_content: str, compile_flags: list[str], exclude_list: list[str] | None = None
+) -> list[str]:
+    """
+    Helper to compile a C++ file and extract all symbols.
+
+    Args:
+        cpp_content: C++ source code to compile
+        compile_flags: Compilation flags
+        exclude_list: List of symbol names to exclude. Defaults to ["main"].
+
+    Returns:
+        List of all symbols found in the object file (excluding those in exclude_list).
+    """
+    import subprocess
+    import tempfile
+
+    if exclude_list is None:
+        exclude_list = ["main"]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmppath = Path(tmpdir)
+        cpp_file = tmppath / "test.cpp"
+        obj_file = tmppath / "test.o"
+
+        cpp_file.write_text(cpp_content)
+
+        result = subprocess.run(
+            compile_flags + [str(cpp_file), "-o", str(obj_file)],
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+
+        if result.returncode != 0:
+            raise RuntimeError(f"Compilation failed: {result.stderr}")
+
+        symbols = get_symbols(str(obj_file))
+
+        # Return all symbol names, excluding those in the exclude list
+        return [name for _addr, _stype, name in symbols if name not in exclude_list]
+
+
+def check_stable_only_symbols(install_root: Path) -> None:
+    """
+    Test TORCH_STABLE_ONLY and TORCH_TARGET_VERSION by compiling test code and comparing symbol counts.
+
+    This approach tests:
+    1. WITHOUT macros -> many torch symbols exposed
+    2. WITH TORCH_STABLE_ONLY -> zero torch symbols (all hidden)
+    3. WITH TORCH_TARGET_VERSION -> zero torch symbols (all hidden)
+    4. WITH both macros -> zero torch symbols (all hidden)
+    """
+    include_dir = install_root / "include"
+    assert include_dir.exists(), f"Expected {include_dir} to be present"
+
+    test_cpp_content = """
+// Main torch C++ API headers
+#include <torch/torch.h>
+#include <torch/all.h>
+
+// ATen tensor library
+#include <ATen/ATen.h>
+
+// Core c10 headers (commonly used)
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Optional.h>
+
+int main() { return 0; }
+"""
+
+    base_compile_flags = [
+        "g++",
+        "-std=c++17",
+        f"-I{include_dir}",
+        f"-I{include_dir}/torch/csrc/api/include",
+        "-c",  # Compile only, don't link
+    ]
+
+    # Compile WITHOUT any macros
+    symbols_without = _compile_and_extract_symbols(
+        cpp_content=test_cpp_content,
+        compile_flags=base_compile_flags,
+    )
+
+    # We expect constexpr symbols, inline functions used by other headers etc.
+    # to produce symbols
+    num_symbols_without = len(symbols_without)
+    print(f"Found {num_symbols_without} symbols without any macros defined")
+    assert num_symbols_without != 0, (
+        "Expected a non-zero number of symbols without any macros"
+    )
+
+    # Compile WITH TORCH_STABLE_ONLY (expect 0 symbols)
+    compile_flags_with_stable_only = base_compile_flags + ["-DTORCH_STABLE_ONLY"]
+
+    symbols_with_stable_only = _compile_and_extract_symbols(
+        cpp_content=test_cpp_content,
+        compile_flags=compile_flags_with_stable_only,
+    )
+
+    num_symbols_with_stable_only = len(symbols_with_stable_only)
+    assert num_symbols_with_stable_only == 0, (
+        f"Expected no symbols with TORCH_STABLE_ONLY macro, but found {num_symbols_with_stable_only}"
+    )
+
+    # Compile WITH TORCH_TARGET_VERSION (expect 0 symbols)
+    compile_flags_with_target_version = base_compile_flags + [
+        "-DTORCH_TARGET_VERSION=1"
+    ]
+
+    symbols_with_target_version = _compile_and_extract_symbols(
+        cpp_content=test_cpp_content,
+        compile_flags=compile_flags_with_target_version,
+    )
+
+    num_symbols_with_target_version = len(symbols_with_target_version)
+    assert num_symbols_with_target_version == 0, (
+        f"Expected no symbols with TORCH_TARGET_VERSION macro, but found {num_symbols_with_target_version}"
+    )
+
+    # Compile WITH both macros (expect 0 symbols)
+    compile_flags_with_both = base_compile_flags + [
+        "-DTORCH_STABLE_ONLY",
+        "-DTORCH_TARGET_VERSION=1",
+    ]
+
+    symbols_with_both = _compile_and_extract_symbols(
+        cpp_content=test_cpp_content,
+        compile_flags=compile_flags_with_both,
+    )
+
+    num_symbols_with_both = len(symbols_with_both)
+    assert num_symbols_with_both == 0, (
+        f"Expected no symbols with both macros, but found {num_symbols_with_both}"
+    )
+
+
+def check_stable_api_symbols(install_root: Path) -> None:
+    """
+    Test that stable API headers still expose symbols with TORCH_STABLE_ONLY.
+    The torch/csrc/stable/c/shim.h header is tested in check_stable_c_shim_symbols
+    """
+    include_dir = install_root / "include"
+    assert include_dir.exists(), f"Expected {include_dir} to be present"
+
+    stable_dir = include_dir / "torch" / "csrc" / "stable"
+    assert stable_dir.exists(), f"Expected {stable_dir} to be present"
+
+    stable_headers = list(stable_dir.rglob("*.h"))
+    if not stable_headers:
+        raise RuntimeError("Could not find any stable headers")
+
+    includes = []
+    for header in stable_headers:
+        rel_path = header.relative_to(include_dir)
+        includes.append(f"#include <{rel_path.as_posix()}>")
+
+    includes_str = "\n".join(includes)
+    test_stable_content = f"""
+{includes_str}
+int main() {{ return 0; }}
+"""
+
+    compile_flags = [
+        "g++",
+        "-std=c++17",
+        f"-I{include_dir}",
+        f"-I{include_dir}/torch/csrc/api/include",
+        "-c",
+        "-DTORCH_STABLE_ONLY",
+    ]
+
+    symbols_stable = _compile_and_extract_symbols(
+        cpp_content=test_stable_content,
+        compile_flags=compile_flags,
+    )
+    num_symbols_stable = len(symbols_stable)
+    print(f"Found {num_symbols_stable} symbols in torch/csrc/stable")
+    assert num_symbols_stable > 0, (
+        f"Expected stable headers to expose symbols with TORCH_STABLE_ONLY, "
+        f"but found {num_symbols_stable} symbols"
+    )
+
+
+def check_headeronly_symbols(install_root: Path) -> None:
+    """
+    Test that header-only utility headers still expose symbols with TORCH_STABLE_ONLY.
+    """
+    include_dir = install_root / "include"
+    assert include_dir.exists(), f"Expected {include_dir} to be present"
+
+    # Find all headers in torch/headeronly
+    headeronly_dir = include_dir / "torch" / "headeronly"
+    assert headeronly_dir.exists(), f"Expected {headeronly_dir} to be present"
+    headeronly_headers = list(headeronly_dir.rglob("*.h"))
+    if not headeronly_headers:
+        raise RuntimeError("Could not find any headeronly headers")
+
+    # Filter out platform-specific headers that may not compile everywhere
+    platform_specific_keywords = [
+        "cpu/vec",
+    ]
+
+    filtered_headers = []
+    for header in headeronly_headers:
+        rel_path = header.relative_to(include_dir).as_posix()
+        if not any(
+            keyword in rel_path.lower() for keyword in platform_specific_keywords
+        ):
+            filtered_headers.append(header)
+
+    includes = []
+    for header in filtered_headers:
+        rel_path = header.relative_to(include_dir)
+        includes.append(f"#include <{rel_path.as_posix()}>")
+
+    includes_str = "\n".join(includes)
+    test_headeronly_content = f"""
+{includes_str}
+int main() {{ return 0; }}
+"""
+
+    compile_flags = [
+        "g++",
+        "-std=c++17",
+        f"-I{include_dir}",
+        f"-I{include_dir}/torch/csrc/api/include",
+        "-c",
+        "-DTORCH_STABLE_ONLY",
+    ]
+
+    symbols_headeronly = _compile_and_extract_symbols(
+        cpp_content=test_headeronly_content,
+        compile_flags=compile_flags,
+    )
+    num_symbols_headeronly = len(symbols_headeronly)
+    print(f"Found {num_symbols_headeronly} symbols in torch/headeronly")
+    assert num_symbols_headeronly > 0, (
+        f"Expected headeronly headers to expose symbols with TORCH_STABLE_ONLY, "
+        f"but found {num_symbols_headeronly} symbols"
+    )
+
+
+def check_aoti_shim_symbols(install_root: Path) -> None:
+    """
+    Test that AOTI shim headers still expose symbols with TORCH_STABLE_ONLY.
+    """
+    include_dir = install_root / "include"
+    assert include_dir.exists(), f"Expected {include_dir} to be present"
+
+    # There are no constexpr symbols etc., so we need to actually use functions
+    # so that some symbols are found.
+    test_shim_content = """
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+int main() {
+    int32_t (*fp1)() = &aoti_torch_device_type_cpu;
+    int32_t (*fp2)() = &aoti_torch_dtype_float32;
+    (void)fp1; (void)fp2;
+    return 0;
+}
+"""
+
+    compile_flags = [
+        "g++",
+        "-std=c++17",
+        f"-I{include_dir}",
+        f"-I{include_dir}/torch/csrc/api/include",
+        "-c",
+        "-DTORCH_STABLE_ONLY",
+    ]
+
+    symbols_shim = _compile_and_extract_symbols(
+        cpp_content=test_shim_content,
+        compile_flags=compile_flags,
+    )
+    num_symbols_shim = len(symbols_shim)
+    assert num_symbols_shim > 0, (
+        f"Expected shim headers to expose symbols with TORCH_STABLE_ONLY, "
+        f"but found {num_symbols_shim} symbols"
+    )
+
+
+def check_stable_c_shim_symbols(install_root: Path) -> None:
+    """
+    Test that stable C shim headers still expose symbols with TORCH_STABLE_ONLY.
+    """
+    include_dir = install_root / "include"
+    assert include_dir.exists(), f"Expected {include_dir} to be present"
+
+    # Check if the stable C shim exists
+    stable_shim = include_dir / "torch" / "csrc" / "stable" / "c" / "shim.h"
+    if not stable_shim.exists():
+        raise RuntimeError("Could not find stable c shim")
+
+    # There are no constexpr symbols etc., so we need to actually use functions
+    # so that some symbols are found.
+    test_stable_shim_content = """
+#include <torch/csrc/stable/c/shim.h>
+int main() {
+    // Reference stable C API functions to create undefined symbols
+    AOTITorchError (*fp1)(const char*, uint32_t*, int32_t*) = &torch_parse_device_string;
+    AOTITorchError (*fp2)(uint32_t*) = &torch_get_num_threads;
+    (void)fp1; (void)fp2;
+    return 0;
+}
+"""
+
+    compile_flags = [
+        "g++",
+        "-std=c++17",
+        f"-I{include_dir}",
+        f"-I{include_dir}/torch/csrc/api/include",
+        "-c",
+        "-DTORCH_STABLE_ONLY",
+    ]
+
+    symbols_stable_shim = _compile_and_extract_symbols(
+        cpp_content=test_stable_shim_content,
+        compile_flags=compile_flags,
+    )
+    num_symbols_stable_shim = len(symbols_stable_shim)
+    assert num_symbols_stable_shim > 0, (
+        f"Expected stable C shim headers to expose symbols with TORCH_STABLE_ONLY, "
+        f"but found {num_symbols_stable_shim} symbols"
+    )
+
+
 def check_lib_symbols_for_abi_correctness(lib: str) -> None:
    print(f"lib: {lib}")
    cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS)
@ -129,6 +460,13 @@ def main() -> None:
    check_lib_symbols_for_abi_correctness(libtorch_cpu_path)
    check_lib_statically_linked_libstdc_cxx_abi_symbols(libtorch_cpu_path)

+    # Check symbols when TORCH_STABLE_ONLY is defined
+    check_stable_only_symbols(install_root)
+    check_stable_api_symbols(install_root)
+    check_headeronly_symbols(install_root)
+    check_aoti_shim_symbols(install_root)
+    check_stable_c_shim_symbols(install_root)
+

 if __name__ == "__main__":
    main()
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -260,8 +260,11 @@ jobs:
            "${DOCKER_IMAGE}"
          )
          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Unified build script for all architectures (x86_64, aarch64, s390x)
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
+          if [[ ${BUILD_ENVIRONMENT} == *"aarch64"* ]]; then
+            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/aarch64_linux/aarch64_ci_build.sh"
+          else
+            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
+          fi

      - name: Chown artifacts
        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
--- a/aten/src/ATen/mps/MPSDevice.h
+++ b/aten/src/ATen/mps/MPSDevice.h
@ -22,7 +22,6 @@ enum class MacOSVersion : uint32_t {
  MACOS_VER_15_0_PLUS,
  MACOS_VER_15_1_PLUS,
  MACOS_VER_15_2_PLUS,
-  MACOS_VER_26_0_PLUS,
 };

 //-----------------------------------------------------------------
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@ -65,7 +65,6 @@ bool MPSDevice::isMacOS13Plus(MacOSVersion version) const {
  static bool _macos_15_0_plus = is_os_version_at_least(15, 0);
  static bool _macos_15_1_plus = is_os_version_at_least(15, 1);
  static bool _macos_15_2_plus = is_os_version_at_least(15, 2);
-  static bool _macos_26_0_plus = is_os_version_at_least(26, 0);

  switch (version) {
    case MacOSVersion::MACOS_VER_14_4_PLUS:
@ -76,8 +75,6 @@ bool MPSDevice::isMacOS13Plus(MacOSVersion version) const {
      return _macos_15_1_plus;
    case MacOSVersion::MACOS_VER_15_2_PLUS:
      return _macos_15_2_plus;
-    case MacOSVersion::MACOS_VER_26_0_PLUS:
-      return _macos_26_0_plus;
    default:
      return false;
  }
--- a/aten/src/ATen/native/mps/operations/Attention.mm
+++ b/aten/src/ATen/native/mps/operations/Attention.mm
@ -69,139 +69,75 @@ static std::tuple<Tensor, Tensor> sdpa_general_mps(const Tensor& query,
  auto out = at::empty({batchSize, num_head, qSize, headSize}, query.options());
  auto attn = at::empty({batchSize, num_head, qSize, maxSeqLength}, query.options());
  auto scale_factor = sdp::calculate_scale(query, scale).expect_float();
-  static const bool is_macOS_26_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_26_0_PLUS);
  @autoreleasepool {
    auto mkey = __func__ + getTensorsStringKey({query, key, value}) + ":" + std::to_string(is_causal) + ":" +
        std::to_string(attn_mask.has_value());
+    auto cachedGraph =
+        LookUpOrCreateCachedGraph<CachedGraph>(mkey, [&, q_ = query, k_ = key, v_ = value](auto mpsGraph, auto graph) {
+          auto qTensor = mpsGraphRankedPlaceHolder(mpsGraph, q_);
+          auto kTensor = mpsGraphRankedPlaceHolder(mpsGraph, k_);
+          auto vTensor = mpsGraphRankedPlaceHolder(mpsGraph, v_);
+          auto kT = [mpsGraph transposeTensor:kTensor dimension:2 withDimension:3 name:nil];
+          auto scaleTensor = [mpsGraph constantWithScalar:scale_factor
+                                                    shape:getMPSShape({1})
+                                                 dataType:MPSDataTypeFloat32];

-    CachedGraph* cachedGraph;
-    //if(is_macOS_26_0_or_newer) {
-    if(true) {
-        cachedGraph =
-            LookUpOrCreateCachedGraph<CachedGraph>(mkey, [&, q_ = query, k_ = key, v_ = value](auto mpsGraph, auto graph) {
-              auto qTensor = mpsGraphRankedPlaceHolder(mpsGraph, q_);
-              auto kTensor = mpsGraphRankedPlaceHolder(mpsGraph, k_);
-              auto vTensor = mpsGraphRankedPlaceHolder(mpsGraph, v_);
+          auto maskedMM = [mpsGraph matrixMultiplicationWithPrimaryTensor:qTensor secondaryTensor:kT name:nil];

-              if (is_causal) {
-                MPSShape* maskShape = @[@(qSize), @(maxSeqLength)];
-                auto x = [mpsGraph coordinateAlongAxis:-1 withShape:@[@(qSize), @1] name:nil];
-                auto y = [mpsGraph coordinateAlongAxis:-2 withShape:@[@1, @(maxSeqLength)] name:nil];
-                auto isLess = [mpsGraph lessThanOrEqualToWithPrimaryTensor:x secondaryTensor:y name:nil];
-                auto causalMask = [mpsGraph selectWithPredicateTensor:isLess 
-                                            truePredicateTensor:[mpsGraph constantWithScalar:0 dataType:qTensor.dataType] 
-                                            falsePredicateTensor:[mpsGraph constantWithScalar:-INFINITY dataType:qTensor.dataType] 
-                                            name:nil];
-                graph->maskTensor = causalMask;
-              } else if (attn_mask) {
-                graph->maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, *attn_mask);
-              }
+          if (macOS15_0_plus && [maskedMM dataType] == MPSDataTypeFloat32) {
+            // bug in MacOS15, without this trick SDPA leaks memory, adding 0.0f gets ignored(still takes SDPA sequence
+            // path which leaks)
+            auto oneTensor = [mpsGraph constantWithScalar:1e-20f shape:getMPSShape({1}) dataType:MPSDataTypeFloat32];
+            maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM secondaryTensor:oneTensor name:nil];
+          }

-              // Account for case where all values were masked causing division by 0 in softmax (issue:#156707)
-              // Overwrites expected NANs in sm with zeros.
-//              auto negInfTensor = [mpsGraph constantWithScalar:-INFINITY shape:maskedMM.shape dataType:maskedMM.dataType];
-//              auto elem_neg_inf = [mpsGraph equalWithPrimaryTensor:maskedMM secondaryTensor:negInfTensor name:nil];
-//              auto all_neg_infs_along_axis = [mpsGraph reductionAndWithTensor:elem_neg_inf axis:3 name:nil];
-//              auto zero_mask = [mpsGraph broadcastTensor:all_neg_infs_along_axis toShape:maskedMM.shape name:nil];
-//              auto zeroTensor = [mpsGraph constantWithScalar:0.0 shape:maskedMM.shape dataType:maskedMM.dataType];
-//
-//              auto sm = [mpsGraph softMaxWithTensor:maskedMM axis:3 name:nil];
-//              MPSGraphTensor* correctedSM = [mpsGraph selectWithPredicateTensor:zero_mask
-//                                                            truePredicateTensor:zeroTensor
-//                                                           falsePredicateTensor:sm
-//                                                                           name:nil];
-//
-//              auto output = [mpsGraph matrixMultiplicationWithPrimaryTensor:correctedSM secondaryTensor:vTensor name:nil];
+          // upcasting to float32 if needed to improve precision when multiplying by the scale factor
+          maskedMM = castMPSTensor(mpsGraph, maskedMM, MPSDataTypeFloat32);
+          maskedMM = [mpsGraph multiplicationWithPrimaryTensor:maskedMM secondaryTensor:scaleTensor name:nil];

-              MPSGraphTensor* output;
-              if(graph->maskTensor != nil) {
-                output = [mpsGraph scaledDotProductAttentionWithQueryTensor:qTensor 
-                                                          keyTensor:kTensor 
-                                                        valueTensor:vTensor
-                                                         maskTensor:graph->maskTensor
-                                                              scale:scale_factor
-                                                               name:@"MPSGraph SDPA"];
-              } else {
-                output = [mpsGraph scaledDotProductAttentionWithQueryTensor:qTensor 
-                                                          keyTensor:kTensor 
-                                                        valueTensor:vTensor
-                                                              scale:scale_factor
-                                                               name:@"MPSGraph SDPA"];
-              }
-              graph->qTensor = qTensor;
-              graph->kTensor = kTensor;
-              graph->vTensor = vTensor;
-              graph->outputTensor = castMPSTensor(mpsGraph, output, qTensor.dataType);
-//              graph->attnTensor = castMPSTensor(mpsGraph, sm, qTensor.dataType);
-            });
-    } else {
-        cachedGraph =
-            LookUpOrCreateCachedGraph<CachedGraph>(mkey, [&, q_ = query, k_ = key, v_ = value](auto mpsGraph, auto graph) {
-              auto qTensor = mpsGraphRankedPlaceHolder(mpsGraph, q_);
-              auto kTensor = mpsGraphRankedPlaceHolder(mpsGraph, k_);
-              auto vTensor = mpsGraphRankedPlaceHolder(mpsGraph, v_);
-              auto kT = [mpsGraph transposeTensor:kTensor dimension:2 withDimension:3 name:nil];
-              auto scaleTensor = [mpsGraph constantWithScalar:scale_factor
-                                                        shape:getMPSShape({1})
-                                                     dataType:MPSDataTypeFloat32];
+          if (is_causal) {
+            auto causalMask = [mpsGraph constantWithScalar:1.0f
+                                                     shape:getMPSShape({qSize, maxSeqLength})
+                                                  dataType:MPSDataTypeBool];
+            causalMask = [mpsGraph bandPartWithTensor:causalMask numLower:-1 numUpper:0 name:nil];
+            auto minusInf = [mpsGraph constantWithScalar:-1e20 shape:maskedMM.shape dataType:maskedMM.dataType];
+            maskedMM = [mpsGraph selectWithPredicateTensor:causalMask
+                                       truePredicateTensor:maskedMM
+                                      falsePredicateTensor:minusInf
+                                                      name:nil];
+          } else if (attn_mask) {
+            graph->maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, *attn_mask);
+            maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM
+                                           secondaryTensor:castMPSTensor(mpsGraph, graph->maskTensor, maskedMM.dataType)
+                                                      name:nil];
+          }

-              auto maskedMM = [mpsGraph matrixMultiplicationWithPrimaryTensor:qTensor secondaryTensor:kT name:nil];
+          // Account for case where all values were masked causing division by 0 in softmax (issue:#156707)
+          // Overwrites expected NANs in sm with zeros.
+          auto negInfTensor = [mpsGraph constantWithScalar:-INFINITY shape:maskedMM.shape dataType:maskedMM.dataType];
+          auto elem_neg_inf = [mpsGraph equalWithPrimaryTensor:maskedMM secondaryTensor:negInfTensor name:nil];
+          auto all_neg_infs_along_axis = [mpsGraph reductionAndWithTensor:elem_neg_inf axis:3 name:nil];
+          auto zero_mask = [mpsGraph broadcastTensor:all_neg_infs_along_axis toShape:maskedMM.shape name:nil];
+          auto zeroTensor = [mpsGraph constantWithScalar:0.0 shape:maskedMM.shape dataType:maskedMM.dataType];

-              if (macOS15_0_plus && [maskedMM dataType] == MPSDataTypeFloat32) {
-                // bug in MacOS15, without this trick SDPA leaks memory, adding 0.0f gets ignored(still takes SDPA sequence
-                // path which leaks)
-                auto oneTensor = [mpsGraph constantWithScalar:1e-20f shape:getMPSShape({1}) dataType:MPSDataTypeFloat32];
-                maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM secondaryTensor:oneTensor name:nil];
-              }
+          auto sm = [mpsGraph softMaxWithTensor:maskedMM axis:3 name:nil];
+          MPSGraphTensor* correctedSM = [mpsGraph selectWithPredicateTensor:zero_mask
+                                                        truePredicateTensor:zeroTensor
+                                                       falsePredicateTensor:sm
+                                                                       name:nil];

-              // upcasting to float32 if needed to improve precision when multiplying by the scale factor
-              maskedMM = castMPSTensor(mpsGraph, maskedMM, MPSDataTypeFloat32);
-              maskedMM = [mpsGraph multiplicationWithPrimaryTensor:maskedMM secondaryTensor:scaleTensor name:nil];
-
-              if (is_causal) {
-                auto causalMask = [mpsGraph constantWithScalar:1.0f
-                                                         shape:getMPSShape({qSize, maxSeqLength})
-                                                      dataType:MPSDataTypeBool];
-                causalMask = [mpsGraph bandPartWithTensor:causalMask numLower:-1 numUpper:0 name:nil];
-                auto minusInf = [mpsGraph constantWithScalar:-1e20 shape:maskedMM.shape dataType:maskedMM.dataType];
-                maskedMM = [mpsGraph selectWithPredicateTensor:causalMask
-                                           truePredicateTensor:maskedMM
-                                          falsePredicateTensor:minusInf
-                                                          name:nil];
-              } else if (attn_mask) {
-                graph->maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, *attn_mask);
-                maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM
-                                               secondaryTensor:castMPSTensor(mpsGraph, graph->maskTensor, maskedMM.dataType)
-                                                          name:nil];
-              }
-
-              // Account for case where all values were masked causing division by 0 in softmax (issue:#156707)
-              // Overwrites expected NANs in sm with zeros.
-              auto negInfTensor = [mpsGraph constantWithScalar:-INFINITY shape:maskedMM.shape dataType:maskedMM.dataType];
-              auto elem_neg_inf = [mpsGraph equalWithPrimaryTensor:maskedMM secondaryTensor:negInfTensor name:nil];
-              auto all_neg_infs_along_axis = [mpsGraph reductionAndWithTensor:elem_neg_inf axis:3 name:nil];
-              auto zero_mask = [mpsGraph broadcastTensor:all_neg_infs_along_axis toShape:maskedMM.shape name:nil];
-              auto zeroTensor = [mpsGraph constantWithScalar:0.0 shape:maskedMM.shape dataType:maskedMM.dataType];
-
-              auto sm = [mpsGraph softMaxWithTensor:maskedMM axis:3 name:nil];
-              MPSGraphTensor* correctedSM = [mpsGraph selectWithPredicateTensor:zero_mask
-                                                            truePredicateTensor:zeroTensor
-                                                           falsePredicateTensor:sm
-                                                                           name:nil];
-
-              auto output = [mpsGraph matrixMultiplicationWithPrimaryTensor:correctedSM secondaryTensor:vTensor name:nil];
-              graph->qTensor = qTensor;
-              graph->kTensor = kTensor;
-              graph->vTensor = vTensor;
-              graph->outputTensor = castMPSTensor(mpsGraph, output, qTensor.dataType);
-              graph->attnTensor = castMPSTensor(mpsGraph, sm, qTensor.dataType);
-            });
-    }
+          auto output = [mpsGraph matrixMultiplicationWithPrimaryTensor:correctedSM secondaryTensor:vTensor name:nil];
+          graph->qTensor = qTensor;
+          graph->kTensor = kTensor;
+          graph->vTensor = vTensor;
+          graph->outputTensor = castMPSTensor(mpsGraph, output, qTensor.dataType);
+          graph->attnTensor = castMPSTensor(mpsGraph, sm, qTensor.dataType);
+        });
    auto qPlaceholder = Placeholder(cachedGraph->qTensor, query);
    auto kPlaceholder = Placeholder(cachedGraph->kTensor, key);
    auto vPlaceholder = Placeholder(cachedGraph->vTensor, value);
    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor, out);
-//    auto attnPlaceholder = Placeholder(cachedGraph->attnTensor, attn);
+    auto attnPlaceholder = Placeholder(cachedGraph->attnTensor, attn);
    NSDictionary* feeds = nil;
    if (!attn_mask) {
      feeds = dictionaryFromPlaceholders(qPlaceholder, kPlaceholder, vPlaceholder);
@ -209,8 +145,7 @@ static std::tuple<Tensor, Tensor> sdpa_general_mps(const Tensor& query,
      auto mPlaceholder = Placeholder(cachedGraph->maskTensor, *attn_mask);
      feeds = dictionaryFromPlaceholders(qPlaceholder, kPlaceholder, vPlaceholder, mPlaceholder);
    }
-//    NSDictionary* outs = dictionaryFromPlaceholders(outputPlaceholder, attnPlaceholder);
-    NSDictionary* outs = dictionaryFromPlaceholders(outputPlaceholder);
+    NSDictionary* outs = dictionaryFromPlaceholders(outputPlaceholder, attnPlaceholder);
    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outs);
  }

--- a/setup.py
+++ b/setup.py
@ -1358,6 +1358,45 @@ class concat_license_files:

 # Need to create the proper LICENSE.txt for the wheel
 class bdist_wheel(setuptools.command.bdist_wheel.bdist_wheel):
+    def _wrap_headers_with_macro(self, bdist_dir: Path) -> None:
+        """Wrap all header files with #if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION).
+
+        Excludes:
+        - torch/include/torch/headeronly/*
+        - torch/include/torch/csrc/stable/*
+        - torch/include/torch/csrc/inductor/aoti_torch/c/ (only shim headers)
+        - torch/include/torch/csrc/inductor/aoti_torch/generated/
+        """
+        header_extensions = (".h", ".hpp", ".cuh")
+        header_files = [
+            f for ext in header_extensions for f in bdist_dir.rglob(f"*{ext}")
+        ]
+
+        # Paths to exclude from wrapping
+        exclude_dir_patterns = [
+            "torch/include/torch/headeronly/",
+            "torch/include/torch/csrc/stable/",
+            "torch/include/torch/csrc/inductor/aoti_torch/c/",
+            "torch/include/torch/csrc/inductor/aoti_torch/generated/",
+        ]
+
+        for header_file in header_files:
+            rel_path = header_file.relative_to(bdist_dir).as_posix()
+
+            if any(rel_path.startswith(pattern) for pattern in exclude_dir_patterns):
+                report(f"Skipping header: {rel_path}")
+                continue
+
+            original_content = header_file.read_text(encoding="utf-8")
+            wrapped_content = (
+                "#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)\n"
+                f"{original_content}"
+                "\n#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)\n"
+            )
+
+            header_file.write_text(wrapped_content, encoding="utf-8")
+            report(f"Wrapped header: {rel_path}")
+
    def run(self) -> None:
        with concat_license_files(include_files=True):
            super().run()
@ -1380,6 +1419,14 @@ class bdist_wheel(setuptools.command.bdist_wheel.bdist_wheel):
            # need an __init__.py file otherwise we wouldn't have a package
            (bdist_dir / "torch" / "__init__.py").touch()

+        # Wrap all header files with TORCH_STABLE_ONLY macro
+        assert self.bdist_dir is not None, "bdist_dir should be set during wheel build"
+        bdist_dir = Path(self.bdist_dir)
+        report(
+            "-- Wrapping header files with if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)"
+        )
+        self._wrap_headers_with_macro(bdist_dir)
+

 class clean(Command):
    user_options: ClassVar[list[tuple[str, str | None, str]]] = []
--- a/test/cpp_extensions/libtorch_agnostic_extension/setup.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/setup.py
@ -33,7 +33,7 @@ class clean(distutils.command.clean.clean):

 def get_extension():
    extra_compile_args = {
-        "cxx": ["-fdiagnostics-color=always"],
+        "cxx": ["-fdiagnostics-color=always", "-DTORCH_STABLE_ONLY"],
    }
    sources = list(CSRC_DIR.glob("**/*.cpp"))

--- a/test/cpp_extensions/torch_stable_test_extension/setup.py
+++ b/test/cpp_extensions/torch_stable_test_extension/setup.py
@ -1,67 +0,0 @@
-import distutils.command.clean
-import shutil
-from pathlib import Path
-
-from setuptools import find_packages, setup
-
-from torch.utils.cpp_extension import BuildExtension, CppExtension
-
-
-ROOT_DIR = Path(__file__).parent
-CSRC_DIR = ROOT_DIR / "torch_stable_test" / "csrc"
-
-
-class clean(distutils.command.clean.clean):
-    def run(self):
-        # Run default behavior first
-        distutils.command.clean.clean.run(self)
-
-        # Remove extension
-        for path in (ROOT_DIR / "torch_stable_test").glob("**/*.so"):
-            path.unlink()
-        # Remove build and dist and egg-info directories
-        dirs = [
-            ROOT_DIR / "build",
-            ROOT_DIR / "dist",
-            ROOT_DIR / "torch_stable_test.egg-info",
-        ]
-        for path in dirs:
-            if path.exists():
-                shutil.rmtree(str(path), ignore_errors=True)
-
-
-def get_extension():
-    extra_compile_args = {
-        "cxx": ["-fdiagnostics-color=always", "-DTORCH_STABLE_ONLY"],
-    }
-
-    sources = list(CSRC_DIR.glob("**/*.cpp"))
-
-    return [
-        CppExtension(
-            "torch_stable_test._C",
-            sources=sorted(str(s) for s in sources),
-            py_limited_api=True,
-            extra_compile_args=extra_compile_args,
-            extra_link_args=[],
-        )
-    ]
-
-
-setup(
-    name="torch_stable_test",
-    version="0.0",
-    author="PyTorch Core Team",
-    description="Test extension to verify TORCH_STABLE_ONLY flag",
-    packages=find_packages(exclude=("test",)),
-    package_data={"torch_stable_test": ["*.dll", "*.dylib", "*.so"]},
-    install_requires=[
-        "torch",
-    ],
-    ext_modules=get_extension(),
-    cmdclass={
-        "build_ext": BuildExtension.with_options(no_python_abi_suffix=True),
-        "clean": clean,
-    },
-    options={"bdist_wheel": {"py_limited_api": "cp39"}},
-)
--- a/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/init.py
+++ b/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/init.py
--- a/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/csrc/test_extension.cpp
+++ b/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/csrc/test_extension.cpp
@ -1 +0,0 @@
-#include <ATen/core/TensorBase.h> // This should trigger the TORCH_STABLE_ONLY error
--- a/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/test_torch_stable.py
+++ b/test/cpp_extensions/torch_stable_test_extension/torch_stable_test/test_torch_stable.py
@ -1,22 +0,0 @@
-# Owner(s): ["module: cpp"]
-
-from pathlib import Path
-
-from torch.testing._internal.common_utils import (
-    install_cpp_extension,
-    IS_WINDOWS,
-    run_tests,
-    TestCase,
-)
-
-
-if not IS_WINDOWS:
-
-    class TestTorchStable(TestCase):
-        def test_setup_fails(self):
-            with self.assertRaisesRegex(RuntimeError, "build failed for cpp extension"):
-                install_cpp_extension(extension_root=Path(__file__).parent.parent)
-
-
-if __name__ == "__main__":
-    run_tests()
--- a/test/export/test_experimental.py
+++ b/test/export/test_experimental.py
@ -456,6 +456,31 @@ def forward(self, x):
        test_inputs = make_inputs()
        self.assertEqual(gm(*test_inputs), foo(*test_inputs))

+    def test_dynamo_graph_capture_with_call_override(self):
+        class _InterestingModule(torch.nn.Module):
+            def __init__(self, module):
+                super().__init__()
+                self._module = module
+
+            def __call__(self, *args, **kwargs):
+                return self._module(*args, **kwargs)
+
+        class MyModel(torch.nn.Module):
+            def forward(self, x):
+                return x + 1
+
+        foo = _InterestingModule(MyModel())
+
+        def make_inputs():
+            return (torch.randn(2, 3),)
+
+        trace_inputs = make_inputs()
+        gm = dynamo_graph_capture_for_export(foo)(*trace_inputs)
+        test_inputs = make_inputs()
+        self.assertEqual(gm(*test_inputs), foo(*test_inputs))
+        self.assertEqual(len(list(gm.buffers())), len(list(foo.buffers())))
+        self.assertEqual(len(list(gm.parameters())), len(list(foo.parameters())))
+
    def test_dynamo_graph_capture_custom_pytree_type(self):
        import torch.utils._pytree as pytree

--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@ -630,7 +630,6 @@ class TestSparse(TestSparseBase):
        i[0][0] = 0
        self.assertEqual(torch.empty((3, 0), dtype=dtype, device=device), self.safeToDense(x))

-    @expectedFailureMPS
    @dtypes(torch.double, torch.cdouble)
    @dtypesIfMPS(torch.float32, torch.complex64)
    @unittest.skipIf(TEST_WITH_CROSSREF, "generator unsupported triggers assertion error")
@ -647,7 +646,8 @@ class TestSparse(TestSparseBase):
            def fn(x):
                return x.to_dense(masked_grad=gradcheck.masked)
            x.requires_grad_(True)
-            gradcheck(fn, (x,))
+            kwargs = {"eps": 1e-4} if device == "mps:0" else {}
+            gradcheck(fn, (x,), **kwargs)

        i = self.index_tensor([
            [0, 1, 2, 2],
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@ -1043,6 +1043,11 @@ def get_traced_fn(mod: Any) -> tuple[FunctionType, Optional[object]]:
    import inspect

    if isinstance(mod, torch.nn.Module):
+        resolved_forward = mod.forward
+        if hasattr(resolved_forward, "__self__"):
+            # pyrefly: ignore [missing-attribute]
+            resolved_forward = resolved_forward.__func__
+
        # Mirrored from NNModuleVariable.call_function:
        # https://github.com/pytorch/pytorch/blob/main/torch/_dynamo/variables/nn_module.py#L1035
        if (
@ -1054,7 +1059,12 @@ def get_traced_fn(mod: Any) -> tuple[FunctionType, Optional[object]]:
            and len(mod._backward_hooks) == 0
            and len(torch.nn.modules.module._global_backward_pre_hooks) == 0
            and len(torch.nn.modules.module._global_backward_hooks) == 0
+            and resolved_forward != torch.nn.Module.forward
        ):
+            # We cannot trace __call__ by default because it will break
+            # the legacy dynamo export. If we want to revisit this,
+            # feel free to remove this path and try unittests in
+            # test_strict_export_v2.py
            mod = mod.forward
        elif isinstance(mod, torch.fx.GraphModule):
            mod = mod._call_impl
--- a/torch/csrc/inductor/aoti_torch/c/shim.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim.h
@ -38,9 +38,9 @@

 // The following files are implemented in a header-only way and are guarded by
 // test/cpp/aoti_abi_check
-#include <c10/util/BFloat16.h>
-#include <c10/util/Half.h>
-#include <c10/util/complex.h>
+#include <torch/headeronly/util/BFloat16.h>
+#include <torch/headeronly/util/Half.h>
+#include <torch/headeronly/util/complex.h>

 #ifdef __cplusplus
 extern "C" {
				`@ -1 +0,0 @@`
				`#include <ATen/core/TensorBase.h> // This should trigger the TORCH_STABLE_ONLY error`