Add check for MacOS26 to use different code path in SDPA

2025-11-19 18:14:54 +08:00 · 2025-11-17 15:38:19 -08:00
409 changed files with 3794 additions and 12524 deletions
--- a/.ci/aarch64_linux/README.md
+++ b/.ci/aarch64_linux/README.md
@ -1,19 +0,0 @@
-# Aarch64 (ARM/Graviton) Support Scripts
-Scripts for building aarch64 PyTorch PIP Wheels. These scripts build the following wheels:
-* torch
-* torchvision
-* torchaudio
-* torchtext
-* torchdata
-## Aarch64_ci_build.sh
-This script is design to support CD operations within PyPi manylinux aarch64 container, and be executed in the container. It prepares the container and then executes __aarch64_wheel_ci_build.py__ to build the wheels. The script "assumes" the PyTorch repo is located at: ```/pytorch``` and will put the wheels into ```/artifacts```.
-### Usage
-```DESIRED_PYTHON=<PythonVersion> aarch64_ci_build.sh```
-
-__NOTE:__ CI build is currently __EXPERMINTAL__
-
-## Build_aarch64_wheel.py
-This app allows a person to build using AWS EC3 resources and requires AWS-CLI and Boto3 with AWS credentials to support building EC2 instances for the wheel builds. Can be used in a codebuild CD or from a local system.
-
-### Usage
-```build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch <RCtag>```
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -1,53 +0,0 @@
-#!/bin/bash
-set -eux -o pipefail
-
-GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
-
-# Set CUDA architecture lists to match x86 build_cuda.sh
-if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0"
-elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
-elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
-elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
-fi
-
-# Compress the fatbin with -compress-mode=size for CUDA 13
-if [[ "$DESIRED_CUDA" == *"13"* ]]; then
-    export TORCH_NVCC_FLAGS="-compress-mode=size"
-    # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
-    export BUILD_BUNDLE_PTXAS=1
-fi
-
-SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
-source $SCRIPTPATH/aarch64_ci_setup.sh
-
-###############################################################################
-# Run aarch64 builder python
-###############################################################################
-cd /
-# adding safe directory for git as the permissions will be
-# on the mounted pytorch repo
-git config --global --add safe.directory /pytorch
-pip install -r /pytorch/requirements.txt
-pip install auditwheel==6.2.0 wheel
-if [ "$DESIRED_CUDA" = "cpu" ]; then
-    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
-    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
-else
-    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
-    export USE_SYSTEM_NCCL=1
-
-    # Check if we should use NVIDIA libs from PyPI (similar to x86 build_cuda.sh logic)
-    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
-        echo "Bundling CUDA libraries with wheel for aarch64."
-    else
-        echo "Using nvidia libs from pypi for aarch64."
-        echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
-        export USE_NVIDIA_PYPI_LIBS=1
-    fi
-
-    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
-fi
--- a/.ci/aarch64_linux/aarch64_ci_setup.sh
+++ b/.ci/aarch64_linux/aarch64_ci_setup.sh
@ -1,21 +0,0 @@
-#!/bin/bash
-set -eux -o pipefail
-
-# This script is used to prepare the Docker container for aarch64_ci_wheel_build.py python script
-# By creating symlinks from desired /opt/python to /usr/local/bin/
-
-NUMPY_VERSION=2.0.2
-if [[ "$DESIRED_PYTHON"  == "3.13" || "$DESIRED_PYTHON" == "3.13t" ]]; then
-    NUMPY_VERSION=2.1.2
-fi
-
-SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
-source $SCRIPTPATH/../manywheel/set_desired_python.sh
-
-pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2
-
-for tool in python python3 pip pip3 ninja scons patchelf; do
-    ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin;
-done
-
-python --version
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -1,333 +0,0 @@
-#!/usr/bin/env python3
-# encoding: UTF-8
-
-import os
-import shutil
-from subprocess import check_call, check_output
-
-
-def list_dir(path: str) -> list[str]:
-    """'
-    Helper for getting paths for Python
-    """
-    return check_output(["ls", "-1", path]).decode().split("\n")
-
-
-def replace_tag(filename) -> None:
-    with open(filename) as f:
-        lines = f.readlines()
-    for i, line in enumerate(lines):
-        if line.startswith("Tag:"):
-            lines[i] = line.replace("-linux_", "-manylinux_2_28_")
-            print(f"Updated tag from {line} to {lines[i]}")
-            break
-
-    with open(filename, "w") as f:
-        f.writelines(lines)
-
-
-def patch_library_rpath(
-    folder: str,
-    lib_name: str,
-    use_nvidia_pypi_libs: bool = False,
-    desired_cuda: str = "",
-) -> None:
-    """Apply patchelf to set RPATH for a library in torch/lib"""
-    lib_path = f"{folder}/tmp/torch/lib/{lib_name}"
-
-    if use_nvidia_pypi_libs:
-        # For PyPI NVIDIA libraries, construct CUDA RPATH
-        cuda_rpaths = [
-            "$ORIGIN/../../nvidia/cudnn/lib",
-            "$ORIGIN/../../nvidia/nvshmem/lib",
-            "$ORIGIN/../../nvidia/nccl/lib",
-            "$ORIGIN/../../nvidia/cusparselt/lib",
-        ]
-
-        if "130" in desired_cuda:
-            cuda_rpaths.append("$ORIGIN/../../nvidia/cu13/lib")
-        else:
-            cuda_rpaths.extend(
-                [
-                    "$ORIGIN/../../nvidia/cublas/lib",
-                    "$ORIGIN/../../nvidia/cuda_cupti/lib",
-                    "$ORIGIN/../../nvidia/cuda_nvrtc/lib",
-                    "$ORIGIN/../../nvidia/cuda_runtime/lib",
-                    "$ORIGIN/../../nvidia/cufft/lib",
-                    "$ORIGIN/../../nvidia/curand/lib",
-                    "$ORIGIN/../../nvidia/cusolver/lib",
-                    "$ORIGIN/../../nvidia/cusparse/lib",
-                    "$ORIGIN/../../nvidia/nvtx/lib",
-                    "$ORIGIN/../../nvidia/cufile/lib",
-                ]
-            )
-
-        # Add $ORIGIN for local torch libs
-        rpath = ":".join(cuda_rpaths) + ":$ORIGIN"
-    else:
-        # For bundled libraries, just use $ORIGIN
-        rpath = "$ORIGIN"
-
-    if os.path.exists(lib_path):
-        os.system(
-            f"cd {folder}/tmp/torch/lib/; "
-            f"patchelf --set-rpath '{rpath}' --force-rpath {lib_name}"
-        )
-
-
-def copy_and_patch_library(
-    src_path: str,
-    folder: str,
-    use_nvidia_pypi_libs: bool = False,
-    desired_cuda: str = "",
-) -> None:
-    """Copy a library to torch/lib and patch its RPATH"""
-    if os.path.exists(src_path):
-        lib_name = os.path.basename(src_path)
-        shutil.copy2(src_path, f"{folder}/tmp/torch/lib/{lib_name}")
-        patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
-
-
-def package_cuda_wheel(wheel_path, desired_cuda) -> None:
-    """
-    Package the cuda wheel libraries
-    """
-    folder = os.path.dirname(wheel_path)
-    os.mkdir(f"{folder}/tmp")
-    os.system(f"unzip {wheel_path} -d {folder}/tmp")
-    # Delete original wheel since it will be repackaged
-    os.system(f"rm {wheel_path}")
-
-    # Check if we should use PyPI NVIDIA libraries or bundle system libraries
-    use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
-
-    if use_nvidia_pypi_libs:
-        print("Using nvidia libs from pypi - skipping CUDA library bundling")
-        # For PyPI approach, we don't bundle CUDA libraries - they come from PyPI packages
-        # We only need to bundle non-NVIDIA libraries
-        minimal_libs_to_copy = [
-            "/lib64/libgomp.so.1",
-            "/usr/lib64/libgfortran.so.5",
-            "/acl/build/libarm_compute.so",
-            "/acl/build/libarm_compute_graph.so",
-            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_lapack_core.so.0",
-            "/usr/local/lib/libnvpl_blas_core.so.0",
-        ]
-
-        # Copy minimal libraries to unzipped_folder/torch/lib
-        for lib_path in minimal_libs_to_copy:
-            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
-
-        # Patch torch libraries used for searching libraries
-        torch_libs_to_patch = [
-            "libtorch.so",
-            "libtorch_cpu.so",
-            "libtorch_cuda.so",
-            "libtorch_cuda_linalg.so",
-            "libtorch_global_deps.so",
-            "libtorch_python.so",
-            "libtorch_nvshmem.so",
-            "libc10.so",
-            "libc10_cuda.so",
-            "libcaffe2_nvrtc.so",
-            "libshm.so",
-        ]
-        for lib_name in torch_libs_to_patch:
-            patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
-    else:
-        print("Bundling CUDA libraries with wheel")
-        # Original logic for bundling system CUDA libraries
-        # Common libraries for all CUDA versions
-        common_libs = [
-            # Non-NVIDIA system libraries
-            "/lib64/libgomp.so.1",
-            "/usr/lib64/libgfortran.so.5",
-            "/acl/build/libarm_compute.so",
-            "/acl/build/libarm_compute_graph.so",
-            # Common CUDA libraries (same for all versions)
-            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_lapack_core.so.0",
-            "/usr/local/lib/libnvpl_blas_core.so.0",
-            "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
-            "/usr/local/cuda/lib64/libcudnn.so.9",
-            "/usr/local/cuda/lib64/libcusparseLt.so.0",
-            "/usr/local/cuda/lib64/libcurand.so.10",
-            "/usr/local/cuda/lib64/libnccl.so.2",
-            "/usr/local/cuda/lib64/libnvshmem_host.so.3",
-            "/usr/local/cuda/lib64/libcudnn_adv.so.9",
-            "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
-            "/usr/local/cuda/lib64/libcudnn_graph.so.9",
-            "/usr/local/cuda/lib64/libcudnn_ops.so.9",
-            "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
-            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
-            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
-            "/usr/local/cuda/lib64/libcufile.so.0",
-            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
-            "/usr/local/cuda/lib64/libcusparse.so.12",
-        ]
-
-        # CUDA version-specific libraries
-        if "13" in desired_cuda:
-            minor_version = desired_cuda[-1]
-            version_specific_libs = [
-                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
-                "/usr/local/cuda/lib64/libcublas.so.13",
-                "/usr/local/cuda/lib64/libcublasLt.so.13",
-                "/usr/local/cuda/lib64/libcudart.so.13",
-                "/usr/local/cuda/lib64/libcufft.so.12",
-                "/usr/local/cuda/lib64/libcusolver.so.12",
-                "/usr/local/cuda/lib64/libnvJitLink.so.13",
-                "/usr/local/cuda/lib64/libnvrtc.so.13",
-                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",
-            ]
-        elif "12" in desired_cuda:
-            # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
-            minor_version = desired_cuda[-1]
-            version_specific_libs = [
-                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
-                "/usr/local/cuda/lib64/libcublas.so.12",
-                "/usr/local/cuda/lib64/libcublasLt.so.12",
-                "/usr/local/cuda/lib64/libcudart.so.12",
-                "/usr/local/cuda/lib64/libcufft.so.11",
-                "/usr/local/cuda/lib64/libcusolver.so.11",
-                "/usr/local/cuda/lib64/libnvJitLink.so.12",
-                "/usr/local/cuda/lib64/libnvrtc.so.12",
-                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
-            ]
-        else:
-            raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")
-
-        # Combine all libraries
-        libs_to_copy = common_libs + version_specific_libs
-
-        # Copy libraries to unzipped_folder/torch/lib
-        for lib_path in libs_to_copy:
-            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
-
-    # Make sure the wheel is tagged with manylinux_2_28
-    for f in os.scandir(f"{folder}/tmp/"):
-        if f.is_dir() and f.name.endswith(".dist-info"):
-            replace_tag(f"{f.path}/WHEEL")
-            break
-
-    os.system(f"wheel pack {folder}/tmp/ -d {folder}")
-    os.system(f"rm -rf {folder}/tmp/")
-
-
-def complete_wheel(folder: str) -> str:
-    """
-    Complete wheel build and put in artifact location
-    """
-    wheel_name = list_dir(f"/{folder}/dist")[0]
-
-    # Please note for cuda we don't run auditwheel since we use custom script to package
-    # the cuda dependencies to the wheel file using update_wheel() method.
-    # However we need to make sure filename reflects the correct Manylinux platform.
-    if "pytorch" in folder and not enable_cuda:
-        print("Repairing Wheel with AuditWheel")
-        check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
-        repaired_wheel_name = list_dir(f"/{folder}/wheelhouse")[0]
-
-        print(f"Moving {repaired_wheel_name} wheel to /{folder}/dist")
-        os.rename(
-            f"/{folder}/wheelhouse/{repaired_wheel_name}",
-            f"/{folder}/dist/{repaired_wheel_name}",
-        )
-    else:
-        repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
-
-    print(f"Copying {repaired_wheel_name} to artifacts")
-    shutil.copy2(
-        f"/{folder}/dist/{repaired_wheel_name}", f"/artifacts/{repaired_wheel_name}"
-    )
-
-    return repaired_wheel_name
-
-
-def parse_arguments():
-    """
-    Parse inline arguments
-    """
-    from argparse import ArgumentParser
-
-    parser = ArgumentParser("AARCH64 wheels python CD")
-    parser.add_argument("--debug", action="store_true")
-    parser.add_argument("--build-only", action="store_true")
-    parser.add_argument("--test-only", type=str)
-    parser.add_argument("--enable-mkldnn", action="store_true")
-    parser.add_argument("--enable-cuda", action="store_true")
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    """
-    Entry Point
-    """
-    args = parse_arguments()
-    enable_mkldnn = args.enable_mkldnn
-    enable_cuda = args.enable_cuda
-    branch = check_output(
-        ["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd="/pytorch"
-    ).decode()
-
-    print("Building PyTorch wheel")
-    build_vars = ""
-    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
-    if enable_cuda:
-        build_vars += "MAX_JOBS=5 "
-
-        # Handle PyPI NVIDIA libraries vs bundled libraries
-        use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
-        if use_nvidia_pypi_libs:
-            print("Configuring build for PyPI NVIDIA libraries")
-            # Configure for dynamic linking (matching x86 logic)
-            build_vars += "ATEN_STATIC_CUDA=0 USE_CUDA_STATIC_LINK=0 USE_CUPTI_SO=1 "
-        else:
-            print("Configuring build for bundled NVIDIA libraries")
-            # Keep existing static linking approach - already configured above
-
-    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
-    desired_cuda = os.getenv("DESIRED_CUDA")
-    if override_package_version is not None:
-        version = override_package_version
-        build_vars += (
-            f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
-        )
-    elif branch in ["nightly", "main"]:
-        build_date = (
-            check_output(["git", "log", "--pretty=format:%cs", "-1"], cwd="/pytorch")
-            .decode()
-            .replace("-", "")
-        )
-        version = (
-            check_output(["cat", "version.txt"], cwd="/pytorch").decode().strip()[:-2]
-        )
-        if enable_cuda:
-            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date}+{desired_cuda} PYTORCH_BUILD_NUMBER=1 "
-        else:
-            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
-    elif branch.startswith(("v1.", "v2.")):
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
-
-    if enable_mkldnn:
-        print("build pytorch with mkldnn+acl backend")
-        build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
-        build_vars += "ACL_ROOT_DIR=/acl "
-        if enable_cuda:
-            build_vars += "BLAS=NVPL "
-        else:
-            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS "
-    else:
-        print("build pytorch without mkldnn backend")
-
-    os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")
-    if enable_cuda:
-        print("Updating Cuda Dependency")
-        filename = os.listdir("/pytorch/dist/")
-        wheel_path = f"/pytorch/dist/{filename[0]}"
-        package_cuda_wheel(wheel_path, desired_cuda)
-    pytorch_wheel_name = complete_wheel("/pytorch/")
-    print(f"Build Complete. Created {pytorch_wheel_name}..")
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -1,999 +0,0 @@
-#!/usr/bin/env python3
-
-# This script is for building  AARCH64 wheels using AWS EC2 instances.
-# To generate binaries for the release follow these steps:
-# 1. Update mappings for each of the Domain Libraries by adding new row to a table like this:
-#         "v1.11.0": ("0.11.0", "rc1"),
-# 2. Run script with following arguments for each of the supported python versions and required tag, for example:
-# build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch v1.11.0-rc3
-
-
-import os
-import subprocess
-import sys
-import time
-from typing import Optional, Union
-
-import boto3
-
-
-# AMI images for us-east-1, change the following based on your ~/.aws/config
-os_amis = {
-    "ubuntu20_04": "ami-052eac90edaa9d08f",  # login_name: ubuntu
-    "ubuntu22_04": "ami-0c6c29c5125214c77",  # login_name: ubuntu
-    "redhat8": "ami-0698b90665a2ddcf1",  # login_name: ec2-user
-}
-
-ubuntu20_04_ami = os_amis["ubuntu20_04"]
-
-
-def compute_keyfile_path(key_name: Optional[str] = None) -> tuple[str, str]:
-    if key_name is None:
-        key_name = os.getenv("AWS_KEY_NAME")
-        if key_name is None:
-            return os.getenv("SSH_KEY_PATH", ""), ""
-
-    homedir_path = os.path.expanduser("~")
-    default_path = os.path.join(homedir_path, ".ssh", f"{key_name}.pem")
-    return os.getenv("SSH_KEY_PATH", default_path), key_name
-
-
-ec2 = boto3.resource("ec2")
-
-
-def ec2_get_instances(filter_name, filter_value):
-    return ec2.instances.filter(
-        Filters=[{"Name": filter_name, "Values": [filter_value]}]
-    )
-
-
-def ec2_instances_of_type(instance_type="t4g.2xlarge"):
-    return ec2_get_instances("instance-type", instance_type)
-
-
-def ec2_instances_by_id(instance_id):
-    rc = list(ec2_get_instances("instance-id", instance_id))
-    return rc[0] if len(rc) > 0 else None
-
-
-def start_instance(
-    key_name, ami=ubuntu20_04_ami, instance_type="t4g.2xlarge", ebs_size: int = 50
-):
-    inst = ec2.create_instances(
-        ImageId=ami,
-        InstanceType=instance_type,
-        SecurityGroups=["ssh-allworld"],
-        KeyName=key_name,
-        MinCount=1,
-        MaxCount=1,
-        BlockDeviceMappings=[
-            {
-                "DeviceName": "/dev/sda1",
-                "Ebs": {
-                    "DeleteOnTermination": True,
-                    "VolumeSize": ebs_size,
-                    "VolumeType": "standard",
-                },
-            }
-        ],
-    )[0]
-    print(f"Create instance {inst.id}")
-    inst.wait_until_running()
-    running_inst = ec2_instances_by_id(inst.id)
-    print(f"Instance started at {running_inst.public_dns_name}")
-    return running_inst
-
-
-class RemoteHost:
-    addr: str
-    keyfile_path: str
-    login_name: str
-    container_id: Optional[str] = None
-    ami: Optional[str] = None
-
-    def __init__(self, addr: str, keyfile_path: str, login_name: str = "ubuntu"):
-        self.addr = addr
-        self.keyfile_path = keyfile_path
-        self.login_name = login_name
-
-    def _gen_ssh_prefix(self) -> list[str]:
-        return [
-            "ssh",
-            "-o",
-            "StrictHostKeyChecking=no",
-            "-i",
-            self.keyfile_path,
-            f"{self.login_name}@{self.addr}",
-            "--",
-        ]
-
-    @staticmethod
-    def _split_cmd(args: Union[str, list[str]]) -> list[str]:
-        return args.split() if isinstance(args, str) else args
-
-    def run_ssh_cmd(self, args: Union[str, list[str]]) -> None:
-        subprocess.check_call(self._gen_ssh_prefix() + self._split_cmd(args))
-
-    def check_ssh_output(self, args: Union[str, list[str]]) -> str:
-        return subprocess.check_output(
-            self._gen_ssh_prefix() + self._split_cmd(args)
-        ).decode("utf-8")
-
-    def scp_upload_file(self, local_file: str, remote_file: str) -> None:
-        subprocess.check_call(
-            [
-                "scp",
-                "-i",
-                self.keyfile_path,
-                local_file,
-                f"{self.login_name}@{self.addr}:{remote_file}",
-            ]
-        )
-
-    def scp_download_file(
-        self, remote_file: str, local_file: Optional[str] = None
-    ) -> None:
-        if local_file is None:
-            local_file = "."
-        subprocess.check_call(
-            [
-                "scp",
-                "-i",
-                self.keyfile_path,
-                f"{self.login_name}@{self.addr}:{remote_file}",
-                local_file,
-            ]
-        )
-
-    def start_docker(self, image="quay.io/pypa/manylinux2014_aarch64:latest") -> None:
-        self.run_ssh_cmd("sudo apt-get install -y docker.io")
-        self.run_ssh_cmd(f"sudo usermod -a -G docker {self.login_name}")
-        self.run_ssh_cmd("sudo service docker start")
-        self.run_ssh_cmd(f"docker pull {image}")
-        self.container_id = self.check_ssh_output(
-            f"docker run -t -d -w /root {image}"
-        ).strip()
-
-    def using_docker(self) -> bool:
-        return self.container_id is not None
-
-    def run_cmd(self, args: Union[str, list[str]]) -> None:
-        if not self.using_docker():
-            return self.run_ssh_cmd(args)
-        assert self.container_id is not None
-        docker_cmd = self._gen_ssh_prefix() + [
-            "docker",
-            "exec",
-            "-i",
-            self.container_id,
-            "bash",
-        ]
-        p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE)
-        p.communicate(
-            input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
-                "utf-8"
-            )
-        )
-        rc = p.wait()
-        if rc != 0:
-            raise subprocess.CalledProcessError(rc, docker_cmd)
-
-    def check_output(self, args: Union[str, list[str]]) -> str:
-        if not self.using_docker():
-            return self.check_ssh_output(args)
-        assert self.container_id is not None
-        docker_cmd = self._gen_ssh_prefix() + [
-            "docker",
-            "exec",
-            "-i",
-            self.container_id,
-            "bash",
-        ]
-        p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
-        (out, err) = p.communicate(
-            input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
-                "utf-8"
-            )
-        )
-        rc = p.wait()
-        if rc != 0:
-            raise subprocess.CalledProcessError(rc, docker_cmd, output=out, stderr=err)
-        return out.decode("utf-8")
-
-    def upload_file(self, local_file: str, remote_file: str) -> None:
-        if not self.using_docker():
-            return self.scp_upload_file(local_file, remote_file)
-        tmp_file = os.path.join("/tmp", os.path.basename(local_file))
-        self.scp_upload_file(local_file, tmp_file)
-        self.run_ssh_cmd(
-            ["docker", "cp", tmp_file, f"{self.container_id}:/root/{remote_file}"]
-        )
-        self.run_ssh_cmd(["rm", tmp_file])
-
-    def download_file(self, remote_file: str, local_file: Optional[str] = None) -> None:
-        if not self.using_docker():
-            return self.scp_download_file(remote_file, local_file)
-        tmp_file = os.path.join("/tmp", os.path.basename(remote_file))
-        self.run_ssh_cmd(
-            ["docker", "cp", f"{self.container_id}:/root/{remote_file}", tmp_file]
-        )
-        self.scp_download_file(tmp_file, local_file)
-        self.run_ssh_cmd(["rm", tmp_file])
-
-    def download_wheel(
-        self, remote_file: str, local_file: Optional[str] = None
-    ) -> None:
-        if self.using_docker() and local_file is None:
-            basename = os.path.basename(remote_file)
-            local_file = basename.replace(
-                "-linux_aarch64.whl", "-manylinux2014_aarch64.whl"
-            )
-        self.download_file(remote_file, local_file)
-
-    def list_dir(self, path: str) -> list[str]:
-        return self.check_output(["ls", "-1", path]).split("\n")
-
-
-def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
-    import socket
-
-    for i in range(attempt_cnt):
-        try:
-            with socket.create_connection((addr, port), timeout=timeout):
-                return
-        except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203
-            if i == attempt_cnt - 1:
-                raise
-            time.sleep(timeout)
-
-
-def update_apt_repo(host: RemoteHost) -> None:
-    time.sleep(5)
-    host.run_cmd("sudo systemctl stop apt-daily.service || true")
-    host.run_cmd("sudo systemctl stop unattended-upgrades.service || true")
-    host.run_cmd(
-        "while systemctl is-active --quiet apt-daily.service; do sleep 1; done"
-    )
-    host.run_cmd(
-        "while systemctl is-active --quiet unattended-upgrades.service; do sleep 1; done"
-    )
-    host.run_cmd("sudo apt-get update")
-    time.sleep(3)
-    host.run_cmd("sudo apt-get update")
-
-
-def install_condaforge(
-    host: RemoteHost, suffix: str = "latest/download/Miniforge3-Linux-aarch64.sh"
-) -> None:
-    print("Install conda-forge")
-    host.run_cmd(f"curl -OL https://github.com/conda-forge/miniforge/releases/{suffix}")
-    host.run_cmd(f"sh -f {os.path.basename(suffix)} -b")
-    host.run_cmd(f"rm -f {os.path.basename(suffix)}")
-    if host.using_docker():
-        host.run_cmd("echo 'PATH=$HOME/miniforge3/bin:$PATH'>>.bashrc")
-    else:
-        host.run_cmd(
-            [
-                "sed",
-                "-i",
-                "'/^# If not running interactively.*/i PATH=$HOME/miniforge3/bin:$PATH'",
-                ".bashrc",
-            ]
-        )
-
-
-def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
-    if python_version == "3.6":
-        # Python-3.6 EOLed and not compatible with conda-4.11
-        install_condaforge(
-            host, suffix="download/4.10.3-10/Miniforge3-4.10.3-10-Linux-aarch64.sh"
-        )
-        host.run_cmd(f"conda install -y python={python_version} numpy pyyaml")
-    else:
-        install_condaforge(
-            host, suffix="download/4.11.0-4/Miniforge3-4.11.0-4-Linux-aarch64.sh"
-        )
-        # Pytorch-1.10 or older are not compatible with setuptools=59.6 or newer
-        host.run_cmd(
-            f"conda install -y python={python_version} numpy pyyaml setuptools>=59.5.0"
-        )
-
-
-def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:
-    host.run_cmd("pip3 install auditwheel")
-    host.run_cmd(
-        "conda install -y patchelf" if use_conda else "sudo apt-get install -y patchelf"
-    )
-    from tempfile import NamedTemporaryFile
-
-    with NamedTemporaryFile() as tmp:
-        tmp.write(embed_library_script.encode("utf-8"))
-        tmp.flush()
-        host.upload_file(tmp.name, "embed_library.py")
-
-    print("Embedding libgomp into wheel")
-    if host.using_docker():
-        host.run_cmd(f"python3 embed_library.py {wheel_name} --update-tag")
-    else:
-        host.run_cmd(f"python3 embed_library.py {wheel_name}")
-
-
-def checkout_repo(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    url: str,
-    git_clone_flags: str,
-    mapping: dict[str, tuple[str, str]],
-) -> Optional[str]:
-    for prefix in mapping:
-        if not branch.startswith(prefix):
-            continue
-        tag = f"v{mapping[prefix][0]}-{mapping[prefix][1]}"
-        host.run_cmd(f"git clone {url} -b {tag} {git_clone_flags}")
-        return mapping[prefix][0]
-
-    host.run_cmd(f"git clone {url} -b {branch} {git_clone_flags}")
-    return None
-
-
-def build_torchvision(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    use_conda: bool = True,
-    git_clone_flags: str,
-    run_smoke_tests: bool = True,
-) -> str:
-    print("Checking out TorchVision repo")
-    build_version = checkout_repo(
-        host,
-        branch=branch,
-        url="https://github.com/pytorch/vision",
-        git_clone_flags=git_clone_flags,
-        mapping={
-            "v1.7.1": ("0.8.2", "rc2"),
-            "v1.8.0": ("0.9.0", "rc3"),
-            "v1.8.1": ("0.9.1", "rc1"),
-            "v1.9.0": ("0.10.0", "rc1"),
-            "v1.10.0": ("0.11.1", "rc1"),
-            "v1.10.1": ("0.11.2", "rc1"),
-            "v1.10.2": ("0.11.3", "rc1"),
-            "v1.11.0": ("0.12.0", "rc1"),
-            "v1.12.0": ("0.13.0", "rc4"),
-            "v1.12.1": ("0.13.1", "rc6"),
-            "v1.13.0": ("0.14.0", "rc4"),
-            "v1.13.1": ("0.14.1", "rc2"),
-            "v2.0.0": ("0.15.1", "rc2"),
-            "v2.0.1": ("0.15.2", "rc2"),
-        },
-    )
-    print("Building TorchVision wheel")
-
-    # Please note libnpg and jpeg are required to build image.so extension
-    if use_conda:
-        host.run_cmd("conda install -y libpng jpeg")
-        # Remove .so files to force static linking
-        host.run_cmd(
-            "rm miniforge3/lib/libpng.so miniforge3/lib/libpng16.so miniforge3/lib/libjpeg.so"
-        )
-        # And patch setup.py to include libz dependency for libpng
-        host.run_cmd(
-            [
-                'sed -i -e \'s/image_link_flags\\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py'
-            ]
-        )
-
-    build_vars = ""
-    if branch == "nightly":
-        version = host.check_output(
-            ["if [ -f vision/version.txt ]; then cat vision/version.txt; fi"]
-        ).strip()
-        if len(version) == 0:
-            # In older revisions, version was embedded in setup.py
-            version = (
-                host.check_output(["grep", '"version = \'"', "vision/setup.py"])
-                .strip()
-                .split("'")[1][:-2]
-            )
-        build_date = (
-            host.check_output("cd vision && git log --pretty=format:%s -1")
-            .strip()
-            .split()[0]
-            .replace("-", "")
-        )
-        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
-    elif build_version is not None:
-        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
-    if host.using_docker():
-        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
-
-    host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation")
-    vision_wheel_name = host.list_dir("vision/dist")[0]
-    embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name))
-
-    print("Copying TorchVision wheel")
-    host.download_wheel(os.path.join("vision", "dist", vision_wheel_name))
-    if run_smoke_tests:
-        host.run_cmd(
-            f"pip3 install {os.path.join('vision', 'dist', vision_wheel_name)}"
-        )
-        host.run_cmd("python3 vision/test/smoke_test.py")
-    print("Delete vision checkout")
-    host.run_cmd("rm -rf vision")
-
-    return vision_wheel_name
-
-
-def build_torchdata(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    use_conda: bool = True,
-    git_clone_flags: str = "",
-) -> str:
-    print("Checking out TorchData repo")
-    git_clone_flags += " --recurse-submodules"
-    build_version = checkout_repo(
-        host,
-        branch=branch,
-        url="https://github.com/pytorch/data",
-        git_clone_flags=git_clone_flags,
-        mapping={
-            "v1.13.1": ("0.5.1", ""),
-            "v2.0.0": ("0.6.0", "rc5"),
-            "v2.0.1": ("0.6.1", "rc1"),
-        },
-    )
-    print("Building TorchData wheel")
-    build_vars = ""
-    if branch == "nightly":
-        version = host.check_output(
-            ["if [ -f data/version.txt ]; then cat data/version.txt; fi"]
-        ).strip()
-        build_date = (
-            host.check_output("cd data && git log --pretty=format:%s -1")
-            .strip()
-            .split()[0]
-            .replace("-", "")
-        )
-        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
-    elif build_version is not None:
-        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
-    if host.using_docker():
-        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
-
-    host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation")
-    wheel_name = host.list_dir("data/dist")[0]
-    embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name))
-
-    print("Copying TorchData wheel")
-    host.download_wheel(os.path.join("data", "dist", wheel_name))
-
-    return wheel_name
-
-
-def build_torchtext(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    use_conda: bool = True,
-    git_clone_flags: str = "",
-) -> str:
-    print("Checking out TorchText repo")
-    git_clone_flags += " --recurse-submodules"
-    build_version = checkout_repo(
-        host,
-        branch=branch,
-        url="https://github.com/pytorch/text",
-        git_clone_flags=git_clone_flags,
-        mapping={
-            "v1.9.0": ("0.10.0", "rc1"),
-            "v1.10.0": ("0.11.0", "rc2"),
-            "v1.10.1": ("0.11.1", "rc1"),
-            "v1.10.2": ("0.11.2", "rc1"),
-            "v1.11.0": ("0.12.0", "rc1"),
-            "v1.12.0": ("0.13.0", "rc2"),
-            "v1.12.1": ("0.13.1", "rc5"),
-            "v1.13.0": ("0.14.0", "rc3"),
-            "v1.13.1": ("0.14.1", "rc1"),
-            "v2.0.0": ("0.15.1", "rc2"),
-            "v2.0.1": ("0.15.2", "rc2"),
-        },
-    )
-    print("Building TorchText wheel")
-    build_vars = ""
-    if branch == "nightly":
-        version = host.check_output(
-            ["if [ -f text/version.txt ]; then cat text/version.txt; fi"]
-        ).strip()
-        build_date = (
-            host.check_output("cd text && git log --pretty=format:%s -1")
-            .strip()
-            .split()[0]
-            .replace("-", "")
-        )
-        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
-    elif build_version is not None:
-        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
-    if host.using_docker():
-        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
-
-    host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation")
-    wheel_name = host.list_dir("text/dist")[0]
-    embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name))
-
-    print("Copying TorchText wheel")
-    host.download_wheel(os.path.join("text", "dist", wheel_name))
-
-    return wheel_name
-
-
-def build_torchaudio(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    use_conda: bool = True,
-    git_clone_flags: str = "",
-) -> str:
-    print("Checking out TorchAudio repo")
-    git_clone_flags += " --recurse-submodules"
-    build_version = checkout_repo(
-        host,
-        branch=branch,
-        url="https://github.com/pytorch/audio",
-        git_clone_flags=git_clone_flags,
-        mapping={
-            "v1.9.0": ("0.9.0", "rc2"),
-            "v1.10.0": ("0.10.0", "rc5"),
-            "v1.10.1": ("0.10.1", "rc1"),
-            "v1.10.2": ("0.10.2", "rc1"),
-            "v1.11.0": ("0.11.0", "rc1"),
-            "v1.12.0": ("0.12.0", "rc3"),
-            "v1.12.1": ("0.12.1", "rc5"),
-            "v1.13.0": ("0.13.0", "rc4"),
-            "v1.13.1": ("0.13.1", "rc2"),
-            "v2.0.0": ("2.0.1", "rc3"),
-            "v2.0.1": ("2.0.2", "rc2"),
-        },
-    )
-    print("Building TorchAudio wheel")
-    build_vars = ""
-    if branch == "nightly":
-        version = (
-            host.check_output(["grep", '"version = \'"', "audio/setup.py"])
-            .strip()
-            .split("'")[1][:-2]
-        )
-        build_date = (
-            host.check_output("cd audio && git log --pretty=format:%s -1")
-            .strip()
-            .split()[0]
-            .replace("-", "")
-        )
-        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
-    elif build_version is not None:
-        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
-    if host.using_docker():
-        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
-
-    host.run_cmd(
-        f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \
-        && ./packaging/ffmpeg/build.sh \
-        && {build_vars} python3 -m build --wheel --no-isolation"
-    )
-
-    wheel_name = host.list_dir("audio/dist")[0]
-    embed_libgomp(host, use_conda, os.path.join("audio", "dist", wheel_name))
-
-    print("Copying TorchAudio wheel")
-    host.download_wheel(os.path.join("audio", "dist", wheel_name))
-
-    return wheel_name
-
-
-def configure_system(
-    host: RemoteHost,
-    *,
-    compiler: str = "gcc-8",
-    use_conda: bool = True,
-    python_version: str = "3.8",
-) -> None:
-    if use_conda:
-        install_condaforge_python(host, python_version)
-
-    print("Configuring the system")
-    if not host.using_docker():
-        update_apt_repo(host)
-        host.run_cmd("sudo apt-get install -y ninja-build g++ git cmake gfortran unzip")
-    else:
-        host.run_cmd("yum install -y sudo")
-        host.run_cmd("conda install -y ninja scons")
-
-    if not use_conda:
-        host.run_cmd(
-            "sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
-        )
-    host.run_cmd("pip3 install dataclasses typing-extensions")
-    if not use_conda:
-        print("Installing Cython + numpy from PyPy")
-        host.run_cmd("sudo pip3 install Cython")
-        host.run_cmd("sudo pip3 install numpy")
-
-
-def build_domains(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    use_conda: bool = True,
-    git_clone_flags: str = "",
-) -> tuple[str, str, str, str]:
-    vision_wheel_name = build_torchvision(
-        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
-    )
-    audio_wheel_name = build_torchaudio(
-        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
-    )
-    data_wheel_name = build_torchdata(
-        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
-    )
-    text_wheel_name = build_torchtext(
-        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
-    )
-    return (vision_wheel_name, audio_wheel_name, data_wheel_name, text_wheel_name)
-
-
-def start_build(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    compiler: str = "gcc-8",
-    use_conda: bool = True,
-    python_version: str = "3.8",
-    pytorch_only: bool = False,
-    pytorch_build_number: Optional[str] = None,
-    shallow_clone: bool = True,
-    enable_mkldnn: bool = False,
-) -> tuple[str, str, str, str, str]:
-    git_clone_flags = " --depth 1 --shallow-submodules" if shallow_clone else ""
-    if host.using_docker() and not use_conda:
-        print("Auto-selecting conda option for docker images")
-        use_conda = True
-    if not host.using_docker():
-        print("Disable mkldnn for host builds")
-        enable_mkldnn = False
-
-    configure_system(
-        host, compiler=compiler, use_conda=use_conda, python_version=python_version
-    )
-
-    if host.using_docker():
-        print("Move libgfortant.a into a standard location")
-        # HACK: pypa gforntran.a is compiled without PIC, which leads to the following error
-        # libgfortran.a(error.o)(.text._gfortrani_st_printf+0x34): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `__stack_chk_guard@@GLIBC_2.17'  # noqa: E501, B950
-        # Workaround by copying gfortran library from the host
-        host.run_ssh_cmd("sudo apt-get install -y gfortran-8")
-        host.run_cmd("mkdir -p /usr/lib/gcc/aarch64-linux-gnu/8")
-        host.run_ssh_cmd(
-            [
-                "docker",
-                "cp",
-                "/usr/lib/gcc/aarch64-linux-gnu/8/libgfortran.a",
-                f"{host.container_id}:/opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/",
-            ]
-        )
-
-    print("Checking out PyTorch repo")
-    host.run_cmd(
-        f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"
-    )
-
-    host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")
-
-    print("Building PyTorch wheel")
-    build_opts = ""
-    if pytorch_build_number is not None:
-        build_opts += f" -C--build-option=--build-number={pytorch_build_number}"
-    # Breakpad build fails on aarch64
-    build_vars = "USE_BREAKPAD=0 "
-    if branch == "nightly":
-        build_date = (
-            host.check_output("cd pytorch && git log --pretty=format:%s -1")
-            .strip()
-            .split()[0]
-            .replace("-", "")
-        )
-        version = host.check_output("cat pytorch/version.txt").strip()[:-2]
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
-    if branch.startswith(("v1.", "v2.")):
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
-    if host.using_docker():
-        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
-    if enable_mkldnn:
-        host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")
-        print("build pytorch with mkldnn+acl backend")
-        build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
-        build_vars += " BLAS=OpenBLAS"
-        build_vars += " OpenBLAS_HOME=/opt/OpenBLAS"
-        build_vars += " ACL_ROOT_DIR=/acl"
-        host.run_cmd(
-            f"cd $HOME/pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
-        )
-        print("Repair the wheel")
-        pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
-        ld_library_path = "/acl/build:$HOME/pytorch/build/lib"
-        host.run_cmd(
-            f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"
-        )
-        print("replace the original wheel with the repaired one")
-        pytorch_repaired_wheel_name = host.list_dir("wheelhouse")[0]
-        host.run_cmd(
-            f"cp $HOME/wheelhouse/{pytorch_repaired_wheel_name} $HOME/pytorch/dist/{pytorch_wheel_name}"
-        )
-    else:
-        print("build pytorch without mkldnn backend")
-        host.run_cmd(
-            f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
-        )
-
-    print("Deleting build folder")
-    host.run_cmd("cd pytorch && rm -rf build")
-    pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
-    embed_libgomp(host, use_conda, os.path.join("pytorch", "dist", pytorch_wheel_name))
-    print("Copying the wheel")
-    host.download_wheel(os.path.join("pytorch", "dist", pytorch_wheel_name))
-
-    print("Installing PyTorch wheel")
-    host.run_cmd(f"pip3 install pytorch/dist/{pytorch_wheel_name}")
-
-    if pytorch_only:
-        return (pytorch_wheel_name, None, None, None, None)
-    domain_wheels = build_domains(
-        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
-    )
-
-    return (pytorch_wheel_name, *domain_wheels)
-
-
-embed_library_script = """
-#!/usr/bin/env python3
-
-from auditwheel.patcher import Patchelf
-from auditwheel.wheeltools import InWheelCtx
-from auditwheel.elfutils import elf_file_filter
-from auditwheel.repair import copylib
-from auditwheel.lddtree import lddtree
-from subprocess import check_call
-import os
-import shutil
-import sys
-from tempfile import TemporaryDirectory
-
-
-def replace_tag(filename):
-   with open(filename, 'r') as f:
-     lines = f.read().split("\\n")
-   for i,line in enumerate(lines):
-       if not line.startswith("Tag: "):
-           continue
-       lines[i] = line.replace("-linux_", "-manylinux2014_")
-       print(f'Updated tag from {line} to {lines[i]}')
-
-   with open(filename, 'w') as f:
-       f.write("\\n".join(lines))
-
-
-class AlignedPatchelf(Patchelf):
-    def set_soname(self, file_name: str, new_soname: str) -> None:
-        check_call(['patchelf', '--page-size', '65536', '--set-soname', new_soname, file_name])
-
-    def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
-        check_call(['patchelf', '--page-size', '65536', '--replace-needed', soname, new_soname, file_name])
-
-
-def embed_library(whl_path, lib_soname, update_tag=False):
-    patcher = AlignedPatchelf()
-    out_dir = TemporaryDirectory()
-    whl_name = os.path.basename(whl_path)
-    tmp_whl_name = os.path.join(out_dir.name, whl_name)
-    with InWheelCtx(whl_path) as ctx:
-        torchlib_path = os.path.join(ctx._tmpdir.name, 'torch', 'lib')
-        ctx.out_wheel=tmp_whl_name
-        new_lib_path, new_lib_soname = None, None
-        for filename, elf in elf_file_filter(ctx.iter_files()):
-            if not filename.startswith('torch/lib'):
-                continue
-            libtree = lddtree(filename)
-            if lib_soname not in libtree['needed']:
-                continue
-            lib_path = libtree['libs'][lib_soname]['path']
-            if lib_path is None:
-                print(f"Can't embed {lib_soname} as it could not be found")
-                break
-            if lib_path.startswith(torchlib_path):
-                continue
-
-            if new_lib_path is None:
-                new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
-            patcher.replace_needed(filename, lib_soname, new_lib_soname)
-            print(f'Replacing {lib_soname} with {new_lib_soname} for {filename}')
-        if update_tag:
-            # Add manylinux2014 tag
-            for filename in ctx.iter_files():
-                if os.path.basename(filename) != 'WHEEL':
-                    continue
-                replace_tag(filename)
-    shutil.move(tmp_whl_name, whl_path)
-
-
-if __name__ == '__main__':
-    embed_library(sys.argv[1], 'libgomp.so.1', len(sys.argv) > 2 and sys.argv[2] == '--update-tag')
-"""
-
-
-def run_tests(host: RemoteHost, whl: str, branch="main") -> None:
-    print("Configuring the system")
-    update_apt_repo(host)
-    host.run_cmd("sudo apt-get install -y python3-pip git")
-    host.run_cmd("sudo pip3 install Cython")
-    host.run_cmd("sudo pip3 install numpy")
-    host.upload_file(whl, ".")
-    host.run_cmd(f"sudo pip3 install {whl}")
-    host.run_cmd("python3 -c 'import torch;print(torch.rand((3,3))'")
-    host.run_cmd(f"git clone -b {branch} https://github.com/pytorch/pytorch")
-    host.run_cmd("cd pytorch/test; python3 test_torch.py -v")
-
-
-def get_instance_name(instance) -> Optional[str]:
-    if instance.tags is None:
-        return None
-    for tag in instance.tags:
-        if tag["Key"] == "Name":
-            return tag["Value"]
-    return None
-
-
-def list_instances(instance_type: str) -> None:
-    print(f"All instances of type {instance_type}")
-    for instance in ec2_instances_of_type(instance_type):
-        ifaces = instance.network_interfaces
-        az = ifaces[0].subnet.availability_zone if len(ifaces) > 0 else None
-        print(
-            f"{instance.id} {get_instance_name(instance)} {instance.public_dns_name} {instance.state['Name']} {az}"
-        )
-
-
-def terminate_instances(instance_type: str) -> None:
-    print(f"Terminating all instances of type {instance_type}")
-    instances = list(ec2_instances_of_type(instance_type))
-    for instance in instances:
-        print(f"Terminating {instance.id}")
-        instance.terminate()
-    print("Waiting for termination to complete")
-    for instance in instances:
-        instance.wait_until_terminated()
-
-
-def parse_arguments():
-    from argparse import ArgumentParser
-
-    parser = ArgumentParser("Build and test AARCH64 wheels using EC2")
-    parser.add_argument("--key-name", type=str)
-    parser.add_argument("--debug", action="store_true")
-    parser.add_argument("--build-only", action="store_true")
-    parser.add_argument("--test-only", type=str)
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument("--os", type=str, choices=list(os_amis.keys()))
-    group.add_argument("--ami", type=str)
-    parser.add_argument(
-        "--python-version",
-        type=str,
-        choices=[f"3.{d}" for d in range(6, 12)],
-        default=None,
-    )
-    parser.add_argument("--alloc-instance", action="store_true")
-    parser.add_argument("--list-instances", action="store_true")
-    parser.add_argument("--pytorch-only", action="store_true")
-    parser.add_argument("--keep-running", action="store_true")
-    parser.add_argument("--terminate-instances", action="store_true")
-    parser.add_argument("--instance-type", type=str, default="t4g.2xlarge")
-    parser.add_argument("--ebs-size", type=int, default=50)
-    parser.add_argument("--branch", type=str, default="main")
-    parser.add_argument("--use-docker", action="store_true")
-    parser.add_argument(
-        "--compiler",
-        type=str,
-        choices=["gcc-7", "gcc-8", "gcc-9", "clang"],
-        default="gcc-8",
-    )
-    parser.add_argument("--use-torch-from-pypi", action="store_true")
-    parser.add_argument("--pytorch-build-number", type=str, default=None)
-    parser.add_argument("--disable-mkldnn", action="store_true")
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    ami = (
-        args.ami
-        if args.ami is not None
-        else os_amis[args.os]
-        if args.os is not None
-        else ubuntu20_04_ami
-    )
-    keyfile_path, key_name = compute_keyfile_path(args.key_name)
-
-    if args.list_instances:
-        list_instances(args.instance_type)
-        sys.exit(0)
-
-    if args.terminate_instances:
-        terminate_instances(args.instance_type)
-        sys.exit(0)
-
-    if len(key_name) == 0:
-        raise RuntimeError("""
-            Cannot start build without key_name, please specify
-            --key-name argument or AWS_KEY_NAME environment variable.""")
-    if len(keyfile_path) == 0 or not os.path.exists(keyfile_path):
-        raise RuntimeError(f"""
-            Cannot find keyfile with name: [{key_name}] in path: [{keyfile_path}], please
-            check `~/.ssh/` folder or manually set SSH_KEY_PATH environment variable.""")
-
-    # Starting the instance
-    inst = start_instance(
-        key_name, ami=ami, instance_type=args.instance_type, ebs_size=args.ebs_size
-    )
-    instance_name = f"{args.key_name}-{args.os}"
-    if args.python_version is not None:
-        instance_name += f"-py{args.python_version}"
-    inst.create_tags(
-        DryRun=False,
-        Tags=[
-            {
-                "Key": "Name",
-                "Value": instance_name,
-            }
-        ],
-    )
-    addr = inst.public_dns_name
-    wait_for_connection(addr, 22)
-    host = RemoteHost(addr, keyfile_path)
-    host.ami = ami
-    if args.use_docker:
-        update_apt_repo(host)
-        host.start_docker()
-
-    if args.test_only:
-        run_tests(host, args.test_only)
-        sys.exit(0)
-
-    if args.alloc_instance:
-        if args.python_version is None:
-            sys.exit(0)
-        install_condaforge_python(host, args.python_version)
-        sys.exit(0)
-
-    python_version = args.python_version if args.python_version is not None else "3.10"
-
-    if args.use_torch_from_pypi:
-        configure_system(host, compiler=args.compiler, python_version=python_version)
-        print("Installing PyTorch wheel")
-        host.run_cmd("pip3 install torch")
-        build_domains(
-            host, branch=args.branch, git_clone_flags=" --depth 1 --shallow-submodules"
-        )
-    else:
-        start_build(
-            host,
-            branch=args.branch,
-            compiler=args.compiler,
-            python_version=python_version,
-            pytorch_only=args.pytorch_only,
-            pytorch_build_number=args.pytorch_build_number,
-            enable_mkldnn=not args.disable_mkldnn,
-        )
-    if not args.keep_running:
-        print(f"Waiting for instance {inst.id} to terminate")
-        inst.terminate()
-        inst.wait_until_terminated()
--- a/.ci/aarch64_linux/embed_library.py
+++ b/.ci/aarch64_linux/embed_library.py
@ -1,87 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import shutil
-import sys
-from subprocess import check_call
-from tempfile import TemporaryDirectory
-
-from auditwheel.elfutils import elf_file_filter
-from auditwheel.lddtree import lddtree
-from auditwheel.patcher import Patchelf
-from auditwheel.repair import copylib
-from auditwheel.wheeltools import InWheelCtx
-
-
-def replace_tag(filename):
-    with open(filename) as f:
-        lines = f.read().split("\\n")
-    for i, line in enumerate(lines):
-        if not line.startswith("Tag: "):
-            continue
-        lines[i] = line.replace("-linux_", "-manylinux2014_")
-        print(f"Updated tag from {line} to {lines[i]}")
-
-    with open(filename, "w") as f:
-        f.write("\\n".join(lines))
-
-
-class AlignedPatchelf(Patchelf):
-    def set_soname(self, file_name: str, new_soname: str) -> None:
-        check_call(
-            ["patchelf", "--page-size", "65536", "--set-soname", new_soname, file_name]
-        )
-
-    def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
-        check_call(
-            [
-                "patchelf",
-                "--page-size",
-                "65536",
-                "--replace-needed",
-                soname,
-                new_soname,
-                file_name,
-            ]
-        )
-
-
-def embed_library(whl_path, lib_soname, update_tag=False):
-    patcher = AlignedPatchelf()
-    out_dir = TemporaryDirectory()
-    whl_name = os.path.basename(whl_path)
-    tmp_whl_name = os.path.join(out_dir.name, whl_name)
-    with InWheelCtx(whl_path) as ctx:
-        torchlib_path = os.path.join(ctx._tmpdir.name, "torch", "lib")
-        ctx.out_wheel = tmp_whl_name
-        new_lib_path, new_lib_soname = None, None
-        for filename, _ in elf_file_filter(ctx.iter_files()):
-            if not filename.startswith("torch/lib"):
-                continue
-            libtree = lddtree(filename)
-            if lib_soname not in libtree["needed"]:
-                continue
-            lib_path = libtree["libs"][lib_soname]["path"]
-            if lib_path is None:
-                print(f"Can't embed {lib_soname} as it could not be found")
-                break
-            if lib_path.startswith(torchlib_path):
-                continue
-
-            if new_lib_path is None:
-                new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
-            patcher.replace_needed(filename, lib_soname, new_lib_soname)
-            print(f"Replacing {lib_soname} with {new_lib_soname} for {filename}")
-        if update_tag:
-            # Add manylinux2014 tag
-            for filename in ctx.iter_files():
-                if os.path.basename(filename) != "WHEEL":
-                    continue
-                replace_tag(filename)
-    shutil.move(tmp_whl_name, whl_path)
-
-
-if __name__ == "__main__":
-    embed_library(
-        sys.argv[1], "libgomp.so.1", len(sys.argv) > 2 and sys.argv[2] == "--update-tag"
-    )
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -188,7 +188,7 @@ case "$tag" in
    fi
    GCC_VERSION=11
    VISION=yes
-    ROCM_VERSION=7.1
+    ROCM_VERSION=7.0
    NINJA_VERSION=1.9.0
    TRITON=yes
    KATEX=yes
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -60,16 +60,14 @@ EOF
        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
    fi

-    if [[ $(ver $ROCM_VERSION) -lt $(ver 7.1) ]]; then
-      # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5, removed in ROCm 7.1
-      # search for all unversioned packages
-      # if search fails it will abort this script; use true to avoid case where search fails
-      MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
-      if [[ "x${MIOPENHIPGFX}" = x ]]; then
-        echo "miopen-hip-gfx package not available" && exit 1
-      else
-        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
-      fi
+    # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
+    # search for all unversioned packages
+    # if search fails it will abort this script; use true to avoid case where search fails
+    MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
+    if [[ "x${MIOPENHIPGFX}" = x ]]; then
+      echo "miopen-hip-gfx package not available" && exit 1
+    else
+      DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
    fi

    # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -12,8 +12,8 @@ function do_install() {

    rocm_version_nodot=${rocm_version//./}

-    # https://github.com/icl-utk-edu/magma/pull/65
-    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
+    # post merge of https://github.com/icl-utk-edu/magma/pull/65
+    MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"

    rocm_dir="/opt/rocm"
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -402,6 +402,3 @@ scikit-build==0.18.1
 pyre-extensions==0.0.32
 tabulate==0.9.0
 #Description: These package are needed to build FBGEMM and torchrec on PyTorch CI
-
-Jinja2==3.1.6
-#Description: required for torch.distributed.debug
--- a/.ci/manywheel/build.sh
+++ b/.ci/manywheel/build.sh
@ -4,14 +4,17 @@ set -ex

 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

+# Source the common build script for architecture-specific configurations (MKLDNN, ACL, etc.)
+source "${SCRIPTPATH}/../pytorch/build.sh" || true
+
 case "${GPU_ARCH_TYPE:-BLANK}" in
-    cuda)
+    cuda | cuda-aarch64)
        bash "${SCRIPTPATH}/build_cuda.sh"
        ;;
    rocm)
        bash "${SCRIPTPATH}/build_rocm.sh"
        ;;
-    cpu | cpu-cxx11-abi | cpu-s390x)
+    cpu | cpu-cxx11-abi | cpu-aarch64 | cpu-s390x)
        bash "${SCRIPTPATH}/build_cpu.sh"
        ;;
    xpu)
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -18,12 +18,31 @@ retry () {
    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
 }

+# Detect architecture first
+ARCH=$(uname -m)
+echo "Detected architecture: $ARCH"
+
 PLATFORM=""
 # TODO move this into the Docker images
 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
 if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    retry yum install -q -y zip openssl
-    PLATFORM="manylinux_2_28_x86_64"
+    # Set platform based on architecture
+    case $ARCH in
+        x86_64)
+            PLATFORM="manylinux_2_28_x86_64"
+            ;;
+        aarch64)
+            PLATFORM="manylinux_2_28_aarch64"
+            ;;
+        s390x)
+            PLATFORM="manylinux_2_28_s390x"
+            ;;
+        *)
+            echo "Unsupported architecture: $ARCH"
+            exit 1
+            ;;
+    esac
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    retry dnf install -q -y zip openssl
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
@ -38,6 +57,8 @@ else
    exit 1
 fi

+echo "Platform set to: $PLATFORM"
+
 # We use the package name to test the package by passing this to 'pip install'
 # This is the env variable that setup.py uses to name the package. Note that
 # pip 'normalizes' the name first by changing all - to _
@ -299,8 +320,8 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
            # ROCm workaround for roctracer dlopens
            if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
                patchedpath=$(fname_without_so_number $destpath)
-            # Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load
-            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then
+            # Keep the so number for XPU dependencies, libgomp.so.1, ACL libraries, and NVPL libraries to avoid twice load
+            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" || "$filename" == libarm_compute* || "$filename" == libnvpl* || "$filename" == "libgfortran.so.5" ]]; then
                patchedpath=$destpath
            else
                patchedpath=$(fname_with_sha256 $destpath)
@ -346,9 +367,22 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
    done

    # create Manylinux 2_28 tag this needs to happen before regenerate the RECORD
-    if [[ $PLATFORM == "manylinux_2_28_x86_64" && $GPU_ARCH_TYPE != "cpu-s390x" && $GPU_ARCH_TYPE != "xpu" ]]; then
+    # Support all architectures (x86_64, aarch64, s390x)
+    if [[ "$IS_MANYLINUX2_28" == "1" && $GPU_ARCH_TYPE != "xpu" ]]; then
        wheel_file=$(echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/WHEEL/g')
-        sed -i -e s#linux_x86_64#"${PLATFORM}"# $wheel_file;
+        echo "Updating wheel tag for $ARCH architecture"
+        # Replace linux_* with manylinux_2_28_* based on architecture
+        case $ARCH in
+            x86_64)
+                sed -i -e 's#linux_x86_64#manylinux_2_28_x86_64#g' $wheel_file
+                ;;
+            aarch64)
+                sed -i -e 's#linux_aarch64#manylinux_2_28_aarch64#g' $wheel_file
+                ;;
+            s390x)
+                sed -i -e 's#linux_s390x#manylinux_2_28_s390x#g' $wheel_file
+                ;;
+        esac
    fi

    # regenerate the RECORD file with new hashes
--- a/.ci/manywheel/build_cpu.sh
+++ b/.ci/manywheel/build_cpu.sh
@ -15,6 +15,10 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
    EXTRA_CAFFE2_CMAKE_FLAGS=()
 fi

+# Detect architecture
+ARCH=$(uname -m)
+echo "Building CPU wheel for architecture: $ARCH"
+
 WHEELHOUSE_DIR="wheelhousecpu"
 LIBTORCH_HOUSE_DIR="libtorch_housecpu"
 if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
@ -34,8 +38,10 @@ elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
 elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
-    if [[ "$(uname -m)" == "s390x" ]]; then
+    if [[ "$ARCH" == "s390x" ]]; then
        LIBGOMP_PATH="/usr/lib/s390x-linux-gnu/libgomp.so.1"
+    elif [[ "$ARCH" == "aarch64" ]]; then
+        LIBGOMP_PATH="/usr/lib/aarch64-linux-gnu/libgomp.so.1"
    else
        LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
    fi
@ -49,6 +55,34 @@ DEPS_SONAME=(
    "libgomp.so.1"
 )

+# Add ARM-specific library dependencies for CPU builds
+if [[ "$ARCH" == "aarch64" ]]; then
+    echo "Adding ARM-specific CPU library dependencies"
+
+    # ARM Compute Library (if available)
+    if [[ -d "/acl/build" ]]; then
+        echo "Adding ARM Compute Library for CPU"
+        DEPS_LIST+=(
+            "/acl/build/libarm_compute.so"
+            "/acl/build/libarm_compute_graph.so"
+        )
+        DEPS_SONAME+=(
+            "libarm_compute.so"
+            "libarm_compute_graph.so"
+        )
+    fi
+
+    # ARM system libraries
+    DEPS_LIST+=(
+        "/usr/lib64/libgfortran.so.5"
+        "/opt/OpenBLAS/lib/libopenblas.so.0"
+    )
+    DEPS_SONAME+=(
+        "libgfortran.so.5"
+        "libopenblas.so.0"
+    )
+fi
+
 rm -rf /usr/local/cuda*

 SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -29,6 +29,10 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
    EXTRA_CAFFE2_CMAKE_FLAGS=()
 fi

+# Detect architecture
+ARCH=$(uname -m)
+echo "Building for architecture: $ARCH"
+
 # Determine CUDA version and architectures to build for
 #
 # NOTE: We should first check `DESIRED_CUDA` when determining `CUDA_VERSION`,
@ -53,34 +57,60 @@ fi
 cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
 EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")

+# Function to remove architectures from a list
+remove_archs() {
+    local result="$1"
+    shift
+    for arch in "$@"; do
+        result="${result//${arch};/}"
+    done
+    echo "$result"
+}
+
+# Function to filter CUDA architectures for aarch64
+# aarch64 ARM GPUs only support certain compute capabilities
+# Keep: 8.0 (A100), 9.0+ (Hopper, Grace Hopper, newer)
+# Remove: < 8.0 (no ARM GPUs), 8.6 (x86_64 RTX 3090/A6000 only)
+filter_aarch64_archs() {
+    local arch_list="$1"
+    # Explicitly remove architectures not needed on aarch64
+    arch_list=$(remove_archs "$arch_list" "5.0" "6.0" "7.0" "7.5" "8.6")
+    echo "$arch_list"
+}
+
+# Base: Common architectures across all modern CUDA versions
+TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0"
+
 case ${CUDA_VERSION} in
-    #removing sm_50-sm_60 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
-    #however we would like to keep sm_70 architecture see: https://github.com/pytorch/pytorch/issues/157517
-    12.8)
-        TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0"
-        ;;
-    12.9)
-        TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0+PTX"
-        # WAR to resolve the ld error in libtorch build with CUDA 12.9
+    12.6) TORCH_CUDA_ARCH_LIST="5.0;6.0;${TORCH_CUDA_ARCH_LIST}" ;;  # Only 12.6 includes Legacy Maxwell/Pascal that will be removed in future releases
+    12.8) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0" ;;  # +Hopper/Blackwell support
+    12.9) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0+PTX" # +Hopper/Blackwell support + PTX for forward compatibility
        if [[ "$PACKAGE_TYPE" == "libtorch" ]]; then
-            TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
+            TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//7.0;/}"  # Remove 7.0 to resolve the ld error
+            TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//8.6;/}"  # Remove 8.6 for libtorch
        fi
        ;;
    13.0)
-        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
-        ;;
-    12.6)
-        TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
-        ;;
-    *)
-        echo "unknown cuda version $CUDA_VERSION"
-        exit 1
+        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;$([[ "$ARCH" == "aarch64" ]] && echo "11.0;" || echo "")12.0+PTX"
+        export TORCH_NVCC_FLAGS="-compress-mode=size"
+        export BUILD_BUNDLE_PTXAS=1
        ;;
+    *) echo "unknown cuda version $CUDA_VERSION"; exit 1 ;;
 esac

+# Filter for aarch64: Remove < 8.0 and 8.6
+[[ "$ARCH" == "aarch64" ]] && TORCH_CUDA_ARCH_LIST=$(filter_aarch64_archs "$TORCH_CUDA_ARCH_LIST")
+
+echo "TORCH_CUDA_ARCH_LIST set to: $TORCH_CUDA_ARCH_LIST"
 export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
 echo "${TORCH_CUDA_ARCH_LIST}"

+# Disable MAGMA for aarch64 as pre-built libraries are x86-64 only
+if [[ "$ARCH" == "aarch64" ]]; then
+    echo "Disabling MAGMA for aarch64 architecture"
+    export USE_MAGMA=0
+fi
+
 # Package directories
 WHEELHOUSE_DIR="wheelhouse$cuda_version_nodot"
 LIBTORCH_HOUSE_DIR="libtorch_house$cuda_version_nodot"
@ -244,6 +274,51 @@ else
    exit 1
 fi

+# Add ARM-specific library dependencies
+if [[ "$ARCH" == "aarch64" ]]; then
+    echo "Adding ARM-specific library dependencies"
+
+    # ARM Compute Library (if available)
+    if [[ -d "/acl/build" ]]; then
+        echo "Adding ARM Compute Library"
+        DEPS_LIST+=(
+            "/acl/build/libarm_compute.so"
+            "/acl/build/libarm_compute_graph.so"
+        )
+        DEPS_SONAME+=(
+            "libarm_compute.so"
+            "libarm_compute_graph.so"
+        )
+    fi
+
+    # ARM system libraries
+    DEPS_LIST+=(
+        "/lib64/libgomp.so.1"
+        "/usr/lib64/libgfortran.so.5"
+    )
+    DEPS_SONAME+=(
+        "libgomp.so.1"
+        "libgfortran.so.5"
+    )
+
+    # NVPL libraries (ARM optimized BLAS/LAPACK)
+    if [[ -d "/usr/local/lib" && -f "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0" ]]; then
+        echo "Adding NVPL libraries for ARM"
+        DEPS_LIST+=(
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0"
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0"
+            "/usr/local/lib/libnvpl_lapack_core.so.0"
+            "/usr/local/lib/libnvpl_blas_core.so.0"
+        )
+        DEPS_SONAME+=(
+            "libnvpl_lapack_lp64_gomp.so.0"
+            "libnvpl_blas_lp64_gomp.so.0"
+            "libnvpl_lapack_core.so.0"
+            "libnvpl_blas_core.so.0"
+        )
+    fi
+fi
+
 # run_tests.sh requires DESIRED_CUDA to know what tests to exclude
 export DESIRED_CUDA="$cuda_version_nodot"

@ -251,9 +326,11 @@ export DESIRED_CUDA="$cuda_version_nodot"
 rm -rf /usr/local/cuda || true
 ln -s "/usr/local/cuda-${CUDA_VERSION}" /usr/local/cuda

-# Switch `/usr/local/magma` to the desired CUDA version
-rm -rf /usr/local/magma || true
-ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
+# Switch `/usr/local/magma` to the desired CUDA version (skip for aarch64)
+if [[ "$ARCH" != "aarch64" ]]; then
+    rm -rf /usr/local/magma || true
+    ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
+fi

 export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev) # 10.0.130
 export CUDA_VERSION_SHORT=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev | cut -f1,2 -d".") # 10.0
--- a/.ci/onnx/common.sh
+++ b/.ci/onnx/common.sh
@ -21,87 +21,3 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
 fi

 mkdir -p "$pytest_reports_dir" || true
-
-##########################################
-# copied from .ci/pytorch/common_utils.sh
-##########################################
-
-function get_pinned_commit() {
-  cat .github/ci_commit_pins/"${1}".txt
-}
-
-function pip_install_whl() {
-  # This is used to install PyTorch and other build artifacts wheel locally
-  # without using any network connection
-
-  # Convert the input arguments into an array
-  local args=("$@")
-
-  # Check if the first argument contains multiple paths separated by spaces
-  if [[ "${args[0]}" == *" "* ]]; then
-    # Split the string by spaces into an array
-    IFS=' ' read -r -a paths <<< "${args[0]}"
-    # Loop through each path and install individually
-    for path in "${paths[@]}"; do
-      echo "Installing $path"
-      python3 -mpip install --no-index --no-deps "$path"
-    done
-  else
-    # Loop through each argument and install individually
-    for path in "${args[@]}"; do
-      echo "Installing $path"
-      python3 -mpip install --no-index --no-deps "$path"
-    done
-  fi
-}
-
-function pip_build_and_install() {
-  local build_target=$1
-  local wheel_dir=$2
-
-  local found_whl=0
-  for file in "${wheel_dir}"/*.whl
-  do
-    if [[ -f "${file}" ]]; then
-      found_whl=1
-      break
-    fi
-  done
-
-  # Build the wheel if it doesn't exist
-  if [ "${found_whl}" == "0" ]; then
-    python3 -m pip wheel \
-      --no-build-isolation \
-      --no-deps \
-      -w "${wheel_dir}" \
-      "${build_target}"
-  fi
-
-  for file in "${wheel_dir}"/*.whl
-  do
-    pip_install_whl "${file}"
-  done
-}
-
-function install_torchvision() {
-  local orig_preload
-  local commit
-  commit=$(get_pinned_commit vision)
-  orig_preload=${LD_PRELOAD}
-  if [ -n "${LD_PRELOAD}" ]; then
-    # Silence dlerror to work-around glibc ASAN bug, see https://sourceware.org/bugzilla/show_bug.cgi?id=27653#c9
-    echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c -
-    LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so
-  fi
-
-  if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
-    # Not sure if both are needed, but why not
-    export FORCE_CUDA=1
-    export WITH_CUDA=1
-  fi
-  pip_build_and_install "git+https://github.com/pytorch/vision.git@${commit}" dist/vision
-
-  if [ -n "${LD_PRELOAD}" ]; then
-    LD_PRELOAD=${orig_preload}
-  fi
-}
--- a/.ci/onnx/test.sh
+++ b/.ci/onnx/test.sh
@ -19,7 +19,7 @@ git config --global --add safe.directory /var/lib/jenkins/workspace

 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
  # TODO: This can be removed later once vision is also part of the Docker image
-  install_torchvision
+  pip install -q --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
  # JIT C++ extensions require ninja, so put it into PATH.
  export PATH="/var/lib/jenkins/.local/bin:$PATH"
  # NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -86,10 +86,20 @@ else
  fi
 fi

+# Enable MKLDNN with ARM Compute Library for ARM builds
 if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
  export USE_MKLDNN=1
+
+  # ACL is required for aarch64 builds
+  if [[ ! -d "/acl" ]]; then
+    echo "ERROR: ARM Compute Library not found at /acl"
+    echo "ACL is required for aarch64 builds. Check Docker image setup."
+    exit 1
+  fi
+
  export USE_MKLDNN_ACL=1
  export ACL_ROOT_DIR=/acl
+  echo "ARM Compute Library enabled for MKLDNN: ACL_ROOT_DIR=/acl"
 fi

 if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -1250,97 +1250,6 @@ test_custom_script_ops() {
  assert_git_not_dirty
 }

-test_libtorch_agnostic_targetting() {
-    echo "Testing libtorch_agnostic runs correctly on TORCH_TARGET_VERSION"
-
-    REPO_DIR=$(pwd)
-    WHEEL_DIR="${REPO_DIR}/test/cpp_extensions/.wheels"
-
-    # Build wheel with current PyTorch (this has TORCH_TARGET_VERSION 2_9_0)
-    echo "Building 2.9 extension wheel with current PyTorch..."
-    pushd test/cpp_extensions/libtorch_agnostic_2_9_extension
-    time python setup.py bdist_wheel
-
-    # Save the wheel
-    mkdir -p "$WHEEL_DIR"
-    cp dist/*.whl "$WHEEL_DIR/"
-    WHEEL_FILE=$(find "$WHEEL_DIR" -maxdepth 1 -name "*.whl" -type f | head -1)
-    echo "Built wheel: $(basename "$WHEEL_FILE")"
-    popd
-
-    # Create venv and install PyTorch 2.9
-    python -m venv venv_pytorch_2_9
-    # shellcheck disable=SC1091
-    . venv_pytorch_2_9/bin/activate
-
-    # Clear PYTHONPATH to avoid using the development PyTorch
-    echo "Clearing PYTHONPATH to use only venv packages..."
-    unset PYTHONPATH
-
-    # Upgrade pip to latest version
-    echo "Upgrading pip to latest version..."
-    pip install --upgrade pip
-    pip --version
-
-    echo "Installing PyTorch 2.9..."
-
-    # Install from release channel only
-    PYTORCH_VERSION="2.9.0"
-
-    # Extract CUDA version from BUILD_ENVIRONMENT (e.g., "cuda12.1" -> "cu121")
-    if [[ "$BUILD_ENVIRONMENT" =~ cuda([0-9]+)\.([0-9]+) ]]; then
-        CUDA_MAJOR="${BASH_REMATCH[1]}"
-        CUDA_MINOR="${BASH_REMATCH[2]}"
-        CUDA_VERSION="cu${CUDA_MAJOR}${CUDA_MINOR}"
-        echo "  Detected CUDA ${CUDA_MAJOR}.${CUDA_MINOR} from BUILD_ENVIRONMENT, using ${CUDA_VERSION}"
-    else
-        # Default to CPU build
-        CUDA_VERSION="cpu"
-        echo "  No CUDA detected in BUILD_ENVIRONMENT, using CPU build"
-    fi
-
-    if pip install torch=="${PYTORCH_VERSION}" --index-url https://download.pytorch.org/whl/${CUDA_VERSION}/; then
-        echo "Installed PyTorch ${PYTORCH_VERSION} from release channel (${CUDA_VERSION})"
-    else
-        echo "  FAILED to install PyTorch 2.9.0 from release channel"
-        echo "  URL: https://download.pytorch.org/whl/${CUDA_VERSION}/"
-        deactivate
-        rm -rf venv_pytorch_2_9
-        return 1
-    fi
-
-    INSTALLED_VERSION=$(python -c "import torch; print(torch.__version__)" 2>/dev/null || echo "unknown")
-    echo "  Installed version: $INSTALLED_VERSION"
-
-    # Install test dependencies
-    echo "Installing test dependencies..."
-    pip install expecttest numpy unittest-xml-reporting
-
-    # Install the pre-built wheel
-    echo ""
-    echo "Installing pre-built 2.9 extension wheel (built with PyTorch 2.10)..."
-    pip install "$WHEEL_FILE"
-    echo "Installed $(basename "$WHEEL_FILE") into PyTorch 2.9 environment"
-
-    # Run tests with PyTorch 2.9 runtime (2.10 tests will be skipped automatically)
-    echo ""
-    echo "Running tests with PyTorch 2.9 runtime (using wheel built on PyTorch 2.10)..."
-    if time python test/cpp_extensions/test_libtorch_agnostic.py -v; then
-        echo ""
-        echo "  Wheel built with current torch and TORCH_TARGET_VERSION 2_9_0 works with PyTorch 2.9 runtime!"
-    else
-        echo "targeting test failed"
-        deactivate
-        rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
-        return 1
-    fi
-
-    deactivate
-    rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
-
-    assert_git_not_dirty
-}
-
 test_jit_hooks() {
  echo "Testing jit hooks in cpp"
  HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
@ -1813,8 +1722,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]];
 elif [[ "${TEST_CONFIG}" == *backward* ]]; then
  test_forward_backward_compatibility
  # Do NOT add tests after bc check tests, see its comment.
-elif [[ "${TEST_CONFIG}" == *libtorch_agnostic_targetting* ]]; then
-  test_libtorch_agnostic_targetting
 elif [[ "${TEST_CONFIG}" == *xla* ]]; then
  install_torchvision
  build_xla
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -91,6 +91,13 @@
 "ciflow/trunk":
 - .ci/docker/ci_commit_pins/triton.txt

+"oncall: distributed":
+- torch/csrc/distributed/**
+- torch/distributed/**
+- torch/nn/parallel/**
+- test/distributed/**
+- torch/testing/_internal/distributed/**
+
 "release notes: distributed (checkpoint)":
 - torch/distributed/checkpoint/**
 - test/distributed/checkpoint/**
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -260,11 +260,8 @@ jobs:
            "${DOCKER_IMAGE}"
          )
          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          if [[ ${BUILD_ENVIRONMENT} == *"aarch64"* ]]; then
-            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/aarch64_linux/aarch64_ci_build.sh"
-          else
-            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
-          fi
+          # Unified build script for all architectures (x86_64, aarch64, s390x)
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"

      - name: Chown artifacts
        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -330,6 +330,8 @@ jobs:
            -e PR_NUMBER \
            -e SHA1 \
            -e BRANCH \
+            -e SCCACHE_BUCKET \
+            -e SCCACHE_REGION \
            -e XLA_CUDA \
            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
            -e SKIP_SCCACHE_INITIALIZATION=1 \
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -75,8 +75,7 @@ jobs:
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter,
-          # TODO: Re-enable me when docker pin update happens
-          # pytorch-linux-jammy-py3-clang12-executorch,
+          pytorch-linux-jammy-py3-clang12-executorch,
          pytorch-linux-jammy-py3.12-triton-cpu,
          pytorch-linux-noble-riscv64-py3.12-gcc14
        ]
--- a/.github/workflows/inductor-perf-test-nightly-xpu.yml
+++ b/.github/workflows/inductor-perf-test-nightly-xpu.yml
@ -85,7 +85,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-noble-xpu-n-py3.10
      docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3-inductor-benchmarks
-      runner: linux.c7i.2xlarge
+      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
          { config: "inductor_huggingface_perf_xpu", shard: 1, num_shards: 5, runner: "linux.idc.xpu" },
@ -108,41 +108,41 @@ jobs:
        ]}
    secrets: inherit

-  # xpu-n-py3_10-inductor-benchmark-test-nightly:
-  #   permissions:
-  #     id-token: write
-  #     contents: read
-  #   if: github.event_name != 'workflow_dispatch'
-  #   name: xpu-n-py3.10-inductor-benchmark
-  #   uses: ./.github/workflows/_xpu-test.yml
-  #   needs: xpu-n-py3_10-inductor-benchmark-build
-  #   with:
-  #     build-environment: linux-noble-xpu-n-py3.10
-  #     dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
-  #     docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
-  #     test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
-  #     timeout-minutes: 720
-  #     # Disable monitor in perf tests for more investigation
-  #     disable-monitor: true
-  #     monitor-log-interval: 10
-  #     monitor-data-collect-interval: 2
-  #   secrets: inherit
+  xpu-n-py3_10-inductor-benchmark-test-nightly:
+    permissions:
+      id-token: write
+      contents: read
+    if: github.event_name != 'workflow_dispatch'
+    name: xpu-n-py3.10-inductor-benchmark
+    uses: ./.github/workflows/_xpu-test.yml
+    needs: xpu-n-py3_10-inductor-benchmark-build
+    with:
+      build-environment: linux-noble-xpu-n-py3.10
+      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
+      docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
+      timeout-minutes: 720
+      # Disable monitor in perf tests for more investigation
+      disable-monitor: true
+      monitor-log-interval: 10
+      monitor-data-collect-interval: 2
+    secrets: inherit

-  # xpu-n-py3_10-inductor-benchmark-test:
-  #   permissions:
-  #     id-token: write
-  #     contents: read
-  #   if: github.event_name == 'workflow_dispatch'
-  #   name: xpu-n-py3.10-inductor-test
-  #   uses: ./.github/workflows/_xpu-test.yml
-  #   needs: xpu-n-py3_10-inductor-benchmark-build
-  #   with:
-  #     build-environment: linux-noble-xpu-n-py3.10
-  #     dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
-  #     docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
-  #     test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
-  #     timeout-minutes: 720
-  #     disable-monitor: false
-  #     monitor-log-interval: 15
-  #     monitor-data-collect-interval: 4
-  #   secrets: inherit
+  xpu-n-py3_10-inductor-benchmark-test:
+    permissions:
+      id-token: write
+      contents: read
+    if: github.event_name == 'workflow_dispatch'
+    name: xpu-n-py3.10-inductor-test
+    uses: ./.github/workflows/_xpu-test.yml
+    needs: xpu-n-py3_10-inductor-benchmark-build
+    with:
+      build-environment: linux-noble-xpu-n-py3.10
+      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
+      docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
+      timeout-minutes: 720
+      disable-monitor: false
+      monitor-log-interval: 15
+      monitor-data-collect-interval: 4
+    secrets: inherit
--- a/.github/workflows/operator_microbenchmark.yml
+++ b/.github/workflows/operator_microbenchmark.yml
@ -24,8 +24,7 @@ jobs:
    name: opmicrobenchmark-build
    uses: ./.github/workflows/_linux-build.yml
    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.c7i.2xlarge
+      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '8.0 9.0'
@ -36,16 +35,16 @@ jobs:
        ]}
    secrets: inherit

-  # opmicrobenchmark-test:
-  #   name: opmicrobenchmark-test
-  #   uses: ./.github/workflows/_linux-test.yml
-  #   needs: opmicrobenchmark-build
-  #   with:
-  #     timeout-minutes: 500
-  #     build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
-  #     docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
-  #     test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
-  #   secrets: inherit
+  opmicrobenchmark-test:
+    name: opmicrobenchmark-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs: opmicrobenchmark-build
+    with:
+      timeout-minutes: 500
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
+    secrets: inherit

  # B200 runner
  opmicrobenchmark-build-b200:
@ -53,8 +52,7 @@ jobs:
    name: opmicrobenchmark-build-b200
    uses: ./.github/workflows/_linux-build.yml
    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.c7i.4xlarge
+      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '10.0'
@ -64,39 +62,39 @@ jobs:
        ]}
    secrets: inherit

-  # opmicrobenchmark-test-b200:
-  #   name: opmicrobenchmark-test-b200
-  #   uses: ./.github/workflows/_linux-test.yml
-  #   needs: opmicrobenchmark-build-b200
-  #   with:
-  #     timeout-minutes: 500
-  #     build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
-  #     docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
-  #     test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
-  #     aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-  #   secrets: inherit
+  opmicrobenchmark-test-b200:
+    name: opmicrobenchmark-test-b200
+    uses: ./.github/workflows/_linux-test.yml
+    needs: opmicrobenchmark-build-b200
+    with:
+      timeout-minutes: 500
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
+      test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+    secrets: inherit

  # ROCM MI300 runner
-  # opmicrobenchmark-build-rocm:
-  #   if: github.repository_owner == 'pytorch'
-  #   name: opmicrobenchmark-build-rocm
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   with:
-  #     build-environment: linux-jammy-rocm-py3_10
-  #     docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
-  #       ]}
-  #   secrets: inherit
+  opmicrobenchmark-build-rocm:
+    if: github.repository_owner == 'pytorch'
+    name: opmicrobenchmark-build-rocm
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-rocm-py3_10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
+      test-matrix: |
+        { include: [
+          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.1" },
+        ]}
+    secrets: inherit

-  # opmicrobenchmark-test-rocm:
-  #   name: opmicrobenchmark-test-rocm
-  #   uses: ./.github/workflows/_rocm-test.yml
-  #   needs: opmicrobenchmark-build-rocm
-  #   with:
-  #     timeout-minutes: 500
-  #     build-environment: linux-jammy-rocm-py3_10
-  #     docker-image: ${{ needs.opmicrobenchmark-build-rocm.outputs.docker-image }}
-  #     test-matrix: ${{ needs.opmicrobenchmark-build-rocm.outputs.test-matrix }}
-  #   secrets: inherit
+  opmicrobenchmark-test-rocm:
+    name: opmicrobenchmark-test-rocm
+    uses: ./.github/workflows/_rocm-test.yml
+    needs: opmicrobenchmark-build-rocm
+    with:
+      timeout-minutes: 500
+      build-environment: linux-jammy-rocm-py3_10
+      docker-image: ${{ needs.opmicrobenchmark-build-rocm.outputs.docker-image }}
+      test-matrix: ${{ needs.opmicrobenchmark-build-rocm.outputs.test-matrix }}
+    secrets: inherit
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -49,86 +49,85 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}

-  # linux-jammy-py3_10-gcc11-build:
-  #   name: linux-jammy-py3.10-gcc11
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-  #     build-environment: linux-jammy-py3.10-gcc11
-  #     docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "docs_test", shard: 1, num_shards: 1,  runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "backwards_compat", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-  #         { config: "distributed", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "distributed", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "numpy_2_x", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-  #         { config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #       ]}
-  #   secrets: inherit
+  linux-jammy-py3_10-gcc11-build:
+    name: linux-jammy-py3.10-gcc11
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "docs_test", shard: 1, num_shards: 1,  runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "backwards_compat", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "distributed", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "distributed", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "numpy_2_x", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+        ]}
+    secrets: inherit

-  # linux-jammy-py3_10-gcc11-test:
-  #   name: linux-jammy-py3.10-gcc11
-  #   uses: ./.github/workflows/_linux-test.yml
-  #   needs:
-  #     - linux-jammy-py3_10-gcc11-build
-  #     - target-determination
-  #   with:
-  #     build-environment: linux-jammy-py3.10-gcc11
-  #     docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
-  #     test-matrix: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.test-matrix }}
-  #   secrets: inherit
+  linux-jammy-py3_10-gcc11-test:
+    name: linux-jammy-py3.10-gcc11
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-py3_10-gcc11-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.test-matrix }}
+    secrets: inherit

-  # linux-docs:
-  #   name: linux-docs
-  #   uses: ./.github/workflows/_docs.yml
-  #   needs: linux-jammy-py3_10-gcc11-build
-  #   with:
-  #     build-environment: linux-jammy-py3.10-gcc11
-  #     docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
-  #   secrets: inherit
+  linux-docs:
+    name: linux-docs
+    uses: ./.github/workflows/_docs.yml
+    needs: linux-jammy-py3_10-gcc11-build
+    with:
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-py3_10-gcc11-build.outputs.docker-image }}
+    secrets: inherit

-  # linux-jammy-py3_10-gcc11-no-ops:
-  #   name: linux-jammy-py3.10-gcc11-no-ops
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-  #     build-environment: linux-jammy-py3.10-gcc11-no-ops
-  #     docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "default", shard: 1, num_shards: 1 },
-  #       ]}
-  #   secrets: inherit
+  linux-jammy-py3_10-gcc11-no-ops:
+    name: linux-jammy-py3.10-gcc11-no-ops
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.10-gcc11-no-ops
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1 },
+        ]}
+    secrets: inherit

-  # linux-jammy-py3_10-gcc11-pch:
-  #   name: linux-jammy-py3.10-gcc11-pch
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-  #     build-environment: linux-jammy-py3.10-gcc11-pch
-  #     docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "default", shard: 1, num_shards: 1 },
-  #       ]}
-  #   secrets: inherit
+  linux-jammy-py3_10-gcc11-pch:
+    name: linux-jammy-py3.10-gcc11-pch
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.10-gcc11-pch
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1 },
+        ]}
+    secrets: inherit

  linux-jammy-py3_10-clang18-asan-build:
    name: linux-jammy-py3.10-clang18-asan
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      runner: linux.r7i.2xlarge
+      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
@ -145,219 +144,219 @@ jobs:
      sync-tag: asan-build
    secrets: inherit

-  # linux-jammy-py3_10-clang18-asan-test:
-  #   name: linux-jammy-py3.10-clang18-asan
-  #   uses: ./.github/workflows/_linux-test.yml
-  #   needs:
-  #     - linux-jammy-py3_10-clang18-asan-build
-  #     - target-determination
-  #   with:
-  #     build-environment: linux-jammy-py3.10-clang18-asan
-  #     docker-image: ${{ needs.linux-jammy-py3_10-clang18-asan-build.outputs.docker-image }}
-  #     test-matrix: ${{ needs.linux-jammy-py3_10-clang18-asan-build.outputs.test-matrix }}
-  #     sync-tag: asan-test
-  #   secrets: inherit
+  linux-jammy-py3_10-clang18-asan-test:
+    name: linux-jammy-py3.10-clang18-asan
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-py3_10-clang18-asan-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.10-clang18-asan
+      docker-image: ${{ needs.linux-jammy-py3_10-clang18-asan-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_10-clang18-asan-build.outputs.test-matrix }}
+      sync-tag: asan-test
+    secrets: inherit

-  # linux-jammy-py3_10-clang12-onnx-build:
-  #   name: linux-jammy-py3.10-clang12-onnx
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-  #     build-environment: linux-jammy-py3.10-clang12-onnx
-  #     docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-onnx
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "default", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-  #         { config: "default", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-  #       ]}
-  #   secrets: inherit
+  linux-jammy-py3_10-clang12-onnx-build:
+    name: linux-jammy-py3.10-clang12-onnx
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.10-clang12-onnx
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-onnx
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "default", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+        ]}
+    secrets: inherit

-  # linux-jammy-py3_10-clang12-onnx-test:
-  #   name: linux-jammy-py3.10-clang12-onnx
-  #   uses: ./.github/workflows/_linux-test.yml
-  #   needs:
-  #     - linux-jammy-py3_10-clang12-onnx-build
-  #     - target-determination
-  #   with:
-  #     build-environment: linux-jammy-py3.10-clang12-onnx
-  #     docker-image: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.docker-image }}
-  #     test-matrix: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.test-matrix }}
-  #   secrets: inherit
+  linux-jammy-py3_10-clang12-onnx-test:
+    name: linux-jammy-py3.10-clang12-onnx
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-py3_10-clang12-onnx-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.10-clang12-onnx
+      docker-image: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.test-matrix }}
+    secrets: inherit

-  # linux-jammy-py3_10-clang12-build:
-  #   name: linux-jammy-py3.10-clang12
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-  #     build-environment: linux-jammy-py3.10-clang12
-  #     docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-  #         { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-  #         { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-  #         { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-  #         { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-  #         { config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "einops", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }
-  #       ]}
-  #   secrets: inherit
+  linux-jammy-py3_10-clang12-build:
+    name: linux-jammy-py3.10-clang12
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.10-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "einops", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }
+        ]}
+    secrets: inherit

-  # linux-jammy-py3_10-clang12-test:
-  #   name: linux-jammy-py3.10-clang12
-  #   uses: ./.github/workflows/_linux-test.yml
-  #   needs:
-  #     - linux-jammy-py3_10-clang12-build
-  #     - target-determination
-  #   with:
-  #     build-environment: linux-jammy-py3.10-clang12
-  #     docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
-  #     test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
-  #   secrets: inherit
+  linux-jammy-py3_10-clang12-test:
+    name: linux-jammy-py3.10-clang12
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-py3_10-clang12-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.10-clang12
+      docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
+    secrets: inherit

-  # linux-jammy-py3_13-clang12-build:
-  #   name: linux-jammy-py3.13-clang12
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-  #     build-environment: linux-jammy-py3.13-clang12
-  #     docker-image-name: ci-image:pytorch-linux-jammy-py3.13-clang12
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-  #         { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-  #         { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-  #         { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-  #         { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-  #         { config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #         { config: "einops", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }
-  #       ]}
-  #   secrets: inherit
+  linux-jammy-py3_13-clang12-build:
+    name: linux-jammy-py3.13-clang12
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.13-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.13-clang12
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "einops", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }
+        ]}
+    secrets: inherit

-  # linux-jammy-py3_13-clang12-test:
-  #   name: linux-jammy-py3.13-clang12
-  #   uses: ./.github/workflows/_linux-test.yml
-  #   needs: linux-jammy-py3_13-clang12-build
-  #   with:
-  #     build-environment: linux-jammy-py3.13-clang12
-  #     docker-image: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.docker-image }}
-  #     test-matrix: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.test-matrix }}
-  #   secrets: inherit
+  linux-jammy-py3_13-clang12-test:
+    name: linux-jammy-py3.13-clang12
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-py3_13-clang12-build
+    with:
+      build-environment: linux-jammy-py3.13-clang12
+      docker-image: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_13-clang12-build.outputs.test-matrix }}
+    secrets: inherit

-  # linux-jammy-cuda12_8-cudnn9-py3_10-clang12-build:
-  #   name: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-  #     build-environment: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
-  #     docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "default", shard: 1, num_shards: 1 },
-  #       ]}
-  #   secrets: inherit
+  linux-jammy-cuda12_8-cudnn9-py3_10-clang12-build:
+    name: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-cudnn9-py3.10-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1 },
+        ]}
+    secrets: inherit

-  # linux-jammy-cpu-py3_10-gcc11-bazel-test:
-  #   name: linux-jammy-cpu-py3.10-gcc11-bazel-test
-  #   uses: ./.github/workflows/_bazel-build-test.yml
-  #   needs: get-label-type
-  #   with:
-  #     runner: "${{ needs.get-label-type.outputs.label-type }}linux.large"
-  #     build-environment: linux-jammy-cuda12.8-py3.10-gcc11-bazel-test
-  #     docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-  #     cuda-version: cpu
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "default", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-  #       ]}
-  #   secrets: inherit
+  linux-jammy-cpu-py3_10-gcc11-bazel-test:
+    name: linux-jammy-cpu-py3.10-gcc11-bazel-test
+    uses: ./.github/workflows/_bazel-build-test.yml
+    needs: get-label-type
+    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.large"
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-bazel-test
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-version: cpu
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+        ]}
+    secrets: inherit

-  # linux-jammy-py3_10-gcc11-mobile-lightweight-dispatch-build:
-  #   name: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-  #     build-environment: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
-  #     docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
-  #     build-generates-artifacts: false
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "default", shard: 1, num_shards: 1 },
-  #       ]}
-  #   secrets: inherit
+  linux-jammy-py3_10-gcc11-mobile-lightweight-dispatch-build:
+    name: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.10-gcc11-mobile-lightweight-dispatch-build
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
+      build-generates-artifacts: false
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1 },
+        ]}
+    secrets: inherit

-  # linux-jammy-rocm-py3_10-build:
-  #   # don't run build twice on main
-  #   if: github.event_name == 'pull_request'
-  #   name: linux-jammy-rocm-py3.10
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-  #     build-environment: linux-jammy-rocm-py3.10
-  #     docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
-  #     sync-tag: rocm-build
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "default", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.2" },
-  #         { config: "default", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.2" },
-  #         { config: "default", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.2" },
-  #       ]}
-  #   secrets: inherit
+  linux-jammy-rocm-py3_10-build:
+    # don't run build twice on main
+    if: github.event_name == 'pull_request'
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      sync-tag: rocm-build
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.2" },
+          { config: "default", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.2" },
+          { config: "default", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.2" },
+        ]}
+    secrets: inherit

-  # linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-  #   name: cuda12.8-py3.10-gcc9-sm75
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-  #     build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
-  #     docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
-  #     cuda-arch-list: '7.5'
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
-  #       ]}
-  #   secrets: inherit
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
+    name: cuda12.8-py3.10-gcc9-sm75
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '7.5'
+      test-matrix: |
+        { include: [
+          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
+        ]}
+    secrets: inherit

-  # linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
-  #   name: cuda12.8-py3.10-gcc9-sm75
-  #   uses: ./.github/workflows/_linux-test.yml
-  #   needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
-  #   with:
-  #     build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
-  #     docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
-  #     test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-  #   secrets: inherit
+  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
+    name: cuda12.8-py3.10-gcc9-sm75
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit

-  # linux-noble-xpu-n-py3_10-build:
-  #   name: linux-noble-xpu-n-py3.10
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     # This should sync with the build in xpu.yml but xpu uses a larger runner
-  #     # sync-tag: linux-xpu-n-build
-  #     runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-  #     build-environment: linux-noble-xpu-n-py3.10
-  #     docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
-  #         { config: "default", shard: 2, num_shards: 4, runner: "linux.idc.xpu" },
-  #         { config: "default", shard: 3, num_shards: 4, runner: "linux.idc.xpu" },
-  #         { config: "default", shard: 4, num_shards: 4, runner: "linux.idc.xpu" },
-  #       ]}
-  #   secrets: inherit
+  linux-noble-xpu-n-py3_10-build:
+    name: linux-noble-xpu-n-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      # This should sync with the build in xpu.yml but xpu uses a larger runner
+      # sync-tag: linux-xpu-n-build
+      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
+      build-environment: linux-noble-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
+          { config: "default", shard: 2, num_shards: 4, runner: "linux.idc.xpu" },
+          { config: "default", shard: 3, num_shards: 4, runner: "linux.idc.xpu" },
+          { config: "default", shard: 4, num_shards: 4, runner: "linux.idc.xpu" },
+        ]}
+    secrets: inherit
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -49,68 +49,68 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  # linux-jammy-cuda12_8-py3_10-gcc11-sm86-build:
-  #   name: linux-jammy-cuda12.8-py3.10-gcc11-sm86
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-  #     build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
-  #     docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-  #     cuda-arch-list: 8.6
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "slow", shard: 1, num_shards: 3, runner: "linux.g5.4xlarge.nvidia.gpu" },
-  #         { config: "slow", shard: 2, num_shards: 3, runner: "linux.g5.4xlarge.nvidia.gpu" },
-  #         { config: "slow", shard: 3, num_shards: 3, runner: "linux.g5.4xlarge.nvidia.gpu" },
-  #       ]}
-  #   secrets: inherit
+  linux-jammy-cuda12_8-py3_10-gcc11-sm86-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: 8.6
+      test-matrix: |
+        { include: [
+          { config: "slow", shard: 1, num_shards: 3, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 2, num_shards: 3, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 3, num_shards: 3, runner: "linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit

-  # linux-jammy-cuda12_8-py3_10-gcc11-sm86-test:
-  #   name: linux-jammy-cuda12.8-py3.10-gcc11-sm86
-  #   uses: ./.github/workflows/_linux-test.yml
-  #   needs:
-  #     - linux-jammy-cuda12_8-py3_10-gcc11-sm86-build
-  #     - target-determination
-  #   with:
-  #     build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
-  #     docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.docker-image }}
-  #     test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.test-matrix }}
-  #   secrets: inherit
+  linux-jammy-cuda12_8-py3_10-gcc11-sm86-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm86-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm86-build.outputs.test-matrix }}
+    secrets: inherit

-  # linux-jammy-py3_10-clang12-build:
-  #   name: linux-jammy-py3.10-clang12
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-  #     build-environment: linux-jammy-py3.10-clang12
-  #     docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "slow", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-  #         { config: "slow", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-  #       ]}
-  #   secrets: inherit
+  linux-jammy-py3_10-clang12-build:
+    name: linux-jammy-py3.10-clang12
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.10-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-clang12
+      test-matrix: |
+        { include: [
+          { config: "slow", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "slow", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
+        ]}
+    secrets: inherit

-  # linux-jammy-py3_10-clang12-test:
-  #   name: linux-jammy-py3.10-clang12
-  #   uses: ./.github/workflows/_linux-test.yml
-  #   needs:
-  #     - linux-jammy-py3_10-clang12-build
-  #     - target-determination
-  #   with:
-  #     build-environment: linux-jammy-py3.10-clang12
-  #     docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
-  #     test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
-  #   secrets: inherit
+  linux-jammy-py3_10-clang12-test:
+    name: linux-jammy-py3.10-clang12
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-py3_10-clang12-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.10-clang12
+      docker-image: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
+    secrets: inherit

  linux-jammy-py3_10-clang18-asan-build:
    name: linux-jammy-py3.10-clang18-asan
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      runner: linux.r7i.2xlarge
+      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
@ -123,15 +123,15 @@ jobs:
      sync-tag: asan-build
    secrets: inherit

-  # linux-jammy-py3_10-clang18-asan-test:
-  #   name: linux-jammy-py3.10-clang18-asan
-  #   uses: ./.github/workflows/_linux-test.yml
-  #   needs:
-  #     - linux-jammy-py3_10-clang18-asan-build
-  #     - target-determination
-  #   with:
-  #     build-environment: linux-jammy-py3.10-clang18-asan
-  #     docker-image: ${{ needs.linux-jammy-py3_10-clang18-asan-build.outputs.docker-image }}
-  #     test-matrix: ${{ needs.linux-jammy-py3_10-clang18-asan-build.outputs.test-matrix }}
-  #     sync-tag: asan-test
-  #   secrets: inherit
+  linux-jammy-py3_10-clang18-asan-test:
+    name: linux-jammy-py3.10-clang18-asan
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-py3_10-clang18-asan-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.10-clang18-asan
+      docker-image: ${{ needs.linux-jammy-py3_10-clang18-asan-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_10-clang18-asan-build.outputs.test-matrix }}
+      sync-tag: asan-test
+    secrets: inherit
--- a/.github/workflows/test-b200.yml
+++ b/.github/workflows/test-b200.yml
@ -54,7 +54,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.r7i.4xlarge
+      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '10.0'
@ -65,14 +65,14 @@ jobs:
      # config: "smoke_b200" maps to test_python_smoke_b200() in .ci/pytorch/test.sh
    secrets: inherit

-  # linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
-  #   name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
-  #   uses: ./.github/workflows/_linux-test.yml
-  #   needs:
-  #     - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build
-  #   with:
-  #     build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
-  #     docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
-  #     test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
-  #     aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-  #   secrets: inherit
+  linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+    secrets: inherit
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -56,252 +56,251 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      build-generates-artifacts: false
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: "linux.c7i.2xlarge"
+      runner: "linux.c7i.4xlarge"
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 1 },
        ]}
    secrets: inherit

-  # linux-jammy-cuda12_8-py3_10-gcc11-build:
-  #   name: linux-jammy-cuda12.8-py3.10-gcc11
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-  #     build-environment: linux-jammy-cuda12.8-py3.10-gcc11
-  #     docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-  #     cuda-arch-list: '7.5 8.9'
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-  #         { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-  #         { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-  #         { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-  #         { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-  #         { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
-  #         { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
-  #         { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
-  #         { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
-  #         { config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
-  #       ]}
-  #   secrets: inherit
+  linux-jammy-cuda12_8-py3_10-gcc11-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '7.5 8.9'
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
+          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
+        ]}
+    secrets: inherit

-  # linux-jammy-cuda12_8-py3_10-gcc11-test:
-  #   name: linux-jammy-cuda12.8-py3.10-gcc11
-  #   uses: ./.github/workflows/_linux-test.yml
-  #   needs:
-  #     - linux-jammy-cuda12_8-py3_10-gcc11-build
-  #     - target-determination
-  #   with:
-  #     timeout-minutes: 360
-  #     build-environment: linux-jammy-cuda12.8-py3.10-gcc11
-  #     docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
-  #     test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
-  #   secrets: inherit
+  linux-jammy-cuda12_8-py3_10-gcc11-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-build
+      - target-determination
+    with:
+      timeout-minutes: 360
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
+    secrets: inherit


  # no-ops builds test USE_PER_OPERATOR_HEADERS=0 where ATen/ops is not generated
-  # linux-jammy-cuda12_8-py3_10-gcc11-no-ops-build:
-  #   name: linux-jammy-cuda12.8-py3.10-gcc11-no-ops
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-  #     build-environment: linux-jammy-cuda12.8-py3.10-gcc11-no-ops
-  #     docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "default", shard: 1, num_shards: 1 },
-  #       ]}
-  #   secrets: inherit
+  linux-jammy-cuda12_8-py3_10-gcc11-no-ops-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-no-ops
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-no-ops
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 1 },
+        ]}
+    secrets: inherit

-  # macos-py3-arm64-build:
-  #   if: github.repository_owner == 'pytorch'
-  #   name: macos-py3-arm64
-  #   uses: ./.github/workflows/_mac-build.yml
-  #   with:
-  #     sync-tag: macos-py3-arm64-build
-  #     build-environment: macos-py3-arm64
-  #     runner-type: macos-m1-stable
-  #     build-generates-artifacts: true
-  #     # To match the one pre-installed in the m1 runners
-  #     python-version: 3.12.7
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" },
-  #         { config: "default", shard: 2, num_shards: 3, runner: "macos-m1-stable" },
-  #         { config: "default", shard: 3, num_shards: 3, runner: "macos-m1-stable" },
-  #         { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
-  #         { config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-15" },
-  #       ]}
-  #   secrets: inherit
+  macos-py3-arm64-build:
+    if: github.repository_owner == 'pytorch'
+    name: macos-py3-arm64
+    uses: ./.github/workflows/_mac-build.yml
+    with:
+      sync-tag: macos-py3-arm64-build
+      build-environment: macos-py3-arm64
+      runner-type: macos-m1-stable
+      build-generates-artifacts: true
+      # To match the one pre-installed in the m1 runners
+      python-version: 3.12.7
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" },
+          { config: "default", shard: 2, num_shards: 3, runner: "macos-m1-stable" },
+          { config: "default", shard: 3, num_shards: 3, runner: "macos-m1-stable" },
+          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" },
+          { config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-15" },
+        ]}
+    secrets: inherit

-  # macos-py3-arm64-test:
-  #   name: macos-py3-arm64
-  #   uses: ./.github/workflows/_mac-test.yml
-  #   needs:
-  #     - macos-py3-arm64-build
-  #     - target-determination
-  #   with:
-  #     build-environment: macos-py3-arm64
-  #     # Same as the build job
-  #     python-version: 3.12.7
-  #     test-matrix: ${{ needs.macos-py3-arm64-build.outputs.test-matrix }}
-  #     disable-monitor: false
-  #   secrets: inherit
+  macos-py3-arm64-test:
+    name: macos-py3-arm64
+    uses: ./.github/workflows/_mac-test.yml
+    needs:
+      - macos-py3-arm64-build
+      - target-determination
+    with:
+      build-environment: macos-py3-arm64
+      # Same as the build job
+      python-version: 3.12.7
+      test-matrix: ${{ needs.macos-py3-arm64-build.outputs.test-matrix }}
+      disable-monitor: false
+    secrets: inherit

-  # win-vs2022-cpu-py3-build:
-  #   name: win-vs2022-cpu-py3
-  #   uses: ./.github/workflows/_win-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     build-environment: win-vs2022-cpu-py3
-  #     cuda-version: cpu
-  #     runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
-  #         { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
-  #         { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
-  #         { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
-  #       ]}
-  #   secrets: inherit
+  win-vs2022-cpu-py3-build:
+    name: win-vs2022-cpu-py3
+    uses: ./.github/workflows/_win-build.yml
+    needs: get-label-type
+    with:
+      build-environment: win-vs2022-cpu-py3
+      cuda-version: cpu
+      runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral" },
+        ]}
+    secrets: inherit

-  # win-vs2022-cpu-py3-test:
-  #   name: win-vs2022-cpu-py3
-  #   uses: ./.github/workflows/_win-test.yml
-  #   needs:
-  #     - win-vs2022-cpu-py3-build
-  #     - target-determination
-  #   with:
-  #     build-environment: win-vs2022-cpu-py3
-  #     cuda-version: cpu
-  #     test-matrix: ${{ needs.win-vs2022-cpu-py3-build.outputs.test-matrix }}
-  #     disable-monitor: false
-  #   secrets: inherit
+  win-vs2022-cpu-py3-test:
+    name: win-vs2022-cpu-py3
+    uses: ./.github/workflows/_win-test.yml
+    needs:
+      - win-vs2022-cpu-py3-build
+      - target-determination
+    with:
+      build-environment: win-vs2022-cpu-py3
+      cuda-version: cpu
+      test-matrix: ${{ needs.win-vs2022-cpu-py3-build.outputs.test-matrix }}
+      disable-monitor: false
+    secrets: inherit

-  # win-vs2022-cuda12_8-py3-build:
-  #   name: win-vs2022-cuda12.8-py3
-  #   uses: ./.github/workflows/_win-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     build-environment: win-vs2022-cuda12.8-py3
-  #     cuda-version: "12.8"
-  #     runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-  #   secrets: inherit
+  win-vs2022-cuda12_8-py3-build:
+    name: win-vs2022-cuda12.8-py3
+    uses: ./.github/workflows/_win-build.yml
+    needs: get-label-type
+    with:
+      build-environment: win-vs2022-cuda12.8-py3
+      cuda-version: "12.8"
+      runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
+    secrets: inherit

-  # linux-jammy-rocm-py3_10-build:
-  #   if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
-  #   name: linux-jammy-rocm-py3.10
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-  #     build-environment: linux-jammy-rocm-py3.10
-  #     docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
-  #     sync-tag: rocm-build
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
-  #         { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
-  #         { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.4" },
-  #       ]}
-  #   secrets: inherit
+  linux-jammy-rocm-py3_10-build:
+    if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      sync-tag: rocm-build
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
+          { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.4" },
+        ]}
+    secrets: inherit

-  # linux-jammy-rocm-py3_10-test:
-  #   if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
-  #   permissions:
-  #     id-token: write
-  #     contents: read
-  #   name: linux-jammy-rocm-py3.10
-  #   uses: ./.github/workflows/_rocm-test.yml
-  #   needs:
-  #     - linux-jammy-rocm-py3_10-build
-  #     - target-determination
-  #   with:
-  #     build-environment: linux-jammy-rocm-py3.10
-  #     docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
-  #     test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
-  #     tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
-  #   secrets: inherit
+  linux-jammy-rocm-py3_10-test:
+    if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
+    secrets: inherit

-  # inductor-build:
-  #   name: inductor-build
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80
-  #     docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
-  #     cuda-arch-list: '8.0'
-  #   secrets: inherit
+  inductor-build:
+    name: inductor-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+    secrets: inherit

  # Test cross-compiled models with Windows libs extracted from wheel
-  # cross-compile-linux-test:
-  #   name: cross-compile-linux-test
-  #   uses: ./.github/workflows/_linux-test.yml
-  #   needs:
-  #     - linux-jammy-cuda12_8-py3_10-gcc11-build
-  #     - get-label-type
-  #     - win-vs2022-cuda12_8-py3-build
-  #   with:
-  #     build-environment: linux-jammy-cuda12.8-py3.10-gcc11
-  #     docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "aoti_cross_compile_for_windows", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", win_torch_wheel_artifact: "win-vs2022-cuda12.8-py3" },
-  #       ]}
-  #   secrets: inherit
+  cross-compile-linux-test:
+    name: cross-compile-linux-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-build
+      - get-label-type
+      - win-vs2022-cuda12_8-py3-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
+      test-matrix: |
+        { include: [
+          { config: "aoti_cross_compile_for_windows", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", win_torch_wheel_artifact: "win-vs2022-cuda12.8-py3" },
+        ]}
+    secrets: inherit

-  # verify-cachebench-cpu-build:
-  #   name: verify-cachebench-cpu-build
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-  #     build-environment: linux-jammy-py3.10-gcc11
-  #     docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #       ]}
-  #   secrets: inherit
+  verify-cachebench-cpu-build:
+    name: verify-cachebench-cpu-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
+      test-matrix: |
+        { include: [
+          { config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+        ]}
+    secrets: inherit

-  # verify-cachebench-cpu-test:
-  #   name: verify-cachebench-cpu-test
-  #   uses: ./.github/workflows/_linux-test.yml
-  #   needs:
-  #     - verify-cachebench-cpu-build
-  #     - target-determination
-  #   with:
-  #     build-environment: linux-jammy-py3.10-gcc11
-  #     docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
-  #     test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
-  #   secrets: inherit
+  verify-cachebench-cpu-test:
+    name: verify-cachebench-cpu-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - verify-cachebench-cpu-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-py3.10-gcc11
+      docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
+    secrets: inherit

-  # linux-jammy-py3-clang12-executorch-build:
-  #   name: linux-jammy-py3-clang12-executorch
-  #   uses: ./.github/workflows/_linux-build.yml
-  #   needs: get-label-type
-  #   with:
-  #     runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-  #     build-environment: linux-jammy-py3-clang12-executorch
-  #     docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
-  #     test-matrix: |
-  #       { include: [
-  #         { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-  #       ]}
-  #   secrets: inherit
+  linux-jammy-py3-clang12-executorch-build:
+    name: linux-jammy-py3-clang12-executorch
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3-clang12-executorch
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
+      test-matrix: |
+        { include: [
+          { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+        ]}
+    secrets: inherit

-  # linux-jammy-py3-clang12-executorch-test:
-  #   name: linux-jammy-py3-clang12-executorch
-  #   uses: ./.github/workflows/_linux-test.yml
-  #   needs: linux-jammy-py3-clang12-executorch-build
-  #   with:
-  #     build-environment: linux-jammy-py3-clang12-executorch
-  #     docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
-  #     test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
-  #   secrets: inherit
+  linux-jammy-py3-clang12-executorch-test:
+    name: linux-jammy-py3-clang12-executorch
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-py3-clang12-executorch-build
+    with:
+      build-environment: linux-jammy-py3-clang12-executorch
+      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
+    secrets: inherit

  linux-jammy-py3_10-gcc11-full-debug-build-only:
    name: linux-jammy-py3.10-gcc11-full-debug-build-only
@ -309,7 +308,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.c7i.2xlarge
+      runner: linux.2xlarge.memory
      build-environment: linux-jammy-py3.10-gcc11-full-debug-build-only
      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
    secrets: inherit
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@ -43,8 +43,7 @@ jobs:
      build-environment: linux-jammy-cuda12.8-py3.12-gcc11
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm
      cuda-arch-list: '8.0 8.9 9.0'
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.r7i.4xlarge
+      runner: linux.24xlarge.memory
      test-matrix: |
        { include: [
          { config: "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
@ -66,14 +65,14 @@ jobs:
        ]}
    secrets: inherit

-  # vllm-test-sm89:
-  #     name: ci-vllm-test
-  #     uses: ./.github/workflows/_linux-test.yml
-  #     needs: [
-  #       torch-build,
-  #     ]
-  #     with:
-  #       build-environment: linux-jammy-cuda12.8-py3.12-gcc11
-  #       docker-image: ${{ needs.torch-build.outputs.docker-image }}
-  #       test-matrix: ${{ needs.torch-build.outputs.test-matrix }}
-  #     secrets: inherit
+  vllm-test-sm89:
+      name: ci-vllm-test
+      uses: ./.github/workflows/_linux-test.yml
+      needs: [
+        torch-build,
+      ]
+      with:
+        build-environment: linux-jammy-cuda12.8-py3.12-gcc11
+        docker-image: ${{ needs.torch-build.outputs.docker-image }}
+        test-matrix: ${{ needs.torch-build.outputs.test-matrix }}
+      secrets: inherit
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@ -35,7 +35,7 @@ jobs:
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
      build-environment: linux-jammy-xpu-n-1-py3.10
      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-1-py3
-      runner: linux.c7i.2xlarge
+      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
@ -56,7 +56,7 @@ jobs:
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
      build-environment: linux-noble-xpu-n-py3.10
      docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3
-      runner: linux.c7i.2xlarge
+      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 12, runner: "linux.idc.xpu" },
@ -74,39 +74,39 @@ jobs:
        ]}
    secrets: inherit

-  # linux-noble-xpu-n-py3_10-test:
-  #   name: linux-noble-xpu-n-py3.10
-  #   uses: ./.github/workflows/_xpu-test.yml
-  #   needs: linux-noble-xpu-n-py3_10-build
-  #   permissions:
-  #     id-token: write
-  #     contents: read
-  #   with:
-  #     build-environment: linux-noble-xpu-n-py3.10
-  #     docker-image: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.docker-image }}
-  #     test-matrix: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.test-matrix }}
-  #   secrets: inherit
+  linux-noble-xpu-n-py3_10-test:
+    name: linux-noble-xpu-n-py3.10
+    uses: ./.github/workflows/_xpu-test.yml
+    needs: linux-noble-xpu-n-py3_10-build
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      build-environment: linux-noble-xpu-n-py3.10
+      docker-image: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.test-matrix }}
+    secrets: inherit

-  # windows-xpu-n-1-build:
-  #   if: github.repository_owner == 'pytorch'
-  #   name: win-vs2022-xpu-n-1-py3
-  #   uses: ./.github/workflows/_win-build.yml
-  #   with:
-  #     build-environment: win-vs2022-xpu-n-1-py3
-  #     cuda-version: cpu
-  #     use-xpu: true
-  #     xpu-version: '2025.1'
-  #     vc-year: '2022'
-  #   secrets: inherit
+  windows-xpu-n-1-build:
+    if: github.repository_owner == 'pytorch'
+    name: win-vs2022-xpu-n-1-py3
+    uses: ./.github/workflows/_win-build.yml
+    with:
+      build-environment: win-vs2022-xpu-n-1-py3
+      cuda-version: cpu
+      use-xpu: true
+      xpu-version: '2025.1'
+      vc-year: '2022'
+    secrets: inherit

-  # windows-xpu-n-build:
-  #   if: github.repository_owner == 'pytorch'
-  #   name: win-vs2022-xpu-n-py3
-  #   uses: ./.github/workflows/_win-build.yml
-  #   with:
-  #     build-environment: win-vs2022-xpu-n-py3
-  #     cuda-version: cpu
-  #     use-xpu: true
-  #     xpu-version: '2025.2'
-  #     vc-year: '2022'
-  #   secrets: inherit
+  windows-xpu-n-build:
+    if: github.repository_owner == 'pytorch'
+    name: win-vs2022-xpu-n-py3
+    uses: ./.github/workflows/_win-build.yml
+    with:
+      build-environment: win-vs2022-xpu-n-py3
+      cuda-version: cpu
+      use-xpu: true
+      xpu-version: '2025.2'
+      vc-year: '2022'
+    secrets: inherit
--- a/aten/src/ATen/LegacyBatchedTensorImpl.h
+++ b/aten/src/ATen/LegacyBatchedTensorImpl.h
@ -144,7 +144,7 @@ inline std::bitset<kVmapNumLevels> createVmapLevelsBitset(BatchDimsRef bdims) {
 }

 inline std::ostream& operator<<(std::ostream& out, const BatchDim& bdim) {
-  out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ')';
+  out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ")";
  return out;
 }

--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@ -9,7 +9,7 @@ namespace indexing {
 const EllipsisIndexType Ellipsis = EllipsisIndexType();

 std::ostream& operator<<(std::ostream& stream, const Slice& slice) {
-  stream << slice.start() << ':' << slice.stop() << ':' << slice.step();
+  stream << slice.start() << ":" << slice.stop() << ":" << slice.step();
  return stream;
 }

@ -31,12 +31,12 @@ std::ostream& operator<<(std::ostream& stream, const TensorIndex& tensor_index)
 }

 std::ostream& operator<<(std::ostream& stream, const std::vector<TensorIndex>& tensor_indices) {
-  stream << '(';
+  stream << "(";
  for (const auto i : c10::irange(tensor_indices.size())) {
    stream << tensor_indices[i];
    if (i < tensor_indices.size() - 1) stream << ", ";
  }
-  stream << ')';
+  stream << ")";
  return stream;
 }

--- a/aten/src/ATen/TensorNames.cpp
+++ b/aten/src/ATen/TensorNames.cpp
@ -113,7 +113,7 @@ void TensorNames::checkUnique(const char* op_name) const {
 std::ostream& operator<<(std::ostream& out, const TensorName& tensorname) {
  out << tensorname.name_ << " (index ";
  out << tensorname.origin_idx_ << " of ";
-  out << tensorname.origin_ << ')';
+  out << tensorname.origin_ << ")";
  return out;
 }

--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@ -13,9 +13,9 @@ std::ostream& operator<<(std::ostream & out, const TensorGeometryArg& t) {
  if (t.pos == 0) {
    // 0 is distinguished; it usually indicates 'self' or the return
    // tensor
-    out << '\'' << t.name << '\'';
+    out << "'" << t.name << "'";
  } else {
-    out << "argument #" << t.pos << " '" << t.name << '\'';
+    out << "argument #" << t.pos << " '" << t.name << "'";
  }
  return out;
 }
@ -154,7 +154,7 @@ void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
      oss << "Tensor for " << t2 << " is on CPU, ";
    }
    oss << "but expected " << ((!t1->is_cpu() && !t2->is_cpu()) ? "them" : "it")
-        << " to be on GPU (while checking arguments for " << c << ')';
+        << " to be on GPU (while checking arguments for " << c << ")";
    TORCH_CHECK(false, oss.str());
  }
  TORCH_CHECK(
@ -199,7 +199,7 @@ void checkScalarTypes(CheckedFrom c, const TensorArg& t,
        i++;
      }
      oss << "; but got " << t->toString()
-          << " instead (while checking arguments for " << c << ')';
+          << " instead (while checking arguments for " << c << ")";
      TORCH_CHECK(false, oss.str());
    }
 }
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@ -43,8 +43,8 @@ std::string get_mkldnn_version() {
    // https://github.com/intel/ideep/issues/29
    {
      const dnnl_version_t* ver = dnnl_version();
-      ss << "Intel(R) MKL-DNN v" << ver->major << '.' << ver->minor << '.' << ver->patch
-         << " (Git Hash " << ver->hash << ')';
+      ss << "Intel(R) MKL-DNN v" << ver->major << "." << ver->minor << "." << ver->patch
+         << " (Git Hash " << ver->hash << ")";
    }
  #else
    ss << "MKLDNN not found";
@ -81,7 +81,7 @@ std::string get_openmp_version() {
          break;
      }
      if (ver_str) {
-        ss << " (a.k.a. OpenMP " << ver_str << ')';
+        ss << " (a.k.a. OpenMP " << ver_str << ")";
      }
    }
  #else
@ -135,38 +135,38 @@ std::string show_config() {

 #if defined(__GNUC__)
  {
-    ss << "  - GCC " << __GNUC__ << '.' << __GNUC_MINOR__ << '\n';
+    ss << "  - GCC " << __GNUC__ << "." << __GNUC_MINOR__ << "\n";
  }
 #endif

 #if defined(__cplusplus)
  {
-    ss << "  - C++ Version: " << __cplusplus << '\n';
+    ss << "  - C++ Version: " << __cplusplus << "\n";
  }
 #endif

 #if defined(__clang_major__)
  {
-    ss << "  - clang " << __clang_major__ << '.' << __clang_minor__ << '.' << __clang_patchlevel__ << '\n';
+    ss << "  - clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__ << "\n";
  }
 #endif

 #if defined(_MSC_VER)
  {
-    ss << "  - MSVC " << _MSC_FULL_VER << '\n';
+    ss << "  - MSVC " << _MSC_FULL_VER << "\n";
  }
 #endif

 #if AT_MKL_ENABLED()
-  ss << "  - " << get_mkl_version() << '\n';
+  ss << "  - " << get_mkl_version() << "\n";
 #endif

 #if AT_MKLDNN_ENABLED()
-  ss << "  - " << get_mkldnn_version() << '\n';
+  ss << "  - " << get_mkldnn_version() << "\n";
 #endif

 #ifdef _OPENMP
-  ss << "  - " << get_openmp_version() << '\n';
+  ss << "  - " << get_openmp_version() << "\n";
 #endif

 #if AT_BUILD_WITH_LAPACK()
@ -183,7 +183,7 @@ std::string show_config() {
  ss << "  - Cross compiling on MacOSX\n";
 #endif

-  ss << "  - "<< used_cpu_capability() << '\n';
+  ss << "  - "<< used_cpu_capability() << "\n";

  if (hasCUDA()) {
    ss << detail::getCUDAHooks().showConfig();
@ -200,10 +200,10 @@ std::string show_config() {
  ss << "  - Build settings: ";
  for (const auto& pair : caffe2::GetBuildOptions()) {
    if (!pair.second.empty()) {
-      ss << pair.first << '=' << pair.second << ", ";
+      ss << pair.first << "=" << pair.second << ", ";
    }
  }
-  ss << '\n';
+  ss << "\n";

  // TODO: do HIP
  // TODO: do XLA
--- a/aten/src/ATen/code_template.h
+++ b/aten/src/ATen/code_template.h
@ -209,7 +209,7 @@ struct CodeTemplate {
  // to indent correctly in the context.
  void emitIndent(std::ostream& out, size_t indent) const {
    for ([[maybe_unused]] const auto i : c10::irange(indent)) {
-      out << ' ';
+      out << " ";
    }
  }
  void emitStringWithIndents(
--- a/aten/src/ATen/core/Dimname.cpp
+++ b/aten/src/ATen/core/Dimname.cpp
@ -10,7 +10,7 @@ std::ostream& operator<<(std::ostream& out, const Dimname& dimname) {
  if (dimname.type() == NameType::WILDCARD) {
    out << "None";
  } else {
-    out << '\'' << dimname.symbol().toUnqualString() << '\'';
+    out << "'" << dimname.symbol().toUnqualString() << "'";
  }
  return out;
 }
--- a/aten/src/ATen/core/Range.cpp
+++ b/aten/src/ATen/core/Range.cpp
@ -5,7 +5,7 @@
 namespace at {

 std::ostream& operator<<(std::ostream& out, const Range& range) {
-  out << "Range[" << range.begin << ", " << range.end << ']';
+  out << "Range[" << range.begin << ", " << range.end << "]";
  return out;
 }

--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@ -71,7 +71,7 @@ void TensorBase::enforce_invariants() {

 void TensorBase::print() const {
  if (defined()) {
-    std::cerr << '[' << toString() << ' ' << sizes() << ']' << '\n';
+    std::cerr << "[" << toString() << " " << sizes() << "]" << '\n';
  } else {
    std::cerr << "[UndefinedTensor]" << '\n';
  }
--- a/aten/src/ATen/core/Vitals.cpp
+++ b/aten/src/ATen/core/Vitals.cpp
@ -9,8 +9,8 @@ APIVitals VitalsAPI;

 std::ostream& operator<<(std::ostream& os, TorchVital const& tv) {
  for (const auto& m : tv.attrs) {
-    os << "[TORCH_VITAL] " << tv.name << '.' << m.first << "\t\t "
-       << m.second.value << '\n';
+    os << "[TORCH_VITAL] " << tv.name << "." << m.first << "\t\t "
+       << m.second.value << "\n";
  }
  return os;
 }
--- a/aten/src/ATen/core/alias_info.h
+++ b/aten/src/ATen/core/alias_info.h
@ -100,18 +100,18 @@ inline bool operator==(const AliasInfo& lhs, const AliasInfo& rhs) {

 // this does match the way things are represented in the schema
 inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
-  out << '(';
+  out << "(";
  bool first = true;
  for (const auto& set : aliasInfo.beforeSets()) {
    if (first) {
      first = false;
    } else {
-      out << '|';
+      out << "|";
    }
    out << set.toUnqualString();
  }
  if (aliasInfo.isWrite()) {
-    out << '!';
+    out << "!";
  }
  if (aliasInfo.beforeSets() != aliasInfo.afterSets()) {
    out << " -> ";
@ -120,12 +120,12 @@ inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
      if (first) {
        first = false;
      } else {
-        out << '|';
+        out << "|";
      }
      out << set.toUnqualString();
    }
  }
-  out << ')';
+  out << ")";
  return out;
 }
 } // namespace c10
--- a/aten/src/ATen/core/blob.h
+++ b/aten/src/ATen/core/blob.h
@ -198,7 +198,7 @@ inline void swap(Blob& lhs, Blob& rhs)  noexcept {
 }

 inline std::ostream& operator<<(std::ostream& out, const Blob& v) {
-  return out << "Blob[" << v.TypeName() << ']';
+  return out << "Blob[" << v.TypeName() << "]";
 }

 } // namespace caffe2
--- a/aten/src/ATen/core/class_type.cpp
+++ b/aten/src/ATen/core/class_type.cpp
@ -456,8 +456,8 @@ bool ClassType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
          *why_not << "Method on class '" << repr_str()
                   << "' (1) is not compatible with interface '"
                   << rhs.repr_str() << "' (2)\n"
-                   << "  (1) " << self_method->getSchema() << '\n'
-                   << "  (2) " << schema << '\n';
+                   << "  (1) " << self_method->getSchema() << "\n"
+                   << "  (2) " << schema << "\n";
        }
        return false;
      }
--- a/aten/src/ATen/core/class_type.h
+++ b/aten/src/ATen/core/class_type.h
@ -100,7 +100,7 @@ struct TORCH_API ClassType : public NamedType {
  std::string repr_str() const override {
    std::stringstream ss;
    ss << str()
-       << " (of Python compilation unit at: " << compilation_unit().get() << ')';
+       << " (of Python compilation unit at: " << compilation_unit().get() << ")";
    return ss.str();
  }

--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
@ -58,12 +58,12 @@ std::string DispatchKeyExtractor::dumpState() const {
  std::ostringstream oss;
  for (const auto i : c10::irange(c10::utils::bitset::NUM_BITS())) {
    if (dispatch_arg_indices_reverse_.get(i)) {
-      oss << '1';
+      oss << "1";
    } else {
-      oss << '0';
+      oss << "0";
    }
  }
-  oss << ' ' << nonFallthroughKeys_ << '\n';
+  oss << " " << nonFallthroughKeys_ << "\n";
  return oss.str();
 }

--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@ -69,8 +69,8 @@ private:

 void _print_dispatch_trace(const std::string& label, const std::string& op_name, const DispatchKeySet& dispatchKeySet) {
  auto nesting_value = dispatch_trace_nesting_value();
-  for (int64_t i = 0; i < nesting_value; ++i) std::cerr << ' ';
-  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << ']' << std::endl;
+  for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
+  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
 }
 } // namespace detail

--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@ -570,7 +570,7 @@ void OperatorEntry::checkInvariants() const {

 std::string OperatorEntry::listAllDispatchKeys() const {
  std::ostringstream str;
-  str << '[';
+  str << "[";

  bool has_kernels = false;
  for (auto k : allDispatchKeysInFullSet()) {
@ -584,7 +584,7 @@ std::string OperatorEntry::listAllDispatchKeys() const {
    str << k;
    has_kernels = true;
  }
-  str << ']';
+  str << "]";
  return str.str();
 }

@ -683,12 +683,12 @@ void OperatorEntry::setReportErrorCallback_(std::unique_ptr<c10::SafePyObject> c
 // This WON'T report backend fallbacks.
 std::string OperatorEntry::dumpState() const {
  std::ostringstream oss;
-  oss << "name: " << name_ << '\n';
+  oss << "name: " << name_ << "\n";
  if (schema_) {
-    oss << "schema: " << schema_->schema << '\n';
-    oss << "debug: " << schema_->debug << '\n';
+    oss << "schema: " << schema_->schema << "\n";
+    oss << "debug: " << schema_->debug << "\n";
    oss << "alias analysis kind: " << toString(schema_->schema.aliasAnalysis())
-        << (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << '\n';
+        << (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << "\n";
  } else {
    oss << "schema: (none)\n";
  }
--- a/aten/src/ATen/core/function_schema.cpp
+++ b/aten/src/ATen/core/function_schema.cpp
@ -7,7 +7,7 @@
 namespace c10 {

 void FunctionSchema::dump() const {
-  std::cout << *this << '\n';
+  std::cout << *this << "\n";
 }

 const std::vector<Argument>& FunctionSchema::getCorrectList(SchemaArgType type) const {
@ -210,9 +210,9 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {

  out << schema.name();
  if (!schema.overload_name().empty()) {
-    out << '.' << schema.overload_name();
+    out << "." << schema.overload_name();
  }
-  out << '(';
+  out << "(";

  bool seen_kwarg_only = false;
  for (const auto i : c10::irange(schema.arguments().size())) {
@ -273,7 +273,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
  }

  if (need_paren) {
-    out << '(';
+    out << "(";
  }
  for (const auto i : c10::irange(returns.size())) {
    if (i > 0) {
@ -288,7 +288,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
    out << "...";
  }
  if (need_paren) {
-    out << ')';
+    out << ")";
  }
  return out;
 }
@ -471,7 +471,7 @@ bool FunctionSchema::isForwardCompatibleWith(
    if (!arguments().at(i).isForwardCompatibleWith(old.arguments().at(i))) {
      if (why_not) {
        why_not
-            << '\'' << arguments().at(i).name() << '\''
+            << "'" << arguments().at(i).name() << "'"
            << " is not forward compatible with the older version of the schema";
      }
      return false;
@ -511,7 +511,7 @@ bool FunctionSchema::isForwardCompatibleWith(
             .isForwardCompatibleWith(old.arguments().at(i))) {
      if (why_not) {
        why_not << "Out argument '"
-                << '\'' << arguments().at(i).name()
+                << "'" << arguments().at(i).name()
                << " is not FC with the older version of the schema";
      }
      return false;
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@ -571,7 +571,7 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
    if (arg.N()) {
        N = std::to_string(*arg.N());
    }
-    out << '[' << N << ']';
+    out << "[" << N << "]";
  } else {
    out << unopt_type->str();
  }
@ -582,15 +582,15 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
  }

  if (is_opt) {
-    out << '?';
+    out << "?";
  }

  if (!arg.name().empty()) {
-    out << ' ' << arg.name();
+    out << " " << arg.name();
  }

  if (arg.default_value()) {
-    out << '=';
+    out << "=";
    if ((type->kind() == c10::TypeKind::StringType ||
        unopt_type->kind() == c10::TypeKind::StringType) &&
        arg.default_value().value().isString()) {
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@ -66,7 +66,7 @@ bool operator==(const ivalue::Tuple& lhs, const ivalue::Tuple& rhs) {
 }

 std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {
-  out << v.qualifiedClassName() << '.' << v.name();
+  out << v.qualifiedClassName() << "." << v.name();
  return out;
 }

@ -526,7 +526,7 @@ std::ostream& printMaybeAnnotatedList(
      !elementTypeCanBeInferredFromMembers(list_elem_type)) {
    out << "annotate(" << the_list.type<c10::Type>()->annotation_str() << ", ";
    printList(out, the_list.toListRef(), "[", "]", formatter);
-    out << ')';
+    out << ")";
    return out;
  } else {
    return printList(out, the_list.toListRef(), "[", "]", formatter);
@ -538,7 +538,7 @@ std::ostream& printDict(
    std::ostream& out,
    const Dict& v,
    const IValueFormatter& formatter) {
-  out << '{';
+  out << "{";

  bool first = true;
  for (const auto& pair : v) {
@ -552,7 +552,7 @@ std::ostream& printDict(
    first = false;
  }

-  out << '}';
+  out << "}";
  return out;
 }
 }
@ -565,8 +565,8 @@ static std::ostream& printMaybeAnnotatedDict(
  auto value_type = the_dict.type()->castRaw<DictType>()->getValueType();
  if (the_dict.toGenericDict().empty() ||
      !elementTypeCanBeInferredFromMembers(value_type)) {
-    out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ',';
-    printDict(out, the_dict.toGenericDict(), formatter) << ')';
+    out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ",";
+    printDict(out, the_dict.toGenericDict(), formatter) << ")";
  } else {
    return printDict(out, the_dict.toGenericDict(), formatter);
  }
@ -577,7 +577,7 @@ static std::ostream& printComplex(std::ostream & out, const IValue & v) {
  c10::complex<double> d = v.toComplexDouble();
  IValue real(d.real()), imag(std::abs(d.imag()));
  auto sign = d.imag() >= 0 ? '+' : '-';
-  return out << real << sign << imag << 'j';
+  return out << real << sign << imag << "j";
 }

 std::ostream& IValue::repr(
@ -605,9 +605,9 @@ std::ostream& IValue::repr(
        if (static_cast<double>(i) == d) {
          // -0.0 (signed zero) needs to be parsed as -0.
          if (i == 0 && std::signbit(d)) {
-            return out << '-' << i << '.';
+            return out << "-" << i << ".";
          }
-          return out << i << '.';
+          return out << i << ".";
        }
      }
      auto orig_prec = out.precision();
@ -643,20 +643,20 @@ std::ostream& IValue::repr(
      device_stream << v.toDevice();
      out << "torch.device(";
      c10::printQuotedString(out, device_stream.str());
-      return out << ')';
+      return out << ")";
    }
    case IValue::Tag::Generator: {
      auto generator = v.toGenerator();
      out << "torch.Generator(device=";
      c10::printQuotedString(out, generator.device().str());
-      out << ", seed=" << generator.current_seed() << ')';
+      out << ", seed=" << generator.current_seed() << ")";
      return out;
    }
    case IValue::Tag::GenericDict:
      return printMaybeAnnotatedDict(out, v, formatter);
    case IValue::Tag::Enum: {
      auto enum_holder = v.toEnumHolder();
-      return out << enum_holder->qualifiedClassName() << '.' <<
+      return out << enum_holder->qualifiedClassName() << "." <<
          enum_holder->name();
    }
    case IValue::Tag::Object: {
@ -801,7 +801,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
      if (c == FP_NORMAL || c == FP_ZERO) {
        int64_t i = static_cast<int64_t>(d);
        if (static_cast<double>(i) == d) {
-          return out << i << '.';
+          return out << i << ".";
        }
      }
      auto orig_prec = out.precision();
@ -852,7 +852,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
      return printDict(out, v.toGenericDict(), formatter);
    case IValue::Tag::PyObject: {
      auto py_obj = v.toPyObject();
-      return out << "<PyObject at" << py_obj << '>';
+      return out << "<PyObject at" << py_obj << ">";
    }
    case IValue::Tag::Generator:
      return out << "Generator";
@ -862,22 +862,22 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
      // TODO we should attempt to call __str__ if the object defines it.
      auto obj = v.toObject();
      // print this out the way python would do it
-      return out << '<' << obj->name() << " object at " << obj.get() << '>';
+      return out << "<" << obj->name() << " object at " << obj.get() << ">";
    }
    case IValue::Tag::Enum: {
      auto enum_holder = v.toEnumHolder();
-      return out << "Enum<" << enum_holder->unqualifiedClassName() << '.' <<
-          enum_holder->name() << '>';
+      return out << "Enum<" << enum_holder->unqualifiedClassName() << "." <<
+          enum_holder->name() << ">";
    }

  }
-  return out << "<Invalid IValue tag=" << std::to_string(static_cast<uint32_t>(v.tag)) << '>';
+  return out << "<Invalid IValue tag=" << std::to_string(static_cast<uint32_t>(v.tag)) << ">";
 }

 #undef TORCH_FORALL_TAGS

 void IValue::dump() const {
-  std::cout << *this << '\n';
+  std::cout << *this << "\n";
 }

 std::shared_ptr<ClassType> ivalue::Object::type() const {
@ -1050,7 +1050,7 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(
      std::stringstream err;
      err << "Cannot serialize custom bound C++ class";
      if (auto qualname = type()->name()) {
-        err << ' ' << qualname->qualifiedName();
+        err << " " << qualname->qualifiedName();
      }
      err << ". Please define serialization methods via def_pickle() for "
            "this class.";
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -211,7 +211,7 @@ struct TORCH_API OptionalType : public UnionType {

  std::string str() const override {
    std::stringstream ss;
-    ss << getElementType()->str() << '?';
+    ss << getElementType()->str() << "?";
    return ss.str();
  }

@ -240,7 +240,7 @@ struct TORCH_API OptionalType : public UnionType {

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "Optional[" << getElementType()->annotation_str(printer) << ']';
+    ss << "Optional[" << getElementType()->annotation_str(printer) << "]";
    return ss.str();
  }
 };
@ -906,7 +906,7 @@ struct TORCH_API ListType

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "List[" << getElementType()->annotation_str(printer) << ']';
+    ss << "List[" << getElementType()->annotation_str(printer) << "]";
    return ss.str();
  }
 };
@ -946,7 +946,7 @@ struct TORCH_API DictType : public SharedType {
  std::string str() const override {
    std::stringstream ss;
    ss << "Dict(" << getKeyType()->str() << ", " << getValueType()->str()
-       << ')';
+       << ")";
    return ss.str();
  }

@ -1018,7 +1018,7 @@ struct TORCH_API FutureType

  std::string str() const override {
    std::stringstream ss;
-    ss << "Future(" << getElementType()->str() << ')';
+    ss << "Future(" << getElementType()->str() << ")";
    return ss.str();
  }
  TypePtr createWithContained(
@ -1041,7 +1041,7 @@ struct TORCH_API FutureType

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "Future[" << getElementType()->annotation_str(printer) << ']';
+    ss << "Future[" << getElementType()->annotation_str(printer) << "]";
    return ss.str();
  }
 };
@ -1060,7 +1060,7 @@ struct TORCH_API AwaitType

  std::string str() const override {
    std::stringstream ss;
-    ss << "Await(" << getElementType()->str() << ')';
+    ss << "Await(" << getElementType()->str() << ")";
    return ss.str();
  }
  TypePtr createWithContained(
@ -1083,7 +1083,7 @@ struct TORCH_API AwaitType

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "Await[" << getElementType()->annotation_str(printer) << ']';
+    ss << "Await[" << getElementType()->annotation_str(printer) << "]";
    return ss.str();
  }
 };
@ -1102,7 +1102,7 @@ struct TORCH_API RRefType

  std::string str() const override {
    std::stringstream ss;
-    ss << "RRef(" << getElementType()->str() << ')';
+    ss << "RRef(" << getElementType()->str() << ")";
    return ss.str();
  }
  TypePtr createWithContained(
@ -1115,7 +1115,7 @@ struct TORCH_API RRefType

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "RRef[" << getElementType()->annotation_str(printer) << ']';
+    ss << "RRef[" << getElementType()->annotation_str(printer) << "]";
    return ss.str();
  }
 };
--- a/aten/src/ATen/core/operator_name.cpp
+++ b/aten/src/ATen/core/operator_name.cpp
@ -11,7 +11,7 @@ std::string toString(const OperatorName& opName) {
 std::ostream& operator<<(std::ostream& os, const OperatorName& opName) {
  os << opName.name;
  if (!opName.overload_name.empty()) {
-    os << '.' << opName.overload_name;
+    os << "." << opName.overload_name;
  }
  return os;
 }
--- a/aten/src/ATen/core/tensor_type.cpp
+++ b/aten/src/ATen/core/tensor_type.cpp
@ -65,7 +65,7 @@ VaryingShape<T> VaryingShape<T>::merge(const VaryingShape<T>& other) const {

 template <typename T>
 std::ostream& operator<<(std::ostream& out, const VaryingShape<T>& vs) {
-  out << '(';
+  out << "(";
  if (!vs.size()) {
    out << "*)";
    return out;
@ -79,10 +79,10 @@ std::ostream& operator<<(std::ostream& out, const VaryingShape<T>& vs) {
    if (v.has_value()) {
      out << v.value();
    } else {
-      out << '*';
+      out << "*";
    }
  }
-  out << ')';
+  out << ")";
  return out;
 }

@ -105,7 +105,7 @@ std::ostream& operator<<(
  }
  auto sizes_opt = ss.sizes();

-  os << '(';
+  os << "(";
  for (size_t i = 0; i < rank_opt.value(); i++) {
    if (i > 0) {
      os << ", ";
@ -113,10 +113,10 @@ std::ostream& operator<<(
    if(sizes_opt.has_value() && sizes_opt.value()[i].is_static()) {
      os << sizes_opt.value()[i];
    } else {
-      os << '*';
+      os << "*";
    }
  }
-  os << ')';
+  os << ")";

  return os;
 }
@ -131,17 +131,17 @@ std::ostream& operator<<(std::ostream& os, const ShapeSymbol& s) {
 }

 std::ostream& operator<<(std::ostream& os, const Stride& s) {
-  os << '{';
+  os << "{";
  if (s.stride_index_.has_value()) {
    os << *s.stride_index_;
  } else {
-    os << '*';
+    os << "*";
  }
-  os << ':';
+  os << ":";
  if (s.stride_.has_value()) {
    os << *s.stride_;
  } else {
-    os << '*';
+    os << "*";
  }
  os << '}';
  return os;
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@ -67,7 +67,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
      bool has_valid_strides_info = ndim > 0 &&
          value->strides().isComplete() && value->strides().size() == ndim;

-      out << '(';
+      out << "(";
      size_t i = 0;
      bool symbolic = type_verbosity() == TypeVerbosity::Symbolic;
      for (i = 0; i < *ndim; ++i) {
@ -79,7 +79,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
        } else if (symbolic) {
          out << value->symbolic_sizes().at(i);
        } else {
-          out << '*';
+          out << "*";
        }
      }
      if (has_valid_strides_info &&
@ -91,7 +91,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
          }
          out << value->strides()[i].value();
        }
-        out << ']';
+        out << "]";
      }
      if (type_verbosity() >= TypeVerbosity::Full) {
        if (value->requiresGrad()) {
@ -107,12 +107,12 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
          out << "device=" << *value->device();
        }
      }
-      out << ')';
+      out << ")";
    } else {
      if (type_verbosity() >= TypeVerbosity::Full) {
        size_t i = 0;
        if (value->requiresGrad()) {
-          out << '('
+          out << "("
              << "requires_grad=" << *value->requiresGrad();
          i++;
        }
@ -120,7 +120,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
          out << ((i++ > 0) ? ", " : "(") << "device=" << *value->device();
        }
        if (i > 0) {
-          out << ')';
+          out << ")";
        }
      }
    }
@ -133,18 +133,18 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
    out << *prim << "[]";
  } else if (t.kind() == TypeKind::OptionalType) {
    auto prim = t.castRaw<OptionalType>()->getElementType();
-    out << *prim << '?';
+    out << *prim << "?";
  } else if(t.kind() == TypeKind::FutureType) {
    auto elem = t.castRaw<FutureType>()->getElementType();
-    out << "Future[" << *elem << ']';
+    out << "Future[" << *elem << "]";
  } else if(t.kind() == TypeKind::RRefType) {
    auto elem = t.castRaw<RRefType>()->getElementType();
-    out << "RRef[" << *elem << ']';
+    out << "RRef[" << *elem << "]";
  } else if(auto tup = t.cast<TupleType>()) {
    if (tup->schema()) {
      out << "NamedTuple";
    }
-    out << '(';
+    out << "(";
    for(size_t i = 0; i < tup->elements().size(); ++i) {
      if(i > 0)
        out << ", ";
@ -160,7 +160,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
        out << *(tup->elements()[i]);
      }
    }
-    out << ')';
+    out << ")";
  } else if (t.kind() == TypeKind::FunctionType) {
    out << "Function";
  } else {
@ -475,7 +475,7 @@ std::optional<TypePtr> unifyTypeList(
      why_not << "Could not unify type list since element " << i << " of type "
              << elements.at(i)->repr_str()
              << " did not match the types before it ("
-              << ret_type->repr_str() << ')';
+              << ret_type->repr_str() << ")";
      return std::nullopt;
    }
    ret_type = *maybe_unified;
@ -907,13 +907,13 @@ std::string TupleType::str() const {
    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
    ss << name()->qualifiedName();
  } else {
-    ss << '(';
+    ss << "(";
    for(size_t i = 0; i < elements().size(); ++i) {
      if(i > 0)
        ss << ", ";
      ss << elements()[i]->str();
    }
-    ss << ')';
+    ss << ")";
  }
  return ss.str();
 }
@ -1003,8 +1003,8 @@ bool InterfaceType::isSubTypeImpl(
          *why_not << "Method on interface '" << lhs.repr_str()
                   << "' (1) is not compatible with interface '"
                   << rhs.repr_str() << "' (2)\n"
-                   << "  (1) " << *self_schema << '\n'
-                   << "  (2) " << schema << '\n';
+                   << "  (1) " << *self_schema << "\n"
+                   << "  (2) " << schema << "\n";
          return false;
        }
        return false;
@ -1078,7 +1078,7 @@ SymbolicShape SymbolicShape::merge(const SymbolicShape& other) const {
 }

 void SymbolicShape::dump() const {
-  std::cout << *this << '\n';
+  std::cout << *this << "\n";
 }

 bool EnumType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
--- a/aten/src/ATen/core/union_type.cpp
+++ b/aten/src/ATen/core/union_type.cpp
@ -205,9 +205,9 @@ UnionType::UnionType(std::vector<TypePtr> reference, TypeKind kind) : SharedType
    for (const auto i : c10::irange(reference.size())) {
      msg << reference[i]->repr_str();
      if (i > 0) {
-        msg << ',';
+        msg << ",";
      }
-      msg << ' ';
+      msg << " ";
    }
    msg << "} has the single type " << types_[0]->repr_str()
         << ". Use the common supertype instead of creating a Union"
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@ -80,7 +80,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
    }
    stream << buf[i];
  }
-  stream << ']';
+  stream << "]";
  return stream;
 }

--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
    }
    stream << buf[i];
  }
-  stream << ']';
+  stream << "]";
  return stream;
 }

--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -411,16 +411,16 @@ std::string CUDAHooks::showConfig() const {
    // HIP_VERSION value format was changed after ROCm v4.2 to include the patch number
    if(v < 500) {
      // If major=xx, minor=yy then format -> xxyy
-      oss << (v / 100) << '.' << (v % 10);
+      oss << (v / 100) << "." << (v % 10);
    }
    else {
      // If major=xx, minor=yy & patch=zzzzz then format -> xxyyzzzzz
-      oss << (v / 10000000) << '.' << (v / 100000 % 100) << '.' << (v % 100000);
+      oss << (v / 10000000) << "." << (v / 100000 % 100) << "." << (v % 100000);
    }
 #else
-    oss << (v / 1000) << '.' << (v / 10 % 100);
+    oss << (v / 1000) << "." << (v / 10 % 100);
    if (v % 10 != 0) {
-      oss << '.' << (v % 10);
+      oss << "." << (v % 10);
    }
 #endif
  };
@ -431,16 +431,16 @@ std::string CUDAHooks::showConfig() const {
  oss << "  - HIP Runtime ";
 #endif
  printCudaStyleVersion(runtimeVersion);
-  oss << '\n';
+  oss << "\n";

  // TODO: Make HIPIFY understand CUDART_VERSION macro
 #if !defined(USE_ROCM)
  if (runtimeVersion != CUDART_VERSION) {
    oss << "  - Built with CUDA Runtime ";
    printCudaStyleVersion(CUDART_VERSION);
-    oss << '\n';
+    oss << "\n";
  }
-  oss << "  - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << '\n';
+  oss << "  - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << "\n";
 #endif

 #if !defined(USE_ROCM)
@ -448,9 +448,9 @@ std::string CUDAHooks::showConfig() const {


  auto printCudnnStyleVersion = [&](size_t v) {
-    oss << (v / 1000) << '.' << (v / 100 % 10);
+    oss << (v / 1000) << "." << (v / 100 % 10);
    if (v % 100 != 0) {
-      oss << '.' << (v % 100);
+      oss << "." << (v % 100);
    }
  };

@ -461,22 +461,22 @@ std::string CUDAHooks::showConfig() const {
  if (cudnnCudartVersion != CUDART_VERSION) {
    oss << "  (built against CUDA ";
    printCudaStyleVersion(cudnnCudartVersion);
-    oss << ')';
+    oss << ")";
  }
-  oss << '\n';
+  oss << "\n";
  if (cudnnVersion != CUDNN_VERSION) {
    oss << "    - Built with CuDNN ";
    printCudnnStyleVersion(CUDNN_VERSION);
-    oss << '\n';
+    oss << "\n";
  }
 #endif
 #else
  // TODO: Check if miopen has the functions above and unify
-  oss << "  - MIOpen " << MIOPEN_VERSION_MAJOR << '.' << MIOPEN_VERSION_MINOR << '.' << MIOPEN_VERSION_PATCH << '\n';
+  oss << "  - MIOpen " << MIOPEN_VERSION_MAJOR << "." << MIOPEN_VERSION_MINOR << "." << MIOPEN_VERSION_PATCH << "\n";
 #endif

 #if AT_MAGMA_ENABLED()
-  oss << "  - Magma " << MAGMA_VERSION_MAJOR << '.' << MAGMA_VERSION_MINOR << '.' << MAGMA_VERSION_MICRO << '\n';
+  oss << "  - Magma " << MAGMA_VERSION_MAJOR << "." << MAGMA_VERSION_MINOR << "." << MAGMA_VERSION_MICRO << "\n";
 #endif

  return oss.str();
--- a/aten/src/ATen/cuda/jiterator.cu
+++ b/aten/src/ATen/cuda/jiterator.cu
@ -42,7 +42,7 @@ static inline void launch_jitted_vectorized_kernel_dynamic(

  // The cache key includes all the parameters to generate_code + vec_size + dev_idx
  std::stringstream ss;
-  ss << nInputs << '_' << nOutputs << f;
+  ss << nInputs << "_" << nOutputs << f;
  ss << f_inputs_type_str << compute_type_str << result_type_str;
  ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
  ss << extra_args_types;
@ -144,7 +144,7 @@ static inline void launch_jitted_unrolled_kernel_dynamic(

  // The cache key includes all the parameters to generate_code + dev_idx
  std::stringstream ss;
-  ss << nInputs << '_' << nOutputs << f;
+  ss << nInputs << "_" << nOutputs << f;
  ss << f_inputs_type_str << compute_type_str << result_type_str;
  ss << contiguous << dynamic_casting;
  ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@ -52,10 +52,10 @@ TuningContext* getTuningContext() {
 std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry) {
  static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1";
  if (!blaslog) {
-    return stream << entry.key_ << ',' << entry.time_;
+    return stream << entry.key_ << "," << entry.time_;
  }
  else {
-    return stream << entry.key_ << ',' << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_;
+    return stream << entry.key_ << "," << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_;
  }
 }

@ -156,10 +156,10 @@ void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std
    if (isNew) {
      static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1";
      if (!blaslog) {
-        untuned_file << op_signature << ',' << params_signature << std::endl;
+        untuned_file << op_signature << "," << params_signature << std::endl;
      }
      else {
-        untuned_file << op_signature << ',' << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl;
+        untuned_file << op_signature << "," << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl;
      }
      TUNABLE_LOG3("Untuned,", op_signature, ",", params_signature);
    }
@ -201,7 +201,7 @@ void TuningResultsManager::InitRealtimeAppend(const std::string& filename, const

  if(!file_exists || file_empty) {
    for(const auto& [key, val] : validators) {
-      (*realtime_out_) << "Validator," << key << ',' << val << std::endl;
+      (*realtime_out_) << "Validator," << key << "," << val << std::endl;
      realtime_out_->flush();
    }
    validators_written_ = true;
@ -219,7 +219,7 @@ void TuningResultsManager::AppendResultLine(const std::string& op_sig, const std
    return;
  }

-  (*realtime_out_) << op_sig << ',' << param_sig << ',' << result << std::endl;
+  (*realtime_out_) << op_sig << "," << param_sig << "," << result << std::endl;
  realtime_out_->flush(); //ensure immediate write to disk

  TUNABLE_LOG3("Realtime append: ", op_sig, "(", param_sig, ") -> ", result);
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@ -93,31 +93,31 @@ std::string cudnnTypeToString(cudnnDataType_t dtype) {
      return "CUDNN_DATA_UINT8x4";
    default:
      std::ostringstream oss;
-      oss << "(unknown data-type " << static_cast<int>(dtype) << ')';
+      oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
      return oss.str();
  }
 }

 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
-  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << '\n';
+  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
  int nbDims = 0;
  int dimA[CUDNN_DIM_MAX];
  int strideA[CUDNN_DIM_MAX];
  cudnnDataType_t dtype{};
  cudnnGetTensorNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &nbDims, dimA, strideA);
-  out << "    type = " << cudnnTypeToString(dtype) << '\n';
-  out << "    nbDims = " << nbDims << '\n';
+  out << "    type = " << cudnnTypeToString(dtype) << "\n";
+  out << "    nbDims = " << nbDims << "\n";
  // Read out only nbDims of the arrays!
  out << "    dimA = ";
  for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << '\n';
+  out << "\n";
  out << "    strideA = ";
  for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << '\n';
+  out << "\n";
  return out;
 }

@ -168,27 +168,27 @@ std::string cudnnMemoryFormatToString(cudnnTensorFormat_t tformat) {
      return "CUDNN_TENSOR_NHWC";
    default:
      std::ostringstream oss;
-      oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ')';
+      oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ")";
      return oss.str();
  }
 }

 std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d) {
-  out << "FilterDescriptor " << static_cast<void*>(d.desc()) << '\n';
+  out << "FilterDescriptor " << static_cast<void*>(d.desc()) << "\n";
  int nbDims = 0;
  int dimA[CUDNN_DIM_MAX];
  cudnnDataType_t dtype{};
  cudnnTensorFormat_t tformat{};
  cudnnGetFilterNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &tformat, &nbDims, dimA);
-  out << "    type = " << cudnnTypeToString(dtype) << '\n';
-  out << "    tensor_format = " << cudnnMemoryFormatToString(tformat) << '\n';
-  out << "    nbDims = " << nbDims << '\n';
+  out << "    type = " << cudnnTypeToString(dtype) << "\n";
+  out << "    tensor_format = " << cudnnMemoryFormatToString(tformat) << "\n";
+  out << "    nbDims = " << nbDims << "\n";
  // Read out only nbDims of the arrays!
  out << "    dimA = ";
  for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << '\n';
+  out << "\n";
  return out;
 }

--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@ -346,15 +346,15 @@ void foreachTensorInplaceWithFlag(std::vector<IValue>& args, int64_t begin, int6
 }

 std::ostream& operator<< (std::ostream& os, const DynamicLayer& layer) {
-  os << layer.layerId() << ':' << layer.key();
+  os << layer.layerId() << ":" << layer.key();
  return os;
 }
 std::ostream& operator<< (std::ostream& os, const std::vector<DynamicLayer>& dls) {
  os << "DynamicLayerStack[ ";
  for (const auto& layer : dls) {
-    os << layer << ' ';
+    os << layer << " ";
  }
-  os << ']';
+  os << "]";
  return os;
 }

--- a/aten/src/ATen/functorch/TensorWrapper.cpp
+++ b/aten/src/ATen/functorch/TensorWrapper.cpp
@ -22,7 +22,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) {
    if (batched) {
      ss << "Batched[lvl=" << batched->level() << " dim=" << batched->bdim() << ", ";
      dumpTensor(ss, batched->value());
-      ss << ']';
+      ss << "]";
      return;
    }
    ss << "Tensor" << tensor.sizes();
@ -36,7 +36,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) {
    ss << "dead, ";
  }
  dumpTensor(ss, wrapped->value());
-  ss << ']';
+  ss << "]";
 }

 void TensorWrapper::refreshMetadata() {
--- a/aten/src/ATen/miopen/Descriptors.cpp
+++ b/aten/src/ATen/miopen/Descriptors.cpp
@ -73,32 +73,32 @@ std::string miopenTypeToString(miopenDataType_t dtype) {
      return "miopenBFloat16";
    default:
      std::ostringstream oss;
-      oss << "(unknown data-type " << static_cast<int>(dtype) << ')';
+      oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
      return oss.str();
  }
 }

 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
-  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << '\n';
+  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
  int nbDims = 0;
  int dimA[MIOPEN_DIM_MAX];
  int strideA[MIOPEN_DIM_MAX];
  miopenDataType_t dtype;
  miopenGetTensorDescriptorSize(d.desc(), &nbDims);
  miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA);
-  out << "    type = " << miopenTypeToString(dtype) << '\n';
-  out << "    nbDims = " << nbDims << '\n';
+  out << "    type = " << miopenTypeToString(dtype) << "\n";
+  out << "    nbDims = " << nbDims << "\n";
  // Read out only nbDims of the arrays!
  out << "    dimA = ";
  for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << '\n';
+  out << "\n";
  out << "    strideA = ";
  for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << '\n';
+  out << "\n";
  return out;
 }

--- a/aten/src/ATen/mps/MPSDevice.h
+++ b/aten/src/ATen/mps/MPSDevice.h
@ -22,6 +22,7 @@ enum class MacOSVersion : uint32_t {
  MACOS_VER_15_0_PLUS,
  MACOS_VER_15_1_PLUS,
  MACOS_VER_15_2_PLUS,
+  MACOS_VER_26_0_PLUS,
 };

 //-----------------------------------------------------------------
--- a/aten/src/ATen/mps/MPSDevice.mm
+++ b/aten/src/ATen/mps/MPSDevice.mm
@ -65,6 +65,7 @@ bool MPSDevice::isMacOS13Plus(MacOSVersion version) const {
  static bool _macos_15_0_plus = is_os_version_at_least(15, 0);
  static bool _macos_15_1_plus = is_os_version_at_least(15, 1);
  static bool _macos_15_2_plus = is_os_version_at_least(15, 2);
+  static bool _macos_26_0_plus = is_os_version_at_least(26, 0);

  switch (version) {
    case MacOSVersion::MACOS_VER_14_4_PLUS:
@ -75,6 +76,8 @@ bool MPSDevice::isMacOS13Plus(MacOSVersion version) const {
      return _macos_15_1_plus;
    case MacOSVersion::MACOS_VER_15_2_PLUS:
      return _macos_15_2_plus;
+    case MacOSVersion::MACOS_VER_26_0_PLUS:
+      return _macos_26_0_plus;
    default:
      return false;
  }
--- a/aten/src/ATen/mps/MPSProfiler.h
+++ b/aten/src/ATen/mps/MPSProfiler.h
@ -91,7 +91,7 @@ struct OperationInfo : BaseInfo {
    std::stringstream kernelStr;
    kernelStr << kernelName;
    for (const Tensor& tensor : tensors) {
-      kernelStr << ':' << BaseInfo::buildTensorString(tensor, includeBufferId);
+      kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId);
    }
    return kernelStr.str();
  }
--- a/aten/src/ATen/mps/MPSProfiler.mm
+++ b/aten/src/ATen/mps/MPSProfiler.mm
@ -39,9 +39,9 @@ std::string BaseInfo::buildTensorString(const Tensor& tensor, bool includeBuffer
    // see comments for INCLUDE_BUFFER_ID
    if (includeBufferId && deviceType == at::kMPS) {
      id<MTLBuffer> buffer = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
-      tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ':' << buffer.retainCount << ')';
+      tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ":" << buffer.retainCount << ")";
    }
-    tensorStr << ':' << tensor.scalar_type() << tensor.sizes();
+    tensorStr << ":" << tensor.scalar_type() << tensor.sizes();
    return tensorStr.str();
  } else {
    return "undefined";
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@ -167,7 +167,7 @@ static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, co
    std::stringstream ss;
    ss << arg_name << " should be greater than zero but got (";
    std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
-    ss << args.back() <<  ")" << " (while checking arguments for " << c << ')';
+    ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
    TORCH_CHECK(false, ss.str());
  }
 }
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -639,7 +639,7 @@ static std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params)
      << "  deterministic = " << params.deterministic
      << "  cudnn_enabled = " << params.cudnn_enabled
      << "  allow_tf32 = " << params.allow_tf32
-      << '}';
+      << "}";
  return out;
 }

--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@ -847,7 +847,7 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const std::optional<int64_t
       << ", hop_length=" << hop_length << ", win_length=" << win_length \
       << ", window="; \
    if (window.defined()) { \
-      SS << window.toString() << '{' << window.sizes() << '}'; \
+      SS << window.toString() << "{" << window.sizes() << "}"; \
    } else { \
      SS << "None"; \
    } \
@ -1046,7 +1046,7 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const std::optional<int64_
       << ", hop_length=" << hop_length << ", win_length=" << win_length \
       << ", window="; \
    if (window.defined()) { \
-      SS << window.toString() << '{' << window.sizes() << '}'; \
+      SS << window.toString() << "{" << window.sizes() << "}"; \
    } else { \
      SS << "None"; \
    } \
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@ -523,7 +523,7 @@ Tensor _functional_assert_async_msg_cpu(
 }

 void _print(std::string_view s) {
-  std::cout << s << '\n';
+  std::cout << s << "\n";
 }

 // Sorting-based algorithm for isin(); used when the number of test elements is
--- a/aten/src/ATen/native/cuda/GroupMM.cu
+++ b/aten/src/ATen/native/cuda/GroupMM.cu
@ -346,9 +346,8 @@ void dispatch_bf16_grouped_kernel_on_tile_size(
  bool small = (M <= 128 || N <= 128);
  cudaDeviceProp* properties = at::cuda::getCurrentDeviceProperties();
  const bool sm10x = properties != nullptr && properties->major == 10;
-  const bool sm11x = properties != nullptr && properties->major == 11;

-  if (sm10x || sm11x) {
+  if (sm10x) {
    if (small){
      bf16bf16_grouped_gemm_impl_sm90_sm100<
        cutlass::arch::Sm100,
--- a/aten/src/ATen/native/cuda/GroupedBlas.cpp
+++ b/aten/src/ATen/native/cuda/GroupedBlas.cpp
@ -607,8 +607,6 @@ _scaled_grouped_mm_cuda_v2(
      // scale shape checks
      _check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
      _check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
-      // swizze checks
-      TORCH_CHECK_VALUE(swizzle_a_enum.size() == 1 && swizzle_b_enum.size() == 1, "Expected single swizzle argument");
      return _mx8_mx8_bf16_grouped_mm_fbgemm(
          mat_a,
          mat_b,
--- a/aten/src/ATen/native/cuda/KernelUtils.cuh
+++ b/aten/src/ATen/native/cuda/KernelUtils.cuh
@ -5,69 +5,11 @@
 #include <cuda_bf16.h>
 #endif

-// ROCm 6.3 is planned to have these functions, but until then here they are.
 #if defined(USE_ROCM)
 #include <device_functions.h>
 #include <hip/hip_fp16.h>
 #include <hip/hip_bf16.h>
-
-__device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) {
-#if (defined(__gfx942__)) && \
-  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16)
-  typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
-  static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw));
-  union {
-    __hip_bfloat162_raw bf162_raw;
-    vec_short2 vs2;
-  } u{static_cast<__hip_bfloat162_raw>(value)};
-  u.vs2 = __builtin_amdgcn_flat_atomic_fadd_v2bf16((vec_short2*)address, u.vs2);
-  return static_cast<__hip_bfloat162>(u.bf162_raw);
-#else
-  static_assert(sizeof(unsigned int) == sizeof(__hip_bfloat162_raw));
-  union u_hold {
-    __hip_bfloat162_raw h2r;
-    unsigned int u32;
-  };
-  u_hold old_val, new_val;
-  old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-  do {
-    new_val.h2r = __hadd2(old_val.h2r, value);
-  } while (!__hip_atomic_compare_exchange_strong(
-        (unsigned int*)address, &old_val.u32, new_val.u32,
-        __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
-  return old_val.h2r;
-#endif
-}
-
-__device__ inline __half2 preview_unsafeAtomicAdd(__half2* address, __half2 value) {
-#if (defined(__gfx942__)) && \
-  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2f16)
-  // The api expects an ext_vector_type of half
-  typedef _Float16 __attribute__((ext_vector_type(2))) vec_fp162;
-  static_assert(sizeof(vec_fp162) == sizeof(__half2_raw));
-  union {
-    __half2_raw h2r;
-    vec_fp162 fp16;
-  } u {static_cast<__half2_raw>(value)};
-  u.fp16 = __builtin_amdgcn_flat_atomic_fadd_v2f16((vec_fp162*)address, u.fp16);
-  return static_cast<__half2>(u.h2r);
-#else
-  static_assert(sizeof(__half2_raw) == sizeof(unsigned int));
-  union u_hold {
-    __half2_raw h2r;
-    unsigned int u32;
-  };
-  u_hold old_val, new_val;
-  old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-  do {
-    new_val.h2r = __hadd2(old_val.h2r, value);
-  } while (!__hip_atomic_compare_exchange_strong(
-        (unsigned int*)address, &old_val.u32, new_val.u32,
-        __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
-  return old_val.h2r;
-#endif
-}
-#define ATOMICADD preview_unsafeAtomicAdd
+#define ATOMICADD unsafeAtomicAdd
 #define NATIVE_ZERO_BF16 __float2bfloat16(0.0f)
 #else
 #define ATOMICADD atomicAdd
--- a/aten/src/ATen/native/cuda/Reduce.cu
+++ b/aten/src/ATen/native/cuda/Reduce.cu
@ -11,7 +11,7 @@ static inline std::ostream& operator<<(std::ostream& out, dim3 dim) {
  if (dim.y == 1 && dim.z == 1) {
    out << dim.x;
  } else {
-    out << '[' << dim.x << ',' << dim.y << ',' << dim.z << ']';
+    out << "[" << dim.x << "," << dim.y << "," << dim.z << "]";
  }
  return out;
 }
@ -27,7 +27,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
  out << "input_mult=[";
  for (int i = 0; i < 3; i++) {
    if (i != 0) {
-      out << ',';
+      out << ",";
    }
    out << config.input_mult[i];
  }
@ -35,7 +35,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
  out << "output_mult=[";
  for (int i = 0; i < 2; i++) {
    if (i != 0) {
-      out << ',';
+      out << ",";
    }
    out << config.output_mult[i];
  }
@ -49,7 +49,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
  out << "block=" << config.block() << ", ";
  out << "grid=" << config.grid() << ", ";
  out << "global_memory_size=" << config.global_memory_size();
-  out << ')';
+  out << ")";
  return out;
 }

--- a/aten/src/ATen/native/cuda/RowwiseScaledMM.cu
+++ b/aten/src/ATen/native/cuda/RowwiseScaledMM.cu
@ -958,9 +958,8 @@ void dispatch_fp8_rowwise_kernel_on_sm(
  const bool sm89 = properties != nullptr && properties->major == 8 && properties->minor == 9;
  const bool sm9x = properties != nullptr && properties->major == 9;
  const bool sm10x = properties != nullptr && properties->major == 10;
-  const bool sm11x = properties != nullptr && properties->major == 11;
  const bool sm12x = properties != nullptr && properties->major == 12;
-  if (!(sm89 || sm9x || sm10x || sm11x || sm12x)) {
+  if (!(sm89 || sm9x || sm10x || sm12x)) {
    TORCH_CHECK(
        false, "Rowwise scaling is not currently supported on your device");
  }
@ -969,7 +968,7 @@ void dispatch_fp8_rowwise_kernel_on_sm(
    dispatch_fp8_rowwise_kernel_on_cluster_size_and_transpose<
      /*ArchTag=*/cutlass::arch::Sm90,
      Types...>(XQ, WQ, x_scale, w_scale, bias, out);
-  } else if (sm10x || sm11x) {
+  } else if (sm10x) {
    dispatch_fp8_rowwise_kernel_on_cluster_size_and_transpose<
      /*ArchTag=*/cutlass::arch::Sm100,
      Types...>(XQ, WQ, x_scale, w_scale, bias, out);
--- a/aten/src/ATen/native/cuda/ScaledGroupMM.cu
+++ b/aten/src/ATen/native/cuda/ScaledGroupMM.cu
@ -364,9 +364,9 @@ void f8f8bf16_grouped_gemm_impl_sm90(
  //       reinterpret_cast<ProblemShape::UnderlyingProblemShape*>(
  //           stride_output_h + group_count);

-  //   std::cout << "PTRS " << mat_a.data_ptr() << ' ' << mat_b.data_ptr() << "
+  //   std::cout << "PTRS " << mat_a.data_ptr() << " " << mat_b.data_ptr() << "
  //   "
-  //             << out.data_ptr() << ' ' << scale_a.data_ptr() << ' '
+  //             << out.data_ptr() << " " << scale_a.data_ptr() << " "
  //             << scale_b.data_ptr() << "\n";
  //   for (int i = 0; i < group_count; i++) {
  //     std::cout << "A " << (void*)inputA_ptrs_h[i] << "\n";
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@ -1057,14 +1057,14 @@ std::string generate_code(
    // TODO these arrays are potentially of the different types, use function
    // traits to determine the types
    declare_load_arrays << f_inputs_type << " arg" << std::to_string(i)
-                        << '[' << std::to_string(thread_work_size) << "];\n";
+                        << "[" << std::to_string(thread_work_size) << "];\n";
  }
  env.s("declare_load_arrays", declare_load_arrays.str());

  std::stringstream declare_store_arrays;
  for (int i = 0; i < nOutputs; i++) {
    declare_store_arrays << result_type << " out" << std::to_string(i)
-                        << '[' << std::to_string(thread_work_size) << "];\n";
+                        << "[" << std::to_string(thread_work_size) << "];\n";
  }
  env.s("declare_store_arrays", declare_store_arrays.str());

@ -1217,7 +1217,7 @@ std::string generate_code(
  for (const auto i : c10::irange(nInputs)){
    auto i_string = std::to_string(i);
    vector_inputs << "auto * input" << i_string <<
-        " = reinterpret_cast<const scalar_t*>(data[" << i_string << '+' << nOutputs << "])" <<
+        " = reinterpret_cast<const scalar_t*>(data[" << i_string << "+" << nOutputs << "])" <<
        " + block_work_size * idx;\n";
  }
  env.s("vector_inputs", vector_inputs.str());
@ -1543,17 +1543,17 @@ NvrtcFunction jit_pwise_function(

    // Constructs file path by appending constructed cubin name to cache path
    std::stringstream ss;
-    ss << *cache_dir << '/';
+    ss << *cache_dir << "/";
    ss << kernel_name;
 #ifdef USE_ROCM
    ss << "_arch" << prop->gcnArchName;
 #else
-    ss << "_arch" << cuda_major << '.' << cuda_minor;
+    ss << "_arch" << cuda_major << "." << cuda_minor;
 #endif
-    ss << "_nvrtc" << nvrtc_major << '.' << nvrtc_minor;
+    ss << "_nvrtc" << nvrtc_major << "." << nvrtc_minor;
    ss << (compile_to_sass ? "_sass" : "_ptx");
-    ss << '_' << code.length();
-    ss << '_' << hash_code;
+    ss << "_" << code.length();
+    ss << "_" << hash_code;
    file_path = ss.str();

    std::ifstream readin{file_path, std::ios::in | std::ifstream::binary};
--- a/aten/src/ATen/native/cudnn/ConvShared.cpp
+++ b/aten/src/ATen/native/cudnn/ConvShared.cpp
@ -82,15 +82,15 @@ namespace native {

 std::ostream& operator<<(std::ostream& out, const ConvolutionParams& params) {
  out << "ConvolutionParams \n"
-      << "    memory_format = " << params.memory_format << '\n'
-      << "    data_type = " << cudnnTypeToString(params.dataType) << '\n'
-      << "    padding = " << ArrayRef<int>{params.padding} << '\n'
-      << "    stride = " << ArrayRef<int>{params.stride} << '\n'
-      << "    dilation = " << ArrayRef<int>{params.dilation} << '\n'
-      << "    groups = " << params.groups << '\n'
+      << "    memory_format = " << params.memory_format << "\n"
+      << "    data_type = " << cudnnTypeToString(params.dataType) << "\n"
+      << "    padding = " << ArrayRef<int>{params.padding} << "\n"
+      << "    stride = " << ArrayRef<int>{params.stride} << "\n"
+      << "    dilation = " << ArrayRef<int>{params.dilation} << "\n"
+      << "    groups = " << params.groups << "\n"
      << "    deterministic = " << (params.deterministic ? "true" : "false")
-      << '\n'
-      << "    allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << '\n';
+      << "\n"
+      << "    allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << "\n";

  return out;
 }
@ -173,16 +173,16 @@ std::string repro_from_args(const ConvolutionParams& params) {
            at::globalContext().float32Precision(
                at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
            at::Float32Precision::TF32)
-     << '\n';
+     << "\n";
  ss << "torch.backends.cudnn.benchmark = "
-     << pybool(at::globalContext().benchmarkCuDNN()) << '\n';
+     << pybool(at::globalContext().benchmarkCuDNN()) << "\n";
  ss << "torch.backends.cudnn.deterministic = " << pybool(params.deterministic)
-     << '\n';
+     << "\n";
  ss << "torch.backends.cudnn.allow_tf32 = " << pybool(params.allow_tf32)
-     << '\n';
+     << "\n";
  ss << "data = torch.randn(" << ArrayRef<int>(params.input_size, dim)
     << ", dtype=" << full_dtype << ", ";
-  ss << "device='cuda', requires_grad=True)" << to_channels_last << '\n';
+  ss << "device='cuda', requires_grad=True)" << to_channels_last << "\n";
  ss << "net = torch.nn.Conv" << dim - 2 << "d(" << in_channels << ", "
     << out_channels << ", ";
  ss << "kernel_size=" << ArrayRef<int>(&params.weight_size[2], dim - 2)
@ -192,7 +192,7 @@ std::string repro_from_args(const ConvolutionParams& params) {
  ss << "dilation=" << ArrayRef<int>(params.dilation, dim - 2) << ", ";
  ss << "groups=" << params.groups << ")\n";
  ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last
-     << '\n';
+     << "\n";
  ss << "out = net(data)\n";
  ss << "out.backward(torch.randn_like(out))\n";
  ss << "torch.cuda.synchronize()\n\n";
--- a/aten/src/ATen/native/cudnn/Conv_v7.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp
@ -93,10 +93,11 @@ std::ostream& operator<<(std::ostream& out, const ConvolutionArgs& args) {
      << "input: " << args.idesc // already has a trailing newline
      << "output: " << args.odesc // already has a trailing newline
      << "weight: " << args.wdesc // already has a trailing newline
-      << "Pointer addresses: " << '\n'
-      << "    input: " << args.input.const_data_ptr() << '\n'
-      << "    output: " << args.output.const_data_ptr() << '\n'
-      << "    weight: " << args.weight.const_data_ptr() << '\n';
+      << "Pointer addresses: "
+      << "\n"
+      << "    input: " << args.input.const_data_ptr() << "\n"
+      << "    output: " << args.output.const_data_ptr() << "\n"
+      << "    weight: " << args.weight.const_data_ptr() << "\n";

  return out;
 }
--- a/aten/src/ATen/native/metal/MetalTensorImplStorage.mm
+++ b/aten/src/ATen/native/metal/MetalTensorImplStorage.mm
@ -115,7 +115,7 @@ std::ostream& operator<<(
  std::copy(
      strides.begin(), strides.end() - 1, std::ostream_iterator<int>(oss, ","));
  oss << sizes.back();
-  output << oss.str() << '}';
+  output << oss.str() << "}";
  return output;
 }

--- a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
@ -53,7 +53,7 @@ std::ostream& operator<<(std::ostream& out, const ConvParams& params) {
      << "  transposed = " << params.transposed
      << "  output_padding = " << IntArrayRef{params.output_padding}
      << "  groups = " << params.groups << "  benchmark = " << params.benchmark
-      << "  deterministic = " << params.deterministic << '}';
+      << "  deterministic = " << params.deterministic << "}";
  return out;
 }

--- a/aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp
@ -5,7 +5,6 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
 #include <ATen/native/xpu/Blas.h>
-#include <ATen/xpu/XPUScaledBlas.h>
 #include <torch/library.h>

 #ifndef AT_PER_OPERATOR_HEADERS
@ -340,399 +339,4 @@ Tensor _scaled_mm_xpu(
      out);
 }

-using acceptance_fn = std::function<bool(
-    c10::ScalarType,
-    std::vector<ScalingType>&,
-    ArrayRef<Tensor>&,
-    c10::ScalarType,
-    std::vector<ScalingType>&,
-    ArrayRef<Tensor>&)>;
-using namespace std::placeholders;
-
-namespace scaled_blas = at::native::onednn::scaled;
-using scaled_blas::convert_int_to_enum;
-using scaled_blas::ScaledGemmImplementation;
-
-std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 2>
-    scale_kernel_dispatch = {{
-        {"tensorwise_tensorwise",
-         scaled_blas::check_tensorwise_recipe,
-         ScaledGemmImplementation::TENSORWISE_TENSORWISE},
-        {"rowwise_rowwise",
-         scaled_blas::check_rowwise_recipe,
-         ScaledGemmImplementation::ROWWISE_ROWWISE},
-
-    }};
-
-Tensor& _scaled_tensorwise_tensorwise(
-    const Tensor& mat_a,
-    const Tensor& mat_b,
-    const Tensor& scale_a,
-    const Tensor& scale_b,
-    const std::optional<Tensor>& bias,
-    const c10::ScalarType out_dtype,
-    bool use_fast_accum,
-    Tensor& out) {
-  // Restrictions:
-  // A, B are FP8, scales are fp32
-
-  TORCH_CHECK_VALUE(
-      isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()),
-      "mat_a and mat_b must be fp8 types, got: ",
-      mat_a.scalar_type(),
-      mat_b.scalar_type());
-  TORCH_CHECK_VALUE(
-      scale_a.numel() == 1 && scale_a.scalar_type() == kFloat,
-      "scale_a must have 1 Float element")
-  TORCH_CHECK_VALUE(
-      scale_b.numel() == 1 && scale_b.scalar_type() == kFloat,
-      "scale_b must have 1 Float element")
-
-  auto scaling_choice_a = ScalingType::TensorWise;
-  auto scaling_choice_b = ScalingType::TensorWise;
-
-  _scaled_gemm(
-      mat_a,
-      mat_b,
-      scale_a,
-      scale_b,
-      scaling_choice_a,
-      scaling_choice_b,
-      bias,
-      use_fast_accum,
-      out);
-
-  return out;
-}
-
-Tensor& _scaled_rowwise_rowwise(
-    const Tensor& mat_a,
-    const Tensor& mat_b,
-    const Tensor& scale_a,
-    const Tensor& scale_b,
-    const std::optional<Tensor>& bias,
-    const c10::ScalarType out_dtype,
-    bool use_fast_accum,
-    Tensor& out) {
-  // Restrictions:
-  // A, B are FP8, scales are fp32, shape M/N for A/B
-  TORCH_CHECK_VALUE(
-      isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()),
-      "mat_a and mat_b must be fp8 types, got: ",
-      mat_a.scalar_type(),
-      mat_b.scalar_type());
-  TORCH_CHECK_VALUE(
-      scale_a.size(0) == mat_a.size(0) && scale_a.size(1) == 1,
-      "scale_a must have shape [",
-      mat_a.size(0),
-      ", 1], got [",
-      scale_a.sizes(),
-      "]");
-  TORCH_CHECK_VALUE(
-      scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat,
-      "scale_a must have ",
-      mat_a.size(0),
-      " Float elements, got ",
-      scale_a.numel())
-  TORCH_CHECK_VALUE(
-      scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat,
-      "scale_b must have ",
-      mat_b.size(1),
-      " Float elements, got ",
-      scale_b.numel())
-
-  TORCH_CHECK_VALUE(
-      scale_a.stride(1) == 1,
-      "expected scale_a.stride(1) to be 1, but got ",
-      scale_a.stride(1));
-  TORCH_CHECK_VALUE(
-      scale_b.stride(1) == 1,
-      "expected scale_b.stride(1) to be 1, but got ",
-      scale_b.stride(1));
-
-  auto scaling_choice_a = ScalingType::RowWise;
-  auto scaling_choice_b = ScalingType::RowWise;
-
-  _scaled_gemm(
-      mat_a,
-      mat_b,
-      scale_a,
-      scale_b,
-      scaling_choice_a,
-      scaling_choice_b,
-      bias,
-      use_fast_accum,
-      out);
-
-  return out;
-}
-
-// V2: Computes matrix multiply + bias while applying scaling to input and
-// output matrices Scales are only applicable when matrices are of Float8 type
-// and assumed to be equal to 1.0 by default. If output matrix type is 16 or
-// 32-bit type, scale_result is not applied. Known limitations:
-//  - Only works if mat1 is row-major and mat2 is column-major
-//  - Only works if matrices sizes are divisible by 32
-//  - If 1-dimensional tensors are used then scale_a should be size =
-//  mat1.size(0)
-//    and scale_b should have size = to mat2.size(1)
-//  Arguments:
-//    - `mat_a`: the first operand of the matrix multiply, can be type
-//    `torch.float8_e4m3fn` or `torch.float8_e5m2`
-//    - `mat_b`: the second operand of the matrix multiply, can be type
-//    `torch.float8_e4m3fn` or `torch.float8_e5m2`
-//    - `scale_a`: a tensor with the inverse scale of `mat1`, whose
-//    shape/strides/dtype depend on the scaling scheme
-//    - `scale_recipe_a`: An integer corresponding to an enum describing the
-//    scaling scheme used for `scale_a`
-//    - `swizzle_a`: An integer corresponding to a `SwizzleType` enum describing
-//    the swizzling scheme for `scale_a`.
-//        Not supported for XPU for now.
-//    - `scale_b`: a tensor with the inverse scale of `mat2`, whose
-//    shape/strides/dtype depend on the scaling scheme
-//    - `scale_recipe_b`: An integer corresponding to an enum describing the
-//    scaling scheme used for `scale_b`
-//    - `swizzle_b`: An integer corresponding to a `SwizzleType` enum describing
-//    the swizzling scheme for `scale_b`.
-//        Not supported for XPU for now.
-//    - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
-//    - `out_dtype`: the output dtype, can either be a float8 or a higher
-//    precision floating point type
-//    - `contraction_dim`: describe which dimensions are `K` in the matmul.
-//       Not supported for XPU. Should always be empty.
-//    - `use_fast_accum`: Not supported for XPU, should always be false.
-//    - `out`: a reference to the output tensor
-Tensor& _scaled_mm_xpu_v2_out(
-    const Tensor& mat_a,
-    const Tensor& mat_b,
-    ArrayRef<Tensor> scale_a,
-    IntArrayRef scale_recipe_a,
-    IntArrayRef swizzle_a,
-    ArrayRef<Tensor> scale_b,
-    IntArrayRef scale_recipe_b,
-    IntArrayRef swizzle_b,
-    const std::optional<Tensor>& bias,
-    const std::optional<c10::ScalarType> out_dtype,
-    IntArrayRef contraction_dim,
-    bool use_fast_accum,
-    Tensor& out) {
-  TORCH_CHECK_VALUE(mat_a.dim() == 2, "mat_a must be a matrix");
-  TORCH_CHECK_VALUE(mat_b.dim() == 2, "mat_b must be a matrix");
-
-  // If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm
-  // kernels do not support this case).
-  if (mat_a.size(0) == 0 || mat_a.size(1) == 0 || mat_b.size(1) == 0) {
-    // `out` was created with `at::empty`. In the case where we are multiplying
-    // MxK by KxN and K is the zero dim, we need to initialize here to properly
-    // return a tensor of zeros.
-    at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
-    if (mat_a.size(1) == 0) {
-      out.zero_();
-    }
-
-    return out;
-  }
-
-  // Note: The `contraction_dim` is not actually used for now. We will need to
-  // align this code when upstreamed CUDA code is done. Currently, only keeps
-  // the code here for check.
-
-  // Check if the input matrix sizes can be multiplied
-  // - if optional contraction dims are provided, use those
-  //   -- mostly for < 1B formats (i.e. nvfp4x2) where cheap .t() is not
-  //   available.
-  if (contraction_dim.size() > 0) {
-    TORCH_CHECK_VALUE(
-        contraction_dim.size() == 2,
-        "contraction_dim must have exactly 2 elements");
-    auto mat_a_dim = contraction_dim[0];
-    auto mat_b_dim = contraction_dim[1];
-    TORCH_CHECK_VALUE(
-        mat_a.size(mat_a_dim) == mat_b.size(mat_b_dim),
-        "mat_a and mat_b shapes cannot be multiplied (",
-        mat_a.size(0),
-        "x",
-        mat_a.size(1),
-        " and ",
-        mat_b.size(0),
-        "x",
-        mat_b.size(1),
-        ") ",
-        "with contraction dims mat_a: ",
-        mat_a_dim,
-        ", mat_b: ",
-        mat_b_dim);
-  } else {
-    TORCH_CHECK_VALUE(
-        mat_a.size(1) == mat_b.size(0),
-        "mat_a and mat_b shapes cannot be multiplied (",
-        mat_a.size(0),
-        "x",
-        mat_a.size(1),
-        " and ",
-        mat_b.size(0),
-        "x",
-        mat_b.size(1),
-        ")");
-  }
-
-  TORCH_CHECK_VALUE(
-      !bias || bias->numel() == mat_b.sizes()[1],
-      "Bias must be size ",
-      mat_b.sizes()[1],
-      " but got ",
-      bias->numel());
-
-  TORCH_CHECK_VALUE(
-      !out_dtype || *out_dtype == out.scalar_type(),
-      "out_dtype must match output matrix type");
-
-  if (bias) {
-    TORCH_CHECK_VALUE(
-        bias->scalar_type() == kFloat ||
-            bias->scalar_type() == c10::ScalarType::BFloat16 ||
-            bias->scalar_type() == c10::ScalarType::Half,
-        "Bias must be Float32 or BFloat16 or Half, but got ",
-        bias->scalar_type());
-  }
-  {
-    auto bias_ = bias.value_or(Tensor());
-    // NOLINTNEXTLINE(*c-array*)
-    TensorArg targs[]{
-        {out, "out", 0},
-        {mat_a, "mat_a", 1},
-        {mat_b, "mat_b", 2},
-        {bias_, "bias", 3},
-        {scale_a[0], "scale_a", 4},
-        {scale_b[0], "scale_b", 5}};
-    checkAllSameGPU(__func__, targs);
-  }
-  // Align with CUDA's default out to be bf16
-  auto out_dtype_ = out_dtype.value_or(c10::ScalarType::BFloat16);
-
-  // Conversion of implicitly-defined enums to explicit
-  auto scale_recipe_a_enum = convert_int_to_enum<ScalingType>(scale_recipe_a);
-  auto swizzle_a_enum = convert_int_to_enum<SwizzleType>(swizzle_a);
-  auto scale_recipe_b_enum = convert_int_to_enum<ScalingType>(scale_recipe_b);
-  auto swizzle_b_enum = convert_int_to_enum<SwizzleType>(swizzle_b);
-
-  // XPU does not support swizzle for now. So directly return false.
-  TORCH_CHECK_VALUE(
-      swizzle_a_enum[0] == at::blas::SwizzleType::NO_SWIZZLE &&
-          swizzle_b_enum[0] == at::blas::SwizzleType::NO_SWIZZLE,
-      "XPU does not support swizzle yet.");
-
-  // at this point we can start working out what we want to be doing
-  // Try to do as few steps as possible.
-  // NOTE: support is deliberately sparse, can explicitly enumerate all
-  // combinations allowed. Do this via a list of defined (name, acceptance,
-  // concrete_impl) tuples.
-  bool found_impl = false;
-  ScaledGemmImplementation gemm_impl = ScaledGemmImplementation::NONE;
-
-  for (const auto& fn_entry : scale_kernel_dispatch) {
-    const auto [name, accept_fn, scaled_gemm_impl] = fn_entry;
-    bool ok = accept_fn(
-        mat_a.scalar_type(),
-        scale_recipe_a_enum,
-        scale_a,
-        mat_b.scalar_type(),
-        scale_recipe_b_enum,
-        scale_b);
-    if (ok) {
-      gemm_impl = scaled_gemm_impl;
-      found_impl = true;
-      break;
-    }
-  }
-  TORCH_CHECK_VALUE(
-      found_impl,
-      "Invalid scaling configuration.\n"
-      "- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n"
-      "- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (",
-      mat_a.size(0),
-      ", 1) and scale_b should be (1, ",
-      mat_b.size(1),
-      "), and both should be contiguous.\n"
-      "Got mat_a.dtype()=",
-      mat_a.scalar_type(),
-      ", scale_a[0].dtype()=",
-      scale_a[0].scalar_type(),
-      ", scale_a[0].size()=",
-      scale_a[0].sizes(),
-      ", scale_a[0].stride()=",
-      scale_a[0].strides(),
-      ", ",
-      "mat_b.dtype()=",
-      mat_b.scalar_type(),
-      ", scale_b[0].dtype()=",
-      scale_b[0].scalar_type(),
-      ", scale_b[0].size()=",
-      scale_b[0].sizes(),
-      " and scale_b[0].stride()=",
-      scale_b[0].strides());
-
-  at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
-
-  auto bias_ = bias.value_or(Tensor());
-
-  // dispatch to appropriate lower-level calls for error checking & execution
-  if (gemm_impl == ScaledGemmImplementation::TENSORWISE_TENSORWISE) {
-    return _scaled_tensorwise_tensorwise(
-        mat_a,
-        mat_b,
-        scale_a[0],
-        scale_b[0],
-        bias,
-        out_dtype_,
-        use_fast_accum,
-        out);
-  } else if (gemm_impl == ScaledGemmImplementation::ROWWISE_ROWWISE) {
-    return _scaled_rowwise_rowwise(
-        mat_a,
-        mat_b,
-        scale_a[0],
-        scale_b[0],
-        bias,
-        out_dtype_,
-        use_fast_accum,
-        out);
-  } else {
-    TORCH_CHECK_VALUE(
-        false, "Invalid state - found an implementation, but not really");
-  }
-}
-
-Tensor _scaled_mm_xpu_v2(
-    const Tensor& mat_a,
-    const Tensor& mat_b,
-    ArrayRef<Tensor> scale_a,
-    IntArrayRef scale_recipe_a,
-    IntArrayRef swizzle_a,
-    ArrayRef<Tensor> scale_b,
-    IntArrayRef scale_recipe_b,
-    IntArrayRef swizzle_b,
-    const std::optional<Tensor>& bias,
-    const std::optional<c10::ScalarType> out_dtype,
-    IntArrayRef contraction_dim,
-    bool use_fast_accum) {
-  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
-  Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
-
-  return _scaled_mm_xpu_v2_out(
-      mat_a,
-      mat_b,
-      scale_a,
-      scale_recipe_a,
-      swizzle_a,
-      scale_b,
-      scale_recipe_b,
-      swizzle_b,
-      bias,
-      out_dtype,
-      contraction_dim,
-      use_fast_accum,
-      out);
-}
-
 } // namespace at::native
--- a/aten/src/ATen/native/mps/operations/Attention.mm
+++ b/aten/src/ATen/native/mps/operations/Attention.mm
@ -69,75 +69,139 @@ static std::tuple<Tensor, Tensor> sdpa_general_mps(const Tensor& query,
  auto out = at::empty({batchSize, num_head, qSize, headSize}, query.options());
  auto attn = at::empty({batchSize, num_head, qSize, maxSeqLength}, query.options());
  auto scale_factor = sdp::calculate_scale(query, scale).expect_float();
+  static const bool is_macOS_26_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_26_0_PLUS);
  @autoreleasepool {
    auto mkey = __func__ + getTensorsStringKey({query, key, value}) + ":" + std::to_string(is_causal) + ":" +
        std::to_string(attn_mask.has_value());
-    auto cachedGraph =
-        LookUpOrCreateCachedGraph<CachedGraph>(mkey, [&, q_ = query, k_ = key, v_ = value](auto mpsGraph, auto graph) {
-          auto qTensor = mpsGraphRankedPlaceHolder(mpsGraph, q_);
-          auto kTensor = mpsGraphRankedPlaceHolder(mpsGraph, k_);
-          auto vTensor = mpsGraphRankedPlaceHolder(mpsGraph, v_);
-          auto kT = [mpsGraph transposeTensor:kTensor dimension:2 withDimension:3 name:nil];
-          auto scaleTensor = [mpsGraph constantWithScalar:scale_factor
-                                                    shape:getMPSShape({1})
-                                                 dataType:MPSDataTypeFloat32];

-          auto maskedMM = [mpsGraph matrixMultiplicationWithPrimaryTensor:qTensor secondaryTensor:kT name:nil];
+    CachedGraph* cachedGraph;
+    //if(is_macOS_26_0_or_newer) {
+    if(true) {
+        cachedGraph =
+            LookUpOrCreateCachedGraph<CachedGraph>(mkey, [&, q_ = query, k_ = key, v_ = value](auto mpsGraph, auto graph) {
+              auto qTensor = mpsGraphRankedPlaceHolder(mpsGraph, q_);
+              auto kTensor = mpsGraphRankedPlaceHolder(mpsGraph, k_);
+              auto vTensor = mpsGraphRankedPlaceHolder(mpsGraph, v_);

-          if (macOS15_0_plus && [maskedMM dataType] == MPSDataTypeFloat32) {
-            // bug in MacOS15, without this trick SDPA leaks memory, adding 0.0f gets ignored(still takes SDPA sequence
-            // path which leaks)
-            auto oneTensor = [mpsGraph constantWithScalar:1e-20f shape:getMPSShape({1}) dataType:MPSDataTypeFloat32];
-            maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM secondaryTensor:oneTensor name:nil];
-          }
+              if (is_causal) {
+                MPSShape* maskShape = @[@(qSize), @(maxSeqLength)];
+                auto x = [mpsGraph coordinateAlongAxis:-1 withShape:@[@(qSize), @1] name:nil];
+                auto y = [mpsGraph coordinateAlongAxis:-2 withShape:@[@1, @(maxSeqLength)] name:nil];
+                auto isLess = [mpsGraph lessThanOrEqualToWithPrimaryTensor:x secondaryTensor:y name:nil];
+                auto causalMask = [mpsGraph selectWithPredicateTensor:isLess 
+                                            truePredicateTensor:[mpsGraph constantWithScalar:0 dataType:qTensor.dataType] 
+                                            falsePredicateTensor:[mpsGraph constantWithScalar:-INFINITY dataType:qTensor.dataType] 
+                                            name:nil];
+                graph->maskTensor = causalMask;
+              } else if (attn_mask) {
+                graph->maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, *attn_mask);
+              }

-          // upcasting to float32 if needed to improve precision when multiplying by the scale factor
-          maskedMM = castMPSTensor(mpsGraph, maskedMM, MPSDataTypeFloat32);
-          maskedMM = [mpsGraph multiplicationWithPrimaryTensor:maskedMM secondaryTensor:scaleTensor name:nil];
+              // Account for case where all values were masked causing division by 0 in softmax (issue:#156707)
+              // Overwrites expected NANs in sm with zeros.
+//              auto negInfTensor = [mpsGraph constantWithScalar:-INFINITY shape:maskedMM.shape dataType:maskedMM.dataType];
+//              auto elem_neg_inf = [mpsGraph equalWithPrimaryTensor:maskedMM secondaryTensor:negInfTensor name:nil];
+//              auto all_neg_infs_along_axis = [mpsGraph reductionAndWithTensor:elem_neg_inf axis:3 name:nil];
+//              auto zero_mask = [mpsGraph broadcastTensor:all_neg_infs_along_axis toShape:maskedMM.shape name:nil];
+//              auto zeroTensor = [mpsGraph constantWithScalar:0.0 shape:maskedMM.shape dataType:maskedMM.dataType];
+//
+//              auto sm = [mpsGraph softMaxWithTensor:maskedMM axis:3 name:nil];
+//              MPSGraphTensor* correctedSM = [mpsGraph selectWithPredicateTensor:zero_mask
+//                                                            truePredicateTensor:zeroTensor
+//                                                           falsePredicateTensor:sm
+//                                                                           name:nil];
+//
+//              auto output = [mpsGraph matrixMultiplicationWithPrimaryTensor:correctedSM secondaryTensor:vTensor name:nil];

-          if (is_causal) {
-            auto causalMask = [mpsGraph constantWithScalar:1.0f
-                                                     shape:getMPSShape({qSize, maxSeqLength})
-                                                  dataType:MPSDataTypeBool];
-            causalMask = [mpsGraph bandPartWithTensor:causalMask numLower:-1 numUpper:0 name:nil];
-            auto minusInf = [mpsGraph constantWithScalar:-1e20 shape:maskedMM.shape dataType:maskedMM.dataType];
-            maskedMM = [mpsGraph selectWithPredicateTensor:causalMask
-                                       truePredicateTensor:maskedMM
-                                      falsePredicateTensor:minusInf
-                                                      name:nil];
-          } else if (attn_mask) {
-            graph->maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, *attn_mask);
-            maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM
-                                           secondaryTensor:castMPSTensor(mpsGraph, graph->maskTensor, maskedMM.dataType)
-                                                      name:nil];
-          }
+              MPSGraphTensor* output;
+              if(graph->maskTensor != nil) {
+                output = [mpsGraph scaledDotProductAttentionWithQueryTensor:qTensor 
+                                                          keyTensor:kTensor 
+                                                        valueTensor:vTensor
+                                                         maskTensor:graph->maskTensor
+                                                              scale:scale_factor
+                                                               name:@"MPSGraph SDPA"];
+              } else {
+                output = [mpsGraph scaledDotProductAttentionWithQueryTensor:qTensor 
+                                                          keyTensor:kTensor 
+                                                        valueTensor:vTensor
+                                                              scale:scale_factor
+                                                               name:@"MPSGraph SDPA"];
+              }
+              graph->qTensor = qTensor;
+              graph->kTensor = kTensor;
+              graph->vTensor = vTensor;
+              graph->outputTensor = castMPSTensor(mpsGraph, output, qTensor.dataType);
+//              graph->attnTensor = castMPSTensor(mpsGraph, sm, qTensor.dataType);
+            });
+    } else {
+        cachedGraph =
+            LookUpOrCreateCachedGraph<CachedGraph>(mkey, [&, q_ = query, k_ = key, v_ = value](auto mpsGraph, auto graph) {
+              auto qTensor = mpsGraphRankedPlaceHolder(mpsGraph, q_);
+              auto kTensor = mpsGraphRankedPlaceHolder(mpsGraph, k_);
+              auto vTensor = mpsGraphRankedPlaceHolder(mpsGraph, v_);
+              auto kT = [mpsGraph transposeTensor:kTensor dimension:2 withDimension:3 name:nil];
+              auto scaleTensor = [mpsGraph constantWithScalar:scale_factor
+                                                        shape:getMPSShape({1})
+                                                     dataType:MPSDataTypeFloat32];

-          // Account for case where all values were masked causing division by 0 in softmax (issue:#156707)
-          // Overwrites expected NANs in sm with zeros.
-          auto negInfTensor = [mpsGraph constantWithScalar:-INFINITY shape:maskedMM.shape dataType:maskedMM.dataType];
-          auto elem_neg_inf = [mpsGraph equalWithPrimaryTensor:maskedMM secondaryTensor:negInfTensor name:nil];
-          auto all_neg_infs_along_axis = [mpsGraph reductionAndWithTensor:elem_neg_inf axis:3 name:nil];
-          auto zero_mask = [mpsGraph broadcastTensor:all_neg_infs_along_axis toShape:maskedMM.shape name:nil];
-          auto zeroTensor = [mpsGraph constantWithScalar:0.0 shape:maskedMM.shape dataType:maskedMM.dataType];
+              auto maskedMM = [mpsGraph matrixMultiplicationWithPrimaryTensor:qTensor secondaryTensor:kT name:nil];

-          auto sm = [mpsGraph softMaxWithTensor:maskedMM axis:3 name:nil];
-          MPSGraphTensor* correctedSM = [mpsGraph selectWithPredicateTensor:zero_mask
-                                                        truePredicateTensor:zeroTensor
-                                                       falsePredicateTensor:sm
-                                                                       name:nil];
+              if (macOS15_0_plus && [maskedMM dataType] == MPSDataTypeFloat32) {
+                // bug in MacOS15, without this trick SDPA leaks memory, adding 0.0f gets ignored(still takes SDPA sequence
+                // path which leaks)
+                auto oneTensor = [mpsGraph constantWithScalar:1e-20f shape:getMPSShape({1}) dataType:MPSDataTypeFloat32];
+                maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM secondaryTensor:oneTensor name:nil];
+              }

-          auto output = [mpsGraph matrixMultiplicationWithPrimaryTensor:correctedSM secondaryTensor:vTensor name:nil];
-          graph->qTensor = qTensor;
-          graph->kTensor = kTensor;
-          graph->vTensor = vTensor;
-          graph->outputTensor = castMPSTensor(mpsGraph, output, qTensor.dataType);
-          graph->attnTensor = castMPSTensor(mpsGraph, sm, qTensor.dataType);
-        });
+              // upcasting to float32 if needed to improve precision when multiplying by the scale factor
+              maskedMM = castMPSTensor(mpsGraph, maskedMM, MPSDataTypeFloat32);
+              maskedMM = [mpsGraph multiplicationWithPrimaryTensor:maskedMM secondaryTensor:scaleTensor name:nil];
+
+              if (is_causal) {
+                auto causalMask = [mpsGraph constantWithScalar:1.0f
+                                                         shape:getMPSShape({qSize, maxSeqLength})
+                                                      dataType:MPSDataTypeBool];
+                causalMask = [mpsGraph bandPartWithTensor:causalMask numLower:-1 numUpper:0 name:nil];
+                auto minusInf = [mpsGraph constantWithScalar:-1e20 shape:maskedMM.shape dataType:maskedMM.dataType];
+                maskedMM = [mpsGraph selectWithPredicateTensor:causalMask
+                                           truePredicateTensor:maskedMM
+                                          falsePredicateTensor:minusInf
+                                                          name:nil];
+              } else if (attn_mask) {
+                graph->maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, *attn_mask);
+                maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM
+                                               secondaryTensor:castMPSTensor(mpsGraph, graph->maskTensor, maskedMM.dataType)
+                                                          name:nil];
+              }
+
+              // Account for case where all values were masked causing division by 0 in softmax (issue:#156707)
+              // Overwrites expected NANs in sm with zeros.
+              auto negInfTensor = [mpsGraph constantWithScalar:-INFINITY shape:maskedMM.shape dataType:maskedMM.dataType];
+              auto elem_neg_inf = [mpsGraph equalWithPrimaryTensor:maskedMM secondaryTensor:negInfTensor name:nil];
+              auto all_neg_infs_along_axis = [mpsGraph reductionAndWithTensor:elem_neg_inf axis:3 name:nil];
+              auto zero_mask = [mpsGraph broadcastTensor:all_neg_infs_along_axis toShape:maskedMM.shape name:nil];
+              auto zeroTensor = [mpsGraph constantWithScalar:0.0 shape:maskedMM.shape dataType:maskedMM.dataType];
+
+              auto sm = [mpsGraph softMaxWithTensor:maskedMM axis:3 name:nil];
+              MPSGraphTensor* correctedSM = [mpsGraph selectWithPredicateTensor:zero_mask
+                                                            truePredicateTensor:zeroTensor
+                                                           falsePredicateTensor:sm
+                                                                           name:nil];
+
+              auto output = [mpsGraph matrixMultiplicationWithPrimaryTensor:correctedSM secondaryTensor:vTensor name:nil];
+              graph->qTensor = qTensor;
+              graph->kTensor = kTensor;
+              graph->vTensor = vTensor;
+              graph->outputTensor = castMPSTensor(mpsGraph, output, qTensor.dataType);
+              graph->attnTensor = castMPSTensor(mpsGraph, sm, qTensor.dataType);
+            });
+    }
    auto qPlaceholder = Placeholder(cachedGraph->qTensor, query);
    auto kPlaceholder = Placeholder(cachedGraph->kTensor, key);
    auto vPlaceholder = Placeholder(cachedGraph->vTensor, value);
    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor, out);
-    auto attnPlaceholder = Placeholder(cachedGraph->attnTensor, attn);
+//    auto attnPlaceholder = Placeholder(cachedGraph->attnTensor, attn);
    NSDictionary* feeds = nil;
    if (!attn_mask) {
      feeds = dictionaryFromPlaceholders(qPlaceholder, kPlaceholder, vPlaceholder);
@ -145,7 +209,8 @@ static std::tuple<Tensor, Tensor> sdpa_general_mps(const Tensor& query,
      auto mPlaceholder = Placeholder(cachedGraph->maskTensor, *attn_mask);
      feeds = dictionaryFromPlaceholders(qPlaceholder, kPlaceholder, vPlaceholder, mPlaceholder);
    }
-    NSDictionary* outs = dictionaryFromPlaceholders(outputPlaceholder, attnPlaceholder);
+//    NSDictionary* outs = dictionaryFromPlaceholders(outputPlaceholder, attnPlaceholder);
+    NSDictionary* outs = dictionaryFromPlaceholders(outputPlaceholder);
    runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outs);
  }

--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h
@ -301,12 +301,12 @@ class AvgPoolMicrokernelTester {
          ASSERT_NEAR(
              float(int32_t(y[i * yStride() + k])), yFP[i * kc() + k], 0.5001f)
              << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
+              << ", ks = " << kh() << "x" << kw() << " (" << ks()
              << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
          ASSERT_EQ(
              uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
              << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
+              << ", ks = " << kh() << "x" << kw() << " (" << ks()
              << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
        }
      }
@ -396,12 +396,12 @@ class AvgPoolMicrokernelTester {
          ASSERT_NEAR(
              float(int32_t(y[i * yStride() + k])), yFP[i * kc() + k], 0.5001f)
              << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
+              << ", ks = " << kh() << "x" << kw() << " (" << ks()
              << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
          ASSERT_EQ(
              uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
              << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
+              << ", ks = " << kh() << "x" << kw() << " (" << ks()
              << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
        }
      }
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h
@ -232,7 +232,7 @@ class MaxPoolMicrokernelTester {
          ASSERT_EQ(
              uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
              << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
+              << ", ks = " << kh() << "x" << kw() << " (" << ks()
              << "), kc = " << kc();
        }
      }
--- a/aten/src/ATen/native/utils/ParamUtils.h
+++ b/aten/src/ATen/native/utils/ParamUtils.h
@ -17,7 +17,7 @@ inline std::vector<T> _expand_param_if_needed(
    std::ostringstream ss;
    ss << "expected " << param_name << " to be a single integer value or a "
       << "list of " << expected_dim << " values to match the convolution "
-       << "dimensions, but got " << param_name << '=' << list_param;
+       << "dimensions, but got " << param_name << "=" << list_param;
    TORCH_CHECK(false, ss.str());
  } else {
    return list_param.vec();
--- a/aten/src/ATen/native/vulkan/api/Adapter.cpp
+++ b/aten/src/ATen/native/vulkan/api/Adapter.cpp
@ -358,9 +358,9 @@ std::string Adapter::stringize() const {
  std::string device_type = get_device_type_str(properties.deviceType);
  VkPhysicalDeviceLimits limits = properties.limits;

-  ss << '{' << std::endl;
+  ss << "{" << std::endl;
  ss << "  Physical Device Info {" << std::endl;
-  ss << "    apiVersion:    " << v_major << '.' << v_minor << std::endl;
+  ss << "    apiVersion:    " << v_major << "." << v_minor << std::endl;
  ss << "    driverversion: " << properties.driverVersion << std::endl;
  ss << "    deviceType:    " << device_type << std::endl;
  ss << "    deviceName:    " << properties.deviceName << std::endl;
@ -371,7 +371,7 @@ std::string Adapter::stringize() const {

 #define PRINT_LIMIT_PROP_VEC3(name)                                       \
  ss << "      " << std::left << std::setw(36) << #name << limits.name[0] \
-     << ',' << limits.name[1] << ',' << limits.name[2] << std::endl;
+     << "," << limits.name[1] << "," << limits.name[2] << std::endl;

  ss << "    Physical Device Limits {" << std::endl;
  PRINT_LIMIT_PROP(maxImageDimension1D);
@ -425,7 +425,7 @@ std::string Adapter::stringize() const {
    ;
  }
  ss << "  ]" << std::endl;
-  ss << '}';
+  ss << "}";

  return ss.str();
 }
--- a/aten/src/ATen/native/vulkan/api/Exception.cpp
+++ b/aten/src/ATen/native/vulkan/api/Exception.cpp
@ -33,7 +33,7 @@ std::ostream& operator<<(std::ostream& out, const VkResult result) {
    VK_RESULT_CASE(VK_ERROR_FORMAT_NOT_SUPPORTED)
    VK_RESULT_CASE(VK_ERROR_FRAGMENTED_POOL)
    default:
-      out << "VK_ERROR_UNKNOWN (VkResult " << result << ')';
+      out << "VK_ERROR_UNKNOWN (VkResult " << result << ")";
      break;
  }
  return out;
@ -46,7 +46,7 @@ std::ostream& operator<<(std::ostream& out, const VkResult result) {
 //

 std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
-  out << loc.function << " at " << loc.file << ':' << loc.line;
+  out << loc.function << " at " << loc.file << ":" << loc.line;
  return out;
 }

@ -66,7 +66,7 @@ Error::Error(SourceLocation source_location, const char* cond, std::string msg)
    : msg_(std::move(msg)), source_location_{source_location} {
  std::ostringstream oss;
  oss << "Exception raised from " << source_location_ << ": ";
-  oss << '(' << cond << ") is false! ";
+  oss << "(" << cond << ") is false! ";
  oss << msg_;
  what_ = oss.str();
 }
--- a/aten/src/ATen/native/vulkan/api/QueryPool.cpp
+++ b/aten/src/ATen/native/vulkan/api/QueryPool.cpp
@ -173,8 +173,8 @@ void QueryPool::extract_results() {

 static std::string stringize(const VkExtent3D& extents) {
  std::stringstream ss;
-  ss << '{' << extents.width << ", " << extents.height << ", " << extents.depth
-     << '}';
+  ss << "{" << extents.width << ", " << extents.height << ", " << extents.depth
+     << "}";
  return ss.str();
 }

--- a/aten/src/ATen/native/vulkan/api/Runtime.cpp
+++ b/aten/src/ATen/native/vulkan/api/Runtime.cpp
@ -149,7 +149,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn(
  (void)flags;

  std::stringstream stream;
-  stream << layer_prefix << ' ' << message_code << ' ' << message << std::endl;
+  stream << layer_prefix << " " << message_code << " " << message << std::endl;
  const std::string log = stream.str();

  std::cout << log;
--- a/aten/src/ATen/native/vulkan/api/Utils.h
+++ b/aten/src/ATen/native/vulkan/api/Utils.h
@ -253,7 +253,7 @@ using vec4 = vec<4u>;

 // uvec3 is the type representing tensor extents. Useful for debugging.
 inline std::ostream& operator<<(std::ostream& os, const uvec3& v) {
-  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ')';
+  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")";
  return os;
 }

--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@ -246,7 +246,7 @@ void TestToCFloat() {
 void TestToString() {
  Tensor b = ones({3, 7}) * .0000001f;
  std::stringstream s;
-  s << b << '\n';
+  s << b << "\n";
  std::string expect = "1e-07 *";
  ASSERT_EQ_RESOLVED(s.str().substr(0, expect.size()), expect);
 }
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@ -33,7 +33,7 @@ struct Foo {
  static void apply(Tensor a, Tensor b) {
    scalar_type s = 1;
    std::stringstream ss;
-    ss << "hello, dispatch: " << a.toString() << s << '\n';
+    ss << "hello, dispatch: " << a.toString() << s << "\n";
    auto data = (scalar_type*)a.data_ptr();
    (void)data;
  }
@ -73,8 +73,8 @@ TEST(TestScalar, TestScalar) {
  Scalar bar = 3.0;
  Half h = bar.toHalf();
  Scalar h2 = h;
-  cout << "H2: " << h2.toDouble() << ' ' << what.toFloat() << ' '
-       << bar.toDouble() << ' ' << what.isIntegral(false) << '\n';
+  cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " "
+       << bar.toDouble() << " " << what.isIntegral(false) << "\n";
  auto gen = at::detail::getDefaultCPUGenerator();
  {
    // See Note [Acquire lock when using random generators]
@ -84,7 +84,7 @@ TEST(TestScalar, TestScalar) {
  }
  if (at::hasCUDA()) {
    auto t2 = zeros({4, 4}, at::kCUDA);
-    cout << &t2 << '\n';
+    cout << &t2 << "\n";
  }
  auto t = ones({4, 4});

@ -129,7 +129,7 @@ TEST(TestScalar, TestScalar) {
      std::stringstream ss;
      // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
      ASSERT_NO_THROW(
-          ss << "hello, dispatch" << x.toString() << s << '\n');
+          ss << "hello, dispatch" << x.toString() << s << "\n");
      auto data = (scalar_t*)x.data_ptr();
      (void)data;
    });
--- a/aten/src/ATen/test/test_install/main.cpp
+++ b/aten/src/ATen/test/test_install/main.cpp
@ -1,5 +1,5 @@
 #include <ATen/ATen.h>

 int main() {
-  std::cout << at::ones({3,4}, at::CPU(at::kFloat)) << '\n';
+  std::cout << at::ones({3,4}, at::CPU(at::kFloat)) << "\n";
 }
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@ -1828,9 +1828,9 @@ namespace {
      #endif

        EXPECT_EQ(u16, c10::detail::fp16_ieee_from_fp32_value(f32s[i]))
-            << "Test failed for float to uint16 " << f32s[i] << '\n';
+            << "Test failed for float to uint16 " << f32s[i] << "\n";
        EXPECT_EQ(x, c10::detail::fp16_ieee_to_fp32_value(u16))
-            << "Test failed for uint16 to float " << u16 << '\n';
+            << "Test failed for uint16 to float " << u16 << "\n";
      }
    }
    TEST(FP8E4M3Test, FP8E4M3ConversionFloat) {
@ -1848,10 +1848,10 @@ namespace {
          EXPECT_TRUE(std::isnan(f32));
        } else {
          EXPECT_EQ(f32, c10::detail::fp8e4m3fn_to_fp32_value(input))
-              << "Test failed for u8 to float " << input << '\n';
+              << "Test failed for u8 to float " << input << "\n";
        }
        EXPECT_EQ(u8, c10::detail::fp8e4m3fn_from_fp32_value(f32))
-            << "Test failed for float to u8 " << f32 << '\n';
+            << "Test failed for float to u8 " << f32 << "\n";
      }
    }
    TEST(FP8E4M3Test, FP8E4M3BinaryAdd) {
@ -2015,10 +2015,10 @@ namespace {
          EXPECT_TRUE(std::isnan(f32));
        } else {
          EXPECT_EQ(f32, c10::detail::fp8e5m2_to_fp32_value(input))
-              << "Test failed for u8 to float " << input << '\n';
+              << "Test failed for u8 to float " << input << "\n";
        }
        EXPECT_EQ(u8, c10::detail::fp8e5m2_from_fp32_value(f32))
-            << "Test failed for float to u8 " << f32 << '\n';
+            << "Test failed for float to u8 " << f32 << "\n";
      }
    }
    TEST(FP8E5M2Test, FP8E5M2BinaryAdd) {
--- a/aten/src/ATen/test/vitals.cpp
+++ b/aten/src/ATen/test/vitals.cpp
@ -19,7 +19,7 @@ TEST(Vitals, Basic) {
    c10::utils::set_env("TORCH_VITAL", "1");
    TORCH_VITAL_DEFINE(Testing);
    TORCH_VITAL(Testing, Attribute0) << 1;
-    TORCH_VITAL(Testing, Attribute1) << '1';
+    TORCH_VITAL(Testing, Attribute1) << "1";
    TORCH_VITAL(Testing, Attribute2) << 1.0f;
    TORCH_VITAL(Testing, Attribute3) << 1.0;
    auto t = at::ones({1, 1});
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@ -129,14 +129,14 @@ void showRtol(const at::Tensor& a, const at::Tensor& b) {
  std::cout << "Max Diff allowed: " << maxDiff << std::endl;
  if (diff.sizes().size() == 2) {
    for (const auto y : c10::irange(diff.sizes()[0])) {
-      std::cout << y << ':';
+      std::cout << y << ":";
      for (const auto x : c10::irange(diff.sizes()[1])) {
        float diff_xy = diff[y][x].item<float>();
        if (diff_xy > maxDiff) {
          std::cout << std::setw(5) << x;
        }
        else {
-          std::cout << std::setw(5) << ' ';
+          std::cout << std::setw(5) << " ";
        }
      }
      std::cout << std::endl;
@ -3276,7 +3276,7 @@ TEST_F(VulkanAPITest, masked_fill_invalidinputs_exceptions) {

 void print_shape(const std::vector<int64_t>& shape) {
  for (const auto& num : shape) {
-    std::cout << num << ' ';
+    std::cout << num << " ";
  }
 }

@ -3367,7 +3367,7 @@ void test_masked_fill_scalar(
            print_shape(tmp_curr_input_shape);
            std::cout << "], and mask of shape [";
            print_shape(tmp_curr_mask_shape);
-            std::cout << ']' << std::endl;
+            std::cout << "]" << std::endl;
          }

          ASSERT_TRUE(check);
@ -4542,9 +4542,9 @@ void test_softmax(const at::IntArrayRef shape, bool log_softmax = false) {
    if (!check) {
      std::cout << "Softmax test failed on axis " << dim << "for tensor dims {";
      for (uint32_t place = 0; place < shape.size() - 1; place++) {
-        std::cout << shape[place] << ' ';
+        std::cout << shape[place] << " ";
      }
-      std::cout << shape.back() << '}' << std::endl;
+      std::cout << shape.back() << "}" << std::endl;
      showRtol(out_cpu, out_vulkan.cpu());
    }
    ASSERT_TRUE(check);
--- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
@ -95,7 +95,7 @@ void showRtol(
  std::cout << "Max Diff found is: " << diff.max().item<double>() << std::endl;
  if (diff.sizes().size() == 2) {
    for (const auto y : c10::irange(diff.sizes()[0])) {
-      std::cout << y << ':';
+      std::cout << y << ":";
      for (const auto x : c10::irange(diff.sizes()[1])) {
        double diff_xy = diff[y][x].item<double>();
        if (diff_xy > maxDiff) {
@ -109,7 +109,7 @@ void showRtol(
            }
          }
        } else {
-          std::cout << std::setw(5) << ' ';
+          std::cout << std::setw(5) << " ";
        }
      }
      std::cout << std::endl;
@ -148,19 +148,19 @@ using at::native::vulkan::api::utils::ivec4;
 using at::native::vulkan::api::utils::vec4;

 std::ostream& operator<<(std::ostream& os, const vec4& v) {
-  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
-     << v.data[3u] << ')';
+  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
+     << v.data[3u] << ")";
  return os;
 }

 std::ostream& operator<<(std::ostream& os, const ivec3& v) {
-  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ')';
+  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")";
  return os;
 }

 std::ostream& operator<<(std::ostream& os, const ivec4& v) {
-  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
-     << v.data[3u] << ')';
+  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
+     << v.data[3u] << ")";
  return os;
 }

@ -3379,51 +3379,51 @@ bool _test_quantized_linear(
      showRtol(out_cpu_dequant, out_vk_to_cpu_dequant);
    }
    if (xpos != -1 && ypos != -1) {
-      std::cout << "\nFailure caused on row/col: " << ypos << '/' << xpos
-                << '\n';
+      std::cout << "\nFailure caused on row/col: " << ypos << "/" << xpos
+                << "\n";
      std::cout << "Input tensor scale: " << scale << " zerop: " << zero_point
-                << '\n';
-      std::cout << "Input tensor row " << ypos << '\n';
+                << "\n";
+      std::cout << "Input tensor row " << ypos << "\n";
      for (int i = 0; i < input_cpu.sizes()[1]; i++) {
        std::cout << input_cpu[ypos][i].item<double>() << ", ";
      }
-      std::cout << '\n';
+      std::cout << "\n";

      std::cout << "Weight tensor scale: " << w_scale
-                << " zerop: " << w_zero_point << '\n';
-      std::cout << "Weight tensor col " << xpos << '\n';
+                << " zerop: " << w_zero_point << "\n";
+      std::cout << "Weight tensor col " << xpos << "\n";
      for (int i = 0; i < weight.sizes()[1]; i++) {
        std::cout << weight[xpos][i].item<double>() << ", ";
      }
-      std::cout << '\n';
+      std::cout << "\n";

      std::cout << "Input tensor quantized row " << ypos << " with dtype "
-                << (input_quant_dtype_int8 ? "QInt8" : "QUInt8") << '\n';
+                << (input_quant_dtype_int8 ? "QInt8" : "QUInt8") << "\n";
      for (int i = 0; i < input_cpu.sizes()[1]; i++) {
        std::cout << input_cpu_quantized[ypos][i].item<double>() << ", ";
      }
-      std::cout << '\n';
+      std::cout << "\n";

      std::cout << "Weight tensor quantized col " << xpos << " with dtype "
-                << (weight_quant_dtype_int8 ? "QInt8" : "QUInt8") << '\n';
+                << (weight_quant_dtype_int8 ? "QInt8" : "QUInt8") << "\n";
      for (int i = 0; i < weight.sizes()[1]; i++) {
        std::cout << weight_cpu_quantized[xpos][i].item<double>() << ", ";
      }
-      std::cout << '\n';
+      std::cout << "\n";

      std::cout << "bias tensor\n";
      for (int i = 0; i < bias.sizes()[0]; i++) {
        std::cout << bias[i].item<double>() << ", ";
      }
-      std::cout << '\n';
+      std::cout << "\n";

      std::cout << "out_scale: " << out_scale
-                << " out_zero_point: " << out_zero_point << '\n';
+                << " out_zero_point: " << out_zero_point << "\n";

      std::cout << "cpu unmatched output: "
-                << out_cpu_dequant[ypos][xpos].item<double>() << '\n';
+                << out_cpu_dequant[ypos][xpos].item<double>() << "\n";
      std::cout << "vk unmatched output: "
-                << out_vk_to_cpu_dequant[ypos][xpos].item<double>() << '\n';
+                << out_vk_to_cpu_dequant[ypos][xpos].item<double>() << "\n";
    }
  }
  return check;
--- a/Show More
+++ b/Show More