Update operator benchmarks README

2025-11-20 02:24:54 +08:00 · 2025-11-19 07:58:11 +00:00
336 changed files with 3192 additions and 5082 deletions
--- a/.ci/aarch64_linux/README.md
+++ b/.ci/aarch64_linux/README.md
@ -1,19 +0,0 @@
-# Aarch64 (ARM/Graviton) Support Scripts
-Scripts for building aarch64 PyTorch PIP Wheels. These scripts build the following wheels:
-* torch
-* torchvision
-* torchaudio
-* torchtext
-* torchdata
-## Aarch64_ci_build.sh
-This script is design to support CD operations within PyPi manylinux aarch64 container, and be executed in the container. It prepares the container and then executes __aarch64_wheel_ci_build.py__ to build the wheels. The script "assumes" the PyTorch repo is located at: ```/pytorch``` and will put the wheels into ```/artifacts```.
-### Usage
-```DESIRED_PYTHON=<PythonVersion> aarch64_ci_build.sh```
-
-__NOTE:__ CI build is currently __EXPERMINTAL__
-
-## Build_aarch64_wheel.py
-This app allows a person to build using AWS EC3 resources and requires AWS-CLI and Boto3 with AWS credentials to support building EC2 instances for the wheel builds. Can be used in a codebuild CD or from a local system.
-
-### Usage
-```build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch <RCtag>```
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -1,53 +0,0 @@
-#!/bin/bash
-set -eux -o pipefail
-
-GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
-
-# Set CUDA architecture lists to match x86 build_cuda.sh
-if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0"
-elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
-elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
-elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
-fi
-
-# Compress the fatbin with -compress-mode=size for CUDA 13
-if [[ "$DESIRED_CUDA" == *"13"* ]]; then
-    export TORCH_NVCC_FLAGS="-compress-mode=size"
-    # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
-    export BUILD_BUNDLE_PTXAS=1
-fi
-
-SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
-source $SCRIPTPATH/aarch64_ci_setup.sh
-
-###############################################################################
-# Run aarch64 builder python
-###############################################################################
-cd /
-# adding safe directory for git as the permissions will be
-# on the mounted pytorch repo
-git config --global --add safe.directory /pytorch
-pip install -r /pytorch/requirements.txt
-pip install auditwheel==6.2.0 wheel
-if [ "$DESIRED_CUDA" = "cpu" ]; then
-    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
-    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
-else
-    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
-    export USE_SYSTEM_NCCL=1
-
-    # Check if we should use NVIDIA libs from PyPI (similar to x86 build_cuda.sh logic)
-    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
-        echo "Bundling CUDA libraries with wheel for aarch64."
-    else
-        echo "Using nvidia libs from pypi for aarch64."
-        echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
-        export USE_NVIDIA_PYPI_LIBS=1
-    fi
-
-    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
-fi
--- a/.ci/aarch64_linux/aarch64_ci_setup.sh
+++ b/.ci/aarch64_linux/aarch64_ci_setup.sh
@ -1,21 +0,0 @@
-#!/bin/bash
-set -eux -o pipefail
-
-# This script is used to prepare the Docker container for aarch64_ci_wheel_build.py python script
-# By creating symlinks from desired /opt/python to /usr/local/bin/
-
-NUMPY_VERSION=2.0.2
-if [[ "$DESIRED_PYTHON"  == "3.13" || "$DESIRED_PYTHON" == "3.13t" ]]; then
-    NUMPY_VERSION=2.1.2
-fi
-
-SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
-source $SCRIPTPATH/../manywheel/set_desired_python.sh
-
-pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2
-
-for tool in python python3 pip pip3 ninja scons patchelf; do
-    ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin;
-done
-
-python --version
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -1,333 +0,0 @@
-#!/usr/bin/env python3
-# encoding: UTF-8
-
-import os
-import shutil
-from subprocess import check_call, check_output
-
-
-def list_dir(path: str) -> list[str]:
-    """'
-    Helper for getting paths for Python
-    """
-    return check_output(["ls", "-1", path]).decode().split("\n")
-
-
-def replace_tag(filename) -> None:
-    with open(filename) as f:
-        lines = f.readlines()
-    for i, line in enumerate(lines):
-        if line.startswith("Tag:"):
-            lines[i] = line.replace("-linux_", "-manylinux_2_28_")
-            print(f"Updated tag from {line} to {lines[i]}")
-            break
-
-    with open(filename, "w") as f:
-        f.writelines(lines)
-
-
-def patch_library_rpath(
-    folder: str,
-    lib_name: str,
-    use_nvidia_pypi_libs: bool = False,
-    desired_cuda: str = "",
-) -> None:
-    """Apply patchelf to set RPATH for a library in torch/lib"""
-    lib_path = f"{folder}/tmp/torch/lib/{lib_name}"
-
-    if use_nvidia_pypi_libs:
-        # For PyPI NVIDIA libraries, construct CUDA RPATH
-        cuda_rpaths = [
-            "$ORIGIN/../../nvidia/cudnn/lib",
-            "$ORIGIN/../../nvidia/nvshmem/lib",
-            "$ORIGIN/../../nvidia/nccl/lib",
-            "$ORIGIN/../../nvidia/cusparselt/lib",
-        ]
-
-        if "130" in desired_cuda:
-            cuda_rpaths.append("$ORIGIN/../../nvidia/cu13/lib")
-        else:
-            cuda_rpaths.extend(
-                [
-                    "$ORIGIN/../../nvidia/cublas/lib",
-                    "$ORIGIN/../../nvidia/cuda_cupti/lib",
-                    "$ORIGIN/../../nvidia/cuda_nvrtc/lib",
-                    "$ORIGIN/../../nvidia/cuda_runtime/lib",
-                    "$ORIGIN/../../nvidia/cufft/lib",
-                    "$ORIGIN/../../nvidia/curand/lib",
-                    "$ORIGIN/../../nvidia/cusolver/lib",
-                    "$ORIGIN/../../nvidia/cusparse/lib",
-                    "$ORIGIN/../../nvidia/nvtx/lib",
-                    "$ORIGIN/../../nvidia/cufile/lib",
-                ]
-            )
-
-        # Add $ORIGIN for local torch libs
-        rpath = ":".join(cuda_rpaths) + ":$ORIGIN"
-    else:
-        # For bundled libraries, just use $ORIGIN
-        rpath = "$ORIGIN"
-
-    if os.path.exists(lib_path):
-        os.system(
-            f"cd {folder}/tmp/torch/lib/; "
-            f"patchelf --set-rpath '{rpath}' --force-rpath {lib_name}"
-        )
-
-
-def copy_and_patch_library(
-    src_path: str,
-    folder: str,
-    use_nvidia_pypi_libs: bool = False,
-    desired_cuda: str = "",
-) -> None:
-    """Copy a library to torch/lib and patch its RPATH"""
-    if os.path.exists(src_path):
-        lib_name = os.path.basename(src_path)
-        shutil.copy2(src_path, f"{folder}/tmp/torch/lib/{lib_name}")
-        patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
-
-
-def package_cuda_wheel(wheel_path, desired_cuda) -> None:
-    """
-    Package the cuda wheel libraries
-    """
-    folder = os.path.dirname(wheel_path)
-    os.mkdir(f"{folder}/tmp")
-    os.system(f"unzip {wheel_path} -d {folder}/tmp")
-    # Delete original wheel since it will be repackaged
-    os.system(f"rm {wheel_path}")
-
-    # Check if we should use PyPI NVIDIA libraries or bundle system libraries
-    use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
-
-    if use_nvidia_pypi_libs:
-        print("Using nvidia libs from pypi - skipping CUDA library bundling")
-        # For PyPI approach, we don't bundle CUDA libraries - they come from PyPI packages
-        # We only need to bundle non-NVIDIA libraries
-        minimal_libs_to_copy = [
-            "/lib64/libgomp.so.1",
-            "/usr/lib64/libgfortran.so.5",
-            "/acl/build/libarm_compute.so",
-            "/acl/build/libarm_compute_graph.so",
-            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_lapack_core.so.0",
-            "/usr/local/lib/libnvpl_blas_core.so.0",
-        ]
-
-        # Copy minimal libraries to unzipped_folder/torch/lib
-        for lib_path in minimal_libs_to_copy:
-            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
-
-        # Patch torch libraries used for searching libraries
-        torch_libs_to_patch = [
-            "libtorch.so",
-            "libtorch_cpu.so",
-            "libtorch_cuda.so",
-            "libtorch_cuda_linalg.so",
-            "libtorch_global_deps.so",
-            "libtorch_python.so",
-            "libtorch_nvshmem.so",
-            "libc10.so",
-            "libc10_cuda.so",
-            "libcaffe2_nvrtc.so",
-            "libshm.so",
-        ]
-        for lib_name in torch_libs_to_patch:
-            patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
-    else:
-        print("Bundling CUDA libraries with wheel")
-        # Original logic for bundling system CUDA libraries
-        # Common libraries for all CUDA versions
-        common_libs = [
-            # Non-NVIDIA system libraries
-            "/lib64/libgomp.so.1",
-            "/usr/lib64/libgfortran.so.5",
-            "/acl/build/libarm_compute.so",
-            "/acl/build/libarm_compute_graph.so",
-            # Common CUDA libraries (same for all versions)
-            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_lapack_core.so.0",
-            "/usr/local/lib/libnvpl_blas_core.so.0",
-            "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
-            "/usr/local/cuda/lib64/libcudnn.so.9",
-            "/usr/local/cuda/lib64/libcusparseLt.so.0",
-            "/usr/local/cuda/lib64/libcurand.so.10",
-            "/usr/local/cuda/lib64/libnccl.so.2",
-            "/usr/local/cuda/lib64/libnvshmem_host.so.3",
-            "/usr/local/cuda/lib64/libcudnn_adv.so.9",
-            "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
-            "/usr/local/cuda/lib64/libcudnn_graph.so.9",
-            "/usr/local/cuda/lib64/libcudnn_ops.so.9",
-            "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
-            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
-            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
-            "/usr/local/cuda/lib64/libcufile.so.0",
-            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
-            "/usr/local/cuda/lib64/libcusparse.so.12",
-        ]
-
-        # CUDA version-specific libraries
-        if "13" in desired_cuda:
-            minor_version = desired_cuda[-1]
-            version_specific_libs = [
-                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
-                "/usr/local/cuda/lib64/libcublas.so.13",
-                "/usr/local/cuda/lib64/libcublasLt.so.13",
-                "/usr/local/cuda/lib64/libcudart.so.13",
-                "/usr/local/cuda/lib64/libcufft.so.12",
-                "/usr/local/cuda/lib64/libcusolver.so.12",
-                "/usr/local/cuda/lib64/libnvJitLink.so.13",
-                "/usr/local/cuda/lib64/libnvrtc.so.13",
-                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",
-            ]
-        elif "12" in desired_cuda:
-            # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
-            minor_version = desired_cuda[-1]
-            version_specific_libs = [
-                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
-                "/usr/local/cuda/lib64/libcublas.so.12",
-                "/usr/local/cuda/lib64/libcublasLt.so.12",
-                "/usr/local/cuda/lib64/libcudart.so.12",
-                "/usr/local/cuda/lib64/libcufft.so.11",
-                "/usr/local/cuda/lib64/libcusolver.so.11",
-                "/usr/local/cuda/lib64/libnvJitLink.so.12",
-                "/usr/local/cuda/lib64/libnvrtc.so.12",
-                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
-            ]
-        else:
-            raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")
-
-        # Combine all libraries
-        libs_to_copy = common_libs + version_specific_libs
-
-        # Copy libraries to unzipped_folder/torch/lib
-        for lib_path in libs_to_copy:
-            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
-
-    # Make sure the wheel is tagged with manylinux_2_28
-    for f in os.scandir(f"{folder}/tmp/"):
-        if f.is_dir() and f.name.endswith(".dist-info"):
-            replace_tag(f"{f.path}/WHEEL")
-            break
-
-    os.system(f"wheel pack {folder}/tmp/ -d {folder}")
-    os.system(f"rm -rf {folder}/tmp/")
-
-
-def complete_wheel(folder: str) -> str:
-    """
-    Complete wheel build and put in artifact location
-    """
-    wheel_name = list_dir(f"/{folder}/dist")[0]
-
-    # Please note for cuda we don't run auditwheel since we use custom script to package
-    # the cuda dependencies to the wheel file using update_wheel() method.
-    # However we need to make sure filename reflects the correct Manylinux platform.
-    if "pytorch" in folder and not enable_cuda:
-        print("Repairing Wheel with AuditWheel")
-        check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
-        repaired_wheel_name = list_dir(f"/{folder}/wheelhouse")[0]
-
-        print(f"Moving {repaired_wheel_name} wheel to /{folder}/dist")
-        os.rename(
-            f"/{folder}/wheelhouse/{repaired_wheel_name}",
-            f"/{folder}/dist/{repaired_wheel_name}",
-        )
-    else:
-        repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
-
-    print(f"Copying {repaired_wheel_name} to artifacts")
-    shutil.copy2(
-        f"/{folder}/dist/{repaired_wheel_name}", f"/artifacts/{repaired_wheel_name}"
-    )
-
-    return repaired_wheel_name
-
-
-def parse_arguments():
-    """
-    Parse inline arguments
-    """
-    from argparse import ArgumentParser
-
-    parser = ArgumentParser("AARCH64 wheels python CD")
-    parser.add_argument("--debug", action="store_true")
-    parser.add_argument("--build-only", action="store_true")
-    parser.add_argument("--test-only", type=str)
-    parser.add_argument("--enable-mkldnn", action="store_true")
-    parser.add_argument("--enable-cuda", action="store_true")
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    """
-    Entry Point
-    """
-    args = parse_arguments()
-    enable_mkldnn = args.enable_mkldnn
-    enable_cuda = args.enable_cuda
-    branch = check_output(
-        ["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd="/pytorch"
-    ).decode()
-
-    print("Building PyTorch wheel")
-    build_vars = ""
-    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
-    if enable_cuda:
-        build_vars += "MAX_JOBS=5 "
-
-        # Handle PyPI NVIDIA libraries vs bundled libraries
-        use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
-        if use_nvidia_pypi_libs:
-            print("Configuring build for PyPI NVIDIA libraries")
-            # Configure for dynamic linking (matching x86 logic)
-            build_vars += "ATEN_STATIC_CUDA=0 USE_CUDA_STATIC_LINK=0 USE_CUPTI_SO=1 "
-        else:
-            print("Configuring build for bundled NVIDIA libraries")
-            # Keep existing static linking approach - already configured above
-
-    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
-    desired_cuda = os.getenv("DESIRED_CUDA")
-    if override_package_version is not None:
-        version = override_package_version
-        build_vars += (
-            f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
-        )
-    elif branch in ["nightly", "main"]:
-        build_date = (
-            check_output(["git", "log", "--pretty=format:%cs", "-1"], cwd="/pytorch")
-            .decode()
-            .replace("-", "")
-        )
-        version = (
-            check_output(["cat", "version.txt"], cwd="/pytorch").decode().strip()[:-2]
-        )
-        if enable_cuda:
-            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date}+{desired_cuda} PYTORCH_BUILD_NUMBER=1 "
-        else:
-            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
-    elif branch.startswith(("v1.", "v2.")):
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
-
-    if enable_mkldnn:
-        print("build pytorch with mkldnn+acl backend")
-        build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
-        build_vars += "ACL_ROOT_DIR=/acl "
-        if enable_cuda:
-            build_vars += "BLAS=NVPL "
-        else:
-            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS "
-    else:
-        print("build pytorch without mkldnn backend")
-
-    os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")
-    if enable_cuda:
-        print("Updating Cuda Dependency")
-        filename = os.listdir("/pytorch/dist/")
-        wheel_path = f"/pytorch/dist/{filename[0]}"
-        package_cuda_wheel(wheel_path, desired_cuda)
-    pytorch_wheel_name = complete_wheel("/pytorch/")
-    print(f"Build Complete. Created {pytorch_wheel_name}..")
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -1,999 +0,0 @@
-#!/usr/bin/env python3
-
-# This script is for building  AARCH64 wheels using AWS EC2 instances.
-# To generate binaries for the release follow these steps:
-# 1. Update mappings for each of the Domain Libraries by adding new row to a table like this:
-#         "v1.11.0": ("0.11.0", "rc1"),
-# 2. Run script with following arguments for each of the supported python versions and required tag, for example:
-# build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch v1.11.0-rc3
-
-
-import os
-import subprocess
-import sys
-import time
-from typing import Optional, Union
-
-import boto3
-
-
-# AMI images for us-east-1, change the following based on your ~/.aws/config
-os_amis = {
-    "ubuntu20_04": "ami-052eac90edaa9d08f",  # login_name: ubuntu
-    "ubuntu22_04": "ami-0c6c29c5125214c77",  # login_name: ubuntu
-    "redhat8": "ami-0698b90665a2ddcf1",  # login_name: ec2-user
-}
-
-ubuntu20_04_ami = os_amis["ubuntu20_04"]
-
-
-def compute_keyfile_path(key_name: Optional[str] = None) -> tuple[str, str]:
-    if key_name is None:
-        key_name = os.getenv("AWS_KEY_NAME")
-        if key_name is None:
-            return os.getenv("SSH_KEY_PATH", ""), ""
-
-    homedir_path = os.path.expanduser("~")
-    default_path = os.path.join(homedir_path, ".ssh", f"{key_name}.pem")
-    return os.getenv("SSH_KEY_PATH", default_path), key_name
-
-
-ec2 = boto3.resource("ec2")
-
-
-def ec2_get_instances(filter_name, filter_value):
-    return ec2.instances.filter(
-        Filters=[{"Name": filter_name, "Values": [filter_value]}]
-    )
-
-
-def ec2_instances_of_type(instance_type="t4g.2xlarge"):
-    return ec2_get_instances("instance-type", instance_type)
-
-
-def ec2_instances_by_id(instance_id):
-    rc = list(ec2_get_instances("instance-id", instance_id))
-    return rc[0] if len(rc) > 0 else None
-
-
-def start_instance(
-    key_name, ami=ubuntu20_04_ami, instance_type="t4g.2xlarge", ebs_size: int = 50
-):
-    inst = ec2.create_instances(
-        ImageId=ami,
-        InstanceType=instance_type,
-        SecurityGroups=["ssh-allworld"],
-        KeyName=key_name,
-        MinCount=1,
-        MaxCount=1,
-        BlockDeviceMappings=[
-            {
-                "DeviceName": "/dev/sda1",
-                "Ebs": {
-                    "DeleteOnTermination": True,
-                    "VolumeSize": ebs_size,
-                    "VolumeType": "standard",
-                },
-            }
-        ],
-    )[0]
-    print(f"Create instance {inst.id}")
-    inst.wait_until_running()
-    running_inst = ec2_instances_by_id(inst.id)
-    print(f"Instance started at {running_inst.public_dns_name}")
-    return running_inst
-
-
-class RemoteHost:
-    addr: str
-    keyfile_path: str
-    login_name: str
-    container_id: Optional[str] = None
-    ami: Optional[str] = None
-
-    def __init__(self, addr: str, keyfile_path: str, login_name: str = "ubuntu"):
-        self.addr = addr
-        self.keyfile_path = keyfile_path
-        self.login_name = login_name
-
-    def _gen_ssh_prefix(self) -> list[str]:
-        return [
-            "ssh",
-            "-o",
-            "StrictHostKeyChecking=no",
-            "-i",
-            self.keyfile_path,
-            f"{self.login_name}@{self.addr}",
-            "--",
-        ]
-
-    @staticmethod
-    def _split_cmd(args: Union[str, list[str]]) -> list[str]:
-        return args.split() if isinstance(args, str) else args
-
-    def run_ssh_cmd(self, args: Union[str, list[str]]) -> None:
-        subprocess.check_call(self._gen_ssh_prefix() + self._split_cmd(args))
-
-    def check_ssh_output(self, args: Union[str, list[str]]) -> str:
-        return subprocess.check_output(
-            self._gen_ssh_prefix() + self._split_cmd(args)
-        ).decode("utf-8")
-
-    def scp_upload_file(self, local_file: str, remote_file: str) -> None:
-        subprocess.check_call(
-            [
-                "scp",
-                "-i",
-                self.keyfile_path,
-                local_file,
-                f"{self.login_name}@{self.addr}:{remote_file}",
-            ]
-        )
-
-    def scp_download_file(
-        self, remote_file: str, local_file: Optional[str] = None
-    ) -> None:
-        if local_file is None:
-            local_file = "."
-        subprocess.check_call(
-            [
-                "scp",
-                "-i",
-                self.keyfile_path,
-                f"{self.login_name}@{self.addr}:{remote_file}",
-                local_file,
-            ]
-        )
-
-    def start_docker(self, image="quay.io/pypa/manylinux2014_aarch64:latest") -> None:
-        self.run_ssh_cmd("sudo apt-get install -y docker.io")
-        self.run_ssh_cmd(f"sudo usermod -a -G docker {self.login_name}")
-        self.run_ssh_cmd("sudo service docker start")
-        self.run_ssh_cmd(f"docker pull {image}")
-        self.container_id = self.check_ssh_output(
-            f"docker run -t -d -w /root {image}"
-        ).strip()
-
-    def using_docker(self) -> bool:
-        return self.container_id is not None
-
-    def run_cmd(self, args: Union[str, list[str]]) -> None:
-        if not self.using_docker():
-            return self.run_ssh_cmd(args)
-        assert self.container_id is not None
-        docker_cmd = self._gen_ssh_prefix() + [
-            "docker",
-            "exec",
-            "-i",
-            self.container_id,
-            "bash",
-        ]
-        p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE)
-        p.communicate(
-            input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
-                "utf-8"
-            )
-        )
-        rc = p.wait()
-        if rc != 0:
-            raise subprocess.CalledProcessError(rc, docker_cmd)
-
-    def check_output(self, args: Union[str, list[str]]) -> str:
-        if not self.using_docker():
-            return self.check_ssh_output(args)
-        assert self.container_id is not None
-        docker_cmd = self._gen_ssh_prefix() + [
-            "docker",
-            "exec",
-            "-i",
-            self.container_id,
-            "bash",
-        ]
-        p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
-        (out, err) = p.communicate(
-            input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
-                "utf-8"
-            )
-        )
-        rc = p.wait()
-        if rc != 0:
-            raise subprocess.CalledProcessError(rc, docker_cmd, output=out, stderr=err)
-        return out.decode("utf-8")
-
-    def upload_file(self, local_file: str, remote_file: str) -> None:
-        if not self.using_docker():
-            return self.scp_upload_file(local_file, remote_file)
-        tmp_file = os.path.join("/tmp", os.path.basename(local_file))
-        self.scp_upload_file(local_file, tmp_file)
-        self.run_ssh_cmd(
-            ["docker", "cp", tmp_file, f"{self.container_id}:/root/{remote_file}"]
-        )
-        self.run_ssh_cmd(["rm", tmp_file])
-
-    def download_file(self, remote_file: str, local_file: Optional[str] = None) -> None:
-        if not self.using_docker():
-            return self.scp_download_file(remote_file, local_file)
-        tmp_file = os.path.join("/tmp", os.path.basename(remote_file))
-        self.run_ssh_cmd(
-            ["docker", "cp", f"{self.container_id}:/root/{remote_file}", tmp_file]
-        )
-        self.scp_download_file(tmp_file, local_file)
-        self.run_ssh_cmd(["rm", tmp_file])
-
-    def download_wheel(
-        self, remote_file: str, local_file: Optional[str] = None
-    ) -> None:
-        if self.using_docker() and local_file is None:
-            basename = os.path.basename(remote_file)
-            local_file = basename.replace(
-                "-linux_aarch64.whl", "-manylinux2014_aarch64.whl"
-            )
-        self.download_file(remote_file, local_file)
-
-    def list_dir(self, path: str) -> list[str]:
-        return self.check_output(["ls", "-1", path]).split("\n")
-
-
-def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
-    import socket
-
-    for i in range(attempt_cnt):
-        try:
-            with socket.create_connection((addr, port), timeout=timeout):
-                return
-        except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203
-            if i == attempt_cnt - 1:
-                raise
-            time.sleep(timeout)
-
-
-def update_apt_repo(host: RemoteHost) -> None:
-    time.sleep(5)
-    host.run_cmd("sudo systemctl stop apt-daily.service || true")
-    host.run_cmd("sudo systemctl stop unattended-upgrades.service || true")
-    host.run_cmd(
-        "while systemctl is-active --quiet apt-daily.service; do sleep 1; done"
-    )
-    host.run_cmd(
-        "while systemctl is-active --quiet unattended-upgrades.service; do sleep 1; done"
-    )
-    host.run_cmd("sudo apt-get update")
-    time.sleep(3)
-    host.run_cmd("sudo apt-get update")
-
-
-def install_condaforge(
-    host: RemoteHost, suffix: str = "latest/download/Miniforge3-Linux-aarch64.sh"
-) -> None:
-    print("Install conda-forge")
-    host.run_cmd(f"curl -OL https://github.com/conda-forge/miniforge/releases/{suffix}")
-    host.run_cmd(f"sh -f {os.path.basename(suffix)} -b")
-    host.run_cmd(f"rm -f {os.path.basename(suffix)}")
-    if host.using_docker():
-        host.run_cmd("echo 'PATH=$HOME/miniforge3/bin:$PATH'>>.bashrc")
-    else:
-        host.run_cmd(
-            [
-                "sed",
-                "-i",
-                "'/^# If not running interactively.*/i PATH=$HOME/miniforge3/bin:$PATH'",
-                ".bashrc",
-            ]
-        )
-
-
-def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
-    if python_version == "3.6":
-        # Python-3.6 EOLed and not compatible with conda-4.11
-        install_condaforge(
-            host, suffix="download/4.10.3-10/Miniforge3-4.10.3-10-Linux-aarch64.sh"
-        )
-        host.run_cmd(f"conda install -y python={python_version} numpy pyyaml")
-    else:
-        install_condaforge(
-            host, suffix="download/4.11.0-4/Miniforge3-4.11.0-4-Linux-aarch64.sh"
-        )
-        # Pytorch-1.10 or older are not compatible with setuptools=59.6 or newer
-        host.run_cmd(
-            f"conda install -y python={python_version} numpy pyyaml setuptools>=59.5.0"
-        )
-
-
-def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:
-    host.run_cmd("pip3 install auditwheel")
-    host.run_cmd(
-        "conda install -y patchelf" if use_conda else "sudo apt-get install -y patchelf"
-    )
-    from tempfile import NamedTemporaryFile
-
-    with NamedTemporaryFile() as tmp:
-        tmp.write(embed_library_script.encode("utf-8"))
-        tmp.flush()
-        host.upload_file(tmp.name, "embed_library.py")
-
-    print("Embedding libgomp into wheel")
-    if host.using_docker():
-        host.run_cmd(f"python3 embed_library.py {wheel_name} --update-tag")
-    else:
-        host.run_cmd(f"python3 embed_library.py {wheel_name}")
-
-
-def checkout_repo(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    url: str,
-    git_clone_flags: str,
-    mapping: dict[str, tuple[str, str]],
-) -> Optional[str]:
-    for prefix in mapping:
-        if not branch.startswith(prefix):
-            continue
-        tag = f"v{mapping[prefix][0]}-{mapping[prefix][1]}"
-        host.run_cmd(f"git clone {url} -b {tag} {git_clone_flags}")
-        return mapping[prefix][0]
-
-    host.run_cmd(f"git clone {url} -b {branch} {git_clone_flags}")
-    return None
-
-
-def build_torchvision(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    use_conda: bool = True,
-    git_clone_flags: str,
-    run_smoke_tests: bool = True,
-) -> str:
-    print("Checking out TorchVision repo")
-    build_version = checkout_repo(
-        host,
-        branch=branch,
-        url="https://github.com/pytorch/vision",
-        git_clone_flags=git_clone_flags,
-        mapping={
-            "v1.7.1": ("0.8.2", "rc2"),
-            "v1.8.0": ("0.9.0", "rc3"),
-            "v1.8.1": ("0.9.1", "rc1"),
-            "v1.9.0": ("0.10.0", "rc1"),
-            "v1.10.0": ("0.11.1", "rc1"),
-            "v1.10.1": ("0.11.2", "rc1"),
-            "v1.10.2": ("0.11.3", "rc1"),
-            "v1.11.0": ("0.12.0", "rc1"),
-            "v1.12.0": ("0.13.0", "rc4"),
-            "v1.12.1": ("0.13.1", "rc6"),
-            "v1.13.0": ("0.14.0", "rc4"),
-            "v1.13.1": ("0.14.1", "rc2"),
-            "v2.0.0": ("0.15.1", "rc2"),
-            "v2.0.1": ("0.15.2", "rc2"),
-        },
-    )
-    print("Building TorchVision wheel")
-
-    # Please note libnpg and jpeg are required to build image.so extension
-    if use_conda:
-        host.run_cmd("conda install -y libpng jpeg")
-        # Remove .so files to force static linking
-        host.run_cmd(
-            "rm miniforge3/lib/libpng.so miniforge3/lib/libpng16.so miniforge3/lib/libjpeg.so"
-        )
-        # And patch setup.py to include libz dependency for libpng
-        host.run_cmd(
-            [
-                'sed -i -e \'s/image_link_flags\\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py'
-            ]
-        )
-
-    build_vars = ""
-    if branch == "nightly":
-        version = host.check_output(
-            ["if [ -f vision/version.txt ]; then cat vision/version.txt; fi"]
-        ).strip()
-        if len(version) == 0:
-            # In older revisions, version was embedded in setup.py
-            version = (
-                host.check_output(["grep", '"version = \'"', "vision/setup.py"])
-                .strip()
-                .split("'")[1][:-2]
-            )
-        build_date = (
-            host.check_output("cd vision && git log --pretty=format:%s -1")
-            .strip()
-            .split()[0]
-            .replace("-", "")
-        )
-        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
-    elif build_version is not None:
-        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
-    if host.using_docker():
-        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
-
-    host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation")
-    vision_wheel_name = host.list_dir("vision/dist")[0]
-    embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name))
-
-    print("Copying TorchVision wheel")
-    host.download_wheel(os.path.join("vision", "dist", vision_wheel_name))
-    if run_smoke_tests:
-        host.run_cmd(
-            f"pip3 install {os.path.join('vision', 'dist', vision_wheel_name)}"
-        )
-        host.run_cmd("python3 vision/test/smoke_test.py")
-    print("Delete vision checkout")
-    host.run_cmd("rm -rf vision")
-
-    return vision_wheel_name
-
-
-def build_torchdata(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    use_conda: bool = True,
-    git_clone_flags: str = "",
-) -> str:
-    print("Checking out TorchData repo")
-    git_clone_flags += " --recurse-submodules"
-    build_version = checkout_repo(
-        host,
-        branch=branch,
-        url="https://github.com/pytorch/data",
-        git_clone_flags=git_clone_flags,
-        mapping={
-            "v1.13.1": ("0.5.1", ""),
-            "v2.0.0": ("0.6.0", "rc5"),
-            "v2.0.1": ("0.6.1", "rc1"),
-        },
-    )
-    print("Building TorchData wheel")
-    build_vars = ""
-    if branch == "nightly":
-        version = host.check_output(
-            ["if [ -f data/version.txt ]; then cat data/version.txt; fi"]
-        ).strip()
-        build_date = (
-            host.check_output("cd data && git log --pretty=format:%s -1")
-            .strip()
-            .split()[0]
-            .replace("-", "")
-        )
-        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
-    elif build_version is not None:
-        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
-    if host.using_docker():
-        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
-
-    host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation")
-    wheel_name = host.list_dir("data/dist")[0]
-    embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name))
-
-    print("Copying TorchData wheel")
-    host.download_wheel(os.path.join("data", "dist", wheel_name))
-
-    return wheel_name
-
-
-def build_torchtext(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    use_conda: bool = True,
-    git_clone_flags: str = "",
-) -> str:
-    print("Checking out TorchText repo")
-    git_clone_flags += " --recurse-submodules"
-    build_version = checkout_repo(
-        host,
-        branch=branch,
-        url="https://github.com/pytorch/text",
-        git_clone_flags=git_clone_flags,
-        mapping={
-            "v1.9.0": ("0.10.0", "rc1"),
-            "v1.10.0": ("0.11.0", "rc2"),
-            "v1.10.1": ("0.11.1", "rc1"),
-            "v1.10.2": ("0.11.2", "rc1"),
-            "v1.11.0": ("0.12.0", "rc1"),
-            "v1.12.0": ("0.13.0", "rc2"),
-            "v1.12.1": ("0.13.1", "rc5"),
-            "v1.13.0": ("0.14.0", "rc3"),
-            "v1.13.1": ("0.14.1", "rc1"),
-            "v2.0.0": ("0.15.1", "rc2"),
-            "v2.0.1": ("0.15.2", "rc2"),
-        },
-    )
-    print("Building TorchText wheel")
-    build_vars = ""
-    if branch == "nightly":
-        version = host.check_output(
-            ["if [ -f text/version.txt ]; then cat text/version.txt; fi"]
-        ).strip()
-        build_date = (
-            host.check_output("cd text && git log --pretty=format:%s -1")
-            .strip()
-            .split()[0]
-            .replace("-", "")
-        )
-        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
-    elif build_version is not None:
-        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
-    if host.using_docker():
-        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
-
-    host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation")
-    wheel_name = host.list_dir("text/dist")[0]
-    embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name))
-
-    print("Copying TorchText wheel")
-    host.download_wheel(os.path.join("text", "dist", wheel_name))
-
-    return wheel_name
-
-
-def build_torchaudio(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    use_conda: bool = True,
-    git_clone_flags: str = "",
-) -> str:
-    print("Checking out TorchAudio repo")
-    git_clone_flags += " --recurse-submodules"
-    build_version = checkout_repo(
-        host,
-        branch=branch,
-        url="https://github.com/pytorch/audio",
-        git_clone_flags=git_clone_flags,
-        mapping={
-            "v1.9.0": ("0.9.0", "rc2"),
-            "v1.10.0": ("0.10.0", "rc5"),
-            "v1.10.1": ("0.10.1", "rc1"),
-            "v1.10.2": ("0.10.2", "rc1"),
-            "v1.11.0": ("0.11.0", "rc1"),
-            "v1.12.0": ("0.12.0", "rc3"),
-            "v1.12.1": ("0.12.1", "rc5"),
-            "v1.13.0": ("0.13.0", "rc4"),
-            "v1.13.1": ("0.13.1", "rc2"),
-            "v2.0.0": ("2.0.1", "rc3"),
-            "v2.0.1": ("2.0.2", "rc2"),
-        },
-    )
-    print("Building TorchAudio wheel")
-    build_vars = ""
-    if branch == "nightly":
-        version = (
-            host.check_output(["grep", '"version = \'"', "audio/setup.py"])
-            .strip()
-            .split("'")[1][:-2]
-        )
-        build_date = (
-            host.check_output("cd audio && git log --pretty=format:%s -1")
-            .strip()
-            .split()[0]
-            .replace("-", "")
-        )
-        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
-    elif build_version is not None:
-        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
-    if host.using_docker():
-        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
-
-    host.run_cmd(
-        f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \
-        && ./packaging/ffmpeg/build.sh \
-        && {build_vars} python3 -m build --wheel --no-isolation"
-    )
-
-    wheel_name = host.list_dir("audio/dist")[0]
-    embed_libgomp(host, use_conda, os.path.join("audio", "dist", wheel_name))
-
-    print("Copying TorchAudio wheel")
-    host.download_wheel(os.path.join("audio", "dist", wheel_name))
-
-    return wheel_name
-
-
-def configure_system(
-    host: RemoteHost,
-    *,
-    compiler: str = "gcc-8",
-    use_conda: bool = True,
-    python_version: str = "3.8",
-) -> None:
-    if use_conda:
-        install_condaforge_python(host, python_version)
-
-    print("Configuring the system")
-    if not host.using_docker():
-        update_apt_repo(host)
-        host.run_cmd("sudo apt-get install -y ninja-build g++ git cmake gfortran unzip")
-    else:
-        host.run_cmd("yum install -y sudo")
-        host.run_cmd("conda install -y ninja scons")
-
-    if not use_conda:
-        host.run_cmd(
-            "sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
-        )
-    host.run_cmd("pip3 install dataclasses typing-extensions")
-    if not use_conda:
-        print("Installing Cython + numpy from PyPy")
-        host.run_cmd("sudo pip3 install Cython")
-        host.run_cmd("sudo pip3 install numpy")
-
-
-def build_domains(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    use_conda: bool = True,
-    git_clone_flags: str = "",
-) -> tuple[str, str, str, str]:
-    vision_wheel_name = build_torchvision(
-        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
-    )
-    audio_wheel_name = build_torchaudio(
-        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
-    )
-    data_wheel_name = build_torchdata(
-        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
-    )
-    text_wheel_name = build_torchtext(
-        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
-    )
-    return (vision_wheel_name, audio_wheel_name, data_wheel_name, text_wheel_name)
-
-
-def start_build(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    compiler: str = "gcc-8",
-    use_conda: bool = True,
-    python_version: str = "3.8",
-    pytorch_only: bool = False,
-    pytorch_build_number: Optional[str] = None,
-    shallow_clone: bool = True,
-    enable_mkldnn: bool = False,
-) -> tuple[str, str, str, str, str]:
-    git_clone_flags = " --depth 1 --shallow-submodules" if shallow_clone else ""
-    if host.using_docker() and not use_conda:
-        print("Auto-selecting conda option for docker images")
-        use_conda = True
-    if not host.using_docker():
-        print("Disable mkldnn for host builds")
-        enable_mkldnn = False
-
-    configure_system(
-        host, compiler=compiler, use_conda=use_conda, python_version=python_version
-    )
-
-    if host.using_docker():
-        print("Move libgfortant.a into a standard location")
-        # HACK: pypa gforntran.a is compiled without PIC, which leads to the following error
-        # libgfortran.a(error.o)(.text._gfortrani_st_printf+0x34): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `__stack_chk_guard@@GLIBC_2.17'  # noqa: E501, B950
-        # Workaround by copying gfortran library from the host
-        host.run_ssh_cmd("sudo apt-get install -y gfortran-8")
-        host.run_cmd("mkdir -p /usr/lib/gcc/aarch64-linux-gnu/8")
-        host.run_ssh_cmd(
-            [
-                "docker",
-                "cp",
-                "/usr/lib/gcc/aarch64-linux-gnu/8/libgfortran.a",
-                f"{host.container_id}:/opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/",
-            ]
-        )
-
-    print("Checking out PyTorch repo")
-    host.run_cmd(
-        f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"
-    )
-
-    host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")
-
-    print("Building PyTorch wheel")
-    build_opts = ""
-    if pytorch_build_number is not None:
-        build_opts += f" -C--build-option=--build-number={pytorch_build_number}"
-    # Breakpad build fails on aarch64
-    build_vars = "USE_BREAKPAD=0 "
-    if branch == "nightly":
-        build_date = (
-            host.check_output("cd pytorch && git log --pretty=format:%s -1")
-            .strip()
-            .split()[0]
-            .replace("-", "")
-        )
-        version = host.check_output("cat pytorch/version.txt").strip()[:-2]
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
-    if branch.startswith(("v1.", "v2.")):
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
-    if host.using_docker():
-        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
-    if enable_mkldnn:
-        host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")
-        print("build pytorch with mkldnn+acl backend")
-        build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
-        build_vars += " BLAS=OpenBLAS"
-        build_vars += " OpenBLAS_HOME=/opt/OpenBLAS"
-        build_vars += " ACL_ROOT_DIR=/acl"
-        host.run_cmd(
-            f"cd $HOME/pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
-        )
-        print("Repair the wheel")
-        pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
-        ld_library_path = "/acl/build:$HOME/pytorch/build/lib"
-        host.run_cmd(
-            f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"
-        )
-        print("replace the original wheel with the repaired one")
-        pytorch_repaired_wheel_name = host.list_dir("wheelhouse")[0]
-        host.run_cmd(
-            f"cp $HOME/wheelhouse/{pytorch_repaired_wheel_name} $HOME/pytorch/dist/{pytorch_wheel_name}"
-        )
-    else:
-        print("build pytorch without mkldnn backend")
-        host.run_cmd(
-            f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
-        )
-
-    print("Deleting build folder")
-    host.run_cmd("cd pytorch && rm -rf build")
-    pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
-    embed_libgomp(host, use_conda, os.path.join("pytorch", "dist", pytorch_wheel_name))
-    print("Copying the wheel")
-    host.download_wheel(os.path.join("pytorch", "dist", pytorch_wheel_name))
-
-    print("Installing PyTorch wheel")
-    host.run_cmd(f"pip3 install pytorch/dist/{pytorch_wheel_name}")
-
-    if pytorch_only:
-        return (pytorch_wheel_name, None, None, None, None)
-    domain_wheels = build_domains(
-        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
-    )
-
-    return (pytorch_wheel_name, *domain_wheels)
-
-
-embed_library_script = """
-#!/usr/bin/env python3
-
-from auditwheel.patcher import Patchelf
-from auditwheel.wheeltools import InWheelCtx
-from auditwheel.elfutils import elf_file_filter
-from auditwheel.repair import copylib
-from auditwheel.lddtree import lddtree
-from subprocess import check_call
-import os
-import shutil
-import sys
-from tempfile import TemporaryDirectory
-
-
-def replace_tag(filename):
-   with open(filename, 'r') as f:
-     lines = f.read().split("\\n")
-   for i,line in enumerate(lines):
-       if not line.startswith("Tag: "):
-           continue
-       lines[i] = line.replace("-linux_", "-manylinux2014_")
-       print(f'Updated tag from {line} to {lines[i]}')
-
-   with open(filename, 'w') as f:
-       f.write("\\n".join(lines))
-
-
-class AlignedPatchelf(Patchelf):
-    def set_soname(self, file_name: str, new_soname: str) -> None:
-        check_call(['patchelf', '--page-size', '65536', '--set-soname', new_soname, file_name])
-
-    def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
-        check_call(['patchelf', '--page-size', '65536', '--replace-needed', soname, new_soname, file_name])
-
-
-def embed_library(whl_path, lib_soname, update_tag=False):
-    patcher = AlignedPatchelf()
-    out_dir = TemporaryDirectory()
-    whl_name = os.path.basename(whl_path)
-    tmp_whl_name = os.path.join(out_dir.name, whl_name)
-    with InWheelCtx(whl_path) as ctx:
-        torchlib_path = os.path.join(ctx._tmpdir.name, 'torch', 'lib')
-        ctx.out_wheel=tmp_whl_name
-        new_lib_path, new_lib_soname = None, None
-        for filename, elf in elf_file_filter(ctx.iter_files()):
-            if not filename.startswith('torch/lib'):
-                continue
-            libtree = lddtree(filename)
-            if lib_soname not in libtree['needed']:
-                continue
-            lib_path = libtree['libs'][lib_soname]['path']
-            if lib_path is None:
-                print(f"Can't embed {lib_soname} as it could not be found")
-                break
-            if lib_path.startswith(torchlib_path):
-                continue
-
-            if new_lib_path is None:
-                new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
-            patcher.replace_needed(filename, lib_soname, new_lib_soname)
-            print(f'Replacing {lib_soname} with {new_lib_soname} for {filename}')
-        if update_tag:
-            # Add manylinux2014 tag
-            for filename in ctx.iter_files():
-                if os.path.basename(filename) != 'WHEEL':
-                    continue
-                replace_tag(filename)
-    shutil.move(tmp_whl_name, whl_path)
-
-
-if __name__ == '__main__':
-    embed_library(sys.argv[1], 'libgomp.so.1', len(sys.argv) > 2 and sys.argv[2] == '--update-tag')
-"""
-
-
-def run_tests(host: RemoteHost, whl: str, branch="main") -> None:
-    print("Configuring the system")
-    update_apt_repo(host)
-    host.run_cmd("sudo apt-get install -y python3-pip git")
-    host.run_cmd("sudo pip3 install Cython")
-    host.run_cmd("sudo pip3 install numpy")
-    host.upload_file(whl, ".")
-    host.run_cmd(f"sudo pip3 install {whl}")
-    host.run_cmd("python3 -c 'import torch;print(torch.rand((3,3))'")
-    host.run_cmd(f"git clone -b {branch} https://github.com/pytorch/pytorch")
-    host.run_cmd("cd pytorch/test; python3 test_torch.py -v")
-
-
-def get_instance_name(instance) -> Optional[str]:
-    if instance.tags is None:
-        return None
-    for tag in instance.tags:
-        if tag["Key"] == "Name":
-            return tag["Value"]
-    return None
-
-
-def list_instances(instance_type: str) -> None:
-    print(f"All instances of type {instance_type}")
-    for instance in ec2_instances_of_type(instance_type):
-        ifaces = instance.network_interfaces
-        az = ifaces[0].subnet.availability_zone if len(ifaces) > 0 else None
-        print(
-            f"{instance.id} {get_instance_name(instance)} {instance.public_dns_name} {instance.state['Name']} {az}"
-        )
-
-
-def terminate_instances(instance_type: str) -> None:
-    print(f"Terminating all instances of type {instance_type}")
-    instances = list(ec2_instances_of_type(instance_type))
-    for instance in instances:
-        print(f"Terminating {instance.id}")
-        instance.terminate()
-    print("Waiting for termination to complete")
-    for instance in instances:
-        instance.wait_until_terminated()
-
-
-def parse_arguments():
-    from argparse import ArgumentParser
-
-    parser = ArgumentParser("Build and test AARCH64 wheels using EC2")
-    parser.add_argument("--key-name", type=str)
-    parser.add_argument("--debug", action="store_true")
-    parser.add_argument("--build-only", action="store_true")
-    parser.add_argument("--test-only", type=str)
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument("--os", type=str, choices=list(os_amis.keys()))
-    group.add_argument("--ami", type=str)
-    parser.add_argument(
-        "--python-version",
-        type=str,
-        choices=[f"3.{d}" for d in range(6, 12)],
-        default=None,
-    )
-    parser.add_argument("--alloc-instance", action="store_true")
-    parser.add_argument("--list-instances", action="store_true")
-    parser.add_argument("--pytorch-only", action="store_true")
-    parser.add_argument("--keep-running", action="store_true")
-    parser.add_argument("--terminate-instances", action="store_true")
-    parser.add_argument("--instance-type", type=str, default="t4g.2xlarge")
-    parser.add_argument("--ebs-size", type=int, default=50)
-    parser.add_argument("--branch", type=str, default="main")
-    parser.add_argument("--use-docker", action="store_true")
-    parser.add_argument(
-        "--compiler",
-        type=str,
-        choices=["gcc-7", "gcc-8", "gcc-9", "clang"],
-        default="gcc-8",
-    )
-    parser.add_argument("--use-torch-from-pypi", action="store_true")
-    parser.add_argument("--pytorch-build-number", type=str, default=None)
-    parser.add_argument("--disable-mkldnn", action="store_true")
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    ami = (
-        args.ami
-        if args.ami is not None
-        else os_amis[args.os]
-        if args.os is not None
-        else ubuntu20_04_ami
-    )
-    keyfile_path, key_name = compute_keyfile_path(args.key_name)
-
-    if args.list_instances:
-        list_instances(args.instance_type)
-        sys.exit(0)
-
-    if args.terminate_instances:
-        terminate_instances(args.instance_type)
-        sys.exit(0)
-
-    if len(key_name) == 0:
-        raise RuntimeError("""
-            Cannot start build without key_name, please specify
-            --key-name argument or AWS_KEY_NAME environment variable.""")
-    if len(keyfile_path) == 0 or not os.path.exists(keyfile_path):
-        raise RuntimeError(f"""
-            Cannot find keyfile with name: [{key_name}] in path: [{keyfile_path}], please
-            check `~/.ssh/` folder or manually set SSH_KEY_PATH environment variable.""")
-
-    # Starting the instance
-    inst = start_instance(
-        key_name, ami=ami, instance_type=args.instance_type, ebs_size=args.ebs_size
-    )
-    instance_name = f"{args.key_name}-{args.os}"
-    if args.python_version is not None:
-        instance_name += f"-py{args.python_version}"
-    inst.create_tags(
-        DryRun=False,
-        Tags=[
-            {
-                "Key": "Name",
-                "Value": instance_name,
-            }
-        ],
-    )
-    addr = inst.public_dns_name
-    wait_for_connection(addr, 22)
-    host = RemoteHost(addr, keyfile_path)
-    host.ami = ami
-    if args.use_docker:
-        update_apt_repo(host)
-        host.start_docker()
-
-    if args.test_only:
-        run_tests(host, args.test_only)
-        sys.exit(0)
-
-    if args.alloc_instance:
-        if args.python_version is None:
-            sys.exit(0)
-        install_condaforge_python(host, args.python_version)
-        sys.exit(0)
-
-    python_version = args.python_version if args.python_version is not None else "3.10"
-
-    if args.use_torch_from_pypi:
-        configure_system(host, compiler=args.compiler, python_version=python_version)
-        print("Installing PyTorch wheel")
-        host.run_cmd("pip3 install torch")
-        build_domains(
-            host, branch=args.branch, git_clone_flags=" --depth 1 --shallow-submodules"
-        )
-    else:
-        start_build(
-            host,
-            branch=args.branch,
-            compiler=args.compiler,
-            python_version=python_version,
-            pytorch_only=args.pytorch_only,
-            pytorch_build_number=args.pytorch_build_number,
-            enable_mkldnn=not args.disable_mkldnn,
-        )
-    if not args.keep_running:
-        print(f"Waiting for instance {inst.id} to terminate")
-        inst.terminate()
-        inst.wait_until_terminated()
--- a/.ci/aarch64_linux/embed_library.py
+++ b/.ci/aarch64_linux/embed_library.py
@ -1,87 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import shutil
-import sys
-from subprocess import check_call
-from tempfile import TemporaryDirectory
-
-from auditwheel.elfutils import elf_file_filter
-from auditwheel.lddtree import lddtree
-from auditwheel.patcher import Patchelf
-from auditwheel.repair import copylib
-from auditwheel.wheeltools import InWheelCtx
-
-
-def replace_tag(filename):
-    with open(filename) as f:
-        lines = f.read().split("\\n")
-    for i, line in enumerate(lines):
-        if not line.startswith("Tag: "):
-            continue
-        lines[i] = line.replace("-linux_", "-manylinux2014_")
-        print(f"Updated tag from {line} to {lines[i]}")
-
-    with open(filename, "w") as f:
-        f.write("\\n".join(lines))
-
-
-class AlignedPatchelf(Patchelf):
-    def set_soname(self, file_name: str, new_soname: str) -> None:
-        check_call(
-            ["patchelf", "--page-size", "65536", "--set-soname", new_soname, file_name]
-        )
-
-    def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
-        check_call(
-            [
-                "patchelf",
-                "--page-size",
-                "65536",
-                "--replace-needed",
-                soname,
-                new_soname,
-                file_name,
-            ]
-        )
-
-
-def embed_library(whl_path, lib_soname, update_tag=False):
-    patcher = AlignedPatchelf()
-    out_dir = TemporaryDirectory()
-    whl_name = os.path.basename(whl_path)
-    tmp_whl_name = os.path.join(out_dir.name, whl_name)
-    with InWheelCtx(whl_path) as ctx:
-        torchlib_path = os.path.join(ctx._tmpdir.name, "torch", "lib")
-        ctx.out_wheel = tmp_whl_name
-        new_lib_path, new_lib_soname = None, None
-        for filename, _ in elf_file_filter(ctx.iter_files()):
-            if not filename.startswith("torch/lib"):
-                continue
-            libtree = lddtree(filename)
-            if lib_soname not in libtree["needed"]:
-                continue
-            lib_path = libtree["libs"][lib_soname]["path"]
-            if lib_path is None:
-                print(f"Can't embed {lib_soname} as it could not be found")
-                break
-            if lib_path.startswith(torchlib_path):
-                continue
-
-            if new_lib_path is None:
-                new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
-            patcher.replace_needed(filename, lib_soname, new_lib_soname)
-            print(f"Replacing {lib_soname} with {new_lib_soname} for {filename}")
-        if update_tag:
-            # Add manylinux2014 tag
-            for filename in ctx.iter_files():
-                if os.path.basename(filename) != "WHEEL":
-                    continue
-                replace_tag(filename)
-    shutil.move(tmp_whl_name, whl_path)
-
-
-if __name__ == "__main__":
-    embed_library(
-        sys.argv[1], "libgomp.so.1", len(sys.argv) > 2 and sys.argv[2] == "--update-tag"
-    )
--- a/.ci/manywheel/build.sh
+++ b/.ci/manywheel/build.sh
@ -4,14 +4,17 @@ set -ex

 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

+# Source the common build script for architecture-specific configurations (MKLDNN, ACL, etc.)
+source "${SCRIPTPATH}/../pytorch/build.sh" || true
+
 case "${GPU_ARCH_TYPE:-BLANK}" in
-    cuda)
+    cuda | cuda-aarch64)
        bash "${SCRIPTPATH}/build_cuda.sh"
        ;;
    rocm)
        bash "${SCRIPTPATH}/build_rocm.sh"
        ;;
-    cpu | cpu-cxx11-abi | cpu-s390x)
+    cpu | cpu-cxx11-abi | cpu-aarch64 | cpu-s390x)
        bash "${SCRIPTPATH}/build_cpu.sh"
        ;;
    xpu)
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -18,12 +18,31 @@ retry () {
    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
 }

+# Detect architecture first
+ARCH=$(uname -m)
+echo "Detected architecture: $ARCH"
+
 PLATFORM=""
 # TODO move this into the Docker images
 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
 if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    retry yum install -q -y zip openssl
-    PLATFORM="manylinux_2_28_x86_64"
+    # Set platform based on architecture
+    case $ARCH in
+        x86_64)
+            PLATFORM="manylinux_2_28_x86_64"
+            ;;
+        aarch64)
+            PLATFORM="manylinux_2_28_aarch64"
+            ;;
+        s390x)
+            PLATFORM="manylinux_2_28_s390x"
+            ;;
+        *)
+            echo "Unsupported architecture: $ARCH"
+            exit 1
+            ;;
+    esac
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    retry dnf install -q -y zip openssl
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
@ -38,6 +57,8 @@ else
    exit 1
 fi

+echo "Platform set to: $PLATFORM"
+
 # We use the package name to test the package by passing this to 'pip install'
 # This is the env variable that setup.py uses to name the package. Note that
 # pip 'normalizes' the name first by changing all - to _
@ -299,8 +320,8 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
            # ROCm workaround for roctracer dlopens
            if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
                patchedpath=$(fname_without_so_number $destpath)
-            # Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load
-            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then
+            # Keep the so number for XPU dependencies, libgomp.so.1, ACL libraries, and NVPL libraries to avoid twice load
+            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" || "$filename" == libarm_compute* || "$filename" == libnvpl* || "$filename" == "libgfortran.so.5" ]]; then
                patchedpath=$destpath
            else
                patchedpath=$(fname_with_sha256 $destpath)
@ -346,9 +367,22 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
    done

    # create Manylinux 2_28 tag this needs to happen before regenerate the RECORD
-    if [[ $PLATFORM == "manylinux_2_28_x86_64" && $GPU_ARCH_TYPE != "cpu-s390x" && $GPU_ARCH_TYPE != "xpu" ]]; then
+    # Support all architectures (x86_64, aarch64, s390x)
+    if [[ "$IS_MANYLINUX2_28" == "1" && $GPU_ARCH_TYPE != "xpu" ]]; then
        wheel_file=$(echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/WHEEL/g')
-        sed -i -e s#linux_x86_64#"${PLATFORM}"# $wheel_file;
+        echo "Updating wheel tag for $ARCH architecture"
+        # Replace linux_* with manylinux_2_28_* based on architecture
+        case $ARCH in
+            x86_64)
+                sed -i -e 's#linux_x86_64#manylinux_2_28_x86_64#g' $wheel_file
+                ;;
+            aarch64)
+                sed -i -e 's#linux_aarch64#manylinux_2_28_aarch64#g' $wheel_file
+                ;;
+            s390x)
+                sed -i -e 's#linux_s390x#manylinux_2_28_s390x#g' $wheel_file
+                ;;
+        esac
    fi

    # regenerate the RECORD file with new hashes
--- a/.ci/manywheel/build_cpu.sh
+++ b/.ci/manywheel/build_cpu.sh
@ -15,6 +15,10 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
    EXTRA_CAFFE2_CMAKE_FLAGS=()
 fi

+# Detect architecture
+ARCH=$(uname -m)
+echo "Building CPU wheel for architecture: $ARCH"
+
 WHEELHOUSE_DIR="wheelhousecpu"
 LIBTORCH_HOUSE_DIR="libtorch_housecpu"
 if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
@ -34,8 +38,10 @@ elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
 elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
-    if [[ "$(uname -m)" == "s390x" ]]; then
+    if [[ "$ARCH" == "s390x" ]]; then
        LIBGOMP_PATH="/usr/lib/s390x-linux-gnu/libgomp.so.1"
+    elif [[ "$ARCH" == "aarch64" ]]; then
+        LIBGOMP_PATH="/usr/lib/aarch64-linux-gnu/libgomp.so.1"
    else
        LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
    fi
@ -49,6 +55,32 @@ DEPS_SONAME=(
    "libgomp.so.1"
 )

+# Add ARM-specific library dependencies for CPU builds
+if [[ "$ARCH" == "aarch64" ]]; then
+    echo "Adding ARM-specific CPU library dependencies"
+
+    # ARM Compute Library (if available)
+    if [[ -d "/acl/build" ]]; then
+        echo "Adding ARM Compute Library for CPU"
+        DEPS_LIST+=(
+            "/acl/build/libarm_compute.so"
+            "/acl/build/libarm_compute_graph.so"
+        )
+        DEPS_SONAME+=(
+            "libarm_compute.so"
+            "libarm_compute_graph.so"
+        )
+    fi
+
+    # ARM system libraries
+    DEPS_LIST+=(
+        "/usr/lib64/libgfortran.so.5"
+    )
+    DEPS_SONAME+=(
+        "libgfortran.so.5"
+    )
+fi
+
 rm -rf /usr/local/cuda*

 SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -29,6 +29,10 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
    EXTRA_CAFFE2_CMAKE_FLAGS=()
 fi

+# Detect architecture
+ARCH=$(uname -m)
+echo "Building for architecture: $ARCH"
+
 # Determine CUDA version and architectures to build for
 #
 # NOTE: We should first check `DESIRED_CUDA` when determining `CUDA_VERSION`,
@ -53,34 +57,60 @@ fi
 cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
 EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")

+# Function to remove architectures from a list
+remove_archs() {
+    local result="$1"
+    shift
+    for arch in "$@"; do
+        result="${result//${arch};/}"
+    done
+    echo "$result"
+}
+
+# Function to filter CUDA architectures for aarch64
+# aarch64 ARM GPUs only support certain compute capabilities
+# Keep: 8.0 (A100), 9.0+ (Hopper, Grace Hopper, newer)
+# Remove: < 8.0 (no ARM GPUs), 8.6 (x86_64 RTX 3090/A6000 only)
+filter_aarch64_archs() {
+    local arch_list="$1"
+    # Explicitly remove architectures not needed on aarch64
+    arch_list=$(remove_archs "$arch_list" "5.0" "6.0" "7.0" "7.5" "8.6")
+    echo "$arch_list"
+}
+
+# Base: Common architectures across all modern CUDA versions
+TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0"
+
 case ${CUDA_VERSION} in
-    #removing sm_50-sm_60 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
-    #however we would like to keep sm_70 architecture see: https://github.com/pytorch/pytorch/issues/157517
-    12.8)
-        TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0"
-        ;;
-    12.9)
-        TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0+PTX"
-        # WAR to resolve the ld error in libtorch build with CUDA 12.9
+    12.6) TORCH_CUDA_ARCH_LIST="5.0;6.0;${TORCH_CUDA_ARCH_LIST}" ;;  # Only 12.6 includes Legacy Maxwell/Pascal that will be removed in future releases
+    12.8) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0" ;;  # +Hopper/Blackwell support
+    12.9) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0+PTX" # +Hopper/Blackwell support + PTX for forward compatibility
        if [[ "$PACKAGE_TYPE" == "libtorch" ]]; then
-            TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
+            TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//7.0;/}"  # Remove 7.0 to resolve the ld error
+            TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//8.6;/}"  # Remove 8.6 for libtorch
        fi
        ;;
    13.0)
-        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
-        ;;
-    12.6)
-        TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
-        ;;
-    *)
-        echo "unknown cuda version $CUDA_VERSION"
-        exit 1
+        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;$([[ "$ARCH" == "aarch64" ]] && echo "11.0;" || echo "")12.0+PTX"
+        export TORCH_NVCC_FLAGS="-compress-mode=size"
+        export BUILD_BUNDLE_PTXAS=1
        ;;
+    *) echo "unknown cuda version $CUDA_VERSION"; exit 1 ;;
 esac

+# Filter for aarch64: Remove < 8.0 and 8.6
+[[ "$ARCH" == "aarch64" ]] && TORCH_CUDA_ARCH_LIST=$(filter_aarch64_archs "$TORCH_CUDA_ARCH_LIST")
+
+echo "TORCH_CUDA_ARCH_LIST set to: $TORCH_CUDA_ARCH_LIST"
 export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
 echo "${TORCH_CUDA_ARCH_LIST}"

+# Disable MAGMA for aarch64 as pre-built libraries are x86-64 only
+if [[ "$ARCH" == "aarch64" ]]; then
+    echo "Disabling MAGMA for aarch64 architecture"
+    export USE_MAGMA=0
+fi
+
 # Package directories
 WHEELHOUSE_DIR="wheelhouse$cuda_version_nodot"
 LIBTORCH_HOUSE_DIR="libtorch_house$cuda_version_nodot"
@ -244,6 +274,51 @@ else
    exit 1
 fi

+# Add ARM-specific library dependencies
+if [[ "$ARCH" == "aarch64" ]]; then
+    echo "Adding ARM-specific library dependencies"
+
+    # ARM Compute Library (if available)
+    if [[ -d "/acl/build" ]]; then
+        echo "Adding ARM Compute Library"
+        DEPS_LIST+=(
+            "/acl/build/libarm_compute.so"
+            "/acl/build/libarm_compute_graph.so"
+        )
+        DEPS_SONAME+=(
+            "libarm_compute.so"
+            "libarm_compute_graph.so"
+        )
+    fi
+
+    # ARM system libraries
+    DEPS_LIST+=(
+        "/lib64/libgomp.so.1"
+        "/usr/lib64/libgfortran.so.5"
+    )
+    DEPS_SONAME+=(
+        "libgomp.so.1"
+        "libgfortran.so.5"
+    )
+
+    # NVPL libraries (ARM optimized BLAS/LAPACK)
+    if [[ -d "/usr/local/lib" && -f "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0" ]]; then
+        echo "Adding NVPL libraries for ARM"
+        DEPS_LIST+=(
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0"
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0"
+            "/usr/local/lib/libnvpl_lapack_core.so.0"
+            "/usr/local/lib/libnvpl_blas_core.so.0"
+        )
+        DEPS_SONAME+=(
+            "libnvpl_lapack_lp64_gomp.so.0"
+            "libnvpl_blas_lp64_gomp.so.0"
+            "libnvpl_lapack_core.so.0"
+            "libnvpl_blas_core.so.0"
+        )
+    fi
+fi
+
 # run_tests.sh requires DESIRED_CUDA to know what tests to exclude
 export DESIRED_CUDA="$cuda_version_nodot"

@ -251,9 +326,11 @@ export DESIRED_CUDA="$cuda_version_nodot"
 rm -rf /usr/local/cuda || true
 ln -s "/usr/local/cuda-${CUDA_VERSION}" /usr/local/cuda

-# Switch `/usr/local/magma` to the desired CUDA version
-rm -rf /usr/local/magma || true
-ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
+# Switch `/usr/local/magma` to the desired CUDA version (skip for aarch64)
+if [[ "$ARCH" != "aarch64" ]]; then
+    rm -rf /usr/local/magma || true
+    ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
+fi

 export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev) # 10.0.130
 export CUDA_VERSION_SHORT=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev | cut -f1,2 -d".") # 10.0
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -86,10 +86,20 @@ else
  fi
 fi

+# Enable MKLDNN with ARM Compute Library for ARM builds
 if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
  export USE_MKLDNN=1
+
+  # ACL is required for aarch64 builds
+  if [[ ! -d "/acl" ]]; then
+    echo "ERROR: ARM Compute Library not found at /acl"
+    echo "ACL is required for aarch64 builds. Check Docker image setup."
+    exit 1
+  fi
+
  export USE_MKLDNN_ACL=1
  export ACL_ROOT_DIR=/acl
+  echo "ARM Compute Library enabled for MKLDNN: ACL_ROOT_DIR=/acl"
 fi

 if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-94631807d22c09723dd006f7be5beb649d5f88d0
+e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -260,11 +260,8 @@ jobs:
            "${DOCKER_IMAGE}"
          )
          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          if [[ ${BUILD_ENVIRONMENT} == *"aarch64"* ]]; then
-            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/aarch64_linux/aarch64_ci_build.sh"
-          else
-            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
-          fi
+          # Unified build script for all architectures (x86_64, aarch64, s390x)
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"

      - name: Chown artifacts
        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
--- a/aten/src/ATen/LegacyBatchedTensorImpl.h
+++ b/aten/src/ATen/LegacyBatchedTensorImpl.h
@ -144,7 +144,7 @@ inline std::bitset<kVmapNumLevels> createVmapLevelsBitset(BatchDimsRef bdims) {
 }

 inline std::ostream& operator<<(std::ostream& out, const BatchDim& bdim) {
-  out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ")";
+  out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ')';
  return out;
 }

--- a/aten/src/ATen/TensorIndexing.cpp
+++ b/aten/src/ATen/TensorIndexing.cpp
@ -9,7 +9,7 @@ namespace indexing {
 const EllipsisIndexType Ellipsis = EllipsisIndexType();

 std::ostream& operator<<(std::ostream& stream, const Slice& slice) {
-  stream << slice.start() << ":" << slice.stop() << ":" << slice.step();
+  stream << slice.start() << ':' << slice.stop() << ':' << slice.step();
  return stream;
 }

@ -31,12 +31,12 @@ std::ostream& operator<<(std::ostream& stream, const TensorIndex& tensor_index)
 }

 std::ostream& operator<<(std::ostream& stream, const std::vector<TensorIndex>& tensor_indices) {
-  stream << "(";
+  stream << '(';
  for (const auto i : c10::irange(tensor_indices.size())) {
    stream << tensor_indices[i];
    if (i < tensor_indices.size() - 1) stream << ", ";
  }
-  stream << ")";
+  stream << ')';
  return stream;
 }

--- a/aten/src/ATen/TensorNames.cpp
+++ b/aten/src/ATen/TensorNames.cpp
@ -113,7 +113,7 @@ void TensorNames::checkUnique(const char* op_name) const {
 std::ostream& operator<<(std::ostream& out, const TensorName& tensorname) {
  out << tensorname.name_ << " (index ";
  out << tensorname.origin_idx_ << " of ";
-  out << tensorname.origin_ << ")";
+  out << tensorname.origin_ << ')';
  return out;
 }

--- a/aten/src/ATen/TensorUtils.cpp
+++ b/aten/src/ATen/TensorUtils.cpp
@ -13,9 +13,9 @@ std::ostream& operator<<(std::ostream & out, const TensorGeometryArg& t) {
  if (t.pos == 0) {
    // 0 is distinguished; it usually indicates 'self' or the return
    // tensor
-    out << "'" << t.name << "'";
+    out << '\'' << t.name << '\'';
  } else {
-    out << "argument #" << t.pos << " '" << t.name << "'";
+    out << "argument #" << t.pos << " '" << t.name << '\'';
  }
  return out;
 }
@ -154,7 +154,7 @@ void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
      oss << "Tensor for " << t2 << " is on CPU, ";
    }
    oss << "but expected " << ((!t1->is_cpu() && !t2->is_cpu()) ? "them" : "it")
-        << " to be on GPU (while checking arguments for " << c << ")";
+        << " to be on GPU (while checking arguments for " << c << ')';
    TORCH_CHECK(false, oss.str());
  }
  TORCH_CHECK(
@ -199,7 +199,7 @@ void checkScalarTypes(CheckedFrom c, const TensorArg& t,
        i++;
      }
      oss << "; but got " << t->toString()
-          << " instead (while checking arguments for " << c << ")";
+          << " instead (while checking arguments for " << c << ')';
      TORCH_CHECK(false, oss.str());
    }
 }
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@ -43,8 +43,8 @@ std::string get_mkldnn_version() {
    // https://github.com/intel/ideep/issues/29
    {
      const dnnl_version_t* ver = dnnl_version();
-      ss << "Intel(R) MKL-DNN v" << ver->major << "." << ver->minor << "." << ver->patch
-         << " (Git Hash " << ver->hash << ")";
+      ss << "Intel(R) MKL-DNN v" << ver->major << '.' << ver->minor << '.' << ver->patch
+         << " (Git Hash " << ver->hash << ')';
    }
  #else
    ss << "MKLDNN not found";
@ -81,7 +81,7 @@ std::string get_openmp_version() {
          break;
      }
      if (ver_str) {
-        ss << " (a.k.a. OpenMP " << ver_str << ")";
+        ss << " (a.k.a. OpenMP " << ver_str << ')';
      }
    }
  #else
@ -135,38 +135,38 @@ std::string show_config() {

 #if defined(__GNUC__)
  {
-    ss << "  - GCC " << __GNUC__ << "." << __GNUC_MINOR__ << "\n";
+    ss << "  - GCC " << __GNUC__ << '.' << __GNUC_MINOR__ << '\n';
  }
 #endif

 #if defined(__cplusplus)
  {
-    ss << "  - C++ Version: " << __cplusplus << "\n";
+    ss << "  - C++ Version: " << __cplusplus << '\n';
  }
 #endif

 #if defined(__clang_major__)
  {
-    ss << "  - clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__ << "\n";
+    ss << "  - clang " << __clang_major__ << '.' << __clang_minor__ << '.' << __clang_patchlevel__ << '\n';
  }
 #endif

 #if defined(_MSC_VER)
  {
-    ss << "  - MSVC " << _MSC_FULL_VER << "\n";
+    ss << "  - MSVC " << _MSC_FULL_VER << '\n';
  }
 #endif

 #if AT_MKL_ENABLED()
-  ss << "  - " << get_mkl_version() << "\n";
+  ss << "  - " << get_mkl_version() << '\n';
 #endif

 #if AT_MKLDNN_ENABLED()
-  ss << "  - " << get_mkldnn_version() << "\n";
+  ss << "  - " << get_mkldnn_version() << '\n';
 #endif

 #ifdef _OPENMP
-  ss << "  - " << get_openmp_version() << "\n";
+  ss << "  - " << get_openmp_version() << '\n';
 #endif

 #if AT_BUILD_WITH_LAPACK()
@ -183,7 +183,7 @@ std::string show_config() {
  ss << "  - Cross compiling on MacOSX\n";
 #endif

-  ss << "  - "<< used_cpu_capability() << "\n";
+  ss << "  - "<< used_cpu_capability() << '\n';

  if (hasCUDA()) {
    ss << detail::getCUDAHooks().showConfig();
@ -200,10 +200,10 @@ std::string show_config() {
  ss << "  - Build settings: ";
  for (const auto& pair : caffe2::GetBuildOptions()) {
    if (!pair.second.empty()) {
-      ss << pair.first << "=" << pair.second << ", ";
+      ss << pair.first << '=' << pair.second << ", ";
    }
  }
-  ss << "\n";
+  ss << '\n';

  // TODO: do HIP
  // TODO: do XLA
--- a/aten/src/ATen/code_template.h
+++ b/aten/src/ATen/code_template.h
@ -209,7 +209,7 @@ struct CodeTemplate {
  // to indent correctly in the context.
  void emitIndent(std::ostream& out, size_t indent) const {
    for ([[maybe_unused]] const auto i : c10::irange(indent)) {
-      out << " ";
+      out << ' ';
    }
  }
  void emitStringWithIndents(
--- a/aten/src/ATen/core/Dimname.cpp
+++ b/aten/src/ATen/core/Dimname.cpp
@ -10,7 +10,7 @@ std::ostream& operator<<(std::ostream& out, const Dimname& dimname) {
  if (dimname.type() == NameType::WILDCARD) {
    out << "None";
  } else {
-    out << "'" << dimname.symbol().toUnqualString() << "'";
+    out << '\'' << dimname.symbol().toUnqualString() << '\'';
  }
  return out;
 }
--- a/aten/src/ATen/core/Range.cpp
+++ b/aten/src/ATen/core/Range.cpp
@ -5,7 +5,7 @@
 namespace at {

 std::ostream& operator<<(std::ostream& out, const Range& range) {
-  out << "Range[" << range.begin << ", " << range.end << "]";
+  out << "Range[" << range.begin << ", " << range.end << ']';
  return out;
 }

--- a/aten/src/ATen/core/Tensor.cpp
+++ b/aten/src/ATen/core/Tensor.cpp
@ -71,7 +71,7 @@ void TensorBase::enforce_invariants() {

 void TensorBase::print() const {
  if (defined()) {
-    std::cerr << "[" << toString() << " " << sizes() << "]" << '\n';
+    std::cerr << '[' << toString() << ' ' << sizes() << ']' << '\n';
  } else {
    std::cerr << "[UndefinedTensor]" << '\n';
  }
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -245,9 +245,6 @@ class TORCH_API TensorBase {
  size_t weak_use_count() const noexcept {
    return impl_.weak_use_count();
  }
-  bool is_uniquely_owned() const noexcept {
-    return impl_.is_uniquely_owned();
-  }

  std::string toString() const;

--- a/aten/src/ATen/core/Vitals.cpp
+++ b/aten/src/ATen/core/Vitals.cpp
@ -9,8 +9,8 @@ APIVitals VitalsAPI;

 std::ostream& operator<<(std::ostream& os, TorchVital const& tv) {
  for (const auto& m : tv.attrs) {
-    os << "[TORCH_VITAL] " << tv.name << "." << m.first << "\t\t "
-       << m.second.value << "\n";
+    os << "[TORCH_VITAL] " << tv.name << '.' << m.first << "\t\t "
+       << m.second.value << '\n';
  }
  return os;
 }
--- a/aten/src/ATen/core/alias_info.h
+++ b/aten/src/ATen/core/alias_info.h
@ -100,18 +100,18 @@ inline bool operator==(const AliasInfo& lhs, const AliasInfo& rhs) {

 // this does match the way things are represented in the schema
 inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
-  out << "(";
+  out << '(';
  bool first = true;
  for (const auto& set : aliasInfo.beforeSets()) {
    if (first) {
      first = false;
    } else {
-      out << "|";
+      out << '|';
    }
    out << set.toUnqualString();
  }
  if (aliasInfo.isWrite()) {
-    out << "!";
+    out << '!';
  }
  if (aliasInfo.beforeSets() != aliasInfo.afterSets()) {
    out << " -> ";
@ -120,12 +120,12 @@ inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
      if (first) {
        first = false;
      } else {
-        out << "|";
+        out << '|';
      }
      out << set.toUnqualString();
    }
  }
-  out << ")";
+  out << ')';
  return out;
 }
 } // namespace c10
--- a/aten/src/ATen/core/blob.h
+++ b/aten/src/ATen/core/blob.h
@ -198,7 +198,7 @@ inline void swap(Blob& lhs, Blob& rhs)  noexcept {
 }

 inline std::ostream& operator<<(std::ostream& out, const Blob& v) {
-  return out << "Blob[" << v.TypeName() << "]";
+  return out << "Blob[" << v.TypeName() << ']';
 }

 } // namespace caffe2
--- a/aten/src/ATen/core/class_type.cpp
+++ b/aten/src/ATen/core/class_type.cpp
@ -456,8 +456,8 @@ bool ClassType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
          *why_not << "Method on class '" << repr_str()
                   << "' (1) is not compatible with interface '"
                   << rhs.repr_str() << "' (2)\n"
-                   << "  (1) " << self_method->getSchema() << "\n"
-                   << "  (2) " << schema << "\n";
+                   << "  (1) " << self_method->getSchema() << '\n'
+                   << "  (2) " << schema << '\n';
        }
        return false;
      }
--- a/aten/src/ATen/core/class_type.h
+++ b/aten/src/ATen/core/class_type.h
@ -100,7 +100,7 @@ struct TORCH_API ClassType : public NamedType {
  std::string repr_str() const override {
    std::stringstream ss;
    ss << str()
-       << " (of Python compilation unit at: " << compilation_unit().get() << ")";
+       << " (of Python compilation unit at: " << compilation_unit().get() << ')';
    return ss.str();
  }

--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp
@ -58,12 +58,12 @@ std::string DispatchKeyExtractor::dumpState() const {
  std::ostringstream oss;
  for (const auto i : c10::irange(c10::utils::bitset::NUM_BITS())) {
    if (dispatch_arg_indices_reverse_.get(i)) {
-      oss << "1";
+      oss << '1';
    } else {
-      oss << "0";
+      oss << '0';
    }
  }
-  oss << " " << nonFallthroughKeys_ << "\n";
+  oss << ' ' << nonFallthroughKeys_ << '\n';
  return oss.str();
 }

--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@ -69,8 +69,8 @@ private:

 void _print_dispatch_trace(const std::string& label, const std::string& op_name, const DispatchKeySet& dispatchKeySet) {
  auto nesting_value = dispatch_trace_nesting_value();
-  for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
-  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
+  for (int64_t i = 0; i < nesting_value; ++i) std::cerr << ' ';
+  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << ']' << std::endl;
 }
 } // namespace detail

--- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp
+++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp
@ -570,7 +570,7 @@ void OperatorEntry::checkInvariants() const {

 std::string OperatorEntry::listAllDispatchKeys() const {
  std::ostringstream str;
-  str << "[";
+  str << '[';

  bool has_kernels = false;
  for (auto k : allDispatchKeysInFullSet()) {
@ -584,7 +584,7 @@ std::string OperatorEntry::listAllDispatchKeys() const {
    str << k;
    has_kernels = true;
  }
-  str << "]";
+  str << ']';
  return str.str();
 }

@ -683,12 +683,12 @@ void OperatorEntry::setReportErrorCallback_(std::unique_ptr<c10::SafePyObject> c
 // This WON'T report backend fallbacks.
 std::string OperatorEntry::dumpState() const {
  std::ostringstream oss;
-  oss << "name: " << name_ << "\n";
+  oss << "name: " << name_ << '\n';
  if (schema_) {
-    oss << "schema: " << schema_->schema << "\n";
-    oss << "debug: " << schema_->debug << "\n";
+    oss << "schema: " << schema_->schema << '\n';
+    oss << "debug: " << schema_->debug << '\n';
    oss << "alias analysis kind: " << toString(schema_->schema.aliasAnalysis())
-        << (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << "\n";
+        << (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << '\n';
  } else {
    oss << "schema: (none)\n";
  }
--- a/aten/src/ATen/core/function_schema.cpp
+++ b/aten/src/ATen/core/function_schema.cpp
@ -7,7 +7,7 @@
 namespace c10 {

 void FunctionSchema::dump() const {
-  std::cout << *this << "\n";
+  std::cout << *this << '\n';
 }

 const std::vector<Argument>& FunctionSchema::getCorrectList(SchemaArgType type) const {
@ -210,9 +210,9 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {

  out << schema.name();
  if (!schema.overload_name().empty()) {
-    out << "." << schema.overload_name();
+    out << '.' << schema.overload_name();
  }
-  out << "(";
+  out << '(';

  bool seen_kwarg_only = false;
  for (const auto i : c10::irange(schema.arguments().size())) {
@ -273,7 +273,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
  }

  if (need_paren) {
-    out << "(";
+    out << '(';
  }
  for (const auto i : c10::irange(returns.size())) {
    if (i > 0) {
@ -288,7 +288,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
    out << "...";
  }
  if (need_paren) {
-    out << ")";
+    out << ')';
  }
  return out;
 }
@ -471,7 +471,7 @@ bool FunctionSchema::isForwardCompatibleWith(
    if (!arguments().at(i).isForwardCompatibleWith(old.arguments().at(i))) {
      if (why_not) {
        why_not
-            << "'" << arguments().at(i).name() << "'"
+            << '\'' << arguments().at(i).name() << '\''
            << " is not forward compatible with the older version of the schema";
      }
      return false;
@ -511,7 +511,7 @@ bool FunctionSchema::isForwardCompatibleWith(
             .isForwardCompatibleWith(old.arguments().at(i))) {
      if (why_not) {
        why_not << "Out argument '"
-                << "'" << arguments().at(i).name()
+                << '\'' << arguments().at(i).name()
                << " is not FC with the older version of the schema";
      }
      return false;
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@ -571,7 +571,7 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
    if (arg.N()) {
        N = std::to_string(*arg.N());
    }
-    out << "[" << N << "]";
+    out << '[' << N << ']';
  } else {
    out << unopt_type->str();
  }
@ -582,15 +582,15 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
  }

  if (is_opt) {
-    out << "?";
+    out << '?';
  }

  if (!arg.name().empty()) {
-    out << " " << arg.name();
+    out << ' ' << arg.name();
  }

  if (arg.default_value()) {
-    out << "=";
+    out << '=';
    if ((type->kind() == c10::TypeKind::StringType ||
        unopt_type->kind() == c10::TypeKind::StringType) &&
        arg.default_value().value().isString()) {
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@ -66,7 +66,7 @@ bool operator==(const ivalue::Tuple& lhs, const ivalue::Tuple& rhs) {
 }

 std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {
-  out << v.qualifiedClassName() << "." << v.name();
+  out << v.qualifiedClassName() << '.' << v.name();
  return out;
 }

@ -526,7 +526,7 @@ std::ostream& printMaybeAnnotatedList(
      !elementTypeCanBeInferredFromMembers(list_elem_type)) {
    out << "annotate(" << the_list.type<c10::Type>()->annotation_str() << ", ";
    printList(out, the_list.toListRef(), "[", "]", formatter);
-    out << ")";
+    out << ')';
    return out;
  } else {
    return printList(out, the_list.toListRef(), "[", "]", formatter);
@ -538,7 +538,7 @@ std::ostream& printDict(
    std::ostream& out,
    const Dict& v,
    const IValueFormatter& formatter) {
-  out << "{";
+  out << '{';

  bool first = true;
  for (const auto& pair : v) {
@ -552,7 +552,7 @@ std::ostream& printDict(
    first = false;
  }

-  out << "}";
+  out << '}';
  return out;
 }
 }
@ -565,8 +565,8 @@ static std::ostream& printMaybeAnnotatedDict(
  auto value_type = the_dict.type()->castRaw<DictType>()->getValueType();
  if (the_dict.toGenericDict().empty() ||
      !elementTypeCanBeInferredFromMembers(value_type)) {
-    out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ",";
-    printDict(out, the_dict.toGenericDict(), formatter) << ")";
+    out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ',';
+    printDict(out, the_dict.toGenericDict(), formatter) << ')';
  } else {
    return printDict(out, the_dict.toGenericDict(), formatter);
  }
@ -577,7 +577,7 @@ static std::ostream& printComplex(std::ostream & out, const IValue & v) {
  c10::complex<double> d = v.toComplexDouble();
  IValue real(d.real()), imag(std::abs(d.imag()));
  auto sign = d.imag() >= 0 ? '+' : '-';
-  return out << real << sign << imag << "j";
+  return out << real << sign << imag << 'j';
 }

 std::ostream& IValue::repr(
@ -605,9 +605,9 @@ std::ostream& IValue::repr(
        if (static_cast<double>(i) == d) {
          // -0.0 (signed zero) needs to be parsed as -0.
          if (i == 0 && std::signbit(d)) {
-            return out << "-" << i << ".";
+            return out << '-' << i << '.';
          }
-          return out << i << ".";
+          return out << i << '.';
        }
      }
      auto orig_prec = out.precision();
@ -643,20 +643,20 @@ std::ostream& IValue::repr(
      device_stream << v.toDevice();
      out << "torch.device(";
      c10::printQuotedString(out, device_stream.str());
-      return out << ")";
+      return out << ')';
    }
    case IValue::Tag::Generator: {
      auto generator = v.toGenerator();
      out << "torch.Generator(device=";
      c10::printQuotedString(out, generator.device().str());
-      out << ", seed=" << generator.current_seed() << ")";
+      out << ", seed=" << generator.current_seed() << ')';
      return out;
    }
    case IValue::Tag::GenericDict:
      return printMaybeAnnotatedDict(out, v, formatter);
    case IValue::Tag::Enum: {
      auto enum_holder = v.toEnumHolder();
-      return out << enum_holder->qualifiedClassName() << "." <<
+      return out << enum_holder->qualifiedClassName() << '.' <<
          enum_holder->name();
    }
    case IValue::Tag::Object: {
@ -801,7 +801,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
      if (c == FP_NORMAL || c == FP_ZERO) {
        int64_t i = static_cast<int64_t>(d);
        if (static_cast<double>(i) == d) {
-          return out << i << ".";
+          return out << i << '.';
        }
      }
      auto orig_prec = out.precision();
@ -852,7 +852,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
      return printDict(out, v.toGenericDict(), formatter);
    case IValue::Tag::PyObject: {
      auto py_obj = v.toPyObject();
-      return out << "<PyObject at" << py_obj << ">";
+      return out << "<PyObject at" << py_obj << '>';
    }
    case IValue::Tag::Generator:
      return out << "Generator";
@ -862,22 +862,22 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
      // TODO we should attempt to call __str__ if the object defines it.
      auto obj = v.toObject();
      // print this out the way python would do it
-      return out << "<" << obj->name() << " object at " << obj.get() << ">";
+      return out << '<' << obj->name() << " object at " << obj.get() << '>';
    }
    case IValue::Tag::Enum: {
      auto enum_holder = v.toEnumHolder();
-      return out << "Enum<" << enum_holder->unqualifiedClassName() << "." <<
-          enum_holder->name() << ">";
+      return out << "Enum<" << enum_holder->unqualifiedClassName() << '.' <<
+          enum_holder->name() << '>';
    }

  }
-  return out << "<Invalid IValue tag=" << std::to_string(static_cast<uint32_t>(v.tag)) << ">";
+  return out << "<Invalid IValue tag=" << std::to_string(static_cast<uint32_t>(v.tag)) << '>';
 }

 #undef TORCH_FORALL_TAGS

 void IValue::dump() const {
-  std::cout << *this << "\n";
+  std::cout << *this << '\n';
 }

 std::shared_ptr<ClassType> ivalue::Object::type() const {
@ -1050,7 +1050,7 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(
      std::stringstream err;
      err << "Cannot serialize custom bound C++ class";
      if (auto qualname = type()->name()) {
-        err << " " << qualname->qualifiedName();
+        err << ' ' << qualname->qualifiedName();
      }
      err << ". Please define serialization methods via def_pickle() for "
            "this class.";
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -211,7 +211,7 @@ struct TORCH_API OptionalType : public UnionType {

  std::string str() const override {
    std::stringstream ss;
-    ss << getElementType()->str() << "?";
+    ss << getElementType()->str() << '?';
    return ss.str();
  }

@ -240,7 +240,7 @@ struct TORCH_API OptionalType : public UnionType {

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "Optional[" << getElementType()->annotation_str(printer) << "]";
+    ss << "Optional[" << getElementType()->annotation_str(printer) << ']';
    return ss.str();
  }
 };
@ -906,7 +906,7 @@ struct TORCH_API ListType

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "List[" << getElementType()->annotation_str(printer) << "]";
+    ss << "List[" << getElementType()->annotation_str(printer) << ']';
    return ss.str();
  }
 };
@ -946,7 +946,7 @@ struct TORCH_API DictType : public SharedType {
  std::string str() const override {
    std::stringstream ss;
    ss << "Dict(" << getKeyType()->str() << ", " << getValueType()->str()
-       << ")";
+       << ')';
    return ss.str();
  }

@ -1018,7 +1018,7 @@ struct TORCH_API FutureType

  std::string str() const override {
    std::stringstream ss;
-    ss << "Future(" << getElementType()->str() << ")";
+    ss << "Future(" << getElementType()->str() << ')';
    return ss.str();
  }
  TypePtr createWithContained(
@ -1041,7 +1041,7 @@ struct TORCH_API FutureType

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "Future[" << getElementType()->annotation_str(printer) << "]";
+    ss << "Future[" << getElementType()->annotation_str(printer) << ']';
    return ss.str();
  }
 };
@ -1060,7 +1060,7 @@ struct TORCH_API AwaitType

  std::string str() const override {
    std::stringstream ss;
-    ss << "Await(" << getElementType()->str() << ")";
+    ss << "Await(" << getElementType()->str() << ')';
    return ss.str();
  }
  TypePtr createWithContained(
@ -1083,7 +1083,7 @@ struct TORCH_API AwaitType

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "Await[" << getElementType()->annotation_str(printer) << "]";
+    ss << "Await[" << getElementType()->annotation_str(printer) << ']';
    return ss.str();
  }
 };
@ -1102,7 +1102,7 @@ struct TORCH_API RRefType

  std::string str() const override {
    std::stringstream ss;
-    ss << "RRef(" << getElementType()->str() << ")";
+    ss << "RRef(" << getElementType()->str() << ')';
    return ss.str();
  }
  TypePtr createWithContained(
@ -1115,7 +1115,7 @@ struct TORCH_API RRefType

  std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
    std::stringstream ss;
-    ss << "RRef[" << getElementType()->annotation_str(printer) << "]";
+    ss << "RRef[" << getElementType()->annotation_str(printer) << ']';
    return ss.str();
  }
 };
--- a/aten/src/ATen/core/operator_name.cpp
+++ b/aten/src/ATen/core/operator_name.cpp
@ -11,7 +11,7 @@ std::string toString(const OperatorName& opName) {
 std::ostream& operator<<(std::ostream& os, const OperatorName& opName) {
  os << opName.name;
  if (!opName.overload_name.empty()) {
-    os << "." << opName.overload_name;
+    os << '.' << opName.overload_name;
  }
  return os;
 }
--- a/aten/src/ATen/core/tensor_type.cpp
+++ b/aten/src/ATen/core/tensor_type.cpp
@ -65,7 +65,7 @@ VaryingShape<T> VaryingShape<T>::merge(const VaryingShape<T>& other) const {

 template <typename T>
 std::ostream& operator<<(std::ostream& out, const VaryingShape<T>& vs) {
-  out << "(";
+  out << '(';
  if (!vs.size()) {
    out << "*)";
    return out;
@ -79,10 +79,10 @@ std::ostream& operator<<(std::ostream& out, const VaryingShape<T>& vs) {
    if (v.has_value()) {
      out << v.value();
    } else {
-      out << "*";
+      out << '*';
    }
  }
-  out << ")";
+  out << ')';
  return out;
 }

@ -105,7 +105,7 @@ std::ostream& operator<<(
  }
  auto sizes_opt = ss.sizes();

-  os << "(";
+  os << '(';
  for (size_t i = 0; i < rank_opt.value(); i++) {
    if (i > 0) {
      os << ", ";
@ -113,10 +113,10 @@ std::ostream& operator<<(
    if(sizes_opt.has_value() && sizes_opt.value()[i].is_static()) {
      os << sizes_opt.value()[i];
    } else {
-      os << "*";
+      os << '*';
    }
  }
-  os << ")";
+  os << ')';

  return os;
 }
@ -131,17 +131,17 @@ std::ostream& operator<<(std::ostream& os, const ShapeSymbol& s) {
 }

 std::ostream& operator<<(std::ostream& os, const Stride& s) {
-  os << "{";
+  os << '{';
  if (s.stride_index_.has_value()) {
    os << *s.stride_index_;
  } else {
-    os << "*";
+    os << '*';
  }
-  os << ":";
+  os << ':';
  if (s.stride_.has_value()) {
    os << *s.stride_;
  } else {
-    os << "*";
+    os << '*';
  }
  os << '}';
  return os;
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@ -67,7 +67,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
      bool has_valid_strides_info = ndim > 0 &&
          value->strides().isComplete() && value->strides().size() == ndim;

-      out << "(";
+      out << '(';
      size_t i = 0;
      bool symbolic = type_verbosity() == TypeVerbosity::Symbolic;
      for (i = 0; i < *ndim; ++i) {
@ -79,7 +79,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
        } else if (symbolic) {
          out << value->symbolic_sizes().at(i);
        } else {
-          out << "*";
+          out << '*';
        }
      }
      if (has_valid_strides_info &&
@ -91,7 +91,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
          }
          out << value->strides()[i].value();
        }
-        out << "]";
+        out << ']';
      }
      if (type_verbosity() >= TypeVerbosity::Full) {
        if (value->requiresGrad()) {
@ -107,12 +107,12 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
          out << "device=" << *value->device();
        }
      }
-      out << ")";
+      out << ')';
    } else {
      if (type_verbosity() >= TypeVerbosity::Full) {
        size_t i = 0;
        if (value->requiresGrad()) {
-          out << "("
+          out << '('
              << "requires_grad=" << *value->requiresGrad();
          i++;
        }
@ -120,7 +120,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
          out << ((i++ > 0) ? ", " : "(") << "device=" << *value->device();
        }
        if (i > 0) {
-          out << ")";
+          out << ')';
        }
      }
    }
@ -133,18 +133,18 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
    out << *prim << "[]";
  } else if (t.kind() == TypeKind::OptionalType) {
    auto prim = t.castRaw<OptionalType>()->getElementType();
-    out << *prim << "?";
+    out << *prim << '?';
  } else if(t.kind() == TypeKind::FutureType) {
    auto elem = t.castRaw<FutureType>()->getElementType();
-    out << "Future[" << *elem << "]";
+    out << "Future[" << *elem << ']';
  } else if(t.kind() == TypeKind::RRefType) {
    auto elem = t.castRaw<RRefType>()->getElementType();
-    out << "RRef[" << *elem << "]";
+    out << "RRef[" << *elem << ']';
  } else if(auto tup = t.cast<TupleType>()) {
    if (tup->schema()) {
      out << "NamedTuple";
    }
-    out << "(";
+    out << '(';
    for(size_t i = 0; i < tup->elements().size(); ++i) {
      if(i > 0)
        out << ", ";
@ -160,7 +160,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
        out << *(tup->elements()[i]);
      }
    }
-    out << ")";
+    out << ')';
  } else if (t.kind() == TypeKind::FunctionType) {
    out << "Function";
  } else {
@ -475,7 +475,7 @@ std::optional<TypePtr> unifyTypeList(
      why_not << "Could not unify type list since element " << i << " of type "
              << elements.at(i)->repr_str()
              << " did not match the types before it ("
-              << ret_type->repr_str() << ")";
+              << ret_type->repr_str() << ')';
      return std::nullopt;
    }
    ret_type = *maybe_unified;
@ -907,13 +907,13 @@ std::string TupleType::str() const {
    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
    ss << name()->qualifiedName();
  } else {
-    ss << "(";
+    ss << '(';
    for(size_t i = 0; i < elements().size(); ++i) {
      if(i > 0)
        ss << ", ";
      ss << elements()[i]->str();
    }
-    ss << ")";
+    ss << ')';
  }
  return ss.str();
 }
@ -1003,8 +1003,8 @@ bool InterfaceType::isSubTypeImpl(
          *why_not << "Method on interface '" << lhs.repr_str()
                   << "' (1) is not compatible with interface '"
                   << rhs.repr_str() << "' (2)\n"
-                   << "  (1) " << *self_schema << "\n"
-                   << "  (2) " << schema << "\n";
+                   << "  (1) " << *self_schema << '\n'
+                   << "  (2) " << schema << '\n';
          return false;
        }
        return false;
@ -1078,7 +1078,7 @@ SymbolicShape SymbolicShape::merge(const SymbolicShape& other) const {
 }

 void SymbolicShape::dump() const {
-  std::cout << *this << "\n";
+  std::cout << *this << '\n';
 }

 bool EnumType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
--- a/aten/src/ATen/core/union_type.cpp
+++ b/aten/src/ATen/core/union_type.cpp
@ -205,9 +205,9 @@ UnionType::UnionType(std::vector<TypePtr> reference, TypeKind kind) : SharedType
    for (const auto i : c10::irange(reference.size())) {
      msg << reference[i]->repr_str();
      if (i > 0) {
-        msg << ",";
+        msg << ',';
      }
-      msg << " ";
+      msg << ' ';
    }
    msg << "} has the single type " << types_[0]->repr_str()
         << ". Use the common supertype instead of creating a Union"
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@ -80,7 +80,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
    }
    stream << buf[i];
  }
-  stream << "]";
+  stream << ']';
  return stream;
 }

--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
    }
    stream << buf[i];
  }
-  stream << "]";
+  stream << ']';
  return stream;
 }

--- a/aten/src/ATen/cuda/CUDAContextLight.h
+++ b/aten/src/ATen/cuda/CUDAContextLight.h
@ -3,7 +3,6 @@

 #include <cstdint>
 #include <map>
-#include <shared_mutex>

 #include <cuda_runtime_api.h>
 #include <cusparse.h>
@ -89,13 +88,8 @@ TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
 TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();

 TORCH_CUDA_CPP_API void clearCublasWorkspaces();
-struct WorkspaceMapWithMutex {
-  std::map<std::tuple<void*, void*>, at::DataPtr> map;
-  std::shared_mutex mutex;
-};
-
-TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
-TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
+TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace();
+TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace();
 TORCH_CUDA_CPP_API size_t getChosenWorkspaceSize();
 TORCH_CUDA_CPP_API size_t getCUDABlasLtWorkspaceSize();
 TORCH_CUDA_CPP_API void* getCUDABlasLtWorkspace();
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@ -99,7 +99,7 @@ void destroyCublasHandle(cublasHandle_t handle) {
 //   - Comments of @soumith copied from cuDNN handle pool implementation
 #ifdef NO_CUDNN_DESTROY_HANDLE
 #else
-  cublasDestroy(handle);
+    cublasDestroy(handle);
 #endif
 }

@ -107,27 +107,19 @@ using CuBlasPoolType = DeviceThreadHandlePool<cublasHandle_t, createCublasHandle

 } // namespace

-WorkspaceMapWithMutex& cublas_handle_stream_to_workspace() {
-  static auto& instance = *new WorkspaceMapWithMutex;
+std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace() {
+  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
  return instance;
 }

-WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace() {
-  static auto& instance = *new WorkspaceMapWithMutex;
+std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace() {
+  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
  return instance;
 }

 void clearCublasWorkspaces() {
-  {
-    auto& workspace = cublas_handle_stream_to_workspace();
-    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
-    workspace.map.clear();
-  }
-  {
-    auto& workspace = cublaslt_handle_stream_to_workspace();
-    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
-    workspace.map.clear();
-  }
+  cublas_handle_stream_to_workspace().clear();
+  cublaslt_handle_stream_to_workspace().clear();
 }

 size_t parseChosenWorkspaceSize() {
@ -241,38 +233,6 @@ at::DataPtr getNewCUDABlasLtWorkspace() {
  return c10::cuda::CUDACachingAllocator::get()->allocate(getCUDABlasLtWorkspaceSize());
 }

-void setWorkspaceForHandle(cublasHandle_t handle, c10::cuda::CUDAStream stream) {
-  cudaStream_t _stream = stream;
-  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-
-  auto& workspace = cublas_handle_stream_to_workspace();
-
-  size_t workspace_size = getChosenWorkspaceSize();
-
-  // Fast path: check if workspace already exists
-  {
-    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it = workspace.map.find(key);
-    if (workspace_it != workspace.map.end()) {
-      TORCH_CUDABLAS_CHECK(cublasSetWorkspace(
-          handle, workspace_it->second.get(), workspace_size));
-      return;
-    }
-  }
-
-  // Slow path: allocate workspace outside the lock
-  auto new_workspace = getNewWorkspace();
-
-  // Insert with lock (double-check in case another thread inserted while we
-  // were allocating)
-  {
-    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it = workspace.map.try_emplace(key, std::move(new_workspace)).first;
-    TORCH_CUDABLAS_CHECK(
-        cublasSetWorkspace(handle, workspace_it->second.get(), workspace_size));
-  }
-}
-
 void* getCUDABlasLtWorkspace() {
 #ifndef USE_ROCM
  static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true;
@ -281,10 +241,8 @@ void* getCUDABlasLtWorkspace() {
    auto stream = c10::cuda::getCurrentCUDAStream();
    cudaStream_t _stream = stream;
    auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-    auto& workspace = at::cuda::cublas_handle_stream_to_workspace();
-    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it = workspace.map.find(key);
-    TORCH_INTERNAL_ASSERT(workspace_it != workspace.map.end());
+    auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key);
+    TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end());
    return workspace_it->second.mutable_get();
  }
 #endif
@ -292,29 +250,11 @@ void* getCUDABlasLtWorkspace() {
  auto stream = c10::cuda::getCurrentCUDAStream();
  cudaStream_t _stream = stream;
  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-
-  auto& workspace = cublaslt_handle_stream_to_workspace();
-
-  // Fast path: check if workspace already exists
-  {
-    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it = workspace.map.find(key);
-    if (workspace_it != workspace.map.end()) {
-      return workspace_it->second.mutable_get();
-    }
-  }
-
-  // Slow path: allocate workspace outside the lock
-  auto new_workspace = getNewCUDABlasLtWorkspace();
-
-  // Insert with lock (double-check in case another thread inserted while we
-  // were allocating)
-  {
-    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it =
-          workspace.map.try_emplace(key, std::move(new_workspace)).first;
-    return workspace_it->second.mutable_get();
+  auto workspace_it = cublaslt_handle_stream_to_workspace().find(key);
+  if (workspace_it == cublaslt_handle_stream_to_workspace().end()) {
+    workspace_it = cublaslt_handle_stream_to_workspace().insert(workspace_it, {key, getNewCUDABlasLtWorkspace()});
  }
+  return workspace_it->second.mutable_get();
 }

 cublasHandle_t getCurrentCUDABlasHandle() {
@ -358,8 +298,13 @@ cublasHandle_t getCurrentCUDABlasHandle() {
  // will allocate memory dynamically (even if they're cheap) outside
  // PyTorch's CUDA caching allocator. It's possible that CCA used up
  // all the memory and cublas's cudaMallocAsync will return OOM
-  setWorkspaceForHandle(handle, stream);
-
+  cudaStream_t _stream = stream;
+  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
+  auto workspace_it = cublas_handle_stream_to_workspace().find(key);
+  if (workspace_it == cublas_handle_stream_to_workspace().end()) {
+    workspace_it = cublas_handle_stream_to_workspace().insert(workspace_it, {key, getNewWorkspace()});
+  }
+  TORCH_CUDABLAS_CHECK(cublasSetWorkspace(handle, workspace_it->second.get(), getChosenWorkspaceSize()));
 #if !defined(USE_ROCM)
  // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
  // FP32 data type calculations based on the value of the allow_tf32 flag.
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -411,16 +411,16 @@ std::string CUDAHooks::showConfig() const {
    // HIP_VERSION value format was changed after ROCm v4.2 to include the patch number
    if(v < 500) {
      // If major=xx, minor=yy then format -> xxyy
-      oss << (v / 100) << "." << (v % 10);
+      oss << (v / 100) << '.' << (v % 10);
    }
    else {
      // If major=xx, minor=yy & patch=zzzzz then format -> xxyyzzzzz
-      oss << (v / 10000000) << "." << (v / 100000 % 100) << "." << (v % 100000);
+      oss << (v / 10000000) << '.' << (v / 100000 % 100) << '.' << (v % 100000);
    }
 #else
-    oss << (v / 1000) << "." << (v / 10 % 100);
+    oss << (v / 1000) << '.' << (v / 10 % 100);
    if (v % 10 != 0) {
-      oss << "." << (v % 10);
+      oss << '.' << (v % 10);
    }
 #endif
  };
@ -431,16 +431,16 @@ std::string CUDAHooks::showConfig() const {
  oss << "  - HIP Runtime ";
 #endif
  printCudaStyleVersion(runtimeVersion);
-  oss << "\n";
+  oss << '\n';

  // TODO: Make HIPIFY understand CUDART_VERSION macro
 #if !defined(USE_ROCM)
  if (runtimeVersion != CUDART_VERSION) {
    oss << "  - Built with CUDA Runtime ";
    printCudaStyleVersion(CUDART_VERSION);
-    oss << "\n";
+    oss << '\n';
  }
-  oss << "  - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << "\n";
+  oss << "  - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << '\n';
 #endif

 #if !defined(USE_ROCM)
@ -448,9 +448,9 @@ std::string CUDAHooks::showConfig() const {


  auto printCudnnStyleVersion = [&](size_t v) {
-    oss << (v / 1000) << "." << (v / 100 % 10);
+    oss << (v / 1000) << '.' << (v / 100 % 10);
    if (v % 100 != 0) {
-      oss << "." << (v % 100);
+      oss << '.' << (v % 100);
    }
  };

@ -461,22 +461,22 @@ std::string CUDAHooks::showConfig() const {
  if (cudnnCudartVersion != CUDART_VERSION) {
    oss << "  (built against CUDA ";
    printCudaStyleVersion(cudnnCudartVersion);
-    oss << ")";
+    oss << ')';
  }
-  oss << "\n";
+  oss << '\n';
  if (cudnnVersion != CUDNN_VERSION) {
    oss << "    - Built with CuDNN ";
    printCudnnStyleVersion(CUDNN_VERSION);
-    oss << "\n";
+    oss << '\n';
  }
 #endif
 #else
  // TODO: Check if miopen has the functions above and unify
-  oss << "  - MIOpen " << MIOPEN_VERSION_MAJOR << "." << MIOPEN_VERSION_MINOR << "." << MIOPEN_VERSION_PATCH << "\n";
+  oss << "  - MIOpen " << MIOPEN_VERSION_MAJOR << '.' << MIOPEN_VERSION_MINOR << '.' << MIOPEN_VERSION_PATCH << '\n';
 #endif

 #if AT_MAGMA_ENABLED()
-  oss << "  - Magma " << MAGMA_VERSION_MAJOR << "." << MAGMA_VERSION_MINOR << "." << MAGMA_VERSION_MICRO << "\n";
+  oss << "  - Magma " << MAGMA_VERSION_MAJOR << '.' << MAGMA_VERSION_MINOR << '.' << MAGMA_VERSION_MICRO << '\n';
 #endif

  return oss.str();
--- a/aten/src/ATen/cuda/jiterator.cu
+++ b/aten/src/ATen/cuda/jiterator.cu
@ -42,7 +42,7 @@ static inline void launch_jitted_vectorized_kernel_dynamic(

  // The cache key includes all the parameters to generate_code + vec_size + dev_idx
  std::stringstream ss;
-  ss << nInputs << "_" << nOutputs << f;
+  ss << nInputs << '_' << nOutputs << f;
  ss << f_inputs_type_str << compute_type_str << result_type_str;
  ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
  ss << extra_args_types;
@ -144,7 +144,7 @@ static inline void launch_jitted_unrolled_kernel_dynamic(

  // The cache key includes all the parameters to generate_code + dev_idx
  std::stringstream ss;
-  ss << nInputs << "_" << nOutputs << f;
+  ss << nInputs << '_' << nOutputs << f;
  ss << f_inputs_type_str << compute_type_str << result_type_str;
  ss << contiguous << dynamic_casting;
  ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@ -52,10 +52,10 @@ TuningContext* getTuningContext() {
 std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry) {
  static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1";
  if (!blaslog) {
-    return stream << entry.key_ << "," << entry.time_;
+    return stream << entry.key_ << ',' << entry.time_;
  }
  else {
-    return stream << entry.key_ << "," << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_;
+    return stream << entry.key_ << ',' << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_;
  }
 }

@ -156,10 +156,10 @@ void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std
    if (isNew) {
      static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1";
      if (!blaslog) {
-        untuned_file << op_signature << "," << params_signature << std::endl;
+        untuned_file << op_signature << ',' << params_signature << std::endl;
      }
      else {
-        untuned_file << op_signature << "," << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl;
+        untuned_file << op_signature << ',' << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl;
      }
      TUNABLE_LOG3("Untuned,", op_signature, ",", params_signature);
    }
@ -201,7 +201,7 @@ void TuningResultsManager::InitRealtimeAppend(const std::string& filename, const

  if(!file_exists || file_empty) {
    for(const auto& [key, val] : validators) {
-      (*realtime_out_) << "Validator," << key << "," << val << std::endl;
+      (*realtime_out_) << "Validator," << key << ',' << val << std::endl;
      realtime_out_->flush();
    }
    validators_written_ = true;
@ -219,7 +219,7 @@ void TuningResultsManager::AppendResultLine(const std::string& op_sig, const std
    return;
  }

-  (*realtime_out_) << op_sig << "," << param_sig << "," << result << std::endl;
+  (*realtime_out_) << op_sig << ',' << param_sig << ',' << result << std::endl;
  realtime_out_->flush(); //ensure immediate write to disk

  TUNABLE_LOG3("Realtime append: ", op_sig, "(", param_sig, ") -> ", result);
--- a/aten/src/ATen/cudnn/Descriptors.cpp
+++ b/aten/src/ATen/cudnn/Descriptors.cpp
@ -93,31 +93,31 @@ std::string cudnnTypeToString(cudnnDataType_t dtype) {
      return "CUDNN_DATA_UINT8x4";
    default:
      std::ostringstream oss;
-      oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
+      oss << "(unknown data-type " << static_cast<int>(dtype) << ')';
      return oss.str();
  }
 }

 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
-  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
+  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << '\n';
  int nbDims = 0;
  int dimA[CUDNN_DIM_MAX];
  int strideA[CUDNN_DIM_MAX];
  cudnnDataType_t dtype{};
  cudnnGetTensorNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &nbDims, dimA, strideA);
-  out << "    type = " << cudnnTypeToString(dtype) << "\n";
-  out << "    nbDims = " << nbDims << "\n";
+  out << "    type = " << cudnnTypeToString(dtype) << '\n';
+  out << "    nbDims = " << nbDims << '\n';
  // Read out only nbDims of the arrays!
  out << "    dimA = ";
  for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << "\n";
+  out << '\n';
  out << "    strideA = ";
  for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << "\n";
+  out << '\n';
  return out;
 }

@ -168,27 +168,27 @@ std::string cudnnMemoryFormatToString(cudnnTensorFormat_t tformat) {
      return "CUDNN_TENSOR_NHWC";
    default:
      std::ostringstream oss;
-      oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ")";
+      oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ')';
      return oss.str();
  }
 }

 std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d) {
-  out << "FilterDescriptor " << static_cast<void*>(d.desc()) << "\n";
+  out << "FilterDescriptor " << static_cast<void*>(d.desc()) << '\n';
  int nbDims = 0;
  int dimA[CUDNN_DIM_MAX];
  cudnnDataType_t dtype{};
  cudnnTensorFormat_t tformat{};
  cudnnGetFilterNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &tformat, &nbDims, dimA);
-  out << "    type = " << cudnnTypeToString(dtype) << "\n";
-  out << "    tensor_format = " << cudnnMemoryFormatToString(tformat) << "\n";
-  out << "    nbDims = " << nbDims << "\n";
+  out << "    type = " << cudnnTypeToString(dtype) << '\n';
+  out << "    tensor_format = " << cudnnMemoryFormatToString(tformat) << '\n';
+  out << "    nbDims = " << nbDims << '\n';
  // Read out only nbDims of the arrays!
  out << "    dimA = ";
  for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << "\n";
+  out << '\n';
  return out;
 }

--- a/aten/src/ATen/functorch/DynamicLayer.cpp
+++ b/aten/src/ATen/functorch/DynamicLayer.cpp
@ -346,15 +346,15 @@ void foreachTensorInplaceWithFlag(std::vector<IValue>& args, int64_t begin, int6
 }

 std::ostream& operator<< (std::ostream& os, const DynamicLayer& layer) {
-  os << layer.layerId() << ":" << layer.key();
+  os << layer.layerId() << ':' << layer.key();
  return os;
 }
 std::ostream& operator<< (std::ostream& os, const std::vector<DynamicLayer>& dls) {
  os << "DynamicLayerStack[ ";
  for (const auto& layer : dls) {
-    os << layer << " ";
+    os << layer << ' ';
  }
-  os << "]";
+  os << ']';
  return os;
 }

--- a/aten/src/ATen/functorch/TensorWrapper.cpp
+++ b/aten/src/ATen/functorch/TensorWrapper.cpp
@ -22,7 +22,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) {
    if (batched) {
      ss << "Batched[lvl=" << batched->level() << " dim=" << batched->bdim() << ", ";
      dumpTensor(ss, batched->value());
-      ss << "]";
+      ss << ']';
      return;
    }
    ss << "Tensor" << tensor.sizes();
@ -36,7 +36,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) {
    ss << "dead, ";
  }
  dumpTensor(ss, wrapped->value());
-  ss << "]";
+  ss << ']';
 }

 void TensorWrapper::refreshMetadata() {
--- a/aten/src/ATen/miopen/Descriptors.cpp
+++ b/aten/src/ATen/miopen/Descriptors.cpp
@ -73,32 +73,32 @@ std::string miopenTypeToString(miopenDataType_t dtype) {
      return "miopenBFloat16";
    default:
      std::ostringstream oss;
-      oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
+      oss << "(unknown data-type " << static_cast<int>(dtype) << ')';
      return oss.str();
  }
 }

 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
-  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
+  out << "TensorDescriptor " << static_cast<void*>(d.desc()) << '\n';
  int nbDims = 0;
  int dimA[MIOPEN_DIM_MAX];
  int strideA[MIOPEN_DIM_MAX];
  miopenDataType_t dtype;
  miopenGetTensorDescriptorSize(d.desc(), &nbDims);
  miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA);
-  out << "    type = " << miopenTypeToString(dtype) << "\n";
-  out << "    nbDims = " << nbDims << "\n";
+  out << "    type = " << miopenTypeToString(dtype) << '\n';
+  out << "    nbDims = " << nbDims << '\n';
  // Read out only nbDims of the arrays!
  out << "    dimA = ";
  for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << "\n";
+  out << '\n';
  out << "    strideA = ";
  for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
    out << i << ", ";
  }
-  out << "\n";
+  out << '\n';
  return out;
 }

--- a/aten/src/ATen/mps/MPSProfiler.h
+++ b/aten/src/ATen/mps/MPSProfiler.h
@ -91,7 +91,7 @@ struct OperationInfo : BaseInfo {
    std::stringstream kernelStr;
    kernelStr << kernelName;
    for (const Tensor& tensor : tensors) {
-      kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId);
+      kernelStr << ':' << BaseInfo::buildTensorString(tensor, includeBufferId);
    }
    return kernelStr.str();
  }
--- a/aten/src/ATen/mps/MPSProfiler.mm
+++ b/aten/src/ATen/mps/MPSProfiler.mm
@ -39,9 +39,9 @@ std::string BaseInfo::buildTensorString(const Tensor& tensor, bool includeBuffer
    // see comments for INCLUDE_BUFFER_ID
    if (includeBufferId && deviceType == at::kMPS) {
      id<MTLBuffer> buffer = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
-      tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ":" << buffer.retainCount << ")";
+      tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ':' << buffer.retainCount << ')';
    }
-    tensorStr << ":" << tensor.scalar_type() << tensor.sizes();
+    tensorStr << ':' << tensor.scalar_type() << tensor.sizes();
    return tensorStr.str();
  } else {
    return "undefined";
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@ -167,7 +167,7 @@ static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, co
    std::stringstream ss;
    ss << arg_name << " should be greater than zero but got (";
    std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
-    ss << args.back() <<  ")" << " (while checking arguments for " << c << ")";
+    ss << args.back() <<  ")" << " (while checking arguments for " << c << ')';
    TORCH_CHECK(false, ss.str());
  }
 }
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -639,7 +639,7 @@ static std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params)
      << "  deterministic = " << params.deterministic
      << "  cudnn_enabled = " << params.cudnn_enabled
      << "  allow_tf32 = " << params.allow_tf32
-      << "}";
+      << '}';
  return out;
 }

--- a/aten/src/ATen/native/SpectralOps.cpp
+++ b/aten/src/ATen/native/SpectralOps.cpp
@ -847,7 +847,7 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const std::optional<int64_t
       << ", hop_length=" << hop_length << ", win_length=" << win_length \
       << ", window="; \
    if (window.defined()) { \
-      SS << window.toString() << "{" << window.sizes() << "}"; \
+      SS << window.toString() << '{' << window.sizes() << '}'; \
    } else { \
      SS << "None"; \
    } \
@ -1046,7 +1046,7 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const std::optional<int64_
       << ", hop_length=" << hop_length << ", win_length=" << win_length \
       << ", window="; \
    if (window.defined()) { \
-      SS << window.toString() << "{" << window.sizes() << "}"; \
+      SS << window.toString() << '{' << window.sizes() << '}'; \
    } else { \
      SS << "None"; \
    } \
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@ -523,7 +523,7 @@ Tensor _functional_assert_async_msg_cpu(
 }

 void _print(std::string_view s) {
-  std::cout << s << "\n";
+  std::cout << s << '\n';
 }

 // Sorting-based algorithm for isin(); used when the number of test elements is
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -296,7 +296,7 @@ template <typename scalar_t, typename res_scalar_t = scalar_t>
 bool launchGemmAndBiasCublasLt(
    // args contains result which is modified
    cublasCommonArgs& args,
-    const std::optional<Tensor>& self,
+    const Tensor& self,
    const Scalar& alpha,
    Activation activation = Activation::None
 ) {
@ -304,8 +304,12 @@ bool launchGemmAndBiasCublasLt(
  // or when it can be squeezed to 1D.
  // self_ptr == nullptr implies ignore bias epilogue
  // and use standard gemm-like API.
-  const auto* self_ptr = self.has_value() ? self.value().const_data_ptr<scalar_t>() : static_cast<const scalar_t*>(nullptr);
-
+  const auto* self_ptr = [&]() -> auto {
+    if (self.dim() == 1 || self.squeeze().dim() == 1) {
+      return self.const_data_ptr<scalar_t>();
+    }
+    return static_cast<const scalar_t*>(nullptr);
+  }();

  const auto tuning_ctx = at::cuda::tunable::getTuningContext();
  if (tuning_ctx->IsTunableOpEnabled()) {
@ -388,30 +392,35 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  bool disable_addmm_cuda_lt = persistent_disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
  #ifdef USE_ROCM
  // Conditioned on the device index, which is not persistent
-  disable_addmm_cuda_lt = disable_addmm_cuda_lt || isGloballyDisabledAddmmCudaLt(self.device());
+  disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt;
  #endif
  // Condition on the input
-  disable_addmm_cuda_lt = disable_addmm_cuda_lt || !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation);
+  disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation) || disable_addmm_cuda_lt;
+  // }

  at::ScalarType scalar_type = mat1.scalar_type();
  bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;

-  #ifdef USE_ROCM
-  disable_addmm_cuda_lt = disable_addmm_cuda_lt || is_float_output_with_half_input;
-  #endif
-
-  bool use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
-  // for float output with half input cublasLT with bias produces wrong results
-  use_bias_ptr_lt &= !is_float_output_with_half_input;
-
  // Handle result/self shapes
  if (!result.is_same(self)) {
    at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]});

-      // We do not copy bias only when we need the bias ptr
+    // We use bias ptr in the Lt path only when bias is 1D
+    const auto use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
+    const auto self_maybe_expanded = [&]() -> c10::MaybeOwned<Tensor> {
+      if (!use_bias_ptr_lt) {
+        // We do expand self even before
+        // check for beta != 0.0 to make sure that
+        // test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_*
+        // runs green.
+        return expand_size(self, result.sizes(), "addmm");
+      }
+      return c10::MaybeOwned<Tensor>::borrowed(self);
+    }();
+    // We do not copy bias only when we need the bias ptr
    if (beta.toComplexDouble() != 0.0 && !use_bias_ptr_lt) {
      // NOTE: self should broadcast over result
-      at::native::copy_(result, *expand_size(self, result.sizes(), "addmm"));
+      at::native::copy_(result, *self_maybe_expanded);
    }
  }

@ -459,7 +468,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
        scalar_type,
        "addmm_cuda_lt",
        [&] {
-          lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
+          lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, self, alpha, activation);
        }
      );
      #endif
@ -471,7 +480,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
        scalar_type,
        "addmm_cuda_lt",
        [&] {
-          lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
+          lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, self, alpha, activation);
        }
      );
    } // end is_float_output_with_half_input
@ -927,7 +936,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
  return _int_mm_out_cuda(self, mat2, result);
 }

-static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
+static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
  // ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm
  TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
  TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
@ -951,7 +960,7 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
    (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");

-  if (self_baddbmm.has_value()) {
+  if (!is_bmm && self_baddbmm.has_value()) {
    const auto& self = self_baddbmm.value();
    TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor");
    TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output");
@ -959,12 +968,15 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
 }

 Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
-  Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
+  IntArrayRef batch1_sizes = batch1.sizes();
+  IntArrayRef batch2_sizes = batch2.sizes();
+
+  Tensor out = at::empty({batch1_sizes[0], batch1_sizes[1], batch2_sizes[2]}, batch1.options().dtype(out_dtype));
  return _bmm_out_dtype_cuda(batch1, batch2, out_dtype, out);
 }

 Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
-  baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype);
+  baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true);
  Scalar beta(0.0);
  Scalar alpha(1.0);
  {
@ -976,16 +988,14 @@ Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at
 }

 Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
-  TORCH_CHECK(self.scalar_type() == out_dtype || self.scalar_type() == batch1.dtype(),
-  "self dtype must match either out_dtype or batch1 dtype");
-  Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
-  return _baddbmm_out_dtype_cuda(self, batch1, batch2, out_dtype, beta, alpha, out);
+  // We need to copy the tensor
+  Tensor out = self.clone().to(self.options().dtype(out_dtype));
+
+  return _baddbmm_out_dtype_cuda(out, batch1, batch2, out_dtype, beta, alpha, out);
 }

 Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
-  baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, out);
-  // We need to copy the tensor
-  out.copy_(self);
+  baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self);
  {
    NoNamesGuard guard;
    baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
@ -1020,27 +1030,24 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca
 }

 Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
-  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
-  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
-  Tensor result = at::empty({mat1.size(0), mat2.size(1)}, self.options().dtype(out_dtype));
+  Tensor result = at::empty(self.sizes(), self.options().dtype(out_dtype));
  return _addmm_dtype_out_cuda(self, mat1, mat2, out_dtype, beta, alpha, result);
 }

 Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
-// repeat dimensionality checks for direct calls to `out` overload
+  TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type());
+  TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
  TORCH_CHECK(
      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
-  TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
-  TORCH_CHECK(out_dtype == mat1.scalar_type() ||
-  (out_dtype == at::ScalarType::Float && (mat1.scalar_type() == at::ScalarType::Half || mat1.scalar_type() == at::ScalarType::BFloat16)),
-  "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");

  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
-  TORCH_CHECK(out_dtype == self.scalar_type() || self.scalar_type() == mat1.scalar_type(),
-    "self dtype must match either out_dtype or mat1 dtype");
+  TORCH_CHECK(out_dtype == self.scalar_type() ||
+    (out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
+    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");

  addmm_out_cuda_impl(out, self, mat1, mat2, beta, alpha);

--- a/aten/src/ATen/native/cuda/KernelUtils.cuh
+++ b/aten/src/ATen/native/cuda/KernelUtils.cuh
@ -5,11 +5,69 @@
 #include <cuda_bf16.h>
 #endif

+// ROCm 6.3 is planned to have these functions, but until then here they are.
 #if defined(USE_ROCM)
 #include <device_functions.h>
 #include <hip/hip_fp16.h>
 #include <hip/hip_bf16.h>
-#define ATOMICADD unsafeAtomicAdd
+
+__device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) {
+#if (defined(__gfx942__)) && \
+  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16)
+  typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
+  static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw));
+  union {
+    __hip_bfloat162_raw bf162_raw;
+    vec_short2 vs2;
+  } u{static_cast<__hip_bfloat162_raw>(value)};
+  u.vs2 = __builtin_amdgcn_flat_atomic_fadd_v2bf16((vec_short2*)address, u.vs2);
+  return static_cast<__hip_bfloat162>(u.bf162_raw);
+#else
+  static_assert(sizeof(unsigned int) == sizeof(__hip_bfloat162_raw));
+  union u_hold {
+    __hip_bfloat162_raw h2r;
+    unsigned int u32;
+  };
+  u_hold old_val, new_val;
+  old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  do {
+    new_val.h2r = __hadd2(old_val.h2r, value);
+  } while (!__hip_atomic_compare_exchange_strong(
+        (unsigned int*)address, &old_val.u32, new_val.u32,
+        __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
+  return old_val.h2r;
+#endif
+}
+
+__device__ inline __half2 preview_unsafeAtomicAdd(__half2* address, __half2 value) {
+#if (defined(__gfx942__)) && \
+  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2f16)
+  // The api expects an ext_vector_type of half
+  typedef _Float16 __attribute__((ext_vector_type(2))) vec_fp162;
+  static_assert(sizeof(vec_fp162) == sizeof(__half2_raw));
+  union {
+    __half2_raw h2r;
+    vec_fp162 fp16;
+  } u {static_cast<__half2_raw>(value)};
+  u.fp16 = __builtin_amdgcn_flat_atomic_fadd_v2f16((vec_fp162*)address, u.fp16);
+  return static_cast<__half2>(u.h2r);
+#else
+  static_assert(sizeof(__half2_raw) == sizeof(unsigned int));
+  union u_hold {
+    __half2_raw h2r;
+    unsigned int u32;
+  };
+  u_hold old_val, new_val;
+  old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+  do {
+    new_val.h2r = __hadd2(old_val.h2r, value);
+  } while (!__hip_atomic_compare_exchange_strong(
+        (unsigned int*)address, &old_val.u32, new_val.u32,
+        __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
+  return old_val.h2r;
+#endif
+}
+#define ATOMICADD preview_unsafeAtomicAdd
 #define NATIVE_ZERO_BF16 __float2bfloat16(0.0f)
 #else
 #define ATOMICADD atomicAdd
--- a/aten/src/ATen/native/cuda/LogAddExpKernel.cu
+++ b/aten/src/ATen/native/cuda/LogAddExpKernel.cu
@ -2,250 +2,18 @@
 #include <ATen/Dispatch.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/cuda/Loops.cuh>
-#include <ATen/native/cuda/JitLoops.cuh>
-#include <ATen/native/cuda/jit_utils.h>
-#include <ATen/native/cuda/ScanUtils.cuh>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/BinaryOps.h>
 #include <ATen/OpMathType.h>
 #include <c10/util/MathConstants.h>
-#include <c10/util/complex.h>
-
-#include <cmath>
-#include <limits>

 // NOTE: CUDA on Windows requires that the enclosing function
 // of a __device__ lambda not have internal linkage.

 namespace at::native {

-// custom min and max to be used in logaddexp for  complex arguments
-template <typename scalar_t, bool min>
-__host__ __device__ c10::complex<scalar_t> _logaddexp_minmax(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
-  scalar_t xr = std::real(x);
-  scalar_t yr = std::real(y);
-  if (::isnan(yr) || (::isnan(std::imag(y)))) {
-    return y;
-  } else if (::isnan(xr) || (::isnan(std::imag(x)))) {
-    return x;
-  } else if (min) { // min
-    return (xr < yr) ? x : y;
-  } else { // max
-    return (xr >= yr) ? x : y;
-  }
-}
-
-template <typename scalar_t>
-__host__ __device__ scalar_t _log_add_exp_helper(const scalar_t& x, const scalar_t& y) {
-  // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
-  // Using the original expression: `at::_isnan(y) ? y : std::min(x, y)` causes an error in ROCM
-  const auto isnan_x = at::_isnan(x);
-  const auto isnan_y = at::_isnan(y);
-  scalar_t min = isnan_y ? y : (isnan_x ? x : std::min(x, y));
-  scalar_t max = isnan_y ? y : (isnan_x ? x : std::max(x, y));
-  if (min != max || ::isfinite(min)) {
-    // nan will be propagated here
-    return ::log1p(std::exp(min - max)) + max;
-  } else {
-    // special case to correctly handle infinite cases
-    return x;
-  }
-}
-
-template <typename scalar_t>
-__host__ __device__ c10::complex<scalar_t> _fast_build_exp(const c10::complex<scalar_t>& x) {
-  // complex exponential function, but implemented manually to get fast compilation time
-  // this function only handles the case where the x is finite (not inf nor nan)
-  const auto xreal = std::real(x);
-  const auto ximag = std::imag(x);
-  const auto exp_x_abs = std::exp(xreal);
-  auto exp_x_real = exp_x_abs * std::cos(ximag);
-  auto exp_x_imag = exp_x_abs * std::sin(ximag);
-  return {exp_x_real, exp_x_imag};
-}
-
-template <typename scalar_t>
-__host__ __device__ c10::complex<scalar_t> _fast_build_exp_inf(const c10::complex<scalar_t>& x) {
-  // complex exponential function, but implemented manually to get fast compilation time
-  // this function only handles the case where the real part of x is infinite
-  const auto ximag = std::imag(x);
-  constexpr auto exp_x_abs = std::numeric_limits<scalar_t>::infinity();
-  if (!::isfinite(ximag)) {  // add this to make consitent with std::exp(x+yi)
-    return {exp_x_abs, std::numeric_limits<scalar_t>::quiet_NaN()};
-  }
-  const auto sin = std::sin(ximag);
-  const auto cos = std::cos(ximag);
-  // special case if the angle is exactly the multiple of pi/2
-  auto exp_x_real = (cos == 0) ? (scalar_t)0.0 : exp_x_abs * cos;
-  auto exp_x_imag = (sin == 0) ? (scalar_t)0.0 : exp_x_abs * sin;
-  return {exp_x_real, exp_x_imag};
-}
-
-template <typename scalar_t>
-__host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
-  c10::complex<scalar_t> min = _logaddexp_minmax<scalar_t, /*min=*/true>(x, y);
-  c10::complex<scalar_t> max = _logaddexp_minmax<scalar_t, /*min=*/false>(x, y);
-  scalar_t min_real = std::real(min);
-  scalar_t max_real = std::real(max);
-
-  if (::isnan(min_real) || ::isnan(std::imag(min))) {
-    // handling the "infectious" NaNs
-    return {std::numeric_limits<scalar_t>::quiet_NaN(), std::numeric_limits<scalar_t>::quiet_NaN()};
-  }
-  else if ((!::isfinite(min_real)) && (min_real == max_real)) {
-    if (min_real < 0) {
-      // handle the -inf case, the imaginary part here does not really matter as the exp(value)
-      // will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined.
-      // It does not matter if we're taking the exp of this value
-      return min;
-    } else {
-      // handle the +inf case, we don't need the special precision for log1p for small values
-      // and to avoid producing nan in case of real(max) == real(min) == +inf
-      const auto exp_min = _fast_build_exp_inf(min);
-      const auto exp_max = _fast_build_exp_inf(max);
-      return ::log1p(exp_min + exp_max - 1);  // log1p(x - 1) builds faster than log
-    }
-  } else {
-    const auto minmax = min - max;
-    c10::complex<scalar_t> exp_minmax;
-    if (!::isfinite(minmax.real())) {
-        exp_minmax = minmax.real() < 0 ? c10::complex<scalar_t>{0.0, 0.0} : _fast_build_exp_inf(minmax);
-    } else {
-        exp_minmax = _fast_build_exp(minmax);
-    }
-    return ::log1p(exp_minmax) + max;
-  }
-}
-
-// Complex logaddexp jiterator string
-const auto logaddexp_complex_string = jiterator_stringify(
-    template<typename T>
-    std::complex<T> log1p(const std::complex<T>& z)
-    {
-      using complex_t = std::complex<T>;
-      T x = z.real();
-      T y = z.imag();
-      T zabs = abs(z);
-      T theta = atan2(y, x + T(1));
-      if (zabs < 0.5) {
-          T r = x * (T(2) + x) + y * y;
-          if (r == 0) { // handle underflow
-              return complex_t(x, theta);
-          }
-          return complex_t(T(0.5) * std::log1p(r), theta);
-      } else {
-          T z0 = std::hypot(x + 1, y);
-          return complex_t(log(z0), theta);
-      }
-    }
-
-    // separated _logaddexp_minmax into 2 different functions for jiterator_string
-    template <typename T>
-    std::complex<T> logaddexp_min(const std::complex<T>& x, const std::complex<T>& y) {
-        T xr = x.real();
-        T yr = y.real();
-        if (isnan(yr) || isnan(y.imag())) {
-            return y;
-        } else if (isnan(xr) || isnan(x.imag())) {
-            return x;
-        } else {
-            return (xr < yr) ? x : y;
-        }
-    }
-
-    template <typename T>
-    std::complex<T> logaddexp_max(const std::complex<T>& x, const std::complex<T>& y) {
-        T xr = x.real();
-        T yr = y.real();
-        if (isnan(yr) || isnan(y.imag())) {
-            return y;
-        } else if (isnan(xr) || isnan(x.imag())) {
-            return x;
-        } else {
-            return (xr >= yr) ? x : y;
-        }
-    }
-
-    template <typename T>
-    std::complex<T> fast_build_exp(const std::complex<T>& x) {
-        const auto xreal = x.real();
-        const auto ximag = x.imag();
-        const auto exp_x_abs = exp(xreal);
-        auto exp_x_real = exp_x_abs * cos(ximag);
-        auto exp_x_imag = exp_x_abs * sin(ximag);
-        return std::complex<T>(exp_x_real, exp_x_imag);
-    }
-
-    template <typename T>
-    std::complex<T> fast_build_exp_inf(const std::complex<T>& x) {
-        using complex_t = std::complex<T>;
-        const auto ximag = x.imag();
-        const T exp_x_abs = INFINITY;
-        if (!isfinite(ximag)) {
-            return complex_t(exp_x_abs, NAN);
-        }
-        const auto sin_val = sin(ximag);
-        const auto cos_val = cos(ximag);
-        auto exp_x_real = (cos_val == T(0)) ? T(0) : exp_x_abs * cos_val;
-        auto exp_x_imag = (sin_val == T(0)) ? T(0) : exp_x_abs * sin_val;
-        return complex_t(exp_x_real, exp_x_imag);
-    }
-
-    template <typename complex_t>
-    complex_t logaddexp_complex(complex_t x, complex_t y) {
-        using T = typename complex_t::value_type;
-        complex_t min_val = logaddexp_min(x, y);
-        complex_t max_val = logaddexp_max(x, y);
-        T min_real = min_val.real();
-        T max_real = max_val.real();
-
-        if (isnan(min_real) || isnan(min_val.imag())) {
-            return complex_t(NAN, NAN);
-        }
-        else if ((!isfinite(min_real)) && (min_real == max_real)) {
-            if (min_real < T(0)) {
-                return min_val;
-            } else {
-                const auto exp_min = fast_build_exp_inf<T>(min_val);
-                const auto exp_max = fast_build_exp_inf<T>(max_val);
-                return log1p(exp_min + exp_max - complex_t(1, 0));
-            }
-        } else {
-            const auto minmax = min_val - max_val;
-            complex_t exp_minmax;
-            if (!isfinite(minmax.real())) {
-                exp_minmax = (minmax.real() < T(0)) ? complex_t(0, 0) : fast_build_exp_inf<T>(minmax);
-            } else {
-                exp_minmax = fast_build_exp<T>(minmax);
-            }
-            return log1p(exp_minmax) + max_val;
-        }
-    }
-);
-
-constexpr char logaddexp_complex_name[] = "logaddexp_complex";
 void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
-  if (at::isComplexType(iter.dtype())) {
-#if AT_USE_JITERATOR()
-    AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
-      jitted_gpu_kernel<
-          /*name=*/logaddexp_complex_name,
-          /*return_dtype=*/scalar_t,
-          /*common_dtype=*/scalar_t,
-          /*arity=*/2>(iter, logaddexp_complex_string);
-    });
-#else
-    AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
-      using opmath_t = at::opmath_type<scalar_t>;
-      gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a_, scalar_t b_) -> scalar_t {
-        const auto a = static_cast<opmath_t>(a_);
-        const auto b = static_cast<opmath_t>(b_);
-        return static_cast<scalar_t>(_log_add_exp_helper(a, b));
-      });
-    });
-#endif
-  } else {
-    AT_DISPATCH_FLOATING_TYPES_AND2(
+  AT_DISPATCH_FLOATING_TYPES_AND2(
      ScalarType::BFloat16, ScalarType::Half,
      iter.dtype(), "logaddexp_cuda",
      [&]() {
@ -261,7 +29,6 @@ void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
          }
        });
      });
-  }
 }

 void logaddexp2_kernel_cuda(TensorIteratorBase& iter) {
--- a/aten/src/ATen/native/cuda/Reduce.cu
+++ b/aten/src/ATen/native/cuda/Reduce.cu
@ -11,7 +11,7 @@ static inline std::ostream& operator<<(std::ostream& out, dim3 dim) {
  if (dim.y == 1 && dim.z == 1) {
    out << dim.x;
  } else {
-    out << "[" << dim.x << "," << dim.y << "," << dim.z << "]";
+    out << '[' << dim.x << ',' << dim.y << ',' << dim.z << ']';
  }
  return out;
 }
@ -27,7 +27,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
  out << "input_mult=[";
  for (int i = 0; i < 3; i++) {
    if (i != 0) {
-      out << ",";
+      out << ',';
    }
    out << config.input_mult[i];
  }
@ -35,7 +35,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
  out << "output_mult=[";
  for (int i = 0; i < 2; i++) {
    if (i != 0) {
-      out << ",";
+      out << ',';
    }
    out << config.output_mult[i];
  }
@ -49,7 +49,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
  out << "block=" << config.block() << ", ";
  out << "grid=" << config.grid() << ", ";
  out << "global_memory_size=" << config.global_memory_size();
-  out << ")";
+  out << ')';
  return out;
 }

--- a/aten/src/ATen/native/cuda/ScaledBlas.cpp
+++ b/aten/src/ATen/native/cuda/ScaledBlas.cpp
@ -1101,19 +1101,6 @@ _scaled_mxfp8_mxfp8(
  return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
 }

-void
-_check_mxfp4_support() {
-#ifndef USE_ROCM
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  // Only on B200 GPUs
-  TORCH_CHECK_NOT_IMPLEMENTED(
-    // B200 = 10.0, B300 = 10.3
-    dprops->major == 10,
-    "MXFP4 scaling only supported in CUDA for B200/B300"
-  );
-#endif
-}
-

 Tensor&
 _scaled_mxfp4_mxfp4(
@ -1126,7 +1113,6 @@ _scaled_mxfp4_mxfp4(
 #if defined(_WIN32) || (!defined(USE_ROCM) && !defined(USE_FBGEMM_GENAI))
  TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM and CUDA+FBGEMM_GENAI only");
 #else
-  _check_mxfp4_support();
  // Restrictions:
  // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
  TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",
--- a/aten/src/ATen/native/cuda/ScaledGroupMM.cu
+++ b/aten/src/ATen/native/cuda/ScaledGroupMM.cu
@ -364,9 +364,9 @@ void f8f8bf16_grouped_gemm_impl_sm90(
  //       reinterpret_cast<ProblemShape::UnderlyingProblemShape*>(
  //           stride_output_h + group_count);

-  //   std::cout << "PTRS " << mat_a.data_ptr() << " " << mat_b.data_ptr() << "
+  //   std::cout << "PTRS " << mat_a.data_ptr() << ' ' << mat_b.data_ptr() << "
  //   "
-  //             << out.data_ptr() << " " << scale_a.data_ptr() << " "
+  //             << out.data_ptr() << ' ' << scale_a.data_ptr() << ' '
  //             << scale_b.data_ptr() << "\n";
  //   for (int i = 0; i < group_count; i++) {
  //     std::cout << "A " << (void*)inputA_ptrs_h[i] << "\n";
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@ -1057,14 +1057,14 @@ std::string generate_code(
    // TODO these arrays are potentially of the different types, use function
    // traits to determine the types
    declare_load_arrays << f_inputs_type << " arg" << std::to_string(i)
-                        << "[" << std::to_string(thread_work_size) << "];\n";
+                        << '[' << std::to_string(thread_work_size) << "];\n";
  }
  env.s("declare_load_arrays", declare_load_arrays.str());

  std::stringstream declare_store_arrays;
  for (int i = 0; i < nOutputs; i++) {
    declare_store_arrays << result_type << " out" << std::to_string(i)
-                        << "[" << std::to_string(thread_work_size) << "];\n";
+                        << '[' << std::to_string(thread_work_size) << "];\n";
  }
  env.s("declare_store_arrays", declare_store_arrays.str());

@ -1217,7 +1217,7 @@ std::string generate_code(
  for (const auto i : c10::irange(nInputs)){
    auto i_string = std::to_string(i);
    vector_inputs << "auto * input" << i_string <<
-        " = reinterpret_cast<const scalar_t*>(data[" << i_string << "+" << nOutputs << "])" <<
+        " = reinterpret_cast<const scalar_t*>(data[" << i_string << '+' << nOutputs << "])" <<
        " + block_work_size * idx;\n";
  }
  env.s("vector_inputs", vector_inputs.str());
@ -1543,17 +1543,17 @@ NvrtcFunction jit_pwise_function(

    // Constructs file path by appending constructed cubin name to cache path
    std::stringstream ss;
-    ss << *cache_dir << "/";
+    ss << *cache_dir << '/';
    ss << kernel_name;
 #ifdef USE_ROCM
    ss << "_arch" << prop->gcnArchName;
 #else
-    ss << "_arch" << cuda_major << "." << cuda_minor;
+    ss << "_arch" << cuda_major << '.' << cuda_minor;
 #endif
-    ss << "_nvrtc" << nvrtc_major << "." << nvrtc_minor;
+    ss << "_nvrtc" << nvrtc_major << '.' << nvrtc_minor;
    ss << (compile_to_sass ? "_sass" : "_ptx");
-    ss << "_" << code.length();
-    ss << "_" << hash_code;
+    ss << '_' << code.length();
+    ss << '_' << hash_code;
    file_path = ss.str();

    std::ifstream readin{file_path, std::ios::in | std::ifstream::binary};
--- a/aten/src/ATen/native/cudnn/ConvShared.cpp
+++ b/aten/src/ATen/native/cudnn/ConvShared.cpp
@ -82,15 +82,15 @@ namespace native {

 std::ostream& operator<<(std::ostream& out, const ConvolutionParams& params) {
  out << "ConvolutionParams \n"
-      << "    memory_format = " << params.memory_format << "\n"
-      << "    data_type = " << cudnnTypeToString(params.dataType) << "\n"
-      << "    padding = " << ArrayRef<int>{params.padding} << "\n"
-      << "    stride = " << ArrayRef<int>{params.stride} << "\n"
-      << "    dilation = " << ArrayRef<int>{params.dilation} << "\n"
-      << "    groups = " << params.groups << "\n"
+      << "    memory_format = " << params.memory_format << '\n'
+      << "    data_type = " << cudnnTypeToString(params.dataType) << '\n'
+      << "    padding = " << ArrayRef<int>{params.padding} << '\n'
+      << "    stride = " << ArrayRef<int>{params.stride} << '\n'
+      << "    dilation = " << ArrayRef<int>{params.dilation} << '\n'
+      << "    groups = " << params.groups << '\n'
      << "    deterministic = " << (params.deterministic ? "true" : "false")
-      << "\n"
-      << "    allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << "\n";
+      << '\n'
+      << "    allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << '\n';

  return out;
 }
@ -173,16 +173,16 @@ std::string repro_from_args(const ConvolutionParams& params) {
            at::globalContext().float32Precision(
                at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
            at::Float32Precision::TF32)
-     << "\n";
+     << '\n';
  ss << "torch.backends.cudnn.benchmark = "
-     << pybool(at::globalContext().benchmarkCuDNN()) << "\n";
+     << pybool(at::globalContext().benchmarkCuDNN()) << '\n';
  ss << "torch.backends.cudnn.deterministic = " << pybool(params.deterministic)
-     << "\n";
+     << '\n';
  ss << "torch.backends.cudnn.allow_tf32 = " << pybool(params.allow_tf32)
-     << "\n";
+     << '\n';
  ss << "data = torch.randn(" << ArrayRef<int>(params.input_size, dim)
     << ", dtype=" << full_dtype << ", ";
-  ss << "device='cuda', requires_grad=True)" << to_channels_last << "\n";
+  ss << "device='cuda', requires_grad=True)" << to_channels_last << '\n';
  ss << "net = torch.nn.Conv" << dim - 2 << "d(" << in_channels << ", "
     << out_channels << ", ";
  ss << "kernel_size=" << ArrayRef<int>(&params.weight_size[2], dim - 2)
@ -192,7 +192,7 @@ std::string repro_from_args(const ConvolutionParams& params) {
  ss << "dilation=" << ArrayRef<int>(params.dilation, dim - 2) << ", ";
  ss << "groups=" << params.groups << ")\n";
  ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last
-     << "\n";
+     << '\n';
  ss << "out = net(data)\n";
  ss << "out.backward(torch.randn_like(out))\n";
  ss << "torch.cuda.synchronize()\n\n";
--- a/aten/src/ATen/native/cudnn/Conv_v7.cpp
+++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp
@ -93,11 +93,10 @@ std::ostream& operator<<(std::ostream& out, const ConvolutionArgs& args) {
      << "input: " << args.idesc // already has a trailing newline
      << "output: " << args.odesc // already has a trailing newline
      << "weight: " << args.wdesc // already has a trailing newline
-      << "Pointer addresses: "
-      << "\n"
-      << "    input: " << args.input.const_data_ptr() << "\n"
-      << "    output: " << args.output.const_data_ptr() << "\n"
-      << "    weight: " << args.weight.const_data_ptr() << "\n";
+      << "Pointer addresses: " << '\n'
+      << "    input: " << args.input.const_data_ptr() << '\n'
+      << "    output: " << args.output.const_data_ptr() << '\n'
+      << "    weight: " << args.weight.const_data_ptr() << '\n';

  return out;
 }
--- a/aten/src/ATen/native/metal/MetalTensorImplStorage.mm
+++ b/aten/src/ATen/native/metal/MetalTensorImplStorage.mm
@ -115,7 +115,7 @@ std::ostream& operator<<(
  std::copy(
      strides.begin(), strides.end() - 1, std::ostream_iterator<int>(oss, ","));
  oss << sizes.back();
-  output << oss.str() << "}";
+  output << oss.str() << '}';
  return output;
 }

--- a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp
@ -53,7 +53,7 @@ std::ostream& operator<<(std::ostream& out, const ConvParams& params) {
      << "  transposed = " << params.transposed
      << "  output_padding = " << IntArrayRef{params.output_padding}
      << "  groups = " << params.groups << "  benchmark = " << params.benchmark
-      << "  deterministic = " << params.deterministic << "}";
+      << "  deterministic = " << params.deterministic << '}';
  return out;
 }

--- a/aten/src/ATen/native/mps/operations/Repeat.mm
+++ b/aten/src/ATen/native/mps/operations/Repeat.mm
@ -91,30 +91,25 @@ static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
 #include <ATen/native/mps/Repeat_metallib.h>
 #endif

-Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
-  TORCH_CHECK(repeat.dim() == 1, "repeat_interleave only accept 1D vector as repeat");
+template <typename index_t>
+void computeRepeatIndices(const index_t* repeat_ptr,
+                          const int64_t* cumsum_ptr,
+                          index_t* result_ptr,
+                          int64_t size,
+                          int64_t result_size) {
+  id<MTLBuffer> repeatBuffer = reinterpret_cast<id<MTLBuffer>>(repeat_ptr);
+  id<MTLBuffer> cumsumBuffer = reinterpret_cast<id<MTLBuffer>>(cumsum_ptr);
+  id<MTLBuffer> resultBuffer = reinterpret_cast<id<MTLBuffer>>(result_ptr);
+  TORCH_CHECK(repeatBuffer && cumsumBuffer && resultBuffer);
+
  std::string scalar_type;
-  if (repeat.scalar_type() == kInt) {
+  if constexpr (std::is_same_v<index_t, int32_t>) {
    scalar_type = "int32_t";
-  } else if (repeat.scalar_type() == kLong) {
+  } else if constexpr (std::is_same_v<index_t, int64_t>) {
    scalar_type = "int64_t";
  } else {
-    TORCH_CHECK(false, "repeats has to be Long or Int tensor");
+    TORCH_CHECK(false, "repeat_interleave: unsupported indexing data type");
  }
-  if (repeat.size(0) == 0) {
-    return at::empty_like(repeat, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  }
-  Tensor repeat_ = repeat.contiguous();
-  Tensor cumsum = repeat.cumsum(0);
-  int64_t total = 0;
-  if (output_size.has_value()) {
-    total = output_size.value();
-  } else {
-    total = cumsum[-1].item<int64_t>();
-    TORCH_CHECK((repeat >= 0).all().item<uint8_t>(), "repeats can not be negative");
-  }
-
-  auto result = at::empty({total}, repeat.options());

  MPSStream* mpsStream = getCurrentMPSStream();
  dispatch_sync(mpsStream->queue(), ^() {
@ -126,13 +121,20 @@ Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output
      getMPSProfiler().beginProfileKernel(pipelineState, "repeat_interleave:" + scalar_type, false);

      [computeEncoder setComputePipelineState:pipelineState];
-      mps::mtl_setArgs(computeEncoder, repeat_, cumsum, result, repeat.size(0));
-      mps::mtl_dispatch1DJob(computeEncoder, pipelineState, repeat.size(0));
+      mps::mtl_setArgs(computeEncoder, repeatBuffer, cumsumBuffer, resultBuffer, size);
+      mps::mtl_dispatch1DJob(computeEncoder, pipelineState, size);

      getMPSProfiler().endProfileKernel(pipelineState);
    }
  });
-  return result;
+}
+
+Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
+  Tensor output;
+  AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_mps", [&]() {
+    output = repeat_interleave_common<index_t, computeRepeatIndices<index_t>>(repeat, output_size);
+  });
+  return output;
 }

 } // namespace at::native
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@ -5,7 +5,6 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorCompare.h>
 #include <ATen/native/mps/OperationUtils.h>
-#include <algorithm>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -90,21 +89,13 @@ static void check_min_max_dims(const OptionalTensorRef clamp_opt, const Tensor&
    auto clamp_shape = clamp_opt->sizes();
    auto input_shape = input_t.sizes();

-    if (num_clamp_dims > num_input_dims) {
-      auto leading_dims = num_clamp_dims - num_input_dims;
-      for (int64_t i = 0; i < leading_dims; ++i) {
-        TORCH_CHECK(clamp_shape[i] == 1,
-                    op_name + ": clamp tensor leading shape must be 1 to broadcast with input tensor");
-      }
-    }
+    TORCH_CHECK(num_clamp_dims <= num_input_dims,
+                op_name + ": clamp tensor number of dims must not be greater than that of input tensor")

-    auto clamp_idx = num_clamp_dims - 1;
-    auto input_idx = num_input_dims - 1;
-    auto common_dims = std::min(num_clamp_dims, num_input_dims);
-    for (int64_t i = 0; i < common_dims; ++i)
+    for (int i = 0; i < num_clamp_dims; i++)
      // One of the indices is allowed to be 1; will be handled by broadcast
-      TORCH_CHECK(clamp_shape[clamp_idx - i] == input_shape[input_idx - i] || clamp_shape[clamp_idx - i] == 1 ||
-                      input_shape[input_idx - i] == 1,
+      TORCH_CHECK(clamp_shape[num_clamp_dims - 1 - i] == input_shape[num_input_dims - 1 - i] ||
+                      clamp_shape[num_clamp_dims - 1 - i] == 1 || input_shape[num_input_dims - 1 - i] == 1,
                  op_name + ": clamp tensor trailing shape must match input tensor")
  }
 }
@ -145,6 +136,9 @@ static void clamp_tensor_out_mps(const Tensor& input_t,

  auto result_type = output_t.scalar_type();

+  IntArrayRef new_min_shape;
+  IntArrayRef new_max_shape;
+
  auto num_min_dims = min_opt->dim();
  auto num_max_dims = max_opt->dim();
  auto num_input_dims = input_t.dim();
@ -152,32 +146,24 @@ static void clamp_tensor_out_mps(const Tensor& input_t,
  std::vector<int64_t> new_min_arr(num_input_dims);
  std::vector<int64_t> new_max_arr(num_input_dims);

+  if (has_min && num_min_dims < num_input_dims) {
+    fill_new_shape(num_input_dims, num_min_dims, new_min_arr.data(), min_opt->sizes());
+    new_min_shape = IntArrayRef(new_min_arr);
+  }
+
+  if (has_max && num_max_dims < num_input_dims) {
+    fill_new_shape(num_input_dims, num_max_dims, new_max_arr.data(), max_opt->sizes());
+    new_max_shape = IntArrayRef(new_max_arr);
+  }
+
  Tensor min_opt_tensor;
  Tensor max_opt_tensor;

-  auto reshape_clamp_tensor = [&](const OptionalTensorRef clamp_tensor_ref,
-                                  int64_t num_clamp_dims,
-                                  std::vector<int64_t>& new_shape_storage) -> Tensor {
-    IntArrayRef clamp_shape = clamp_tensor_ref->sizes();
-    bool requires_view = false;
-
-    if (num_clamp_dims > num_input_dims) {
-      clamp_shape = clamp_shape.slice(num_clamp_dims - num_input_dims);
-      requires_view = true;
-    } else if (num_clamp_dims < num_input_dims) {
-      fill_new_shape(num_input_dims, num_clamp_dims, new_shape_storage.data(), clamp_shape);
-      clamp_shape = IntArrayRef(new_shape_storage);
-      requires_view = true;
-    }
-
-    return requires_view ? (*clamp_tensor_ref).view(clamp_shape) : *clamp_tensor_ref;
-  };
-
  if (has_min) {
-    min_opt_tensor = reshape_clamp_tensor(min_opt, num_min_dims, new_min_arr);
+    min_opt_tensor = (num_min_dims < num_input_dims) ? (*min_opt).view(new_min_shape) : *min_opt;
  }
  if (has_max) {
-    max_opt_tensor = reshape_clamp_tensor(max_opt, num_max_dims, new_max_arr);
+    max_opt_tensor = (num_max_dims < num_input_dims) ? (*max_opt).view(new_max_shape) : *max_opt;
  }

  @autoreleasepool {
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -4225,7 +4225,7 @@
    MTIA: mm_out_mtia
    MPS: mm_out_mps
    XPU: mm_out_xpu
-    SparseCPU, SparseCUDA, SparseMPS: _sparse_mm_out
+    SparseCPU, SparseCUDA: _sparse_mm_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm_out

 - func: mm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/avgpool-microkernel-tester.h
@ -301,12 +301,12 @@ class AvgPoolMicrokernelTester {
          ASSERT_NEAR(
              float(int32_t(y[i * yStride() + k])), yFP[i * kc() + k], 0.5001f)
              << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
              << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
          ASSERT_EQ(
              uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
              << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
              << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
        }
      }
@ -396,12 +396,12 @@ class AvgPoolMicrokernelTester {
          ASSERT_NEAR(
              float(int32_t(y[i * yStride() + k])), yFP[i * kc() + k], 0.5001f)
              << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
              << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
          ASSERT_EQ(
              uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
              << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
              << "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
        }
      }
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/test/maxpool-microkernel-tester.h
@ -232,7 +232,7 @@ class MaxPoolMicrokernelTester {
          ASSERT_EQ(
              uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
              << "at pixel " << i << ", channel " << k << ", n = " << n()
-              << ", ks = " << kh() << "x" << kw() << " (" << ks()
+              << ", ks = " << kh() << 'x' << kw() << " (" << ks()
              << "), kc = " << kc();
        }
      }
--- a/aten/src/ATen/native/utils/ParamUtils.h
+++ b/aten/src/ATen/native/utils/ParamUtils.h
@ -17,7 +17,7 @@ inline std::vector<T> _expand_param_if_needed(
    std::ostringstream ss;
    ss << "expected " << param_name << " to be a single integer value or a "
       << "list of " << expected_dim << " values to match the convolution "
-       << "dimensions, but got " << param_name << "=" << list_param;
+       << "dimensions, but got " << param_name << '=' << list_param;
    TORCH_CHECK(false, ss.str());
  } else {
    return list_param.vec();
--- a/aten/src/ATen/native/vulkan/api/Adapter.cpp
+++ b/aten/src/ATen/native/vulkan/api/Adapter.cpp
@ -358,9 +358,9 @@ std::string Adapter::stringize() const {
  std::string device_type = get_device_type_str(properties.deviceType);
  VkPhysicalDeviceLimits limits = properties.limits;

-  ss << "{" << std::endl;
+  ss << '{' << std::endl;
  ss << "  Physical Device Info {" << std::endl;
-  ss << "    apiVersion:    " << v_major << "." << v_minor << std::endl;
+  ss << "    apiVersion:    " << v_major << '.' << v_minor << std::endl;
  ss << "    driverversion: " << properties.driverVersion << std::endl;
  ss << "    deviceType:    " << device_type << std::endl;
  ss << "    deviceName:    " << properties.deviceName << std::endl;
@ -371,7 +371,7 @@ std::string Adapter::stringize() const {

 #define PRINT_LIMIT_PROP_VEC3(name)                                       \
  ss << "      " << std::left << std::setw(36) << #name << limits.name[0] \
-     << "," << limits.name[1] << "," << limits.name[2] << std::endl;
+     << ',' << limits.name[1] << ',' << limits.name[2] << std::endl;

  ss << "    Physical Device Limits {" << std::endl;
  PRINT_LIMIT_PROP(maxImageDimension1D);
@ -425,7 +425,7 @@ std::string Adapter::stringize() const {
    ;
  }
  ss << "  ]" << std::endl;
-  ss << "}";
+  ss << '}';

  return ss.str();
 }
--- a/aten/src/ATen/native/vulkan/api/Exception.cpp
+++ b/aten/src/ATen/native/vulkan/api/Exception.cpp
@ -33,7 +33,7 @@ std::ostream& operator<<(std::ostream& out, const VkResult result) {
    VK_RESULT_CASE(VK_ERROR_FORMAT_NOT_SUPPORTED)
    VK_RESULT_CASE(VK_ERROR_FRAGMENTED_POOL)
    default:
-      out << "VK_ERROR_UNKNOWN (VkResult " << result << ")";
+      out << "VK_ERROR_UNKNOWN (VkResult " << result << ')';
      break;
  }
  return out;
@ -46,7 +46,7 @@ std::ostream& operator<<(std::ostream& out, const VkResult result) {
 //

 std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
-  out << loc.function << " at " << loc.file << ":" << loc.line;
+  out << loc.function << " at " << loc.file << ':' << loc.line;
  return out;
 }

@ -66,7 +66,7 @@ Error::Error(SourceLocation source_location, const char* cond, std::string msg)
    : msg_(std::move(msg)), source_location_{source_location} {
  std::ostringstream oss;
  oss << "Exception raised from " << source_location_ << ": ";
-  oss << "(" << cond << ") is false! ";
+  oss << '(' << cond << ") is false! ";
  oss << msg_;
  what_ = oss.str();
 }
--- a/aten/src/ATen/native/vulkan/api/QueryPool.cpp
+++ b/aten/src/ATen/native/vulkan/api/QueryPool.cpp
@ -173,8 +173,8 @@ void QueryPool::extract_results() {

 static std::string stringize(const VkExtent3D& extents) {
  std::stringstream ss;
-  ss << "{" << extents.width << ", " << extents.height << ", " << extents.depth
-     << "}";
+  ss << '{' << extents.width << ", " << extents.height << ", " << extents.depth
+     << '}';
  return ss.str();
 }

--- a/aten/src/ATen/native/vulkan/api/Runtime.cpp
+++ b/aten/src/ATen/native/vulkan/api/Runtime.cpp
@ -149,7 +149,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn(
  (void)flags;

  std::stringstream stream;
-  stream << layer_prefix << " " << message_code << " " << message << std::endl;
+  stream << layer_prefix << ' ' << message_code << ' ' << message << std::endl;
  const std::string log = stream.str();

  std::cout << log;
--- a/aten/src/ATen/native/vulkan/api/Utils.h
+++ b/aten/src/ATen/native/vulkan/api/Utils.h
@ -253,7 +253,7 @@ using vec4 = vec<4u>;

 // uvec3 is the type representing tensor extents. Useful for debugging.
 inline std::ostream& operator<<(std::ostream& os, const uvec3& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")";
+  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ')';
  return os;
 }

--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@ -61,7 +61,6 @@ list(APPEND ATen_CUDA_TEST_SRCS
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_math_test.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_test.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cub_test.cu
-  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cublas_handle_pool_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_device_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_distributions_test.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_dlconvertor_test.cpp
--- a/aten/src/ATen/test/basic.cpp
+++ b/aten/src/ATen/test/basic.cpp
@ -246,7 +246,7 @@ void TestToCFloat() {
 void TestToString() {
  Tensor b = ones({3, 7}) * .0000001f;
  std::stringstream s;
-  s << b << "\n";
+  s << b << '\n';
  std::string expect = "1e-07 *";
  ASSERT_EQ_RESOLVED(s.str().substr(0, expect.size()), expect);
 }
--- a/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp
+++ b/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp
@ -1,77 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDACachingAllocator.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include <atomic>
-#include <thread>
-#include <vector>
-
-// Test concurrent access to getCurrentCUDABlasHandle and getCUDABlasLtWorkspace
-// to verify that the data race fix is working correctly
-
-TEST(CUDABlasHandlePoolTest, ConcurrentGetAndClearWorkspaces) {
-  if (!at::cuda::is_available()) {
-    return;
-  }
-
-  constexpr int num_accessor_threads = 15;
-  constexpr int num_clear_threads = 5;
-  constexpr int iterations_per_thread = 50;
-
-  std::atomic<bool> stop{false};
-  std::atomic<int> error_count{0};
-  std::vector<std::thread> threads;
-  threads.reserve(num_accessor_threads + num_clear_threads);
-
-  // Launch accessor threads
-  for (int i = 0; i < num_accessor_threads; ++i) {
-    threads.emplace_back([&stop, &error_count]() {
-      try {
-        at::cuda::CUDAGuard device_guard(0);
-
-        while (!stop.load(std::memory_order_relaxed)) {
-          const auto handle = at::cuda::getCurrentCUDABlasHandle();
-          const auto workspace = at::cuda::getCUDABlasLtWorkspace();
-
-          if (handle == nullptr || workspace == nullptr) {
-            error_count++;
-          }
-        }
-      } catch (const std::exception& e) {
-        error_count++;
-      }
-    });
-  }
-
-  // Launch threads that clear workspaces
-  for (int i = 0; i < num_clear_threads; ++i) {
-    threads.emplace_back([&error_count]() {
-      try {
-        for (int j = 0; j < iterations_per_thread; ++j) {
-          at::cuda::clearCublasWorkspaces();
-          std::this_thread::yield();
-        }
-      } catch (const std::exception& e) {
-        error_count++;
-      }
-    });
-  }
-
-  // Let them run for a bit
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));
-  stop.store(true, std::memory_order_relaxed);
-
-  for (auto& thread : threads) {
-    thread.join();
-  }
-
-  EXPECT_EQ(error_count.load(), 0);
-}
-
-int main(int argc, char* argv[]) {
-  ::testing::InitGoogleTest(&argc, argv);
-  c10::cuda::CUDACachingAllocator::init(1);
-  return RUN_ALL_TESTS();
-}
--- a/aten/src/ATen/test/scalar_test.cpp
+++ b/aten/src/ATen/test/scalar_test.cpp
@ -33,7 +33,7 @@ struct Foo {
  static void apply(Tensor a, Tensor b) {
    scalar_type s = 1;
    std::stringstream ss;
-    ss << "hello, dispatch: " << a.toString() << s << "\n";
+    ss << "hello, dispatch: " << a.toString() << s << '\n';
    auto data = (scalar_type*)a.data_ptr();
    (void)data;
  }
@ -73,8 +73,8 @@ TEST(TestScalar, TestScalar) {
  Scalar bar = 3.0;
  Half h = bar.toHalf();
  Scalar h2 = h;
-  cout << "H2: " << h2.toDouble() << " " << what.toFloat() << " "
-       << bar.toDouble() << " " << what.isIntegral(false) << "\n";
+  cout << "H2: " << h2.toDouble() << ' ' << what.toFloat() << ' '
+       << bar.toDouble() << ' ' << what.isIntegral(false) << '\n';
  auto gen = at::detail::getDefaultCPUGenerator();
  {
    // See Note [Acquire lock when using random generators]
@ -84,7 +84,7 @@ TEST(TestScalar, TestScalar) {
  }
  if (at::hasCUDA()) {
    auto t2 = zeros({4, 4}, at::kCUDA);
-    cout << &t2 << "\n";
+    cout << &t2 << '\n';
  }
  auto t = ones({4, 4});

@ -129,7 +129,7 @@ TEST(TestScalar, TestScalar) {
      std::stringstream ss;
      // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)
      ASSERT_NO_THROW(
-          ss << "hello, dispatch" << x.toString() << s << "\n");
+          ss << "hello, dispatch" << x.toString() << s << '\n');
      auto data = (scalar_t*)x.data_ptr();
      (void)data;
    });
--- a/aten/src/ATen/test/test_install/main.cpp
+++ b/aten/src/ATen/test/test_install/main.cpp
@ -1,5 +1,5 @@
 #include <ATen/ATen.h>

 int main() {
-  std::cout << at::ones({3,4}, at::CPU(at::kFloat)) << "\n";
+  std::cout << at::ones({3,4}, at::CPU(at::kFloat)) << '\n';
 }
--- a/aten/src/ATen/test/vec_test_all_types.cpp
+++ b/aten/src/ATen/test/vec_test_all_types.cpp
@ -1828,9 +1828,9 @@ namespace {
      #endif

        EXPECT_EQ(u16, c10::detail::fp16_ieee_from_fp32_value(f32s[i]))
-            << "Test failed for float to uint16 " << f32s[i] << "\n";
+            << "Test failed for float to uint16 " << f32s[i] << '\n';
        EXPECT_EQ(x, c10::detail::fp16_ieee_to_fp32_value(u16))
-            << "Test failed for uint16 to float " << u16 << "\n";
+            << "Test failed for uint16 to float " << u16 << '\n';
      }
    }
    TEST(FP8E4M3Test, FP8E4M3ConversionFloat) {
@ -1848,10 +1848,10 @@ namespace {
          EXPECT_TRUE(std::isnan(f32));
        } else {
          EXPECT_EQ(f32, c10::detail::fp8e4m3fn_to_fp32_value(input))
-              << "Test failed for u8 to float " << input << "\n";
+              << "Test failed for u8 to float " << input << '\n';
        }
        EXPECT_EQ(u8, c10::detail::fp8e4m3fn_from_fp32_value(f32))
-            << "Test failed for float to u8 " << f32 << "\n";
+            << "Test failed for float to u8 " << f32 << '\n';
      }
    }
    TEST(FP8E4M3Test, FP8E4M3BinaryAdd) {
@ -2015,10 +2015,10 @@ namespace {
          EXPECT_TRUE(std::isnan(f32));
        } else {
          EXPECT_EQ(f32, c10::detail::fp8e5m2_to_fp32_value(input))
-              << "Test failed for u8 to float " << input << "\n";
+              << "Test failed for u8 to float " << input << '\n';
        }
        EXPECT_EQ(u8, c10::detail::fp8e5m2_from_fp32_value(f32))
-            << "Test failed for float to u8 " << f32 << "\n";
+            << "Test failed for float to u8 " << f32 << '\n';
      }
    }
    TEST(FP8E5M2Test, FP8E5M2BinaryAdd) {
--- a/aten/src/ATen/test/vitals.cpp
+++ b/aten/src/ATen/test/vitals.cpp
@ -19,7 +19,7 @@ TEST(Vitals, Basic) {
    c10::utils::set_env("TORCH_VITAL", "1");
    TORCH_VITAL_DEFINE(Testing);
    TORCH_VITAL(Testing, Attribute0) << 1;
-    TORCH_VITAL(Testing, Attribute1) << "1";
+    TORCH_VITAL(Testing, Attribute1) << '1';
    TORCH_VITAL(Testing, Attribute2) << 1.0f;
    TORCH_VITAL(Testing, Attribute3) << 1.0;
    auto t = at::ones({1, 1});
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@ -129,14 +129,14 @@ void showRtol(const at::Tensor& a, const at::Tensor& b) {
  std::cout << "Max Diff allowed: " << maxDiff << std::endl;
  if (diff.sizes().size() == 2) {
    for (const auto y : c10::irange(diff.sizes()[0])) {
-      std::cout << y << ":";
+      std::cout << y << ':';
      for (const auto x : c10::irange(diff.sizes()[1])) {
        float diff_xy = diff[y][x].item<float>();
        if (diff_xy > maxDiff) {
          std::cout << std::setw(5) << x;
        }
        else {
-          std::cout << std::setw(5) << " ";
+          std::cout << std::setw(5) << ' ';
        }
      }
      std::cout << std::endl;
@ -3276,7 +3276,7 @@ TEST_F(VulkanAPITest, masked_fill_invalidinputs_exceptions) {

 void print_shape(const std::vector<int64_t>& shape) {
  for (const auto& num : shape) {
-    std::cout << num << " ";
+    std::cout << num << ' ';
  }
 }

@ -3367,7 +3367,7 @@ void test_masked_fill_scalar(
            print_shape(tmp_curr_input_shape);
            std::cout << "], and mask of shape [";
            print_shape(tmp_curr_mask_shape);
-            std::cout << "]" << std::endl;
+            std::cout << ']' << std::endl;
          }

          ASSERT_TRUE(check);
@ -4542,9 +4542,9 @@ void test_softmax(const at::IntArrayRef shape, bool log_softmax = false) {
    if (!check) {
      std::cout << "Softmax test failed on axis " << dim << "for tensor dims {";
      for (uint32_t place = 0; place < shape.size() - 1; place++) {
-        std::cout << shape[place] << " ";
+        std::cout << shape[place] << ' ';
      }
-      std::cout << shape.back() << "}" << std::endl;
+      std::cout << shape.back() << '}' << std::endl;
      showRtol(out_cpu, out_vulkan.cpu());
    }
    ASSERT_TRUE(check);
--- a/aten/src/ATen/test/vulkan_quantized_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_quantized_api_test.cpp
@ -95,7 +95,7 @@ void showRtol(
  std::cout << "Max Diff found is: " << diff.max().item<double>() << std::endl;
  if (diff.sizes().size() == 2) {
    for (const auto y : c10::irange(diff.sizes()[0])) {
-      std::cout << y << ":";
+      std::cout << y << ':';
      for (const auto x : c10::irange(diff.sizes()[1])) {
        double diff_xy = diff[y][x].item<double>();
        if (diff_xy > maxDiff) {
@ -109,7 +109,7 @@ void showRtol(
            }
          }
        } else {
-          std::cout << std::setw(5) << " ";
+          std::cout << std::setw(5) << ' ';
        }
      }
      std::cout << std::endl;
@ -148,19 +148,19 @@ using at::native::vulkan::api::utils::ivec4;
 using at::native::vulkan::api::utils::vec4;

 std::ostream& operator<<(std::ostream& os, const vec4& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
-     << v.data[3u] << ")";
+  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
+     << v.data[3u] << ')';
  return os;
 }

 std::ostream& operator<<(std::ostream& os, const ivec3& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ")";
+  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ')';
  return os;
 }

 std::ostream& operator<<(std::ostream& os, const ivec4& v) {
-  os << "(" << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
-     << v.data[3u] << ")";
+  os << '(' << v.data[0u] << ", " << v.data[1u] << ", " << v.data[2u] << ", "
+     << v.data[3u] << ')';
  return os;
 }

@ -3379,51 +3379,51 @@ bool _test_quantized_linear(
      showRtol(out_cpu_dequant, out_vk_to_cpu_dequant);
    }
    if (xpos != -1 && ypos != -1) {
-      std::cout << "\nFailure caused on row/col: " << ypos << "/" << xpos
-                << "\n";
+      std::cout << "\nFailure caused on row/col: " << ypos << '/' << xpos
+                << '\n';
      std::cout << "Input tensor scale: " << scale << " zerop: " << zero_point
-                << "\n";
-      std::cout << "Input tensor row " << ypos << "\n";
+                << '\n';
+      std::cout << "Input tensor row " << ypos << '\n';
      for (int i = 0; i < input_cpu.sizes()[1]; i++) {
        std::cout << input_cpu[ypos][i].item<double>() << ", ";
      }
-      std::cout << "\n";
+      std::cout << '\n';

      std::cout << "Weight tensor scale: " << w_scale
-                << " zerop: " << w_zero_point << "\n";
-      std::cout << "Weight tensor col " << xpos << "\n";
+                << " zerop: " << w_zero_point << '\n';
+      std::cout << "Weight tensor col " << xpos << '\n';
      for (int i = 0; i < weight.sizes()[1]; i++) {
        std::cout << weight[xpos][i].item<double>() << ", ";
      }
-      std::cout << "\n";
+      std::cout << '\n';

      std::cout << "Input tensor quantized row " << ypos << " with dtype "
-                << (input_quant_dtype_int8 ? "QInt8" : "QUInt8") << "\n";
+                << (input_quant_dtype_int8 ? "QInt8" : "QUInt8") << '\n';
      for (int i = 0; i < input_cpu.sizes()[1]; i++) {
        std::cout << input_cpu_quantized[ypos][i].item<double>() << ", ";
      }
-      std::cout << "\n";
+      std::cout << '\n';

      std::cout << "Weight tensor quantized col " << xpos << " with dtype "
-                << (weight_quant_dtype_int8 ? "QInt8" : "QUInt8") << "\n";
+                << (weight_quant_dtype_int8 ? "QInt8" : "QUInt8") << '\n';
      for (int i = 0; i < weight.sizes()[1]; i++) {
        std::cout << weight_cpu_quantized[xpos][i].item<double>() << ", ";
      }
-      std::cout << "\n";
+      std::cout << '\n';

      std::cout << "bias tensor\n";
      for (int i = 0; i < bias.sizes()[0]; i++) {
        std::cout << bias[i].item<double>() << ", ";
      }
-      std::cout << "\n";
+      std::cout << '\n';

      std::cout << "out_scale: " << out_scale
-                << " out_zero_point: " << out_zero_point << "\n";
+                << " out_zero_point: " << out_zero_point << '\n';

      std::cout << "cpu unmatched output: "
-                << out_cpu_dequant[ypos][xpos].item<double>() << "\n";
+                << out_cpu_dequant[ypos][xpos].item<double>() << '\n';
      std::cout << "vk unmatched output: "
-                << out_vk_to_cpu_dequant[ypos][xpos].item<double>() << "\n";
+                << out_vk_to_cpu_dequant[ypos][xpos].item<double>() << '\n';
    }
  }
  return check;
--- a/aten/tools/valgrind.sup
+++ b/aten/tools/valgrind.sup
@ -10,13 +10,6 @@
   ...
 }

-{
-   ignore_empty_generic_uninitialised_conditional_jump
-   Memcheck:Cond
-   fun:_ZN2at6detail13empty_genericEN3c108ArrayRefIlEEPNS1_9AllocatorENS1_14DispatchKeySetENS1_10ScalarTypeESt8optionalINS1_12MemoryFormatEE
-   ...
-}
-
 {
   Cond_cuda
   Memcheck:Cond
--- a/benchmarks/dynamo/check_perf_csv.py
+++ b/benchmarks/dynamo/check_perf_csv.py
@ -9,61 +9,28 @@ def check_perf_csv(filename, threshold, threshold_scale):
    """
    Basic performance checking.
    """
-    try:
-        df = pd.read_csv(filename)
-    except FileNotFoundError:
-        print(f"Error: File {filename} not found")
-        sys.exit(1)

-    effective_threshold = threshold * threshold_scale
-    print(f"Checking {filename} (speedup threshold >= {effective_threshold:.2f}x)\n")
+    df = pd.read_csv(filename)

    failed = []
    for _, row in df.iterrows():
        model_name = row["name"]
-        speedup = float(row["speedup"])
-        abs_latency = float(row["abs_latency"])
-        compilation_latency = float(row["compilation_latency"])
-        compression_ratio = float(row["compression_ratio"])
-        eager_peak_mem = float(row["eager_peak_mem"])
-        dynamo_peak_mem = float(row["dynamo_peak_mem"])
+        speedup = row["speedup"]
+        if speedup < threshold * threshold_scale:
+            failed.append(model_name)

-        perf_summary = f"{model_name:34} speedup={speedup:.3f}x"
-        if pd.notna(abs_latency):
-            perf_summary += f", latency={abs_latency:.1f} ms/iter"
-        if pd.notna(compilation_latency):
-            perf_summary += f", compile={compilation_latency:.3f}s"
-        if pd.notna(compression_ratio):
-            perf_summary += f", mem_ratio={1 / compression_ratio:.2f}x"
-            if pd.notna(eager_peak_mem) and pd.notna(dynamo_peak_mem):
-                perf_summary += (
-                    f" (eager={eager_peak_mem:.1f} GB, dynamo={dynamo_peak_mem:.1f} GB)"
-                )
-
-        if speedup < effective_threshold:
-            failed.append((model_name, speedup))
-
-        print(perf_summary)
+        print(f"{model_name:34} {speedup}")

    if failed:
        print(
            textwrap.dedent(
                f"""
-                Error {len(failed)} model(s) performance regressed
-                    {" ".join([name for name, _ in failed])}
+                Error {len(failed)} models performance regressed
+                    {" ".join(failed)}
                """
            )
        )
-        for name, sp in sorted(failed, key=lambda x: x[1]):
-            pct_from_target = (sp / effective_threshold - 1.0) * 100.0
-            print(
-                f"  - {name}: {sp:.3f}x (< {effective_threshold:.2f}x; {pct_from_target:.1f}% from target)"
-            )
        sys.exit(1)
-    else:
-        print(
-            f"\nAll {len(df)} model(s) passed threshold check (>= {effective_threshold:.2f}x)"
-        )


 if __name__ == "__main__":
@ -77,7 +44,7 @@ if __name__ == "__main__":
        "-s",
        type=float,
        default=1.0,
-        help="multiply threshold by this value to relax the check",
+        help="multiple threshold by this value to relax the check",
    )
    args = parser.parse_args()
    check_perf_csv(args.file, args.threshold, args.threshold_scale)
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -2379,9 +2379,7 @@ class BenchmarkRunner:
                    print(
                        f"Load model outputs from {self.args.compare_model_outputs_with} to compare"
                    )
-                    saved_result = torch.load(
-                        self.args.compare_model_outputs_with, weights_only=False
-                    )
+                    saved_result = torch.load(self.args.compare_model_outputs_with)
                    is_bitwise_same = bitwise_same(saved_result, new_result)
                    if not is_bitwise_same:
                        print(
--- a/benchmarks/operator_benchmark/README.md
+++ b/benchmarks/operator_benchmark/README.md
@ -145,6 +145,64 @@ Run torch.add benchmark with tag 'long':
 python -m pt.add_test --tag-filter long
 ```

+## CI Regression Tracking
+
+The operator benchmarks are continuously monitored in CI to track performance regressions across a diverse set of CPU and GPU devices. Two GitHub Actions workflows run these benchmarks on a regular schedule:
+
+### CPU Benchmarks
+
+The [operator_benchmark.yml](../../.github/workflows/operator_benchmark.yml) workflow runs operator benchmarks on CPU devices:
+
+**Devices:**
+- x86_64: `linux.12xlarge` (Intel/AMD CPUs)
+- aarch64: `linux.arm64.m8g.4xlarge` (ARM64 CPUs)
+
+**Operators Tracked:** All operators in the `pt/` directory with tag : `short`
+
+**Schedule:** Weekly on Sundays at 07:00 UTC
+
+**Test Modes:** `short`, `long`, or `all` (default: `short`)
+
+**Triggers:**
+- Scheduled runs (weekly)
+- Manual workflow dispatch with configurable test mode
+- Push to `ciflow/op-benchmark/*` tags
+- Pull requests that modify benchmark files
+
+### GPU Microbenchmarks
+
+The [operator_microbenchmark.yml](../../.github/workflows/operator_microbenchmark.yml) workflow runs operator microbenchmarks on GPU devices:
+
+**CUDA Devices:**
+- H100 GPUs (`linux.aws.h100`) - CUDA 12.8, sm_80
+- A100 GPUs (`linux.aws.a100`) - CUDA 12.8, sm_80
+- B200 GPUs (`linux.dgx.b200`) - CUDA 12.8, sm_100
+
+**ROCm Devices:**
+- MI300X GPUs (`linux.rocm.gpu.gfx942.1`) - gfx942
+
+**Operators Tracked in CI:** `matmul`, `mm`, `addmm`, `bmm`, `conv` (with tag `long`)
+- Other operators in the `pt/` directory can be run ad-hoc using the workflow dispatch
+
+**Schedule:** Daily at 06:00 UTC
+
+**Performance Dashboard:** [PyTorch Operator Microbenchmark Dashboard](https://hud.pytorch.org/benchmark/v3/dashboard/pytorch_operator_microbenchmark)
+
+**Triggers:**
+- Scheduled runs (daily)
+- Manual workflow dispatch
+- Push to `ciflow/op-benchmark/*` tags
+
+### Running Manual Benchmarks
+
+To trigger a manual run of the benchmarks:
+
+1. Navigate to the [GitHub Actions workflows](https://github.com/pytorch/pytorch/actions)
+2. Select either `operator_benchmark` or `operator_microbenchmark`
+3. Click "Run workflow" in the top right
+4. For CPU benchmarks, optionally select a test mode (`short`, `long`, or `all`)
+5. Click "Run workflow" to start the benchmark run
+
 ## Adding New Operators to the Benchmark Suite
 In the previous sections, we gave several examples to show how to run the already available operators in the benchmark suite. In the following sections, we'll step through the complete flow of adding PyTorch operators to the benchmark suite. Existing benchmarks for operators are in the `pt` directory and we highly recommend putting your new operators in those directories as well.

--- a/c10/core/DispatchKeySet.cpp
+++ b/c10/core/DispatchKeySet.cpp
@ -176,7 +176,7 @@ std::ostream& operator<<(std::ostream& os, DispatchKeySet ts) {
    os << k;
    first = false;
  }
-  os << ")";
+  os << ')';
  return os;
 }

--- a/c10/core/SafePyObject.h
+++ b/c10/core/SafePyObject.h
@ -44,7 +44,7 @@ struct C10_API SafePyObject {
      (*other.pyinterpreter_)->incref(other.data_);
    }
    if (data_ != nullptr) {
-      (*pyinterpreter_)->decref(data_);
+      (*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
    }
    data_ = other.data_;
    pyinterpreter_ = other.pyinterpreter_;
@ -53,7 +53,7 @@ struct C10_API SafePyObject {

  ~SafePyObject() {
    if (data_ != nullptr) {
-      (*pyinterpreter_)->decref(data_);
+      (*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
    }
  }

--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@ -92,6 +92,13 @@ inline bool isComplexType(ScalarType t) {
      t == ScalarType::ComplexDouble);
 }

+inline bool isQIntType(ScalarType t) {
+  // Don't forget to extend this when adding new QInt types
+  return t == ScalarType::QInt8 || t == ScalarType::QUInt8 ||
+      t == ScalarType::QInt32 || t == ScalarType::QUInt4x2 ||
+      t == ScalarType::QUInt2x4;
+}
+
 inline bool isBitsType(ScalarType t) {
  return t == ScalarType::Bits1x8 || t == ScalarType::Bits2x4 ||
      t == ScalarType::Bits4x2 || t == ScalarType::Bits8 ||
--- a/c10/core/StorageImpl.cpp
+++ b/c10/core/StorageImpl.cpp
@ -48,30 +48,6 @@ void warnDeprecatedDataPtr() {
  TORCH_CHECK(false, "Cannot access data pointer of Storage that is invalid.");
 }

-void StorageImpl::incref_pyobject() const {
-  // Because intrusive_ptr incref uses relaxed memory order, we need to
-  // do an acquire fence to ensure that the kHasPyObject bit was
-  // observed before the load of the PyObject* below.
-  // NB: This is a no-op on x86/x86-64
-  std::atomic_thread_fence(std::memory_order_acquire);
-
-  PyObject* obj = pyobj_slot_.load_pyobj();
-  (*pyobj_slot_.pyobj_interpreter())->incref(obj);
-}
-
-void StorageImpl::decref_pyobject() const {
-  PyObject* obj = pyobj_slot_.load_pyobj();
-  (*pyobj_slot_.pyobj_interpreter())->decref(obj);
-}
-
-bool StorageImpl::try_incref_pyobject() const {
-  c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
-  if (C10_UNLIKELY(!interp)) {
-    return false;
-  }
-  return (*interp)->try_incref(pyobj_slot_);
-}
-
 void SetStorageImplCreate(DeviceType t, StorageImplCreateHelper fptr) {
  // Allowlist verification.
  // Only if the devicetype is in the allowlist,
--- a/c10/core/StorageImpl.h
+++ b/c10/core/StorageImpl.h
@ -105,12 +105,6 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
    data_ptr_.clear();
  }

-  void incref_pyobject() const override final;
-
-  void decref_pyobject() const override final;
-
-  bool try_incref_pyobject() const override final;
-
  size_t nbytes() const {
    // OK to do this instead of maybe_as_int as nbytes is guaranteed positive
    TORCH_CHECK(!size_bytes_is_heap_allocated_);
@ -376,18 +370,4 @@ C10_API c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
    bool resizable,
    std::optional<at::Device> device_opt);

-namespace detail {
-
-#ifndef C10_MOBILE
-template <class T>
-struct TargetTraits<
-    T,
-    std::enable_if_t<
-        std::is_base_of_v<c10::StorageImpl, std::remove_cv_t<T>>>> {
-  static constexpr bool can_have_pyobject = true;
-};
-#endif
-
-} // namespace detail
-
 } // namespace c10
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@ -277,6 +277,7 @@ void TensorImpl::release_resources() {
  if (storage_) {
    storage_ = {};
  }
+  pyobj_slot_.maybe_destroy_pyobj();
 }

 #ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
@ -988,30 +989,6 @@ void TensorImpl::empty_tensor_restride_symint(MemoryFormat memory_format) {
  }
 }

-void TensorImpl::incref_pyobject() const {
-  // Because intrusive_ptr incref uses relaxed memory order, we need to
-  // do an acquire fence to ensure that the kHasPyObject bit was
-  // observed before the load of the PyObject* below.
-  // NB: This is a no-op on x86/x86-64
-  std::atomic_thread_fence(std::memory_order_acquire);
-
-  PyObject* obj = pyobj_slot_.load_pyobj();
-  (*pyobj_slot_.pyobj_interpreter())->incref(obj);
-}
-
-void TensorImpl::decref_pyobject() const {
-  PyObject* obj = pyobj_slot_.load_pyobj();
-  (*pyobj_slot_.pyobj_interpreter())->decref(obj);
-}
-
-bool TensorImpl::try_incref_pyobject() const {
-  c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
-  if (C10_UNLIKELY(!interp)) {
-    return false;
-  }
-  return (*interp)->try_incref(pyobj_slot_);
-}
-
 namespace impl {

 namespace {
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@ -2178,12 +2178,6 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    return &pyobj_slot_;
  }

-  void incref_pyobject() const override final;
-
-  void decref_pyobject() const override final;
-
-  bool try_incref_pyobject() const override final;
-
 private:
  // See NOTE [std::optional operator usage in CUDA]
  // We probably don't want to expose this publicly until
@ -3085,19 +3079,6 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
  friend class C10_TensorImpl_Size_Check_Dummy_Class;
 };

-namespace detail {
-
-#ifndef C10_MOBILE
-template <class T>
-struct TargetTraits<
-    T,
-    std::enable_if_t<std::is_base_of_v<c10::TensorImpl, std::remove_cv_t<T>>>> {
-  static constexpr bool can_have_pyobject = true;
-};
-#endif
-
-} // namespace detail
-
 // Note [TensorImpl size constraints]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // Changed the size of TensorImpl?  If the size went down, good for
--- a/c10/core/TensorOptions.cpp
+++ b/c10/core/TensorOptions.cpp
@ -33,7 +33,7 @@ std::ostream& operator<<(std::ostream& stream, const TensorOptions& options) {
  } else {
    stream << "(nullopt)";
  }
-  stream << ")";
+  stream << ')';

  return stream;
 }
--- a/c10/core/impl/PyInterpreter.cpp
+++ b/c10/core/impl/PyInterpreter.cpp
@ -11,11 +11,8 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {

  void incref(PyObject* pyobj) const override {} // do nothing

-  void decref(PyObject* pyobj) const override {} // do nothing
-
-  bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const override {
-    return false;
-  }
+  void decref(PyObject* pyobj, bool has_pyobj_slot) const override {
+  } // do nothing

 #define PANIC(m)              \
  TORCH_INTERNAL_ASSERT(      \
@ -23,10 +20,6 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
      "attempted to call " #m \
      " on a Tensor with nontrivial PyObject after corresponding interpreter died")

-  size_t refcnt(PyObject* pyobj) const override {
-    PANIC(refcnt);
-  }
-
  c10::intrusive_ptr<TensorImpl> detach(const TensorImpl* self) const override {
    PANIC(detach);
  }
--- a/Show More
+++ b/Show More