[DTensor] Add pointwise ops strategy for aten.fmin, aten.fmax, aten.heaviside

2025-11-19 18:14:54 +08:00 · 2025-11-17 16:08:38 +05:30
270 changed files with 3662 additions and 10381 deletions
--- a/.ci/aarch64_linux/README.md
+++ b/.ci/aarch64_linux/README.md
@ -1,19 +0,0 @@
-# Aarch64 (ARM/Graviton) Support Scripts
-Scripts for building aarch64 PyTorch PIP Wheels. These scripts build the following wheels:
-* torch
-* torchvision
-* torchaudio
-* torchtext
-* torchdata
-## Aarch64_ci_build.sh
-This script is design to support CD operations within PyPi manylinux aarch64 container, and be executed in the container. It prepares the container and then executes __aarch64_wheel_ci_build.py__ to build the wheels. The script "assumes" the PyTorch repo is located at: ```/pytorch``` and will put the wheels into ```/artifacts```.
-### Usage
-```DESIRED_PYTHON=<PythonVersion> aarch64_ci_build.sh```
-
-__NOTE:__ CI build is currently __EXPERMINTAL__
-
-## Build_aarch64_wheel.py
-This app allows a person to build using AWS EC3 resources and requires AWS-CLI and Boto3 with AWS credentials to support building EC2 instances for the wheel builds. Can be used in a codebuild CD or from a local system.
-
-### Usage
-```build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch <RCtag>```
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -1,53 +0,0 @@
-#!/bin/bash
-set -eux -o pipefail
-
-GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
-
-# Set CUDA architecture lists to match x86 build_cuda.sh
-if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0"
-elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
-elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
-elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
-fi
-
-# Compress the fatbin with -compress-mode=size for CUDA 13
-if [[ "$DESIRED_CUDA" == *"13"* ]]; then
-    export TORCH_NVCC_FLAGS="-compress-mode=size"
-    # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
-    export BUILD_BUNDLE_PTXAS=1
-fi
-
-SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
-source $SCRIPTPATH/aarch64_ci_setup.sh
-
-###############################################################################
-# Run aarch64 builder python
-###############################################################################
-cd /
-# adding safe directory for git as the permissions will be
-# on the mounted pytorch repo
-git config --global --add safe.directory /pytorch
-pip install -r /pytorch/requirements.txt
-pip install auditwheel==6.2.0 wheel
-if [ "$DESIRED_CUDA" = "cpu" ]; then
-    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
-    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
-else
-    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
-    export USE_SYSTEM_NCCL=1
-
-    # Check if we should use NVIDIA libs from PyPI (similar to x86 build_cuda.sh logic)
-    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
-        echo "Bundling CUDA libraries with wheel for aarch64."
-    else
-        echo "Using nvidia libs from pypi for aarch64."
-        echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
-        export USE_NVIDIA_PYPI_LIBS=1
-    fi
-
-    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
-fi
--- a/.ci/aarch64_linux/aarch64_ci_setup.sh
+++ b/.ci/aarch64_linux/aarch64_ci_setup.sh
@ -1,21 +0,0 @@
-#!/bin/bash
-set -eux -o pipefail
-
-# This script is used to prepare the Docker container for aarch64_ci_wheel_build.py python script
-# By creating symlinks from desired /opt/python to /usr/local/bin/
-
-NUMPY_VERSION=2.0.2
-if [[ "$DESIRED_PYTHON"  == "3.13" || "$DESIRED_PYTHON" == "3.13t" ]]; then
-    NUMPY_VERSION=2.1.2
-fi
-
-SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
-source $SCRIPTPATH/../manywheel/set_desired_python.sh
-
-pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2
-
-for tool in python python3 pip pip3 ninja scons patchelf; do
-    ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin;
-done
-
-python --version
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -1,333 +0,0 @@
-#!/usr/bin/env python3
-# encoding: UTF-8
-
-import os
-import shutil
-from subprocess import check_call, check_output
-
-
-def list_dir(path: str) -> list[str]:
-    """'
-    Helper for getting paths for Python
-    """
-    return check_output(["ls", "-1", path]).decode().split("\n")
-
-
-def replace_tag(filename) -> None:
-    with open(filename) as f:
-        lines = f.readlines()
-    for i, line in enumerate(lines):
-        if line.startswith("Tag:"):
-            lines[i] = line.replace("-linux_", "-manylinux_2_28_")
-            print(f"Updated tag from {line} to {lines[i]}")
-            break
-
-    with open(filename, "w") as f:
-        f.writelines(lines)
-
-
-def patch_library_rpath(
-    folder: str,
-    lib_name: str,
-    use_nvidia_pypi_libs: bool = False,
-    desired_cuda: str = "",
-) -> None:
-    """Apply patchelf to set RPATH for a library in torch/lib"""
-    lib_path = f"{folder}/tmp/torch/lib/{lib_name}"
-
-    if use_nvidia_pypi_libs:
-        # For PyPI NVIDIA libraries, construct CUDA RPATH
-        cuda_rpaths = [
-            "$ORIGIN/../../nvidia/cudnn/lib",
-            "$ORIGIN/../../nvidia/nvshmem/lib",
-            "$ORIGIN/../../nvidia/nccl/lib",
-            "$ORIGIN/../../nvidia/cusparselt/lib",
-        ]
-
-        if "130" in desired_cuda:
-            cuda_rpaths.append("$ORIGIN/../../nvidia/cu13/lib")
-        else:
-            cuda_rpaths.extend(
-                [
-                    "$ORIGIN/../../nvidia/cublas/lib",
-                    "$ORIGIN/../../nvidia/cuda_cupti/lib",
-                    "$ORIGIN/../../nvidia/cuda_nvrtc/lib",
-                    "$ORIGIN/../../nvidia/cuda_runtime/lib",
-                    "$ORIGIN/../../nvidia/cufft/lib",
-                    "$ORIGIN/../../nvidia/curand/lib",
-                    "$ORIGIN/../../nvidia/cusolver/lib",
-                    "$ORIGIN/../../nvidia/cusparse/lib",
-                    "$ORIGIN/../../nvidia/nvtx/lib",
-                    "$ORIGIN/../../nvidia/cufile/lib",
-                ]
-            )
-
-        # Add $ORIGIN for local torch libs
-        rpath = ":".join(cuda_rpaths) + ":$ORIGIN"
-    else:
-        # For bundled libraries, just use $ORIGIN
-        rpath = "$ORIGIN"
-
-    if os.path.exists(lib_path):
-        os.system(
-            f"cd {folder}/tmp/torch/lib/; "
-            f"patchelf --set-rpath '{rpath}' --force-rpath {lib_name}"
-        )
-
-
-def copy_and_patch_library(
-    src_path: str,
-    folder: str,
-    use_nvidia_pypi_libs: bool = False,
-    desired_cuda: str = "",
-) -> None:
-    """Copy a library to torch/lib and patch its RPATH"""
-    if os.path.exists(src_path):
-        lib_name = os.path.basename(src_path)
-        shutil.copy2(src_path, f"{folder}/tmp/torch/lib/{lib_name}")
-        patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
-
-
-def package_cuda_wheel(wheel_path, desired_cuda) -> None:
-    """
-    Package the cuda wheel libraries
-    """
-    folder = os.path.dirname(wheel_path)
-    os.mkdir(f"{folder}/tmp")
-    os.system(f"unzip {wheel_path} -d {folder}/tmp")
-    # Delete original wheel since it will be repackaged
-    os.system(f"rm {wheel_path}")
-
-    # Check if we should use PyPI NVIDIA libraries or bundle system libraries
-    use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
-
-    if use_nvidia_pypi_libs:
-        print("Using nvidia libs from pypi - skipping CUDA library bundling")
-        # For PyPI approach, we don't bundle CUDA libraries - they come from PyPI packages
-        # We only need to bundle non-NVIDIA libraries
-        minimal_libs_to_copy = [
-            "/lib64/libgomp.so.1",
-            "/usr/lib64/libgfortran.so.5",
-            "/acl/build/libarm_compute.so",
-            "/acl/build/libarm_compute_graph.so",
-            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_lapack_core.so.0",
-            "/usr/local/lib/libnvpl_blas_core.so.0",
-        ]
-
-        # Copy minimal libraries to unzipped_folder/torch/lib
-        for lib_path in minimal_libs_to_copy:
-            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
-
-        # Patch torch libraries used for searching libraries
-        torch_libs_to_patch = [
-            "libtorch.so",
-            "libtorch_cpu.so",
-            "libtorch_cuda.so",
-            "libtorch_cuda_linalg.so",
-            "libtorch_global_deps.so",
-            "libtorch_python.so",
-            "libtorch_nvshmem.so",
-            "libc10.so",
-            "libc10_cuda.so",
-            "libcaffe2_nvrtc.so",
-            "libshm.so",
-        ]
-        for lib_name in torch_libs_to_patch:
-            patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
-    else:
-        print("Bundling CUDA libraries with wheel")
-        # Original logic for bundling system CUDA libraries
-        # Common libraries for all CUDA versions
-        common_libs = [
-            # Non-NVIDIA system libraries
-            "/lib64/libgomp.so.1",
-            "/usr/lib64/libgfortran.so.5",
-            "/acl/build/libarm_compute.so",
-            "/acl/build/libarm_compute_graph.so",
-            # Common CUDA libraries (same for all versions)
-            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_lapack_core.so.0",
-            "/usr/local/lib/libnvpl_blas_core.so.0",
-            "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
-            "/usr/local/cuda/lib64/libcudnn.so.9",
-            "/usr/local/cuda/lib64/libcusparseLt.so.0",
-            "/usr/local/cuda/lib64/libcurand.so.10",
-            "/usr/local/cuda/lib64/libnccl.so.2",
-            "/usr/local/cuda/lib64/libnvshmem_host.so.3",
-            "/usr/local/cuda/lib64/libcudnn_adv.so.9",
-            "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
-            "/usr/local/cuda/lib64/libcudnn_graph.so.9",
-            "/usr/local/cuda/lib64/libcudnn_ops.so.9",
-            "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
-            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
-            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
-            "/usr/local/cuda/lib64/libcufile.so.0",
-            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
-            "/usr/local/cuda/lib64/libcusparse.so.12",
-        ]
-
-        # CUDA version-specific libraries
-        if "13" in desired_cuda:
-            minor_version = desired_cuda[-1]
-            version_specific_libs = [
-                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
-                "/usr/local/cuda/lib64/libcublas.so.13",
-                "/usr/local/cuda/lib64/libcublasLt.so.13",
-                "/usr/local/cuda/lib64/libcudart.so.13",
-                "/usr/local/cuda/lib64/libcufft.so.12",
-                "/usr/local/cuda/lib64/libcusolver.so.12",
-                "/usr/local/cuda/lib64/libnvJitLink.so.13",
-                "/usr/local/cuda/lib64/libnvrtc.so.13",
-                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",
-            ]
-        elif "12" in desired_cuda:
-            # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
-            minor_version = desired_cuda[-1]
-            version_specific_libs = [
-                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
-                "/usr/local/cuda/lib64/libcublas.so.12",
-                "/usr/local/cuda/lib64/libcublasLt.so.12",
-                "/usr/local/cuda/lib64/libcudart.so.12",
-                "/usr/local/cuda/lib64/libcufft.so.11",
-                "/usr/local/cuda/lib64/libcusolver.so.11",
-                "/usr/local/cuda/lib64/libnvJitLink.so.12",
-                "/usr/local/cuda/lib64/libnvrtc.so.12",
-                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
-            ]
-        else:
-            raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")
-
-        # Combine all libraries
-        libs_to_copy = common_libs + version_specific_libs
-
-        # Copy libraries to unzipped_folder/torch/lib
-        for lib_path in libs_to_copy:
-            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
-
-    # Make sure the wheel is tagged with manylinux_2_28
-    for f in os.scandir(f"{folder}/tmp/"):
-        if f.is_dir() and f.name.endswith(".dist-info"):
-            replace_tag(f"{f.path}/WHEEL")
-            break
-
-    os.system(f"wheel pack {folder}/tmp/ -d {folder}")
-    os.system(f"rm -rf {folder}/tmp/")
-
-
-def complete_wheel(folder: str) -> str:
-    """
-    Complete wheel build and put in artifact location
-    """
-    wheel_name = list_dir(f"/{folder}/dist")[0]
-
-    # Please note for cuda we don't run auditwheel since we use custom script to package
-    # the cuda dependencies to the wheel file using update_wheel() method.
-    # However we need to make sure filename reflects the correct Manylinux platform.
-    if "pytorch" in folder and not enable_cuda:
-        print("Repairing Wheel with AuditWheel")
-        check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
-        repaired_wheel_name = list_dir(f"/{folder}/wheelhouse")[0]
-
-        print(f"Moving {repaired_wheel_name} wheel to /{folder}/dist")
-        os.rename(
-            f"/{folder}/wheelhouse/{repaired_wheel_name}",
-            f"/{folder}/dist/{repaired_wheel_name}",
-        )
-    else:
-        repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
-
-    print(f"Copying {repaired_wheel_name} to artifacts")
-    shutil.copy2(
-        f"/{folder}/dist/{repaired_wheel_name}", f"/artifacts/{repaired_wheel_name}"
-    )
-
-    return repaired_wheel_name
-
-
-def parse_arguments():
-    """
-    Parse inline arguments
-    """
-    from argparse import ArgumentParser
-
-    parser = ArgumentParser("AARCH64 wheels python CD")
-    parser.add_argument("--debug", action="store_true")
-    parser.add_argument("--build-only", action="store_true")
-    parser.add_argument("--test-only", type=str)
-    parser.add_argument("--enable-mkldnn", action="store_true")
-    parser.add_argument("--enable-cuda", action="store_true")
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    """
-    Entry Point
-    """
-    args = parse_arguments()
-    enable_mkldnn = args.enable_mkldnn
-    enable_cuda = args.enable_cuda
-    branch = check_output(
-        ["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd="/pytorch"
-    ).decode()
-
-    print("Building PyTorch wheel")
-    build_vars = ""
-    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
-    if enable_cuda:
-        build_vars += "MAX_JOBS=5 "
-
-        # Handle PyPI NVIDIA libraries vs bundled libraries
-        use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
-        if use_nvidia_pypi_libs:
-            print("Configuring build for PyPI NVIDIA libraries")
-            # Configure for dynamic linking (matching x86 logic)
-            build_vars += "ATEN_STATIC_CUDA=0 USE_CUDA_STATIC_LINK=0 USE_CUPTI_SO=1 "
-        else:
-            print("Configuring build for bundled NVIDIA libraries")
-            # Keep existing static linking approach - already configured above
-
-    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
-    desired_cuda = os.getenv("DESIRED_CUDA")
-    if override_package_version is not None:
-        version = override_package_version
-        build_vars += (
-            f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
-        )
-    elif branch in ["nightly", "main"]:
-        build_date = (
-            check_output(["git", "log", "--pretty=format:%cs", "-1"], cwd="/pytorch")
-            .decode()
-            .replace("-", "")
-        )
-        version = (
-            check_output(["cat", "version.txt"], cwd="/pytorch").decode().strip()[:-2]
-        )
-        if enable_cuda:
-            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date}+{desired_cuda} PYTORCH_BUILD_NUMBER=1 "
-        else:
-            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
-    elif branch.startswith(("v1.", "v2.")):
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
-
-    if enable_mkldnn:
-        print("build pytorch with mkldnn+acl backend")
-        build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
-        build_vars += "ACL_ROOT_DIR=/acl "
-        if enable_cuda:
-            build_vars += "BLAS=NVPL "
-        else:
-            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS "
-    else:
-        print("build pytorch without mkldnn backend")
-
-    os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")
-    if enable_cuda:
-        print("Updating Cuda Dependency")
-        filename = os.listdir("/pytorch/dist/")
-        wheel_path = f"/pytorch/dist/{filename[0]}"
-        package_cuda_wheel(wheel_path, desired_cuda)
-    pytorch_wheel_name = complete_wheel("/pytorch/")
-    print(f"Build Complete. Created {pytorch_wheel_name}..")
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -1,999 +0,0 @@
-#!/usr/bin/env python3
-
-# This script is for building  AARCH64 wheels using AWS EC2 instances.
-# To generate binaries for the release follow these steps:
-# 1. Update mappings for each of the Domain Libraries by adding new row to a table like this:
-#         "v1.11.0": ("0.11.0", "rc1"),
-# 2. Run script with following arguments for each of the supported python versions and required tag, for example:
-# build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch v1.11.0-rc3
-
-
-import os
-import subprocess
-import sys
-import time
-from typing import Optional, Union
-
-import boto3
-
-
-# AMI images for us-east-1, change the following based on your ~/.aws/config
-os_amis = {
-    "ubuntu20_04": "ami-052eac90edaa9d08f",  # login_name: ubuntu
-    "ubuntu22_04": "ami-0c6c29c5125214c77",  # login_name: ubuntu
-    "redhat8": "ami-0698b90665a2ddcf1",  # login_name: ec2-user
-}
-
-ubuntu20_04_ami = os_amis["ubuntu20_04"]
-
-
-def compute_keyfile_path(key_name: Optional[str] = None) -> tuple[str, str]:
-    if key_name is None:
-        key_name = os.getenv("AWS_KEY_NAME")
-        if key_name is None:
-            return os.getenv("SSH_KEY_PATH", ""), ""
-
-    homedir_path = os.path.expanduser("~")
-    default_path = os.path.join(homedir_path, ".ssh", f"{key_name}.pem")
-    return os.getenv("SSH_KEY_PATH", default_path), key_name
-
-
-ec2 = boto3.resource("ec2")
-
-
-def ec2_get_instances(filter_name, filter_value):
-    return ec2.instances.filter(
-        Filters=[{"Name": filter_name, "Values": [filter_value]}]
-    )
-
-
-def ec2_instances_of_type(instance_type="t4g.2xlarge"):
-    return ec2_get_instances("instance-type", instance_type)
-
-
-def ec2_instances_by_id(instance_id):
-    rc = list(ec2_get_instances("instance-id", instance_id))
-    return rc[0] if len(rc) > 0 else None
-
-
-def start_instance(
-    key_name, ami=ubuntu20_04_ami, instance_type="t4g.2xlarge", ebs_size: int = 50
-):
-    inst = ec2.create_instances(
-        ImageId=ami,
-        InstanceType=instance_type,
-        SecurityGroups=["ssh-allworld"],
-        KeyName=key_name,
-        MinCount=1,
-        MaxCount=1,
-        BlockDeviceMappings=[
-            {
-                "DeviceName": "/dev/sda1",
-                "Ebs": {
-                    "DeleteOnTermination": True,
-                    "VolumeSize": ebs_size,
-                    "VolumeType": "standard",
-                },
-            }
-        ],
-    )[0]
-    print(f"Create instance {inst.id}")
-    inst.wait_until_running()
-    running_inst = ec2_instances_by_id(inst.id)
-    print(f"Instance started at {running_inst.public_dns_name}")
-    return running_inst
-
-
-class RemoteHost:
-    addr: str
-    keyfile_path: str
-    login_name: str
-    container_id: Optional[str] = None
-    ami: Optional[str] = None
-
-    def __init__(self, addr: str, keyfile_path: str, login_name: str = "ubuntu"):
-        self.addr = addr
-        self.keyfile_path = keyfile_path
-        self.login_name = login_name
-
-    def _gen_ssh_prefix(self) -> list[str]:
-        return [
-            "ssh",
-            "-o",
-            "StrictHostKeyChecking=no",
-            "-i",
-            self.keyfile_path,
-            f"{self.login_name}@{self.addr}",
-            "--",
-        ]
-
-    @staticmethod
-    def _split_cmd(args: Union[str, list[str]]) -> list[str]:
-        return args.split() if isinstance(args, str) else args
-
-    def run_ssh_cmd(self, args: Union[str, list[str]]) -> None:
-        subprocess.check_call(self._gen_ssh_prefix() + self._split_cmd(args))
-
-    def check_ssh_output(self, args: Union[str, list[str]]) -> str:
-        return subprocess.check_output(
-            self._gen_ssh_prefix() + self._split_cmd(args)
-        ).decode("utf-8")
-
-    def scp_upload_file(self, local_file: str, remote_file: str) -> None:
-        subprocess.check_call(
-            [
-                "scp",
-                "-i",
-                self.keyfile_path,
-                local_file,
-                f"{self.login_name}@{self.addr}:{remote_file}",
-            ]
-        )
-
-    def scp_download_file(
-        self, remote_file: str, local_file: Optional[str] = None
-    ) -> None:
-        if local_file is None:
-            local_file = "."
-        subprocess.check_call(
-            [
-                "scp",
-                "-i",
-                self.keyfile_path,
-                f"{self.login_name}@{self.addr}:{remote_file}",
-                local_file,
-            ]
-        )
-
-    def start_docker(self, image="quay.io/pypa/manylinux2014_aarch64:latest") -> None:
-        self.run_ssh_cmd("sudo apt-get install -y docker.io")
-        self.run_ssh_cmd(f"sudo usermod -a -G docker {self.login_name}")
-        self.run_ssh_cmd("sudo service docker start")
-        self.run_ssh_cmd(f"docker pull {image}")
-        self.container_id = self.check_ssh_output(
-            f"docker run -t -d -w /root {image}"
-        ).strip()
-
-    def using_docker(self) -> bool:
-        return self.container_id is not None
-
-    def run_cmd(self, args: Union[str, list[str]]) -> None:
-        if not self.using_docker():
-            return self.run_ssh_cmd(args)
-        assert self.container_id is not None
-        docker_cmd = self._gen_ssh_prefix() + [
-            "docker",
-            "exec",
-            "-i",
-            self.container_id,
-            "bash",
-        ]
-        p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE)
-        p.communicate(
-            input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
-                "utf-8"
-            )
-        )
-        rc = p.wait()
-        if rc != 0:
-            raise subprocess.CalledProcessError(rc, docker_cmd)
-
-    def check_output(self, args: Union[str, list[str]]) -> str:
-        if not self.using_docker():
-            return self.check_ssh_output(args)
-        assert self.container_id is not None
-        docker_cmd = self._gen_ssh_prefix() + [
-            "docker",
-            "exec",
-            "-i",
-            self.container_id,
-            "bash",
-        ]
-        p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
-        (out, err) = p.communicate(
-            input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
-                "utf-8"
-            )
-        )
-        rc = p.wait()
-        if rc != 0:
-            raise subprocess.CalledProcessError(rc, docker_cmd, output=out, stderr=err)
-        return out.decode("utf-8")
-
-    def upload_file(self, local_file: str, remote_file: str) -> None:
-        if not self.using_docker():
-            return self.scp_upload_file(local_file, remote_file)
-        tmp_file = os.path.join("/tmp", os.path.basename(local_file))
-        self.scp_upload_file(local_file, tmp_file)
-        self.run_ssh_cmd(
-            ["docker", "cp", tmp_file, f"{self.container_id}:/root/{remote_file}"]
-        )
-        self.run_ssh_cmd(["rm", tmp_file])
-
-    def download_file(self, remote_file: str, local_file: Optional[str] = None) -> None:
-        if not self.using_docker():
-            return self.scp_download_file(remote_file, local_file)
-        tmp_file = os.path.join("/tmp", os.path.basename(remote_file))
-        self.run_ssh_cmd(
-            ["docker", "cp", f"{self.container_id}:/root/{remote_file}", tmp_file]
-        )
-        self.scp_download_file(tmp_file, local_file)
-        self.run_ssh_cmd(["rm", tmp_file])
-
-    def download_wheel(
-        self, remote_file: str, local_file: Optional[str] = None
-    ) -> None:
-        if self.using_docker() and local_file is None:
-            basename = os.path.basename(remote_file)
-            local_file = basename.replace(
-                "-linux_aarch64.whl", "-manylinux2014_aarch64.whl"
-            )
-        self.download_file(remote_file, local_file)
-
-    def list_dir(self, path: str) -> list[str]:
-        return self.check_output(["ls", "-1", path]).split("\n")
-
-
-def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
-    import socket
-
-    for i in range(attempt_cnt):
-        try:
-            with socket.create_connection((addr, port), timeout=timeout):
-                return
-        except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203
-            if i == attempt_cnt - 1:
-                raise
-            time.sleep(timeout)
-
-
-def update_apt_repo(host: RemoteHost) -> None:
-    time.sleep(5)
-    host.run_cmd("sudo systemctl stop apt-daily.service || true")
-    host.run_cmd("sudo systemctl stop unattended-upgrades.service || true")
-    host.run_cmd(
-        "while systemctl is-active --quiet apt-daily.service; do sleep 1; done"
-    )
-    host.run_cmd(
-        "while systemctl is-active --quiet unattended-upgrades.service; do sleep 1; done"
-    )
-    host.run_cmd("sudo apt-get update")
-    time.sleep(3)
-    host.run_cmd("sudo apt-get update")
-
-
-def install_condaforge(
-    host: RemoteHost, suffix: str = "latest/download/Miniforge3-Linux-aarch64.sh"
-) -> None:
-    print("Install conda-forge")
-    host.run_cmd(f"curl -OL https://github.com/conda-forge/miniforge/releases/{suffix}")
-    host.run_cmd(f"sh -f {os.path.basename(suffix)} -b")
-    host.run_cmd(f"rm -f {os.path.basename(suffix)}")
-    if host.using_docker():
-        host.run_cmd("echo 'PATH=$HOME/miniforge3/bin:$PATH'>>.bashrc")
-    else:
-        host.run_cmd(
-            [
-                "sed",
-                "-i",
-                "'/^# If not running interactively.*/i PATH=$HOME/miniforge3/bin:$PATH'",
-                ".bashrc",
-            ]
-        )
-
-
-def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
-    if python_version == "3.6":
-        # Python-3.6 EOLed and not compatible with conda-4.11
-        install_condaforge(
-            host, suffix="download/4.10.3-10/Miniforge3-4.10.3-10-Linux-aarch64.sh"
-        )
-        host.run_cmd(f"conda install -y python={python_version} numpy pyyaml")
-    else:
-        install_condaforge(
-            host, suffix="download/4.11.0-4/Miniforge3-4.11.0-4-Linux-aarch64.sh"
-        )
-        # Pytorch-1.10 or older are not compatible with setuptools=59.6 or newer
-        host.run_cmd(
-            f"conda install -y python={python_version} numpy pyyaml setuptools>=59.5.0"
-        )
-
-
-def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:
-    host.run_cmd("pip3 install auditwheel")
-    host.run_cmd(
-        "conda install -y patchelf" if use_conda else "sudo apt-get install -y patchelf"
-    )
-    from tempfile import NamedTemporaryFile
-
-    with NamedTemporaryFile() as tmp:
-        tmp.write(embed_library_script.encode("utf-8"))
-        tmp.flush()
-        host.upload_file(tmp.name, "embed_library.py")
-
-    print("Embedding libgomp into wheel")
-    if host.using_docker():
-        host.run_cmd(f"python3 embed_library.py {wheel_name} --update-tag")
-    else:
-        host.run_cmd(f"python3 embed_library.py {wheel_name}")
-
-
-def checkout_repo(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    url: str,
-    git_clone_flags: str,
-    mapping: dict[str, tuple[str, str]],
-) -> Optional[str]:
-    for prefix in mapping:
-        if not branch.startswith(prefix):
-            continue
-        tag = f"v{mapping[prefix][0]}-{mapping[prefix][1]}"
-        host.run_cmd(f"git clone {url} -b {tag} {git_clone_flags}")
-        return mapping[prefix][0]
-
-    host.run_cmd(f"git clone {url} -b {branch} {git_clone_flags}")
-    return None
-
-
-def build_torchvision(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    use_conda: bool = True,
-    git_clone_flags: str,
-    run_smoke_tests: bool = True,
-) -> str:
-    print("Checking out TorchVision repo")
-    build_version = checkout_repo(
-        host,
-        branch=branch,
-        url="https://github.com/pytorch/vision",
-        git_clone_flags=git_clone_flags,
-        mapping={
-            "v1.7.1": ("0.8.2", "rc2"),
-            "v1.8.0": ("0.9.0", "rc3"),
-            "v1.8.1": ("0.9.1", "rc1"),
-            "v1.9.0": ("0.10.0", "rc1"),
-            "v1.10.0": ("0.11.1", "rc1"),
-            "v1.10.1": ("0.11.2", "rc1"),
-            "v1.10.2": ("0.11.3", "rc1"),
-            "v1.11.0": ("0.12.0", "rc1"),
-            "v1.12.0": ("0.13.0", "rc4"),
-            "v1.12.1": ("0.13.1", "rc6"),
-            "v1.13.0": ("0.14.0", "rc4"),
-            "v1.13.1": ("0.14.1", "rc2"),
-            "v2.0.0": ("0.15.1", "rc2"),
-            "v2.0.1": ("0.15.2", "rc2"),
-        },
-    )
-    print("Building TorchVision wheel")
-
-    # Please note libnpg and jpeg are required to build image.so extension
-    if use_conda:
-        host.run_cmd("conda install -y libpng jpeg")
-        # Remove .so files to force static linking
-        host.run_cmd(
-            "rm miniforge3/lib/libpng.so miniforge3/lib/libpng16.so miniforge3/lib/libjpeg.so"
-        )
-        # And patch setup.py to include libz dependency for libpng
-        host.run_cmd(
-            [
-                'sed -i -e \'s/image_link_flags\\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py'
-            ]
-        )
-
-    build_vars = ""
-    if branch == "nightly":
-        version = host.check_output(
-            ["if [ -f vision/version.txt ]; then cat vision/version.txt; fi"]
-        ).strip()
-        if len(version) == 0:
-            # In older revisions, version was embedded in setup.py
-            version = (
-                host.check_output(["grep", '"version = \'"', "vision/setup.py"])
-                .strip()
-                .split("'")[1][:-2]
-            )
-        build_date = (
-            host.check_output("cd vision && git log --pretty=format:%s -1")
-            .strip()
-            .split()[0]
-            .replace("-", "")
-        )
-        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
-    elif build_version is not None:
-        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
-    if host.using_docker():
-        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
-
-    host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation")
-    vision_wheel_name = host.list_dir("vision/dist")[0]
-    embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name))
-
-    print("Copying TorchVision wheel")
-    host.download_wheel(os.path.join("vision", "dist", vision_wheel_name))
-    if run_smoke_tests:
-        host.run_cmd(
-            f"pip3 install {os.path.join('vision', 'dist', vision_wheel_name)}"
-        )
-        host.run_cmd("python3 vision/test/smoke_test.py")
-    print("Delete vision checkout")
-    host.run_cmd("rm -rf vision")
-
-    return vision_wheel_name
-
-
-def build_torchdata(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    use_conda: bool = True,
-    git_clone_flags: str = "",
-) -> str:
-    print("Checking out TorchData repo")
-    git_clone_flags += " --recurse-submodules"
-    build_version = checkout_repo(
-        host,
-        branch=branch,
-        url="https://github.com/pytorch/data",
-        git_clone_flags=git_clone_flags,
-        mapping={
-            "v1.13.1": ("0.5.1", ""),
-            "v2.0.0": ("0.6.0", "rc5"),
-            "v2.0.1": ("0.6.1", "rc1"),
-        },
-    )
-    print("Building TorchData wheel")
-    build_vars = ""
-    if branch == "nightly":
-        version = host.check_output(
-            ["if [ -f data/version.txt ]; then cat data/version.txt; fi"]
-        ).strip()
-        build_date = (
-            host.check_output("cd data && git log --pretty=format:%s -1")
-            .strip()
-            .split()[0]
-            .replace("-", "")
-        )
-        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
-    elif build_version is not None:
-        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
-    if host.using_docker():
-        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
-
-    host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation")
-    wheel_name = host.list_dir("data/dist")[0]
-    embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name))
-
-    print("Copying TorchData wheel")
-    host.download_wheel(os.path.join("data", "dist", wheel_name))
-
-    return wheel_name
-
-
-def build_torchtext(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    use_conda: bool = True,
-    git_clone_flags: str = "",
-) -> str:
-    print("Checking out TorchText repo")
-    git_clone_flags += " --recurse-submodules"
-    build_version = checkout_repo(
-        host,
-        branch=branch,
-        url="https://github.com/pytorch/text",
-        git_clone_flags=git_clone_flags,
-        mapping={
-            "v1.9.0": ("0.10.0", "rc1"),
-            "v1.10.0": ("0.11.0", "rc2"),
-            "v1.10.1": ("0.11.1", "rc1"),
-            "v1.10.2": ("0.11.2", "rc1"),
-            "v1.11.0": ("0.12.0", "rc1"),
-            "v1.12.0": ("0.13.0", "rc2"),
-            "v1.12.1": ("0.13.1", "rc5"),
-            "v1.13.0": ("0.14.0", "rc3"),
-            "v1.13.1": ("0.14.1", "rc1"),
-            "v2.0.0": ("0.15.1", "rc2"),
-            "v2.0.1": ("0.15.2", "rc2"),
-        },
-    )
-    print("Building TorchText wheel")
-    build_vars = ""
-    if branch == "nightly":
-        version = host.check_output(
-            ["if [ -f text/version.txt ]; then cat text/version.txt; fi"]
-        ).strip()
-        build_date = (
-            host.check_output("cd text && git log --pretty=format:%s -1")
-            .strip()
-            .split()[0]
-            .replace("-", "")
-        )
-        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
-    elif build_version is not None:
-        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
-    if host.using_docker():
-        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
-
-    host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation")
-    wheel_name = host.list_dir("text/dist")[0]
-    embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name))
-
-    print("Copying TorchText wheel")
-    host.download_wheel(os.path.join("text", "dist", wheel_name))
-
-    return wheel_name
-
-
-def build_torchaudio(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    use_conda: bool = True,
-    git_clone_flags: str = "",
-) -> str:
-    print("Checking out TorchAudio repo")
-    git_clone_flags += " --recurse-submodules"
-    build_version = checkout_repo(
-        host,
-        branch=branch,
-        url="https://github.com/pytorch/audio",
-        git_clone_flags=git_clone_flags,
-        mapping={
-            "v1.9.0": ("0.9.0", "rc2"),
-            "v1.10.0": ("0.10.0", "rc5"),
-            "v1.10.1": ("0.10.1", "rc1"),
-            "v1.10.2": ("0.10.2", "rc1"),
-            "v1.11.0": ("0.11.0", "rc1"),
-            "v1.12.0": ("0.12.0", "rc3"),
-            "v1.12.1": ("0.12.1", "rc5"),
-            "v1.13.0": ("0.13.0", "rc4"),
-            "v1.13.1": ("0.13.1", "rc2"),
-            "v2.0.0": ("2.0.1", "rc3"),
-            "v2.0.1": ("2.0.2", "rc2"),
-        },
-    )
-    print("Building TorchAudio wheel")
-    build_vars = ""
-    if branch == "nightly":
-        version = (
-            host.check_output(["grep", '"version = \'"', "audio/setup.py"])
-            .strip()
-            .split("'")[1][:-2]
-        )
-        build_date = (
-            host.check_output("cd audio && git log --pretty=format:%s -1")
-            .strip()
-            .split()[0]
-            .replace("-", "")
-        )
-        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
-    elif build_version is not None:
-        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
-    if host.using_docker():
-        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
-
-    host.run_cmd(
-        f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \
-        && ./packaging/ffmpeg/build.sh \
-        && {build_vars} python3 -m build --wheel --no-isolation"
-    )
-
-    wheel_name = host.list_dir("audio/dist")[0]
-    embed_libgomp(host, use_conda, os.path.join("audio", "dist", wheel_name))
-
-    print("Copying TorchAudio wheel")
-    host.download_wheel(os.path.join("audio", "dist", wheel_name))
-
-    return wheel_name
-
-
-def configure_system(
-    host: RemoteHost,
-    *,
-    compiler: str = "gcc-8",
-    use_conda: bool = True,
-    python_version: str = "3.8",
-) -> None:
-    if use_conda:
-        install_condaforge_python(host, python_version)
-
-    print("Configuring the system")
-    if not host.using_docker():
-        update_apt_repo(host)
-        host.run_cmd("sudo apt-get install -y ninja-build g++ git cmake gfortran unzip")
-    else:
-        host.run_cmd("yum install -y sudo")
-        host.run_cmd("conda install -y ninja scons")
-
-    if not use_conda:
-        host.run_cmd(
-            "sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
-        )
-    host.run_cmd("pip3 install dataclasses typing-extensions")
-    if not use_conda:
-        print("Installing Cython + numpy from PyPy")
-        host.run_cmd("sudo pip3 install Cython")
-        host.run_cmd("sudo pip3 install numpy")
-
-
-def build_domains(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    use_conda: bool = True,
-    git_clone_flags: str = "",
-) -> tuple[str, str, str, str]:
-    vision_wheel_name = build_torchvision(
-        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
-    )
-    audio_wheel_name = build_torchaudio(
-        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
-    )
-    data_wheel_name = build_torchdata(
-        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
-    )
-    text_wheel_name = build_torchtext(
-        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
-    )
-    return (vision_wheel_name, audio_wheel_name, data_wheel_name, text_wheel_name)
-
-
-def start_build(
-    host: RemoteHost,
-    *,
-    branch: str = "main",
-    compiler: str = "gcc-8",
-    use_conda: bool = True,
-    python_version: str = "3.8",
-    pytorch_only: bool = False,
-    pytorch_build_number: Optional[str] = None,
-    shallow_clone: bool = True,
-    enable_mkldnn: bool = False,
-) -> tuple[str, str, str, str, str]:
-    git_clone_flags = " --depth 1 --shallow-submodules" if shallow_clone else ""
-    if host.using_docker() and not use_conda:
-        print("Auto-selecting conda option for docker images")
-        use_conda = True
-    if not host.using_docker():
-        print("Disable mkldnn for host builds")
-        enable_mkldnn = False
-
-    configure_system(
-        host, compiler=compiler, use_conda=use_conda, python_version=python_version
-    )
-
-    if host.using_docker():
-        print("Move libgfortant.a into a standard location")
-        # HACK: pypa gforntran.a is compiled without PIC, which leads to the following error
-        # libgfortran.a(error.o)(.text._gfortrani_st_printf+0x34): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `__stack_chk_guard@@GLIBC_2.17'  # noqa: E501, B950
-        # Workaround by copying gfortran library from the host
-        host.run_ssh_cmd("sudo apt-get install -y gfortran-8")
-        host.run_cmd("mkdir -p /usr/lib/gcc/aarch64-linux-gnu/8")
-        host.run_ssh_cmd(
-            [
-                "docker",
-                "cp",
-                "/usr/lib/gcc/aarch64-linux-gnu/8/libgfortran.a",
-                f"{host.container_id}:/opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/",
-            ]
-        )
-
-    print("Checking out PyTorch repo")
-    host.run_cmd(
-        f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"
-    )
-
-    host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")
-
-    print("Building PyTorch wheel")
-    build_opts = ""
-    if pytorch_build_number is not None:
-        build_opts += f" -C--build-option=--build-number={pytorch_build_number}"
-    # Breakpad build fails on aarch64
-    build_vars = "USE_BREAKPAD=0 "
-    if branch == "nightly":
-        build_date = (
-            host.check_output("cd pytorch && git log --pretty=format:%s -1")
-            .strip()
-            .split()[0]
-            .replace("-", "")
-        )
-        version = host.check_output("cat pytorch/version.txt").strip()[:-2]
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
-    if branch.startswith(("v1.", "v2.")):
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
-    if host.using_docker():
-        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
-    if enable_mkldnn:
-        host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")
-        print("build pytorch with mkldnn+acl backend")
-        build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
-        build_vars += " BLAS=OpenBLAS"
-        build_vars += " OpenBLAS_HOME=/opt/OpenBLAS"
-        build_vars += " ACL_ROOT_DIR=/acl"
-        host.run_cmd(
-            f"cd $HOME/pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
-        )
-        print("Repair the wheel")
-        pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
-        ld_library_path = "/acl/build:$HOME/pytorch/build/lib"
-        host.run_cmd(
-            f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"
-        )
-        print("replace the original wheel with the repaired one")
-        pytorch_repaired_wheel_name = host.list_dir("wheelhouse")[0]
-        host.run_cmd(
-            f"cp $HOME/wheelhouse/{pytorch_repaired_wheel_name} $HOME/pytorch/dist/{pytorch_wheel_name}"
-        )
-    else:
-        print("build pytorch without mkldnn backend")
-        host.run_cmd(
-            f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
-        )
-
-    print("Deleting build folder")
-    host.run_cmd("cd pytorch && rm -rf build")
-    pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
-    embed_libgomp(host, use_conda, os.path.join("pytorch", "dist", pytorch_wheel_name))
-    print("Copying the wheel")
-    host.download_wheel(os.path.join("pytorch", "dist", pytorch_wheel_name))
-
-    print("Installing PyTorch wheel")
-    host.run_cmd(f"pip3 install pytorch/dist/{pytorch_wheel_name}")
-
-    if pytorch_only:
-        return (pytorch_wheel_name, None, None, None, None)
-    domain_wheels = build_domains(
-        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
-    )
-
-    return (pytorch_wheel_name, *domain_wheels)
-
-
-embed_library_script = """
-#!/usr/bin/env python3
-
-from auditwheel.patcher import Patchelf
-from auditwheel.wheeltools import InWheelCtx
-from auditwheel.elfutils import elf_file_filter
-from auditwheel.repair import copylib
-from auditwheel.lddtree import lddtree
-from subprocess import check_call
-import os
-import shutil
-import sys
-from tempfile import TemporaryDirectory
-
-
-def replace_tag(filename):
-   with open(filename, 'r') as f:
-     lines = f.read().split("\\n")
-   for i,line in enumerate(lines):
-       if not line.startswith("Tag: "):
-           continue
-       lines[i] = line.replace("-linux_", "-manylinux2014_")
-       print(f'Updated tag from {line} to {lines[i]}')
-
-   with open(filename, 'w') as f:
-       f.write("\\n".join(lines))
-
-
-class AlignedPatchelf(Patchelf):
-    def set_soname(self, file_name: str, new_soname: str) -> None:
-        check_call(['patchelf', '--page-size', '65536', '--set-soname', new_soname, file_name])
-
-    def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
-        check_call(['patchelf', '--page-size', '65536', '--replace-needed', soname, new_soname, file_name])
-
-
-def embed_library(whl_path, lib_soname, update_tag=False):
-    patcher = AlignedPatchelf()
-    out_dir = TemporaryDirectory()
-    whl_name = os.path.basename(whl_path)
-    tmp_whl_name = os.path.join(out_dir.name, whl_name)
-    with InWheelCtx(whl_path) as ctx:
-        torchlib_path = os.path.join(ctx._tmpdir.name, 'torch', 'lib')
-        ctx.out_wheel=tmp_whl_name
-        new_lib_path, new_lib_soname = None, None
-        for filename, elf in elf_file_filter(ctx.iter_files()):
-            if not filename.startswith('torch/lib'):
-                continue
-            libtree = lddtree(filename)
-            if lib_soname not in libtree['needed']:
-                continue
-            lib_path = libtree['libs'][lib_soname]['path']
-            if lib_path is None:
-                print(f"Can't embed {lib_soname} as it could not be found")
-                break
-            if lib_path.startswith(torchlib_path):
-                continue
-
-            if new_lib_path is None:
-                new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
-            patcher.replace_needed(filename, lib_soname, new_lib_soname)
-            print(f'Replacing {lib_soname} with {new_lib_soname} for {filename}')
-        if update_tag:
-            # Add manylinux2014 tag
-            for filename in ctx.iter_files():
-                if os.path.basename(filename) != 'WHEEL':
-                    continue
-                replace_tag(filename)
-    shutil.move(tmp_whl_name, whl_path)
-
-
-if __name__ == '__main__':
-    embed_library(sys.argv[1], 'libgomp.so.1', len(sys.argv) > 2 and sys.argv[2] == '--update-tag')
-"""
-
-
-def run_tests(host: RemoteHost, whl: str, branch="main") -> None:
-    print("Configuring the system")
-    update_apt_repo(host)
-    host.run_cmd("sudo apt-get install -y python3-pip git")
-    host.run_cmd("sudo pip3 install Cython")
-    host.run_cmd("sudo pip3 install numpy")
-    host.upload_file(whl, ".")
-    host.run_cmd(f"sudo pip3 install {whl}")
-    host.run_cmd("python3 -c 'import torch;print(torch.rand((3,3))'")
-    host.run_cmd(f"git clone -b {branch} https://github.com/pytorch/pytorch")
-    host.run_cmd("cd pytorch/test; python3 test_torch.py -v")
-
-
-def get_instance_name(instance) -> Optional[str]:
-    if instance.tags is None:
-        return None
-    for tag in instance.tags:
-        if tag["Key"] == "Name":
-            return tag["Value"]
-    return None
-
-
-def list_instances(instance_type: str) -> None:
-    print(f"All instances of type {instance_type}")
-    for instance in ec2_instances_of_type(instance_type):
-        ifaces = instance.network_interfaces
-        az = ifaces[0].subnet.availability_zone if len(ifaces) > 0 else None
-        print(
-            f"{instance.id} {get_instance_name(instance)} {instance.public_dns_name} {instance.state['Name']} {az}"
-        )
-
-
-def terminate_instances(instance_type: str) -> None:
-    print(f"Terminating all instances of type {instance_type}")
-    instances = list(ec2_instances_of_type(instance_type))
-    for instance in instances:
-        print(f"Terminating {instance.id}")
-        instance.terminate()
-    print("Waiting for termination to complete")
-    for instance in instances:
-        instance.wait_until_terminated()
-
-
-def parse_arguments():
-    from argparse import ArgumentParser
-
-    parser = ArgumentParser("Build and test AARCH64 wheels using EC2")
-    parser.add_argument("--key-name", type=str)
-    parser.add_argument("--debug", action="store_true")
-    parser.add_argument("--build-only", action="store_true")
-    parser.add_argument("--test-only", type=str)
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument("--os", type=str, choices=list(os_amis.keys()))
-    group.add_argument("--ami", type=str)
-    parser.add_argument(
-        "--python-version",
-        type=str,
-        choices=[f"3.{d}" for d in range(6, 12)],
-        default=None,
-    )
-    parser.add_argument("--alloc-instance", action="store_true")
-    parser.add_argument("--list-instances", action="store_true")
-    parser.add_argument("--pytorch-only", action="store_true")
-    parser.add_argument("--keep-running", action="store_true")
-    parser.add_argument("--terminate-instances", action="store_true")
-    parser.add_argument("--instance-type", type=str, default="t4g.2xlarge")
-    parser.add_argument("--ebs-size", type=int, default=50)
-    parser.add_argument("--branch", type=str, default="main")
-    parser.add_argument("--use-docker", action="store_true")
-    parser.add_argument(
-        "--compiler",
-        type=str,
-        choices=["gcc-7", "gcc-8", "gcc-9", "clang"],
-        default="gcc-8",
-    )
-    parser.add_argument("--use-torch-from-pypi", action="store_true")
-    parser.add_argument("--pytorch-build-number", type=str, default=None)
-    parser.add_argument("--disable-mkldnn", action="store_true")
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_arguments()
-    ami = (
-        args.ami
-        if args.ami is not None
-        else os_amis[args.os]
-        if args.os is not None
-        else ubuntu20_04_ami
-    )
-    keyfile_path, key_name = compute_keyfile_path(args.key_name)
-
-    if args.list_instances:
-        list_instances(args.instance_type)
-        sys.exit(0)
-
-    if args.terminate_instances:
-        terminate_instances(args.instance_type)
-        sys.exit(0)
-
-    if len(key_name) == 0:
-        raise RuntimeError("""
-            Cannot start build without key_name, please specify
-            --key-name argument or AWS_KEY_NAME environment variable.""")
-    if len(keyfile_path) == 0 or not os.path.exists(keyfile_path):
-        raise RuntimeError(f"""
-            Cannot find keyfile with name: [{key_name}] in path: [{keyfile_path}], please
-            check `~/.ssh/` folder or manually set SSH_KEY_PATH environment variable.""")
-
-    # Starting the instance
-    inst = start_instance(
-        key_name, ami=ami, instance_type=args.instance_type, ebs_size=args.ebs_size
-    )
-    instance_name = f"{args.key_name}-{args.os}"
-    if args.python_version is not None:
-        instance_name += f"-py{args.python_version}"
-    inst.create_tags(
-        DryRun=False,
-        Tags=[
-            {
-                "Key": "Name",
-                "Value": instance_name,
-            }
-        ],
-    )
-    addr = inst.public_dns_name
-    wait_for_connection(addr, 22)
-    host = RemoteHost(addr, keyfile_path)
-    host.ami = ami
-    if args.use_docker:
-        update_apt_repo(host)
-        host.start_docker()
-
-    if args.test_only:
-        run_tests(host, args.test_only)
-        sys.exit(0)
-
-    if args.alloc_instance:
-        if args.python_version is None:
-            sys.exit(0)
-        install_condaforge_python(host, args.python_version)
-        sys.exit(0)
-
-    python_version = args.python_version if args.python_version is not None else "3.10"
-
-    if args.use_torch_from_pypi:
-        configure_system(host, compiler=args.compiler, python_version=python_version)
-        print("Installing PyTorch wheel")
-        host.run_cmd("pip3 install torch")
-        build_domains(
-            host, branch=args.branch, git_clone_flags=" --depth 1 --shallow-submodules"
-        )
-    else:
-        start_build(
-            host,
-            branch=args.branch,
-            compiler=args.compiler,
-            python_version=python_version,
-            pytorch_only=args.pytorch_only,
-            pytorch_build_number=args.pytorch_build_number,
-            enable_mkldnn=not args.disable_mkldnn,
-        )
-    if not args.keep_running:
-        print(f"Waiting for instance {inst.id} to terminate")
-        inst.terminate()
-        inst.wait_until_terminated()
--- a/.ci/aarch64_linux/embed_library.py
+++ b/.ci/aarch64_linux/embed_library.py
@ -1,87 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import shutil
-import sys
-from subprocess import check_call
-from tempfile import TemporaryDirectory
-
-from auditwheel.elfutils import elf_file_filter
-from auditwheel.lddtree import lddtree
-from auditwheel.patcher import Patchelf
-from auditwheel.repair import copylib
-from auditwheel.wheeltools import InWheelCtx
-
-
-def replace_tag(filename):
-    with open(filename) as f:
-        lines = f.read().split("\\n")
-    for i, line in enumerate(lines):
-        if not line.startswith("Tag: "):
-            continue
-        lines[i] = line.replace("-linux_", "-manylinux2014_")
-        print(f"Updated tag from {line} to {lines[i]}")
-
-    with open(filename, "w") as f:
-        f.write("\\n".join(lines))
-
-
-class AlignedPatchelf(Patchelf):
-    def set_soname(self, file_name: str, new_soname: str) -> None:
-        check_call(
-            ["patchelf", "--page-size", "65536", "--set-soname", new_soname, file_name]
-        )
-
-    def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
-        check_call(
-            [
-                "patchelf",
-                "--page-size",
-                "65536",
-                "--replace-needed",
-                soname,
-                new_soname,
-                file_name,
-            ]
-        )
-
-
-def embed_library(whl_path, lib_soname, update_tag=False):
-    patcher = AlignedPatchelf()
-    out_dir = TemporaryDirectory()
-    whl_name = os.path.basename(whl_path)
-    tmp_whl_name = os.path.join(out_dir.name, whl_name)
-    with InWheelCtx(whl_path) as ctx:
-        torchlib_path = os.path.join(ctx._tmpdir.name, "torch", "lib")
-        ctx.out_wheel = tmp_whl_name
-        new_lib_path, new_lib_soname = None, None
-        for filename, _ in elf_file_filter(ctx.iter_files()):
-            if not filename.startswith("torch/lib"):
-                continue
-            libtree = lddtree(filename)
-            if lib_soname not in libtree["needed"]:
-                continue
-            lib_path = libtree["libs"][lib_soname]["path"]
-            if lib_path is None:
-                print(f"Can't embed {lib_soname} as it could not be found")
-                break
-            if lib_path.startswith(torchlib_path):
-                continue
-
-            if new_lib_path is None:
-                new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
-            patcher.replace_needed(filename, lib_soname, new_lib_soname)
-            print(f"Replacing {lib_soname} with {new_lib_soname} for {filename}")
-        if update_tag:
-            # Add manylinux2014 tag
-            for filename in ctx.iter_files():
-                if os.path.basename(filename) != "WHEEL":
-                    continue
-                replace_tag(filename)
-    shutil.move(tmp_whl_name, whl_path)
-
-
-if __name__ == "__main__":
-    embed_library(
-        sys.argv[1], "libgomp.so.1", len(sys.argv) > 2 and sys.argv[2] == "--update-tag"
-    )
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -188,7 +188,7 @@ case "$tag" in
    fi
    GCC_VERSION=11
    VISION=yes
-    ROCM_VERSION=7.1
+    ROCM_VERSION=7.0
    NINJA_VERSION=1.9.0
    TRITON=yes
    KATEX=yes
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -60,16 +60,14 @@ EOF
        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
    fi

-    if [[ $(ver $ROCM_VERSION) -lt $(ver 7.1) ]]; then
-      # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5, removed in ROCm 7.1
-      # search for all unversioned packages
-      # if search fails it will abort this script; use true to avoid case where search fails
-      MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
-      if [[ "x${MIOPENHIPGFX}" = x ]]; then
-        echo "miopen-hip-gfx package not available" && exit 1
-      else
-        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
-      fi
+    # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
+    # search for all unversioned packages
+    # if search fails it will abort this script; use true to avoid case where search fails
+    MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
+    if [[ "x${MIOPENHIPGFX}" = x ]]; then
+      echo "miopen-hip-gfx package not available" && exit 1
+    else
+      DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
    fi

    # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -12,8 +12,8 @@ function do_install() {

    rocm_version_nodot=${rocm_version//./}

-    # https://github.com/icl-utk-edu/magma/pull/65
-    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
+    # post merge of https://github.com/icl-utk-edu/magma/pull/65
+    MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"

    rocm_dir="/opt/rocm"
--- a/.ci/manywheel/build.sh
+++ b/.ci/manywheel/build.sh
@ -4,14 +4,17 @@ set -ex

 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

+# Source the common build script for architecture-specific configurations (MKLDNN, ACL, etc.)
+source "${SCRIPTPATH}/../pytorch/build.sh" || true
+
 case "${GPU_ARCH_TYPE:-BLANK}" in
-    cuda)
+    cuda | cuda-aarch64)
        bash "${SCRIPTPATH}/build_cuda.sh"
        ;;
    rocm)
        bash "${SCRIPTPATH}/build_rocm.sh"
        ;;
-    cpu | cpu-cxx11-abi | cpu-s390x)
+    cpu | cpu-cxx11-abi | cpu-aarch64 | cpu-s390x)
        bash "${SCRIPTPATH}/build_cpu.sh"
        ;;
    xpu)
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -18,12 +18,31 @@ retry () {
    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
 }

+# Detect architecture first
+ARCH=$(uname -m)
+echo "Detected architecture: $ARCH"
+
 PLATFORM=""
 # TODO move this into the Docker images
 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
 if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    retry yum install -q -y zip openssl
-    PLATFORM="manylinux_2_28_x86_64"
+    # Set platform based on architecture
+    case $ARCH in
+        x86_64)
+            PLATFORM="manylinux_2_28_x86_64"
+            ;;
+        aarch64)
+            PLATFORM="manylinux_2_28_aarch64"
+            ;;
+        s390x)
+            PLATFORM="manylinux_2_28_s390x"
+            ;;
+        *)
+            echo "Unsupported architecture: $ARCH"
+            exit 1
+            ;;
+    esac
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    retry dnf install -q -y zip openssl
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
@ -38,6 +57,8 @@ else
    exit 1
 fi

+echo "Platform set to: $PLATFORM"
+
 # We use the package name to test the package by passing this to 'pip install'
 # This is the env variable that setup.py uses to name the package. Note that
 # pip 'normalizes' the name first by changing all - to _
@ -299,8 +320,8 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
            # ROCm workaround for roctracer dlopens
            if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
                patchedpath=$(fname_without_so_number $destpath)
-            # Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load
-            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then
+            # Keep the so number for XPU dependencies, libgomp.so.1, ACL libraries, and NVPL libraries to avoid twice load
+            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" || "$filename" == libarm_compute* || "$filename" == libnvpl* || "$filename" == "libgfortran.so.5" ]]; then
                patchedpath=$destpath
            else
                patchedpath=$(fname_with_sha256 $destpath)
@ -346,9 +367,22 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
    done

    # create Manylinux 2_28 tag this needs to happen before regenerate the RECORD
-    if [[ $PLATFORM == "manylinux_2_28_x86_64" && $GPU_ARCH_TYPE != "cpu-s390x" && $GPU_ARCH_TYPE != "xpu" ]]; then
+    # Support all architectures (x86_64, aarch64, s390x)
+    if [[ "$IS_MANYLINUX2_28" == "1" && $GPU_ARCH_TYPE != "xpu" ]]; then
        wheel_file=$(echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/WHEEL/g')
-        sed -i -e s#linux_x86_64#"${PLATFORM}"# $wheel_file;
+        echo "Updating wheel tag for $ARCH architecture"
+        # Replace linux_* with manylinux_2_28_* based on architecture
+        case $ARCH in
+            x86_64)
+                sed -i -e 's#linux_x86_64#manylinux_2_28_x86_64#g' $wheel_file
+                ;;
+            aarch64)
+                sed -i -e 's#linux_aarch64#manylinux_2_28_aarch64#g' $wheel_file
+                ;;
+            s390x)
+                sed -i -e 's#linux_s390x#manylinux_2_28_s390x#g' $wheel_file
+                ;;
+        esac
    fi

    # regenerate the RECORD file with new hashes
--- a/.ci/manywheel/build_cpu.sh
+++ b/.ci/manywheel/build_cpu.sh
@ -15,6 +15,10 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
    EXTRA_CAFFE2_CMAKE_FLAGS=()
 fi

+# Detect architecture
+ARCH=$(uname -m)
+echo "Building CPU wheel for architecture: $ARCH"
+
 WHEELHOUSE_DIR="wheelhousecpu"
 LIBTORCH_HOUSE_DIR="libtorch_housecpu"
 if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
@ -34,8 +38,10 @@ elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
 elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
-    if [[ "$(uname -m)" == "s390x" ]]; then
+    if [[ "$ARCH" == "s390x" ]]; then
        LIBGOMP_PATH="/usr/lib/s390x-linux-gnu/libgomp.so.1"
+    elif [[ "$ARCH" == "aarch64" ]]; then
+        LIBGOMP_PATH="/usr/lib/aarch64-linux-gnu/libgomp.so.1"
    else
        LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
    fi
@ -49,6 +55,32 @@ DEPS_SONAME=(
    "libgomp.so.1"
 )

+# Add ARM-specific library dependencies for CPU builds
+if [[ "$ARCH" == "aarch64" ]]; then
+    echo "Adding ARM-specific CPU library dependencies"
+
+    # ARM Compute Library (if available)
+    if [[ -d "/acl/build" ]]; then
+        echo "Adding ARM Compute Library for CPU"
+        DEPS_LIST+=(
+            "/acl/build/libarm_compute.so"
+            "/acl/build/libarm_compute_graph.so"
+        )
+        DEPS_SONAME+=(
+            "libarm_compute.so"
+            "libarm_compute_graph.so"
+        )
+    fi
+
+    # ARM system libraries
+    DEPS_LIST+=(
+        "/usr/lib64/libgfortran.so.5"
+    )
+    DEPS_SONAME+=(
+        "libgfortran.so.5"
+    )
+fi
+
 rm -rf /usr/local/cuda*

 SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -29,6 +29,10 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
    EXTRA_CAFFE2_CMAKE_FLAGS=()
 fi

+# Detect architecture
+ARCH=$(uname -m)
+echo "Building for architecture: $ARCH"
+
 # Determine CUDA version and architectures to build for
 #
 # NOTE: We should first check `DESIRED_CUDA` when determining `CUDA_VERSION`,
@ -53,34 +57,60 @@ fi
 cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
 EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")

+# Function to remove architectures from a list
+remove_archs() {
+    local result="$1"
+    shift
+    for arch in "$@"; do
+        result="${result//${arch};/}"
+    done
+    echo "$result"
+}
+
+# Function to filter CUDA architectures for aarch64
+# aarch64 ARM GPUs only support certain compute capabilities
+# Keep: 8.0 (A100), 9.0+ (Hopper, Grace Hopper, newer)
+# Remove: < 8.0 (no ARM GPUs), 8.6 (x86_64 RTX 3090/A6000 only)
+filter_aarch64_archs() {
+    local arch_list="$1"
+    # Explicitly remove architectures not needed on aarch64
+    arch_list=$(remove_archs "$arch_list" "5.0" "6.0" "7.0" "7.5" "8.6")
+    echo "$arch_list"
+}
+
+# Base: Common architectures across all modern CUDA versions
+TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0"
+
 case ${CUDA_VERSION} in
-    #removing sm_50-sm_60 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
-    #however we would like to keep sm_70 architecture see: https://github.com/pytorch/pytorch/issues/157517
-    12.8)
-        TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0"
-        ;;
-    12.9)
-        TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0+PTX"
-        # WAR to resolve the ld error in libtorch build with CUDA 12.9
+    12.6) TORCH_CUDA_ARCH_LIST="5.0;6.0;${TORCH_CUDA_ARCH_LIST}" ;;  # Only 12.6 includes Legacy Maxwell/Pascal that will be removed in future releases
+    12.8) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0" ;;  # +Hopper/Blackwell support
+    12.9) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0+PTX" # +Hopper/Blackwell support + PTX for forward compatibility
        if [[ "$PACKAGE_TYPE" == "libtorch" ]]; then
-            TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
+            TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//7.0;/}"  # Remove 7.0 to resolve the ld error
+            TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//8.6;/}"  # Remove 8.6 for libtorch
        fi
        ;;
    13.0)
-        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
-        ;;
-    12.6)
-        TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
-        ;;
-    *)
-        echo "unknown cuda version $CUDA_VERSION"
-        exit 1
+        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;$([[ "$ARCH" == "aarch64" ]] && echo "11.0;" || echo "")12.0+PTX"
+        export TORCH_NVCC_FLAGS="-compress-mode=size"
+        export BUILD_BUNDLE_PTXAS=1
        ;;
+    *) echo "unknown cuda version $CUDA_VERSION"; exit 1 ;;
 esac

+# Filter for aarch64: Remove < 8.0 and 8.6
+[[ "$ARCH" == "aarch64" ]] && TORCH_CUDA_ARCH_LIST=$(filter_aarch64_archs "$TORCH_CUDA_ARCH_LIST")
+
+echo "TORCH_CUDA_ARCH_LIST set to: $TORCH_CUDA_ARCH_LIST"
 export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
 echo "${TORCH_CUDA_ARCH_LIST}"

+# Disable MAGMA for aarch64 as pre-built libraries are x86-64 only
+if [[ "$ARCH" == "aarch64" ]]; then
+    echo "Disabling MAGMA for aarch64 architecture"
+    export USE_MAGMA=0
+fi
+
 # Package directories
 WHEELHOUSE_DIR="wheelhouse$cuda_version_nodot"
 LIBTORCH_HOUSE_DIR="libtorch_house$cuda_version_nodot"
@ -244,6 +274,51 @@ else
    exit 1
 fi

+# Add ARM-specific library dependencies
+if [[ "$ARCH" == "aarch64" ]]; then
+    echo "Adding ARM-specific library dependencies"
+
+    # ARM Compute Library (if available)
+    if [[ -d "/acl/build" ]]; then
+        echo "Adding ARM Compute Library"
+        DEPS_LIST+=(
+            "/acl/build/libarm_compute.so"
+            "/acl/build/libarm_compute_graph.so"
+        )
+        DEPS_SONAME+=(
+            "libarm_compute.so"
+            "libarm_compute_graph.so"
+        )
+    fi
+
+    # ARM system libraries
+    DEPS_LIST+=(
+        "/lib64/libgomp.so.1"
+        "/usr/lib64/libgfortran.so.5"
+    )
+    DEPS_SONAME+=(
+        "libgomp.so.1"
+        "libgfortran.so.5"
+    )
+
+    # NVPL libraries (ARM optimized BLAS/LAPACK)
+    if [[ -d "/usr/local/lib" && -f "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0" ]]; then
+        echo "Adding NVPL libraries for ARM"
+        DEPS_LIST+=(
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0"
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0"
+            "/usr/local/lib/libnvpl_lapack_core.so.0"
+            "/usr/local/lib/libnvpl_blas_core.so.0"
+        )
+        DEPS_SONAME+=(
+            "libnvpl_lapack_lp64_gomp.so.0"
+            "libnvpl_blas_lp64_gomp.so.0"
+            "libnvpl_lapack_core.so.0"
+            "libnvpl_blas_core.so.0"
+        )
+    fi
+fi
+
 # run_tests.sh requires DESIRED_CUDA to know what tests to exclude
 export DESIRED_CUDA="$cuda_version_nodot"

@ -251,9 +326,11 @@ export DESIRED_CUDA="$cuda_version_nodot"
 rm -rf /usr/local/cuda || true
 ln -s "/usr/local/cuda-${CUDA_VERSION}" /usr/local/cuda

-# Switch `/usr/local/magma` to the desired CUDA version
-rm -rf /usr/local/magma || true
-ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
+# Switch `/usr/local/magma` to the desired CUDA version (skip for aarch64)
+if [[ "$ARCH" != "aarch64" ]]; then
+    rm -rf /usr/local/magma || true
+    ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
+fi

 export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev) # 10.0.130
 export CUDA_VERSION_SHORT=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev | cut -f1,2 -d".") # 10.0
--- a/.ci/onnx/common.sh
+++ b/.ci/onnx/common.sh
@ -21,87 +21,3 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
 fi

 mkdir -p "$pytest_reports_dir" || true
-
-##########################################
-# copied from .ci/pytorch/common_utils.sh
-##########################################
-
-function get_pinned_commit() {
-  cat .github/ci_commit_pins/"${1}".txt
-}
-
-function pip_install_whl() {
-  # This is used to install PyTorch and other build artifacts wheel locally
-  # without using any network connection
-
-  # Convert the input arguments into an array
-  local args=("$@")
-
-  # Check if the first argument contains multiple paths separated by spaces
-  if [[ "${args[0]}" == *" "* ]]; then
-    # Split the string by spaces into an array
-    IFS=' ' read -r -a paths <<< "${args[0]}"
-    # Loop through each path and install individually
-    for path in "${paths[@]}"; do
-      echo "Installing $path"
-      python3 -mpip install --no-index --no-deps "$path"
-    done
-  else
-    # Loop through each argument and install individually
-    for path in "${args[@]}"; do
-      echo "Installing $path"
-      python3 -mpip install --no-index --no-deps "$path"
-    done
-  fi
-}
-
-function pip_build_and_install() {
-  local build_target=$1
-  local wheel_dir=$2
-
-  local found_whl=0
-  for file in "${wheel_dir}"/*.whl
-  do
-    if [[ -f "${file}" ]]; then
-      found_whl=1
-      break
-    fi
-  done
-
-  # Build the wheel if it doesn't exist
-  if [ "${found_whl}" == "0" ]; then
-    python3 -m pip wheel \
-      --no-build-isolation \
-      --no-deps \
-      -w "${wheel_dir}" \
-      "${build_target}"
-  fi
-
-  for file in "${wheel_dir}"/*.whl
-  do
-    pip_install_whl "${file}"
-  done
-}
-
-function install_torchvision() {
-  local orig_preload
-  local commit
-  commit=$(get_pinned_commit vision)
-  orig_preload=${LD_PRELOAD}
-  if [ -n "${LD_PRELOAD}" ]; then
-    # Silence dlerror to work-around glibc ASAN bug, see https://sourceware.org/bugzilla/show_bug.cgi?id=27653#c9
-    echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c -
-    LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so
-  fi
-
-  if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
-    # Not sure if both are needed, but why not
-    export FORCE_CUDA=1
-    export WITH_CUDA=1
-  fi
-  pip_build_and_install "git+https://github.com/pytorch/vision.git@${commit}" dist/vision
-
-  if [ -n "${LD_PRELOAD}" ]; then
-    LD_PRELOAD=${orig_preload}
-  fi
-}
--- a/.ci/onnx/test.sh
+++ b/.ci/onnx/test.sh
@ -19,7 +19,7 @@ git config --global --add safe.directory /var/lib/jenkins/workspace

 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
  # TODO: This can be removed later once vision is also part of the Docker image
-  install_torchvision
+  pip install -q --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
  # JIT C++ extensions require ninja, so put it into PATH.
  export PATH="/var/lib/jenkins/.local/bin:$PATH"
  # NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -86,10 +86,20 @@ else
  fi
 fi

+# Enable MKLDNN with ARM Compute Library for ARM builds
 if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
  export USE_MKLDNN=1
+
+  # ACL is required for aarch64 builds
+  if [[ ! -d "/acl" ]]; then
+    echo "ERROR: ARM Compute Library not found at /acl"
+    echo "ACL is required for aarch64 builds. Check Docker image setup."
+    exit 1
+  fi
+
  export USE_MKLDNN_ACL=1
  export ACL_ROOT_DIR=/acl
+  echo "ARM Compute Library enabled for MKLDNN: ACL_ROOT_DIR=/acl"
 fi

 if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -1250,97 +1250,6 @@ test_custom_script_ops() {
  assert_git_not_dirty
 }

-test_libtorch_agnostic_targetting() {
-    echo "Testing libtorch_agnostic runs correctly on TORCH_TARGET_VERSION"
-
-    REPO_DIR=$(pwd)
-    WHEEL_DIR="${REPO_DIR}/test/cpp_extensions/.wheels"
-
-    # Build wheel with current PyTorch (this has TORCH_TARGET_VERSION 2_9_0)
-    echo "Building 2.9 extension wheel with current PyTorch..."
-    pushd test/cpp_extensions/libtorch_agnostic_2_9_extension
-    time python setup.py bdist_wheel
-
-    # Save the wheel
-    mkdir -p "$WHEEL_DIR"
-    cp dist/*.whl "$WHEEL_DIR/"
-    WHEEL_FILE=$(find "$WHEEL_DIR" -maxdepth 1 -name "*.whl" -type f | head -1)
-    echo "Built wheel: $(basename "$WHEEL_FILE")"
-    popd
-
-    # Create venv and install PyTorch 2.9
-    python -m venv venv_pytorch_2_9
-    # shellcheck disable=SC1091
-    . venv_pytorch_2_9/bin/activate
-
-    # Clear PYTHONPATH to avoid using the development PyTorch
-    echo "Clearing PYTHONPATH to use only venv packages..."
-    unset PYTHONPATH
-
-    # Upgrade pip to latest version
-    echo "Upgrading pip to latest version..."
-    pip install --upgrade pip
-    pip --version
-
-    echo "Installing PyTorch 2.9..."
-
-    # Install from release channel only
-    PYTORCH_VERSION="2.9.0"
-
-    # Extract CUDA version from BUILD_ENVIRONMENT (e.g., "cuda12.1" -> "cu121")
-    if [[ "$BUILD_ENVIRONMENT" =~ cuda([0-9]+)\.([0-9]+) ]]; then
-        CUDA_MAJOR="${BASH_REMATCH[1]}"
-        CUDA_MINOR="${BASH_REMATCH[2]}"
-        CUDA_VERSION="cu${CUDA_MAJOR}${CUDA_MINOR}"
-        echo "  Detected CUDA ${CUDA_MAJOR}.${CUDA_MINOR} from BUILD_ENVIRONMENT, using ${CUDA_VERSION}"
-    else
-        # Default to CPU build
-        CUDA_VERSION="cpu"
-        echo "  No CUDA detected in BUILD_ENVIRONMENT, using CPU build"
-    fi
-
-    if pip install torch=="${PYTORCH_VERSION}" --index-url https://download.pytorch.org/whl/${CUDA_VERSION}/; then
-        echo "Installed PyTorch ${PYTORCH_VERSION} from release channel (${CUDA_VERSION})"
-    else
-        echo "  FAILED to install PyTorch 2.9.0 from release channel"
-        echo "  URL: https://download.pytorch.org/whl/${CUDA_VERSION}/"
-        deactivate
-        rm -rf venv_pytorch_2_9
-        return 1
-    fi
-
-    INSTALLED_VERSION=$(python -c "import torch; print(torch.__version__)" 2>/dev/null || echo "unknown")
-    echo "  Installed version: $INSTALLED_VERSION"
-
-    # Install test dependencies
-    echo "Installing test dependencies..."
-    pip install expecttest numpy unittest-xml-reporting
-
-    # Install the pre-built wheel
-    echo ""
-    echo "Installing pre-built 2.9 extension wheel (built with PyTorch 2.10)..."
-    pip install "$WHEEL_FILE"
-    echo "Installed $(basename "$WHEEL_FILE") into PyTorch 2.9 environment"
-
-    # Run tests with PyTorch 2.9 runtime (2.10 tests will be skipped automatically)
-    echo ""
-    echo "Running tests with PyTorch 2.9 runtime (using wheel built on PyTorch 2.10)..."
-    if time python test/cpp_extensions/test_libtorch_agnostic.py -v; then
-        echo ""
-        echo "  Wheel built with current torch and TORCH_TARGET_VERSION 2_9_0 works with PyTorch 2.9 runtime!"
-    else
-        echo "targeting test failed"
-        deactivate
-        rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
-        return 1
-    fi
-
-    deactivate
-    rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
-
-    assert_git_not_dirty
-}
-
 test_jit_hooks() {
  echo "Testing jit hooks in cpp"
  HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
@ -1768,7 +1677,7 @@ test_operator_microbenchmark() {

  cd "${TEST_DIR}"/benchmarks/operator_benchmark

-  for OP_BENCHMARK_TESTS in optimizer; do
+  for OP_BENCHMARK_TESTS in matmul mm addmm bmm conv; do
    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
      --benchmark-name "PyTorch operator microbenchmark" --use-compile
@ -1813,8 +1722,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]];
 elif [[ "${TEST_CONFIG}" == *backward* ]]; then
  test_forward_backward_compatibility
  # Do NOT add tests after bc check tests, see its comment.
-elif [[ "${TEST_CONFIG}" == *libtorch_agnostic_targetting* ]]; then
-  test_libtorch_agnostic_targetting
 elif [[ "${TEST_CONFIG}" == *xla* ]]; then
  install_torchvision
  build_xla
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-94631807d22c09723dd006f7be5beb649d5f88d0
+e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -91,6 +91,13 @@
 "ciflow/trunk":
 - .ci/docker/ci_commit_pins/triton.txt

+"oncall: distributed":
+- torch/csrc/distributed/**
+- torch/distributed/**
+- torch/nn/parallel/**
+- test/distributed/**
+- torch/testing/_internal/distributed/**
+
 "release notes: distributed (checkpoint)":
 - torch/distributed/checkpoint/**
 - test/distributed/checkpoint/**
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -260,11 +260,8 @@ jobs:
            "${DOCKER_IMAGE}"
          )
          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          if [[ ${BUILD_ENVIRONMENT} == *"aarch64"* ]]; then
-            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/aarch64_linux/aarch64_ci_build.sh"
-          else
-            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
-          fi
+          # Unified build script for all architectures (x86_64, aarch64, s390x)
+          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"

      - name: Chown artifacts
        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -70,7 +70,6 @@ jobs:
          { config: "distributed", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "distributed", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "numpy_2_x", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
-          { config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
        ]}
    secrets: inherit

--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -83,7 +83,6 @@ jobs:
          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
-          { config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
        ]}
    secrets: inherit

--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -245,9 +245,6 @@ class TORCH_API TensorBase {
  size_t weak_use_count() const noexcept {
    return impl_.weak_use_count();
  }
-  bool is_uniquely_owned() const noexcept {
-    return impl_.is_uniquely_owned();
-  }

  std::string toString() const;

--- a/aten/src/ATen/cuda/CUDAContextLight.h
+++ b/aten/src/ATen/cuda/CUDAContextLight.h
@ -3,7 +3,6 @@

 #include <cstdint>
 #include <map>
-#include <shared_mutex>

 #include <cuda_runtime_api.h>
 #include <cusparse.h>
@ -89,13 +88,8 @@ TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
 TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();

 TORCH_CUDA_CPP_API void clearCublasWorkspaces();
-struct WorkspaceMapWithMutex {
-  std::map<std::tuple<void*, void*>, at::DataPtr> map;
-  std::shared_mutex mutex;
-};
-
-TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
-TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
+TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace();
+TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace();
 TORCH_CUDA_CPP_API size_t getChosenWorkspaceSize();
 TORCH_CUDA_CPP_API size_t getCUDABlasLtWorkspaceSize();
 TORCH_CUDA_CPP_API void* getCUDABlasLtWorkspace();
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@ -99,7 +99,7 @@ void destroyCublasHandle(cublasHandle_t handle) {
 //   - Comments of @soumith copied from cuDNN handle pool implementation
 #ifdef NO_CUDNN_DESTROY_HANDLE
 #else
-  cublasDestroy(handle);
+    cublasDestroy(handle);
 #endif
 }

@ -107,27 +107,19 @@ using CuBlasPoolType = DeviceThreadHandlePool<cublasHandle_t, createCublasHandle

 } // namespace

-WorkspaceMapWithMutex& cublas_handle_stream_to_workspace() {
-  static auto& instance = *new WorkspaceMapWithMutex;
+std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace() {
+  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
  return instance;
 }

-WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace() {
-  static auto& instance = *new WorkspaceMapWithMutex;
+std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace() {
+  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
  return instance;
 }

 void clearCublasWorkspaces() {
-  {
-    auto& workspace = cublas_handle_stream_to_workspace();
-    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
-    workspace.map.clear();
-  }
-  {
-    auto& workspace = cublaslt_handle_stream_to_workspace();
-    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
-    workspace.map.clear();
-  }
+  cublas_handle_stream_to_workspace().clear();
+  cublaslt_handle_stream_to_workspace().clear();
 }

 size_t parseChosenWorkspaceSize() {
@ -241,38 +233,6 @@ at::DataPtr getNewCUDABlasLtWorkspace() {
  return c10::cuda::CUDACachingAllocator::get()->allocate(getCUDABlasLtWorkspaceSize());
 }

-void setWorkspaceForHandle(cublasHandle_t handle, c10::cuda::CUDAStream stream) {
-  cudaStream_t _stream = stream;
-  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-
-  auto& workspace = cublas_handle_stream_to_workspace();
-
-  size_t workspace_size = getChosenWorkspaceSize();
-
-  // Fast path: check if workspace already exists
-  {
-    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it = workspace.map.find(key);
-    if (workspace_it != workspace.map.end()) {
-      TORCH_CUDABLAS_CHECK(cublasSetWorkspace(
-          handle, workspace_it->second.get(), workspace_size));
-      return;
-    }
-  }
-
-  // Slow path: allocate workspace outside the lock
-  auto new_workspace = getNewWorkspace();
-
-  // Insert with lock (double-check in case another thread inserted while we
-  // were allocating)
-  {
-    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it = workspace.map.try_emplace(key, std::move(new_workspace)).first;
-    TORCH_CUDABLAS_CHECK(
-        cublasSetWorkspace(handle, workspace_it->second.get(), workspace_size));
-  }
-}
-
 void* getCUDABlasLtWorkspace() {
 #ifndef USE_ROCM
  static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true;
@ -281,10 +241,8 @@ void* getCUDABlasLtWorkspace() {
    auto stream = c10::cuda::getCurrentCUDAStream();
    cudaStream_t _stream = stream;
    auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-    auto& workspace = at::cuda::cublas_handle_stream_to_workspace();
-    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it = workspace.map.find(key);
-    TORCH_INTERNAL_ASSERT(workspace_it != workspace.map.end());
+    auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key);
+    TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end());
    return workspace_it->second.mutable_get();
  }
 #endif
@ -292,29 +250,11 @@ void* getCUDABlasLtWorkspace() {
  auto stream = c10::cuda::getCurrentCUDAStream();
  cudaStream_t _stream = stream;
  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-
-  auto& workspace = cublaslt_handle_stream_to_workspace();
-
-  // Fast path: check if workspace already exists
-  {
-    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it = workspace.map.find(key);
-    if (workspace_it != workspace.map.end()) {
-      return workspace_it->second.mutable_get();
-    }
-  }
-
-  // Slow path: allocate workspace outside the lock
-  auto new_workspace = getNewCUDABlasLtWorkspace();
-
-  // Insert with lock (double-check in case another thread inserted while we
-  // were allocating)
-  {
-    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
-    auto workspace_it =
-          workspace.map.try_emplace(key, std::move(new_workspace)).first;
-    return workspace_it->second.mutable_get();
+  auto workspace_it = cublaslt_handle_stream_to_workspace().find(key);
+  if (workspace_it == cublaslt_handle_stream_to_workspace().end()) {
+    workspace_it = cublaslt_handle_stream_to_workspace().insert(workspace_it, {key, getNewCUDABlasLtWorkspace()});
  }
+  return workspace_it->second.mutable_get();
 }

 cublasHandle_t getCurrentCUDABlasHandle() {
@ -358,8 +298,13 @@ cublasHandle_t getCurrentCUDABlasHandle() {
  // will allocate memory dynamically (even if they're cheap) outside
  // PyTorch's CUDA caching allocator. It's possible that CCA used up
  // all the memory and cublas's cudaMallocAsync will return OOM
-  setWorkspaceForHandle(handle, stream);
-
+  cudaStream_t _stream = stream;
+  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
+  auto workspace_it = cublas_handle_stream_to_workspace().find(key);
+  if (workspace_it == cublas_handle_stream_to_workspace().end()) {
+    workspace_it = cublas_handle_stream_to_workspace().insert(workspace_it, {key, getNewWorkspace()});
+  }
+  TORCH_CUDABLAS_CHECK(cublasSetWorkspace(handle, workspace_it->second.get(), getChosenWorkspaceSize()));
 #if !defined(USE_ROCM)
  // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
  // FP32 data type calculations based on the value of the allow_tf32 flag.
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -296,7 +296,7 @@ template <typename scalar_t, typename res_scalar_t = scalar_t>
 bool launchGemmAndBiasCublasLt(
    // args contains result which is modified
    cublasCommonArgs& args,
-    const std::optional<Tensor>& self,
+    const Tensor& self,
    const Scalar& alpha,
    Activation activation = Activation::None
 ) {
@ -304,8 +304,12 @@ bool launchGemmAndBiasCublasLt(
  // or when it can be squeezed to 1D.
  // self_ptr == nullptr implies ignore bias epilogue
  // and use standard gemm-like API.
-  const auto* self_ptr = self.has_value() ? self.value().const_data_ptr<scalar_t>() : static_cast<const scalar_t*>(nullptr);
-
+  const auto* self_ptr = [&]() -> auto {
+    if (self.dim() == 1 || self.squeeze().dim() == 1) {
+      return self.const_data_ptr<scalar_t>();
+    }
+    return static_cast<const scalar_t*>(nullptr);
+  }();

  const auto tuning_ctx = at::cuda::tunable::getTuningContext();
  if (tuning_ctx->IsTunableOpEnabled()) {
@ -388,30 +392,35 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  bool disable_addmm_cuda_lt = persistent_disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
  #ifdef USE_ROCM
  // Conditioned on the device index, which is not persistent
-  disable_addmm_cuda_lt = disable_addmm_cuda_lt || isGloballyDisabledAddmmCudaLt(self.device());
+  disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt;
  #endif
  // Condition on the input
-  disable_addmm_cuda_lt = disable_addmm_cuda_lt || !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation);
+  disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation) || disable_addmm_cuda_lt;
+  // }

  at::ScalarType scalar_type = mat1.scalar_type();
  bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;

-  #ifdef USE_ROCM
-  disable_addmm_cuda_lt = disable_addmm_cuda_lt || is_float_output_with_half_input;
-  #endif
-
-  bool use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
-  // for float output with half input cublasLT with bias produces wrong results
-  use_bias_ptr_lt &= !is_float_output_with_half_input;
-
  // Handle result/self shapes
  if (!result.is_same(self)) {
    at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]});

-      // We do not copy bias only when we need the bias ptr
+    // We use bias ptr in the Lt path only when bias is 1D
+    const auto use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
+    const auto self_maybe_expanded = [&]() -> c10::MaybeOwned<Tensor> {
+      if (!use_bias_ptr_lt) {
+        // We do expand self even before
+        // check for beta != 0.0 to make sure that
+        // test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_*
+        // runs green.
+        return expand_size(self, result.sizes(), "addmm");
+      }
+      return c10::MaybeOwned<Tensor>::borrowed(self);
+    }();
+    // We do not copy bias only when we need the bias ptr
    if (beta.toComplexDouble() != 0.0 && !use_bias_ptr_lt) {
      // NOTE: self should broadcast over result
-      at::native::copy_(result, *expand_size(self, result.sizes(), "addmm"));
+      at::native::copy_(result, *self_maybe_expanded);
    }
  }

@ -459,7 +468,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
        scalar_type,
        "addmm_cuda_lt",
        [&] {
-          lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
+          lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, self, alpha, activation);
        }
      );
      #endif
@ -471,7 +480,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
        scalar_type,
        "addmm_cuda_lt",
        [&] {
-          lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
+          lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, self, alpha, activation);
        }
      );
    } // end is_float_output_with_half_input
@ -927,7 +936,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
  return _int_mm_out_cuda(self, mat2, result);
 }

-static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
+static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
  // ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm
  TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
  TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
@ -951,7 +960,7 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
    (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");

-  if (self_baddbmm.has_value()) {
+  if (!is_bmm && self_baddbmm.has_value()) {
    const auto& self = self_baddbmm.value();
    TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor");
    TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output");
@ -959,12 +968,15 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
 }

 Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
-  Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
+  IntArrayRef batch1_sizes = batch1.sizes();
+  IntArrayRef batch2_sizes = batch2.sizes();
+
+  Tensor out = at::empty({batch1_sizes[0], batch1_sizes[1], batch2_sizes[2]}, batch1.options().dtype(out_dtype));
  return _bmm_out_dtype_cuda(batch1, batch2, out_dtype, out);
 }

 Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
-  baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype);
+  baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true);
  Scalar beta(0.0);
  Scalar alpha(1.0);
  {
@ -976,16 +988,14 @@ Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at
 }

 Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
-  TORCH_CHECK(self.scalar_type() == out_dtype || self.scalar_type() == batch1.dtype(),
-  "self dtype must match either out_dtype or batch1 dtype");
-  Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
-  return _baddbmm_out_dtype_cuda(self, batch1, batch2, out_dtype, beta, alpha, out);
+  // We need to copy the tensor
+  Tensor out = self.clone().to(self.options().dtype(out_dtype));
+
+  return _baddbmm_out_dtype_cuda(out, batch1, batch2, out_dtype, beta, alpha, out);
 }

 Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
-  baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, out);
-  // We need to copy the tensor
-  out.copy_(self);
+  baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self);
  {
    NoNamesGuard guard;
    baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
@ -1020,27 +1030,24 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca
 }

 Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
-  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
-  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
-  Tensor result = at::empty({mat1.size(0), mat2.size(1)}, self.options().dtype(out_dtype));
+  Tensor result = at::empty(self.sizes(), self.options().dtype(out_dtype));
  return _addmm_dtype_out_cuda(self, mat1, mat2, out_dtype, beta, alpha, result);
 }

 Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
-// repeat dimensionality checks for direct calls to `out` overload
+  TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type());
+  TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
  TORCH_CHECK(
      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
-  TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
-  TORCH_CHECK(out_dtype == mat1.scalar_type() ||
-  (out_dtype == at::ScalarType::Float && (mat1.scalar_type() == at::ScalarType::Half || mat1.scalar_type() == at::ScalarType::BFloat16)),
-  "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");

  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
-  TORCH_CHECK(out_dtype == self.scalar_type() || self.scalar_type() == mat1.scalar_type(),
-    "self dtype must match either out_dtype or mat1 dtype");
+  TORCH_CHECK(out_dtype == self.scalar_type() ||
+    (out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
+    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");

  addmm_out_cuda_impl(out, self, mat1, mat2, beta, alpha);

--- a/aten/src/ATen/native/cuda/GroupedBlas.cpp
+++ b/aten/src/ATen/native/cuda/GroupedBlas.cpp
@ -607,8 +607,6 @@ _scaled_grouped_mm_cuda_v2(
      // scale shape checks
      _check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
      _check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
-      // swizze checks
-      TORCH_CHECK_VALUE(swizzle_a_enum.size() == 1 && swizzle_b_enum.size() == 1, "Expected single swizzle argument");
      return _mx8_mx8_bf16_grouped_mm_fbgemm(
          mat_a,
          mat_b,
--- a/aten/src/ATen/native/cuda/LogAddExpKernel.cu
+++ b/aten/src/ATen/native/cuda/LogAddExpKernel.cu
@ -2,250 +2,18 @@
 #include <ATen/Dispatch.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/cuda/Loops.cuh>
-#include <ATen/native/cuda/JitLoops.cuh>
-#include <ATen/native/cuda/jit_utils.h>
-#include <ATen/native/cuda/ScanUtils.cuh>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/BinaryOps.h>
 #include <ATen/OpMathType.h>
 #include <c10/util/MathConstants.h>
-#include <c10/util/complex.h>
-
-#include <cmath>
-#include <limits>

 // NOTE: CUDA on Windows requires that the enclosing function
 // of a __device__ lambda not have internal linkage.

 namespace at::native {

-// custom min and max to be used in logaddexp for  complex arguments
-template <typename scalar_t, bool min>
-__host__ __device__ c10::complex<scalar_t> _logaddexp_minmax(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
-  scalar_t xr = std::real(x);
-  scalar_t yr = std::real(y);
-  if (::isnan(yr) || (::isnan(std::imag(y)))) {
-    return y;
-  } else if (::isnan(xr) || (::isnan(std::imag(x)))) {
-    return x;
-  } else if (min) { // min
-    return (xr < yr) ? x : y;
-  } else { // max
-    return (xr >= yr) ? x : y;
-  }
-}
-
-template <typename scalar_t>
-__host__ __device__ scalar_t _log_add_exp_helper(const scalar_t& x, const scalar_t& y) {
-  // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
-  // Using the original expression: `at::_isnan(y) ? y : std::min(x, y)` causes an error in ROCM
-  const auto isnan_x = at::_isnan(x);
-  const auto isnan_y = at::_isnan(y);
-  scalar_t min = isnan_y ? y : (isnan_x ? x : std::min(x, y));
-  scalar_t max = isnan_y ? y : (isnan_x ? x : std::max(x, y));
-  if (min != max || ::isfinite(min)) {
-    // nan will be propagated here
-    return ::log1p(std::exp(min - max)) + max;
-  } else {
-    // special case to correctly handle infinite cases
-    return x;
-  }
-}
-
-template <typename scalar_t>
-__host__ __device__ c10::complex<scalar_t> _fast_build_exp(const c10::complex<scalar_t>& x) {
-  // complex exponential function, but implemented manually to get fast compilation time
-  // this function only handles the case where the x is finite (not inf nor nan)
-  const auto xreal = std::real(x);
-  const auto ximag = std::imag(x);
-  const auto exp_x_abs = std::exp(xreal);
-  auto exp_x_real = exp_x_abs * std::cos(ximag);
-  auto exp_x_imag = exp_x_abs * std::sin(ximag);
-  return {exp_x_real, exp_x_imag};
-}
-
-template <typename scalar_t>
-__host__ __device__ c10::complex<scalar_t> _fast_build_exp_inf(const c10::complex<scalar_t>& x) {
-  // complex exponential function, but implemented manually to get fast compilation time
-  // this function only handles the case where the real part of x is infinite
-  const auto ximag = std::imag(x);
-  constexpr auto exp_x_abs = std::numeric_limits<scalar_t>::infinity();
-  if (!::isfinite(ximag)) {  // add this to make consitent with std::exp(x+yi)
-    return {exp_x_abs, std::numeric_limits<scalar_t>::quiet_NaN()};
-  }
-  const auto sin = std::sin(ximag);
-  const auto cos = std::cos(ximag);
-  // special case if the angle is exactly the multiple of pi/2
-  auto exp_x_real = (cos == 0) ? (scalar_t)0.0 : exp_x_abs * cos;
-  auto exp_x_imag = (sin == 0) ? (scalar_t)0.0 : exp_x_abs * sin;
-  return {exp_x_real, exp_x_imag};
-}
-
-template <typename scalar_t>
-__host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
-  c10::complex<scalar_t> min = _logaddexp_minmax<scalar_t, /*min=*/true>(x, y);
-  c10::complex<scalar_t> max = _logaddexp_minmax<scalar_t, /*min=*/false>(x, y);
-  scalar_t min_real = std::real(min);
-  scalar_t max_real = std::real(max);
-
-  if (::isnan(min_real) || ::isnan(std::imag(min))) {
-    // handling the "infectious" NaNs
-    return {std::numeric_limits<scalar_t>::quiet_NaN(), std::numeric_limits<scalar_t>::quiet_NaN()};
-  }
-  else if ((!::isfinite(min_real)) && (min_real == max_real)) {
-    if (min_real < 0) {
-      // handle the -inf case, the imaginary part here does not really matter as the exp(value)
-      // will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined.
-      // It does not matter if we're taking the exp of this value
-      return min;
-    } else {
-      // handle the +inf case, we don't need the special precision for log1p for small values
-      // and to avoid producing nan in case of real(max) == real(min) == +inf
-      const auto exp_min = _fast_build_exp_inf(min);
-      const auto exp_max = _fast_build_exp_inf(max);
-      return ::log1p(exp_min + exp_max - 1);  // log1p(x - 1) builds faster than log
-    }
-  } else {
-    const auto minmax = min - max;
-    c10::complex<scalar_t> exp_minmax;
-    if (!::isfinite(minmax.real())) {
-        exp_minmax = minmax.real() < 0 ? c10::complex<scalar_t>{0.0, 0.0} : _fast_build_exp_inf(minmax);
-    } else {
-        exp_minmax = _fast_build_exp(minmax);
-    }
-    return ::log1p(exp_minmax) + max;
-  }
-}
-
-// Complex logaddexp jiterator string
-const auto logaddexp_complex_string = jiterator_stringify(
-    template<typename T>
-    std::complex<T> log1p(const std::complex<T>& z)
-    {
-      using complex_t = std::complex<T>;
-      T x = z.real();
-      T y = z.imag();
-      T zabs = abs(z);
-      T theta = atan2(y, x + T(1));
-      if (zabs < 0.5) {
-          T r = x * (T(2) + x) + y * y;
-          if (r == 0) { // handle underflow
-              return complex_t(x, theta);
-          }
-          return complex_t(T(0.5) * std::log1p(r), theta);
-      } else {
-          T z0 = std::hypot(x + 1, y);
-          return complex_t(log(z0), theta);
-      }
-    }
-
-    // separated _logaddexp_minmax into 2 different functions for jiterator_string
-    template <typename T>
-    std::complex<T> logaddexp_min(const std::complex<T>& x, const std::complex<T>& y) {
-        T xr = x.real();
-        T yr = y.real();
-        if (isnan(yr) || isnan(y.imag())) {
-            return y;
-        } else if (isnan(xr) || isnan(x.imag())) {
-            return x;
-        } else {
-            return (xr < yr) ? x : y;
-        }
-    }
-
-    template <typename T>
-    std::complex<T> logaddexp_max(const std::complex<T>& x, const std::complex<T>& y) {
-        T xr = x.real();
-        T yr = y.real();
-        if (isnan(yr) || isnan(y.imag())) {
-            return y;
-        } else if (isnan(xr) || isnan(x.imag())) {
-            return x;
-        } else {
-            return (xr >= yr) ? x : y;
-        }
-    }
-
-    template <typename T>
-    std::complex<T> fast_build_exp(const std::complex<T>& x) {
-        const auto xreal = x.real();
-        const auto ximag = x.imag();
-        const auto exp_x_abs = exp(xreal);
-        auto exp_x_real = exp_x_abs * cos(ximag);
-        auto exp_x_imag = exp_x_abs * sin(ximag);
-        return std::complex<T>(exp_x_real, exp_x_imag);
-    }
-
-    template <typename T>
-    std::complex<T> fast_build_exp_inf(const std::complex<T>& x) {
-        using complex_t = std::complex<T>;
-        const auto ximag = x.imag();
-        const T exp_x_abs = INFINITY;
-        if (!isfinite(ximag)) {
-            return complex_t(exp_x_abs, NAN);
-        }
-        const auto sin_val = sin(ximag);
-        const auto cos_val = cos(ximag);
-        auto exp_x_real = (cos_val == T(0)) ? T(0) : exp_x_abs * cos_val;
-        auto exp_x_imag = (sin_val == T(0)) ? T(0) : exp_x_abs * sin_val;
-        return complex_t(exp_x_real, exp_x_imag);
-    }
-
-    template <typename complex_t>
-    complex_t logaddexp_complex(complex_t x, complex_t y) {
-        using T = typename complex_t::value_type;
-        complex_t min_val = logaddexp_min(x, y);
-        complex_t max_val = logaddexp_max(x, y);
-        T min_real = min_val.real();
-        T max_real = max_val.real();
-
-        if (isnan(min_real) || isnan(min_val.imag())) {
-            return complex_t(NAN, NAN);
-        }
-        else if ((!isfinite(min_real)) && (min_real == max_real)) {
-            if (min_real < T(0)) {
-                return min_val;
-            } else {
-                const auto exp_min = fast_build_exp_inf<T>(min_val);
-                const auto exp_max = fast_build_exp_inf<T>(max_val);
-                return log1p(exp_min + exp_max - complex_t(1, 0));
-            }
-        } else {
-            const auto minmax = min_val - max_val;
-            complex_t exp_minmax;
-            if (!isfinite(minmax.real())) {
-                exp_minmax = (minmax.real() < T(0)) ? complex_t(0, 0) : fast_build_exp_inf<T>(minmax);
-            } else {
-                exp_minmax = fast_build_exp<T>(minmax);
-            }
-            return log1p(exp_minmax) + max_val;
-        }
-    }
-);
-
-constexpr char logaddexp_complex_name[] = "logaddexp_complex";
 void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
-  if (at::isComplexType(iter.dtype())) {
-#if AT_USE_JITERATOR()
-    AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
-      jitted_gpu_kernel<
-          /*name=*/logaddexp_complex_name,
-          /*return_dtype=*/scalar_t,
-          /*common_dtype=*/scalar_t,
-          /*arity=*/2>(iter, logaddexp_complex_string);
-    });
-#else
-    AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
-      using opmath_t = at::opmath_type<scalar_t>;
-      gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a_, scalar_t b_) -> scalar_t {
-        const auto a = static_cast<opmath_t>(a_);
-        const auto b = static_cast<opmath_t>(b_);
-        return static_cast<scalar_t>(_log_add_exp_helper(a, b));
-      });
-    });
-#endif
-  } else {
-    AT_DISPATCH_FLOATING_TYPES_AND2(
+  AT_DISPATCH_FLOATING_TYPES_AND2(
      ScalarType::BFloat16, ScalarType::Half,
      iter.dtype(), "logaddexp_cuda",
      [&]() {
@ -261,7 +29,6 @@ void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
          }
        });
      });
-  }
 }

 void logaddexp2_kernel_cuda(TensorIteratorBase& iter) {
--- a/aten/src/ATen/native/cuda/ScaledBlas.cpp
+++ b/aten/src/ATen/native/cuda/ScaledBlas.cpp
@ -1101,19 +1101,6 @@ _scaled_mxfp8_mxfp8(
  return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
 }

-void
-_check_mxfp4_support() {
-#ifndef USE_ROCM
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  // Only on B200 GPUs
-  TORCH_CHECK_NOT_IMPLEMENTED(
-    // B200 = 10.0, B300 = 10.3
-    dprops->major == 10,
-    "MXFP4 scaling only supported in CUDA for B200/B300"
-  );
-#endif
-}
-

 Tensor&
 _scaled_mxfp4_mxfp4(
@ -1126,7 +1113,6 @@ _scaled_mxfp4_mxfp4(
 #if defined(_WIN32) || (!defined(USE_ROCM) && !defined(USE_FBGEMM_GENAI))
  TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM and CUDA+FBGEMM_GENAI only");
 #else
-  _check_mxfp4_support();
  // Restrictions:
  // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
  TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",
--- a/aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp
@ -5,7 +5,6 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
 #include <ATen/native/xpu/Blas.h>
-#include <ATen/xpu/XPUScaledBlas.h>
 #include <torch/library.h>

 #ifndef AT_PER_OPERATOR_HEADERS
@ -340,399 +339,4 @@ Tensor _scaled_mm_xpu(
      out);
 }

-using acceptance_fn = std::function<bool(
-    c10::ScalarType,
-    std::vector<ScalingType>&,
-    ArrayRef<Tensor>&,
-    c10::ScalarType,
-    std::vector<ScalingType>&,
-    ArrayRef<Tensor>&)>;
-using namespace std::placeholders;
-
-namespace scaled_blas = at::native::onednn::scaled;
-using scaled_blas::convert_int_to_enum;
-using scaled_blas::ScaledGemmImplementation;
-
-std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 2>
-    scale_kernel_dispatch = {{
-        {"tensorwise_tensorwise",
-         scaled_blas::check_tensorwise_recipe,
-         ScaledGemmImplementation::TENSORWISE_TENSORWISE},
-        {"rowwise_rowwise",
-         scaled_blas::check_rowwise_recipe,
-         ScaledGemmImplementation::ROWWISE_ROWWISE},
-
-    }};
-
-Tensor& _scaled_tensorwise_tensorwise(
-    const Tensor& mat_a,
-    const Tensor& mat_b,
-    const Tensor& scale_a,
-    const Tensor& scale_b,
-    const std::optional<Tensor>& bias,
-    const c10::ScalarType out_dtype,
-    bool use_fast_accum,
-    Tensor& out) {
-  // Restrictions:
-  // A, B are FP8, scales are fp32
-
-  TORCH_CHECK_VALUE(
-      isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()),
-      "mat_a and mat_b must be fp8 types, got: ",
-      mat_a.scalar_type(),
-      mat_b.scalar_type());
-  TORCH_CHECK_VALUE(
-      scale_a.numel() == 1 && scale_a.scalar_type() == kFloat,
-      "scale_a must have 1 Float element")
-  TORCH_CHECK_VALUE(
-      scale_b.numel() == 1 && scale_b.scalar_type() == kFloat,
-      "scale_b must have 1 Float element")
-
-  auto scaling_choice_a = ScalingType::TensorWise;
-  auto scaling_choice_b = ScalingType::TensorWise;
-
-  _scaled_gemm(
-      mat_a,
-      mat_b,
-      scale_a,
-      scale_b,
-      scaling_choice_a,
-      scaling_choice_b,
-      bias,
-      use_fast_accum,
-      out);
-
-  return out;
-}
-
-Tensor& _scaled_rowwise_rowwise(
-    const Tensor& mat_a,
-    const Tensor& mat_b,
-    const Tensor& scale_a,
-    const Tensor& scale_b,
-    const std::optional<Tensor>& bias,
-    const c10::ScalarType out_dtype,
-    bool use_fast_accum,
-    Tensor& out) {
-  // Restrictions:
-  // A, B are FP8, scales are fp32, shape M/N for A/B
-  TORCH_CHECK_VALUE(
-      isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()),
-      "mat_a and mat_b must be fp8 types, got: ",
-      mat_a.scalar_type(),
-      mat_b.scalar_type());
-  TORCH_CHECK_VALUE(
-      scale_a.size(0) == mat_a.size(0) && scale_a.size(1) == 1,
-      "scale_a must have shape [",
-      mat_a.size(0),
-      ", 1], got [",
-      scale_a.sizes(),
-      "]");
-  TORCH_CHECK_VALUE(
-      scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat,
-      "scale_a must have ",
-      mat_a.size(0),
-      " Float elements, got ",
-      scale_a.numel())
-  TORCH_CHECK_VALUE(
-      scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat,
-      "scale_b must have ",
-      mat_b.size(1),
-      " Float elements, got ",
-      scale_b.numel())
-
-  TORCH_CHECK_VALUE(
-      scale_a.stride(1) == 1,
-      "expected scale_a.stride(1) to be 1, but got ",
-      scale_a.stride(1));
-  TORCH_CHECK_VALUE(
-      scale_b.stride(1) == 1,
-      "expected scale_b.stride(1) to be 1, but got ",
-      scale_b.stride(1));
-
-  auto scaling_choice_a = ScalingType::RowWise;
-  auto scaling_choice_b = ScalingType::RowWise;
-
-  _scaled_gemm(
-      mat_a,
-      mat_b,
-      scale_a,
-      scale_b,
-      scaling_choice_a,
-      scaling_choice_b,
-      bias,
-      use_fast_accum,
-      out);
-
-  return out;
-}
-
-// V2: Computes matrix multiply + bias while applying scaling to input and
-// output matrices Scales are only applicable when matrices are of Float8 type
-// and assumed to be equal to 1.0 by default. If output matrix type is 16 or
-// 32-bit type, scale_result is not applied. Known limitations:
-//  - Only works if mat1 is row-major and mat2 is column-major
-//  - Only works if matrices sizes are divisible by 32
-//  - If 1-dimensional tensors are used then scale_a should be size =
-//  mat1.size(0)
-//    and scale_b should have size = to mat2.size(1)
-//  Arguments:
-//    - `mat_a`: the first operand of the matrix multiply, can be type
-//    `torch.float8_e4m3fn` or `torch.float8_e5m2`
-//    - `mat_b`: the second operand of the matrix multiply, can be type
-//    `torch.float8_e4m3fn` or `torch.float8_e5m2`
-//    - `scale_a`: a tensor with the inverse scale of `mat1`, whose
-//    shape/strides/dtype depend on the scaling scheme
-//    - `scale_recipe_a`: An integer corresponding to an enum describing the
-//    scaling scheme used for `scale_a`
-//    - `swizzle_a`: An integer corresponding to a `SwizzleType` enum describing
-//    the swizzling scheme for `scale_a`.
-//        Not supported for XPU for now.
-//    - `scale_b`: a tensor with the inverse scale of `mat2`, whose
-//    shape/strides/dtype depend on the scaling scheme
-//    - `scale_recipe_b`: An integer corresponding to an enum describing the
-//    scaling scheme used for `scale_b`
-//    - `swizzle_b`: An integer corresponding to a `SwizzleType` enum describing
-//    the swizzling scheme for `scale_b`.
-//        Not supported for XPU for now.
-//    - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
-//    - `out_dtype`: the output dtype, can either be a float8 or a higher
-//    precision floating point type
-//    - `contraction_dim`: describe which dimensions are `K` in the matmul.
-//       Not supported for XPU. Should always be empty.
-//    - `use_fast_accum`: Not supported for XPU, should always be false.
-//    - `out`: a reference to the output tensor
-Tensor& _scaled_mm_xpu_v2_out(
-    const Tensor& mat_a,
-    const Tensor& mat_b,
-    ArrayRef<Tensor> scale_a,
-    IntArrayRef scale_recipe_a,
-    IntArrayRef swizzle_a,
-    ArrayRef<Tensor> scale_b,
-    IntArrayRef scale_recipe_b,
-    IntArrayRef swizzle_b,
-    const std::optional<Tensor>& bias,
-    const std::optional<c10::ScalarType> out_dtype,
-    IntArrayRef contraction_dim,
-    bool use_fast_accum,
-    Tensor& out) {
-  TORCH_CHECK_VALUE(mat_a.dim() == 2, "mat_a must be a matrix");
-  TORCH_CHECK_VALUE(mat_b.dim() == 2, "mat_b must be a matrix");
-
-  // If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm
-  // kernels do not support this case).
-  if (mat_a.size(0) == 0 || mat_a.size(1) == 0 || mat_b.size(1) == 0) {
-    // `out` was created with `at::empty`. In the case where we are multiplying
-    // MxK by KxN and K is the zero dim, we need to initialize here to properly
-    // return a tensor of zeros.
-    at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
-    if (mat_a.size(1) == 0) {
-      out.zero_();
-    }
-
-    return out;
-  }
-
-  // Note: The `contraction_dim` is not actually used for now. We will need to
-  // align this code when upstreamed CUDA code is done. Currently, only keeps
-  // the code here for check.
-
-  // Check if the input matrix sizes can be multiplied
-  // - if optional contraction dims are provided, use those
-  //   -- mostly for < 1B formats (i.e. nvfp4x2) where cheap .t() is not
-  //   available.
-  if (contraction_dim.size() > 0) {
-    TORCH_CHECK_VALUE(
-        contraction_dim.size() == 2,
-        "contraction_dim must have exactly 2 elements");
-    auto mat_a_dim = contraction_dim[0];
-    auto mat_b_dim = contraction_dim[1];
-    TORCH_CHECK_VALUE(
-        mat_a.size(mat_a_dim) == mat_b.size(mat_b_dim),
-        "mat_a and mat_b shapes cannot be multiplied (",
-        mat_a.size(0),
-        "x",
-        mat_a.size(1),
-        " and ",
-        mat_b.size(0),
-        "x",
-        mat_b.size(1),
-        ") ",
-        "with contraction dims mat_a: ",
-        mat_a_dim,
-        ", mat_b: ",
-        mat_b_dim);
-  } else {
-    TORCH_CHECK_VALUE(
-        mat_a.size(1) == mat_b.size(0),
-        "mat_a and mat_b shapes cannot be multiplied (",
-        mat_a.size(0),
-        "x",
-        mat_a.size(1),
-        " and ",
-        mat_b.size(0),
-        "x",
-        mat_b.size(1),
-        ")");
-  }
-
-  TORCH_CHECK_VALUE(
-      !bias || bias->numel() == mat_b.sizes()[1],
-      "Bias must be size ",
-      mat_b.sizes()[1],
-      " but got ",
-      bias->numel());
-
-  TORCH_CHECK_VALUE(
-      !out_dtype || *out_dtype == out.scalar_type(),
-      "out_dtype must match output matrix type");
-
-  if (bias) {
-    TORCH_CHECK_VALUE(
-        bias->scalar_type() == kFloat ||
-            bias->scalar_type() == c10::ScalarType::BFloat16 ||
-            bias->scalar_type() == c10::ScalarType::Half,
-        "Bias must be Float32 or BFloat16 or Half, but got ",
-        bias->scalar_type());
-  }
-  {
-    auto bias_ = bias.value_or(Tensor());
-    // NOLINTNEXTLINE(*c-array*)
-    TensorArg targs[]{
-        {out, "out", 0},
-        {mat_a, "mat_a", 1},
-        {mat_b, "mat_b", 2},
-        {bias_, "bias", 3},
-        {scale_a[0], "scale_a", 4},
-        {scale_b[0], "scale_b", 5}};
-    checkAllSameGPU(__func__, targs);
-  }
-  // Align with CUDA's default out to be bf16
-  auto out_dtype_ = out_dtype.value_or(c10::ScalarType::BFloat16);
-
-  // Conversion of implicitly-defined enums to explicit
-  auto scale_recipe_a_enum = convert_int_to_enum<ScalingType>(scale_recipe_a);
-  auto swizzle_a_enum = convert_int_to_enum<SwizzleType>(swizzle_a);
-  auto scale_recipe_b_enum = convert_int_to_enum<ScalingType>(scale_recipe_b);
-  auto swizzle_b_enum = convert_int_to_enum<SwizzleType>(swizzle_b);
-
-  // XPU does not support swizzle for now. So directly return false.
-  TORCH_CHECK_VALUE(
-      swizzle_a_enum[0] == at::blas::SwizzleType::NO_SWIZZLE &&
-          swizzle_b_enum[0] == at::blas::SwizzleType::NO_SWIZZLE,
-      "XPU does not support swizzle yet.");
-
-  // at this point we can start working out what we want to be doing
-  // Try to do as few steps as possible.
-  // NOTE: support is deliberately sparse, can explicitly enumerate all
-  // combinations allowed. Do this via a list of defined (name, acceptance,
-  // concrete_impl) tuples.
-  bool found_impl = false;
-  ScaledGemmImplementation gemm_impl = ScaledGemmImplementation::NONE;
-
-  for (const auto& fn_entry : scale_kernel_dispatch) {
-    const auto [name, accept_fn, scaled_gemm_impl] = fn_entry;
-    bool ok = accept_fn(
-        mat_a.scalar_type(),
-        scale_recipe_a_enum,
-        scale_a,
-        mat_b.scalar_type(),
-        scale_recipe_b_enum,
-        scale_b);
-    if (ok) {
-      gemm_impl = scaled_gemm_impl;
-      found_impl = true;
-      break;
-    }
-  }
-  TORCH_CHECK_VALUE(
-      found_impl,
-      "Invalid scaling configuration.\n"
-      "- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n"
-      "- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (",
-      mat_a.size(0),
-      ", 1) and scale_b should be (1, ",
-      mat_b.size(1),
-      "), and both should be contiguous.\n"
-      "Got mat_a.dtype()=",
-      mat_a.scalar_type(),
-      ", scale_a[0].dtype()=",
-      scale_a[0].scalar_type(),
-      ", scale_a[0].size()=",
-      scale_a[0].sizes(),
-      ", scale_a[0].stride()=",
-      scale_a[0].strides(),
-      ", ",
-      "mat_b.dtype()=",
-      mat_b.scalar_type(),
-      ", scale_b[0].dtype()=",
-      scale_b[0].scalar_type(),
-      ", scale_b[0].size()=",
-      scale_b[0].sizes(),
-      " and scale_b[0].stride()=",
-      scale_b[0].strides());
-
-  at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
-
-  auto bias_ = bias.value_or(Tensor());
-
-  // dispatch to appropriate lower-level calls for error checking & execution
-  if (gemm_impl == ScaledGemmImplementation::TENSORWISE_TENSORWISE) {
-    return _scaled_tensorwise_tensorwise(
-        mat_a,
-        mat_b,
-        scale_a[0],
-        scale_b[0],
-        bias,
-        out_dtype_,
-        use_fast_accum,
-        out);
-  } else if (gemm_impl == ScaledGemmImplementation::ROWWISE_ROWWISE) {
-    return _scaled_rowwise_rowwise(
-        mat_a,
-        mat_b,
-        scale_a[0],
-        scale_b[0],
-        bias,
-        out_dtype_,
-        use_fast_accum,
-        out);
-  } else {
-    TORCH_CHECK_VALUE(
-        false, "Invalid state - found an implementation, but not really");
-  }
-}
-
-Tensor _scaled_mm_xpu_v2(
-    const Tensor& mat_a,
-    const Tensor& mat_b,
-    ArrayRef<Tensor> scale_a,
-    IntArrayRef scale_recipe_a,
-    IntArrayRef swizzle_a,
-    ArrayRef<Tensor> scale_b,
-    IntArrayRef scale_recipe_b,
-    IntArrayRef swizzle_b,
-    const std::optional<Tensor>& bias,
-    const std::optional<c10::ScalarType> out_dtype,
-    IntArrayRef contraction_dim,
-    bool use_fast_accum) {
-  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
-  Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
-
-  return _scaled_mm_xpu_v2_out(
-      mat_a,
-      mat_b,
-      scale_a,
-      scale_recipe_a,
-      swizzle_a,
-      scale_b,
-      scale_recipe_b,
-      swizzle_b,
-      bias,
-      out_dtype,
-      contraction_dim,
-      use_fast_accum,
-      out);
-}
-
 } // namespace at::native
--- a/aten/src/ATen/native/mps/operations/Repeat.mm
+++ b/aten/src/ATen/native/mps/operations/Repeat.mm
@ -91,30 +91,25 @@ static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
 #include <ATen/native/mps/Repeat_metallib.h>
 #endif

-Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
-  TORCH_CHECK(repeat.dim() == 1, "repeat_interleave only accept 1D vector as repeat");
+template <typename index_t>
+void computeRepeatIndices(const index_t* repeat_ptr,
+                          const int64_t* cumsum_ptr,
+                          index_t* result_ptr,
+                          int64_t size,
+                          int64_t result_size) {
+  id<MTLBuffer> repeatBuffer = reinterpret_cast<id<MTLBuffer>>(repeat_ptr);
+  id<MTLBuffer> cumsumBuffer = reinterpret_cast<id<MTLBuffer>>(cumsum_ptr);
+  id<MTLBuffer> resultBuffer = reinterpret_cast<id<MTLBuffer>>(result_ptr);
+  TORCH_CHECK(repeatBuffer && cumsumBuffer && resultBuffer);
+
  std::string scalar_type;
-  if (repeat.scalar_type() == kInt) {
+  if constexpr (std::is_same_v<index_t, int32_t>) {
    scalar_type = "int32_t";
-  } else if (repeat.scalar_type() == kLong) {
+  } else if constexpr (std::is_same_v<index_t, int64_t>) {
    scalar_type = "int64_t";
  } else {
-    TORCH_CHECK(false, "repeats has to be Long or Int tensor");
+    TORCH_CHECK(false, "repeat_interleave: unsupported indexing data type");
  }
-  if (repeat.size(0) == 0) {
-    return at::empty_like(repeat, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
-  }
-  Tensor repeat_ = repeat.contiguous();
-  Tensor cumsum = repeat.cumsum(0);
-  int64_t total = 0;
-  if (output_size.has_value()) {
-    total = output_size.value();
-  } else {
-    total = cumsum[-1].item<int64_t>();
-    TORCH_CHECK((repeat >= 0).all().item<uint8_t>(), "repeats can not be negative");
-  }
-
-  auto result = at::empty({total}, repeat.options());

  MPSStream* mpsStream = getCurrentMPSStream();
  dispatch_sync(mpsStream->queue(), ^() {
@ -126,13 +121,20 @@ Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output
      getMPSProfiler().beginProfileKernel(pipelineState, "repeat_interleave:" + scalar_type, false);

      [computeEncoder setComputePipelineState:pipelineState];
-      mps::mtl_setArgs(computeEncoder, repeat_, cumsum, result, repeat.size(0));
-      mps::mtl_dispatch1DJob(computeEncoder, pipelineState, repeat.size(0));
+      mps::mtl_setArgs(computeEncoder, repeatBuffer, cumsumBuffer, resultBuffer, size);
+      mps::mtl_dispatch1DJob(computeEncoder, pipelineState, size);

      getMPSProfiler().endProfileKernel(pipelineState);
    }
  });
-  return result;
+}
+
+Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
+  Tensor output;
+  AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_mps", [&]() {
+    output = repeat_interleave_common<index_t, computeRepeatIndices<index_t>>(repeat, output_size);
+  });
+  return output;
 }

 } // namespace at::native
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -4225,7 +4225,7 @@
    MTIA: mm_out_mtia
    MPS: mm_out_mps
    XPU: mm_out_xpu
-    SparseCPU, SparseCUDA, SparseMPS: _sparse_mm_out
+    SparseCPU, SparseCUDA: _sparse_mm_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm_out

 - func: mm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@ -61,7 +61,6 @@ list(APPEND ATen_CUDA_TEST_SRCS
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_math_test.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_test.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cub_test.cu
-  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cublas_handle_pool_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_device_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_distributions_test.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_dlconvertor_test.cpp
--- a/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp
+++ b/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp
@ -1,77 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDACachingAllocator.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include <atomic>
-#include <thread>
-#include <vector>
-
-// Test concurrent access to getCurrentCUDABlasHandle and getCUDABlasLtWorkspace
-// to verify that the data race fix is working correctly
-
-TEST(CUDABlasHandlePoolTest, ConcurrentGetAndClearWorkspaces) {
-  if (!at::cuda::is_available()) {
-    return;
-  }
-
-  constexpr int num_accessor_threads = 15;
-  constexpr int num_clear_threads = 5;
-  constexpr int iterations_per_thread = 50;
-
-  std::atomic<bool> stop{false};
-  std::atomic<int> error_count{0};
-  std::vector<std::thread> threads;
-  threads.reserve(num_accessor_threads + num_clear_threads);
-
-  // Launch accessor threads
-  for (int i = 0; i < num_accessor_threads; ++i) {
-    threads.emplace_back([&stop, &error_count]() {
-      try {
-        at::cuda::CUDAGuard device_guard(0);
-
-        while (!stop.load(std::memory_order_relaxed)) {
-          const auto handle = at::cuda::getCurrentCUDABlasHandle();
-          const auto workspace = at::cuda::getCUDABlasLtWorkspace();
-
-          if (handle == nullptr || workspace == nullptr) {
-            error_count++;
-          }
-        }
-      } catch (const std::exception& e) {
-        error_count++;
-      }
-    });
-  }
-
-  // Launch threads that clear workspaces
-  for (int i = 0; i < num_clear_threads; ++i) {
-    threads.emplace_back([&error_count]() {
-      try {
-        for (int j = 0; j < iterations_per_thread; ++j) {
-          at::cuda::clearCublasWorkspaces();
-          std::this_thread::yield();
-        }
-      } catch (const std::exception& e) {
-        error_count++;
-      }
-    });
-  }
-
-  // Let them run for a bit
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));
-  stop.store(true, std::memory_order_relaxed);
-
-  for (auto& thread : threads) {
-    thread.join();
-  }
-
-  EXPECT_EQ(error_count.load(), 0);
-}
-
-int main(int argc, char* argv[]) {
-  ::testing::InitGoogleTest(&argc, argv);
-  c10::cuda::CUDACachingAllocator::init(1);
-  return RUN_ALL_TESTS();
-}
--- a/aten/src/ATen/xpu/XPUScaledBlas.cpp
+++ b/aten/src/ATen/xpu/XPUScaledBlas.cpp
@ -1,122 +0,0 @@
-#include <c10/core/Scalar.h>
-#include <c10/core/ScalarType.h>
-#include <c10/util/Exception.h>
-#include <c10/util/SmallVector.h>
-#include <c10/util/typeid.h>
-#include <cstdint>
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/BlasBackend.h>
-#include <ATen/Dispatch.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/OpMathType.h>
-#include <ATen/TensorUtils.h>
-#include <ATen/core/NamedTensor.h>
-#include <ATen/core/Tensor.h>
-#include <ATen/native/GroupedMMUtils.h>
-#include <ATen/native/Resize.h>
-#include <c10/util/MaybeOwned.h>
-
-#include <ATen/ceil_div.h>
-#include <ATen/xpu/XPUScaledBlas.h>
-
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
-#include <ATen/ops/_addmm_activation_native.h>
-#include <ATen/ops/_efficientzerotensor.h>
-#include <ATen/ops/_scaled_mm_native.h>
-#include <ATen/ops/_unsafe_view_native.h>
-#include <ATen/ops/abs.h>
-#include <ATen/ops/addmm_native.h>
-#include <ATen/ops/addmv_native.h>
-#include <ATen/ops/baddbmm_native.h>
-#include <ATen/ops/bmm_native.h>
-#include <ATen/ops/copy_native.h>
-#include <ATen/ops/dot_native.h>
-#include <ATen/ops/empty.h>
-#include <ATen/ops/empty_strided.h>
-#include <ATen/ops/gelu.h>
-#include <ATen/ops/max.h>
-#include <ATen/ops/mm_native.h>
-#include <ATen/ops/mul.h>
-#include <ATen/ops/ones.h>
-#include <ATen/ops/relu.h>
-#include <ATen/ops/scalar_tensor_native.h>
-#include <ATen/ops/vdot_native.h>
-#endif
-
-using at::blas::ScalingType;
-
-namespace at::native::onednn::scaled {
-
-/**
- * Both inputs must be fp8,
- * Each needs a single scale, {Tensorwise (float)}
- */
-bool check_tensorwise_recipe(
-    c10::ScalarType type_a,
-    std::vector<ScalingType>& recipe_a,
-    ArrayRef<Tensor>& scales_a,
-    c10::ScalarType type_b,
-    std::vector<ScalingType>& recipe_b,
-    ArrayRef<Tensor>& scales_b) {
-  // both types must be fp8
-  if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) {
-    return false;
-  }
-
-  // 1 scale each, {Tensorwise, float}
-  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 ||
-      recipe_b.size() != 1) {
-    return false;
-  }
-  // Need {Blockwise_1x32, e8m0} for A & B
-  if (recipe_a[0] != ScalingType::TensorWise)
-    return false;
-  if (scales_a[0].scalar_type() != ScalarType::Float)
-    return false;
-  if (recipe_b[0] != ScalingType::TensorWise)
-    return false;
-  if (scales_b[0].scalar_type() != ScalarType::Float)
-    return false;
-
-  return true;
-}
-
-/**
- * Both inputs must be fp8,
- * Each needs scales, {Rowwise (float)}
- */
-bool check_rowwise_recipe(
-    c10::ScalarType type_a,
-    std::vector<ScalingType>& recipe_a,
-    ArrayRef<Tensor>& scales_a,
-    c10::ScalarType type_b,
-    std::vector<ScalingType>& recipe_b,
-    ArrayRef<Tensor>& scales_b) {
-  // both types must be fp8
-  if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) {
-    return false;
-  }
-
-  // 1 scale each, {Tensorwise, float}
-  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 ||
-      recipe_b.size() != 1) {
-    return false;
-  }
-
-  // Need {RowWise, dp32} for A & B
-  if (recipe_a[0] != ScalingType::RowWise)
-    return false;
-  if (scales_a[0].scalar_type() != ScalarType::Float)
-    return false;
-  if (recipe_b[0] != ScalingType::RowWise)
-    return false;
-  if (scales_b[0].scalar_type() != ScalarType::Float)
-    return false;
-
-  return true;
-}
-
-} // namespace at::native::onednn::scaled
--- a/aten/src/ATen/xpu/XPUScaledBlas.h
+++ b/aten/src/ATen/xpu/XPUScaledBlas.h
@ -1,95 +0,0 @@
-#include <c10/core/Scalar.h>
-#include <c10/core/ScalarType.h>
-#include <c10/util/Exception.h>
-#include <c10/util/SmallVector.h>
-#include <c10/util/typeid.h>
-#include <cstdint>
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/Dispatch.h>
-#include <ATen/ExpandUtils.h>
-#include <ATen/OpMathType.h>
-#include <ATen/TensorUtils.h>
-#include <ATen/core/NamedTensor.h>
-#include <ATen/core/Tensor.h>
-#include <ATen/native/Resize.h>
-#include <c10/util/MaybeOwned.h>
-
-#include <ATen/BlasBackend.h>
-#include <ATen/ceil_div.h>
-
-#ifdef USE_FBGEMM_GENAI
-#include <fbgemm_gpu/torch_ops.h>
-#endif
-
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#include <ATen/NativeFunctions.h>
-#else
-#include <ATen/ops/_addmm_activation_native.h>
-#include <ATen/ops/_efficientzerotensor.h>
-#include <ATen/ops/_scaled_mm_native.h>
-#include <ATen/ops/_unsafe_view_native.h>
-#include <ATen/ops/abs.h>
-#include <ATen/ops/addmm_native.h>
-#include <ATen/ops/addmv_native.h>
-#include <ATen/ops/baddbmm_native.h>
-#include <ATen/ops/bmm_native.h>
-#include <ATen/ops/copy_native.h>
-#include <ATen/ops/dot_native.h>
-#include <ATen/ops/empty.h>
-#include <ATen/ops/empty_strided.h>
-#include <ATen/ops/gelu.h>
-#include <ATen/ops/max.h>
-#include <ATen/ops/mm_native.h>
-#include <ATen/ops/mul.h>
-#include <ATen/ops/ones.h>
-#include <ATen/ops/relu.h>
-#include <ATen/ops/scalar_tensor_native.h>
-#include <ATen/ops/vdot_native.h>
-#endif
-
-using at::blas::ScalingType;
-
-namespace at::native::onednn::scaled {
-
-/**
- * Track concrete implementations available
- */
-enum class ScaledGemmImplementation {
-  NONE = 0,
-  TENSORWISE_TENSORWISE = 1,
-  ROWWISE_ROWWISE = 2,
-};
-
-/**
- * Convert passed int (enum) from python back into a
- * strictly-typed enum
- */
-template <class EnumType, class ArrayType>
-std::vector<EnumType> convert_int_to_enum(ArrayType& v) {
-  std::vector<EnumType> converted;
-  converted.reserve(v.size());
-
-  for (auto vi : v) {
-    converted.push_back(static_cast<EnumType>(vi));
-  }
-  return converted;
-}
-
-bool check_tensorwise_recipe(
-    c10::ScalarType,
-    std::vector<ScalingType>&,
-    ArrayRef<Tensor>&,
-    c10::ScalarType,
-    std::vector<ScalingType>&,
-    ArrayRef<Tensor>&);
-
-bool check_rowwise_recipe(
-    c10::ScalarType,
-    std::vector<ScalingType>&,
-    ArrayRef<Tensor>&,
-    c10::ScalarType,
-    std::vector<ScalingType>&,
-    ArrayRef<Tensor>&);
-
-} // namespace at::native::onednn::scaled
--- a/aten/tools/valgrind.sup
+++ b/aten/tools/valgrind.sup
@ -10,13 +10,6 @@
   ...
 }

-{
-   ignore_empty_generic_uninitialised_conditional_jump
-   Memcheck:Cond
-   fun:_ZN2at6detail13empty_genericEN3c108ArrayRefIlEEPNS1_9AllocatorENS1_14DispatchKeySetENS1_10ScalarTypeESt8optionalINS1_12MemoryFormatEE
-   ...
-}
-
 {
   Cond_cuda
   Memcheck:Cond
--- a/benchmarks/dynamo/check_perf_csv.py
+++ b/benchmarks/dynamo/check_perf_csv.py
@ -9,61 +9,28 @@ def check_perf_csv(filename, threshold, threshold_scale):
    """
    Basic performance checking.
    """
-    try:
-        df = pd.read_csv(filename)
-    except FileNotFoundError:
-        print(f"Error: File {filename} not found")
-        sys.exit(1)

-    effective_threshold = threshold * threshold_scale
-    print(f"Checking {filename} (speedup threshold >= {effective_threshold:.2f}x)\n")
+    df = pd.read_csv(filename)

    failed = []
    for _, row in df.iterrows():
        model_name = row["name"]
-        speedup = float(row["speedup"])
-        abs_latency = float(row["abs_latency"])
-        compilation_latency = float(row["compilation_latency"])
-        compression_ratio = float(row["compression_ratio"])
-        eager_peak_mem = float(row["eager_peak_mem"])
-        dynamo_peak_mem = float(row["dynamo_peak_mem"])
+        speedup = row["speedup"]
+        if speedup < threshold * threshold_scale:
+            failed.append(model_name)

-        perf_summary = f"{model_name:34} speedup={speedup:.3f}x"
-        if pd.notna(abs_latency):
-            perf_summary += f", latency={abs_latency:.1f} ms/iter"
-        if pd.notna(compilation_latency):
-            perf_summary += f", compile={compilation_latency:.3f}s"
-        if pd.notna(compression_ratio):
-            perf_summary += f", mem_ratio={1 / compression_ratio:.2f}x"
-            if pd.notna(eager_peak_mem) and pd.notna(dynamo_peak_mem):
-                perf_summary += (
-                    f" (eager={eager_peak_mem:.1f} GB, dynamo={dynamo_peak_mem:.1f} GB)"
-                )
-
-        if speedup < effective_threshold:
-            failed.append((model_name, speedup))
-
-        print(perf_summary)
+        print(f"{model_name:34} {speedup}")

    if failed:
        print(
            textwrap.dedent(
                f"""
-                Error {len(failed)} model(s) performance regressed
-                    {" ".join([name for name, _ in failed])}
+                Error {len(failed)} models performance regressed
+                    {" ".join(failed)}
                """
            )
        )
-        for name, sp in sorted(failed, key=lambda x: x[1]):
-            pct_from_target = (sp / effective_threshold - 1.0) * 100.0
-            print(
-                f"  - {name}: {sp:.3f}x (< {effective_threshold:.2f}x; {pct_from_target:.1f}% from target)"
-            )
        sys.exit(1)
-    else:
-        print(
-            f"\nAll {len(df)} model(s) passed threshold check (>= {effective_threshold:.2f}x)"
-        )


 if __name__ == "__main__":
@ -77,7 +44,7 @@ if __name__ == "__main__":
        "-s",
        type=float,
        default=1.0,
-        help="multiply threshold by this value to relax the check",
+        help="multiple threshold by this value to relax the check",
    )
    args = parser.parse_args()
    check_perf_csv(args.file, args.threshold, args.threshold_scale)
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -2379,9 +2379,7 @@ class BenchmarkRunner:
                    print(
                        f"Load model outputs from {self.args.compare_model_outputs_with} to compare"
                    )
-                    saved_result = torch.load(
-                        self.args.compare_model_outputs_with, weights_only=False
-                    )
+                    saved_result = torch.load(self.args.compare_model_outputs_with)
                    is_bitwise_same = bitwise_same(saved_result, new_result)
                    if not is_bitwise_same:
                        print(
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@ -266,11 +266,7 @@ class BenchmarkRunner:
                print(
                    f"{mode} Execution Time (us) : {results['reported_run_time_us'][0]:.3f}"
                )
-                print(f"Peak Memory (KB) : {results['peak_memory']}")
-                # Calculate and print memory bandwidth if operator provides memory traffic
-                if results.get('memory_bandwidth_gb_s') is not None:
-                    print(f"Memory Bandwidth (GB/s) : {results['memory_bandwidth_gb_s']:.2f}")
-                print()
+                print(f"Peak Memory (KB) : {results['peak_memory']}\n")

    def _perf_result_to_dict(self, results, test_case):
        """This function is the parallel of _print_perf_result, which instead of
@ -715,15 +711,6 @@ class BenchmarkRunner:
                result_dict = dict()
                result_dict["reported_run_time_us"] = [r[0] for r in results]
                result_dict["peak_memory"] = results[0][1]
-
-                # Calculate memory bandwidth if operator provides memory traffic
-                memory_traffic_bytes = test_case.op_bench.get_memory_traffic_bytes()
-                if memory_traffic_bytes is not None:
-                    execution_time_s = result_dict["reported_run_time_us"][0] / 1e6
-                    result_dict["memory_bandwidth_gb_s"] = memory_traffic_bytes / execution_time_s / 1e9
-                else:
-                    result_dict["memory_bandwidth_gb_s"] = None
-
                self._print_perf_result(results=result_dict, test_case=test_case)

                # output results to csv
--- a/benchmarks/operator_benchmark/benchmark_pytorch.py
+++ b/benchmarks/operator_benchmark/benchmark_pytorch.py
@ -118,54 +118,6 @@ class TorchBenchmarkBase(torch.nn.Module):
        name = (self.module_name() + "_" + "_".join(test_name_str)).replace(" ", "")
        return name

-    def get_memory_traffic_bytes(self):
-        """Return the number of bytes read/written by this operator.
-
-        Override this method in subclasses to enable memory bandwidth calculation.
-        The framework will use this value along with execution time to compute
-        and report memory bandwidth in GB/s.
-
-        This provides automatic calculation for matmul-like operations by
-        inferring dimensions from input tensor shapes:
-        - 2D inputs: (M, N) @ (N, K) → matmul, mm
-        - 3D inputs: (B, M, N) @ (B, N, K) → bmm, baddbmm
-
-        For custom memory patterns, override this method.
-
-        Returns:
-            int or None: Total bytes transferred (reads + writes), or None if not applicable
-        """
-        if not hasattr(self, 'inputs') or not self.inputs:
-            return None
-
-        input_tensors = [v for v in self.inputs.values() if isinstance(v, torch.Tensor)]
-        if len(input_tensors) < 2:
-            return None
-
-        input_a, input_b = input_tensors[0], input_tensors[1]
-
-        if input_a.dim() != input_b.dim() or input_a.dim() not in (2, 3):
-            return None
-
-        bytes_per_element = input_a.element_size()
-
-        if input_a.dim() == 3:
-            B_a, M, N_a = input_a.shape
-            B_b, N_b, K = input_b.shape
-            if B_a != B_b or N_a != N_b:
-                return None
-            B = B_a
-        else:
-            M, N_a = input_a.shape
-            N_b, K = input_b.shape
-            if N_a != N_b:
-                return None
-            B = 1
-
-        N = N_a
-        total_elements = B * (M * N + N * K + M * K)
-        return total_elements * bytes_per_element
-

 class PyTorchOperatorTestCase:
    """This class includes all the information needed to benchmark an operator.
--- a/benchmarks/operator_benchmark/pt/optimizer_test.py
+++ b/benchmarks/operator_benchmark/pt/optimizer_test.py
@ -1,79 +0,0 @@
-import operator_benchmark as op_bench
-
-import torch
-import torch.optim as optim
-
-
-"""Microbenchmarks for optimizer operators."""
-
-
-optimizer_list = op_bench.op_list(
-    attr_names=["op_name", "op_func"],
-    attrs=[
-        ["adamw", optim.AdamW],
-        ["adam", optim.Adam],
-        ["sgd", optim.SGD],
-        ["rmsprop", optim.RMSprop],
-        ["adagrad", optim.Adagrad],
-    ],
-)
-
-optimizer_configs_long = op_bench.cross_product_configs(
-    num_params=[1, 10, 100],
-    param_size=[100000, 1000000, 10000000],
-    device=["cuda"],
-    tags=["long"],
-)
-
-
-class OptimizerBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, op_func, device, shape=None, num_params=None, param_size=None):
-        if shape is not None:
-            num_params = num_params if num_params is not None else 1
-            self.params = [
-                torch.randn(shape, device=device, requires_grad=True)
-                for _ in range(num_params)
-            ]
-            for param in self.params:
-                param.grad = torch.randn(shape, device=device)
-        else:
-            self.params = [
-                torch.randn(param_size, device=device, requires_grad=True)
-                for _ in range(num_params)
-            ]
-            for param in self.params:
-                param.grad = torch.randn_like(param)
-
-        kwargs = {"momentum": 0.9} if op_func == optim.SGD else {}
-        self.optimizer = op_func(self.params, lr=0.001, **kwargs)
-
-        # Memory traffic calculation for bandwidth
-        self.total_elements = sum(p.numel() for p in self.params)
-        self.bytes_per_element = self.params[0].element_size()
-        # SGD w/ momentum: read(param, grad, momentum) + write(param, momentum) = 5x
-        # Adam/AdamW: read(param, grad, exp_avg, exp_avg_sq) + write(param, exp_avg, exp_avg_sq) = 7x
-        # Adagrad/RMSprop: read(param, grad, state) + write(param, state) = 5x
-        if op_func in (optim.Adam, optim.AdamW):
-            self.memory_multiplier = 7
-        else:
-            self.memory_multiplier = 5
-
-        self.inputs = {"dummy": self.params[0]}
-
-    def forward(self, dummy):
-        self.optimizer.step()
-        for param in self.params:
-            param.grad = torch.randn_like(param)
-        return self.params[0]
-
-    def get_memory_traffic_bytes(self):
-        return self.total_elements * self.bytes_per_element * self.memory_multiplier
-
-
-op_bench.generate_pt_tests_from_op_list(
-    optimizer_list, optimizer_configs_long, OptimizerBenchmark
-)
-
-
-if __name__ == "__main__":
-    op_bench.benchmark_runner.main()
--- a/c10/core/SafePyObject.h
+++ b/c10/core/SafePyObject.h
@ -44,7 +44,7 @@ struct C10_API SafePyObject {
      (*other.pyinterpreter_)->incref(other.data_);
    }
    if (data_ != nullptr) {
-      (*pyinterpreter_)->decref(data_);
+      (*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
    }
    data_ = other.data_;
    pyinterpreter_ = other.pyinterpreter_;
@ -53,7 +53,7 @@ struct C10_API SafePyObject {

  ~SafePyObject() {
    if (data_ != nullptr) {
-      (*pyinterpreter_)->decref(data_);
+      (*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
    }
  }

--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@ -92,6 +92,13 @@ inline bool isComplexType(ScalarType t) {
      t == ScalarType::ComplexDouble);
 }

+inline bool isQIntType(ScalarType t) {
+  // Don't forget to extend this when adding new QInt types
+  return t == ScalarType::QInt8 || t == ScalarType::QUInt8 ||
+      t == ScalarType::QInt32 || t == ScalarType::QUInt4x2 ||
+      t == ScalarType::QUInt2x4;
+}
+
 inline bool isBitsType(ScalarType t) {
  return t == ScalarType::Bits1x8 || t == ScalarType::Bits2x4 ||
      t == ScalarType::Bits4x2 || t == ScalarType::Bits8 ||
--- a/c10/core/StorageImpl.cpp
+++ b/c10/core/StorageImpl.cpp
@ -48,30 +48,6 @@ void warnDeprecatedDataPtr() {
  TORCH_CHECK(false, "Cannot access data pointer of Storage that is invalid.");
 }

-void StorageImpl::incref_pyobject() const {
-  // Because intrusive_ptr incref uses relaxed memory order, we need to
-  // do an acquire fence to ensure that the kHasPyObject bit was
-  // observed before the load of the PyObject* below.
-  // NB: This is a no-op on x86/x86-64
-  std::atomic_thread_fence(std::memory_order_acquire);
-
-  PyObject* obj = pyobj_slot_.load_pyobj();
-  (*pyobj_slot_.pyobj_interpreter())->incref(obj);
-}
-
-void StorageImpl::decref_pyobject() const {
-  PyObject* obj = pyobj_slot_.load_pyobj();
-  (*pyobj_slot_.pyobj_interpreter())->decref(obj);
-}
-
-bool StorageImpl::try_incref_pyobject() const {
-  c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
-  if (C10_UNLIKELY(!interp)) {
-    return false;
-  }
-  return (*interp)->try_incref(pyobj_slot_);
-}
-
 void SetStorageImplCreate(DeviceType t, StorageImplCreateHelper fptr) {
  // Allowlist verification.
  // Only if the devicetype is in the allowlist,
--- a/c10/core/StorageImpl.h
+++ b/c10/core/StorageImpl.h
@ -105,12 +105,6 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
    data_ptr_.clear();
  }

-  void incref_pyobject() const override final;
-
-  void decref_pyobject() const override final;
-
-  bool try_incref_pyobject() const override final;
-
  size_t nbytes() const {
    // OK to do this instead of maybe_as_int as nbytes is guaranteed positive
    TORCH_CHECK(!size_bytes_is_heap_allocated_);
@ -376,18 +370,4 @@ C10_API c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
    bool resizable,
    std::optional<at::Device> device_opt);

-namespace detail {
-
-#ifndef C10_MOBILE
-template <class T>
-struct TargetTraits<
-    T,
-    std::enable_if_t<
-        std::is_base_of_v<c10::StorageImpl, std::remove_cv_t<T>>>> {
-  static constexpr bool can_have_pyobject = true;
-};
-#endif
-
-} // namespace detail
-
 } // namespace c10
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@ -277,6 +277,7 @@ void TensorImpl::release_resources() {
  if (storage_) {
    storage_ = {};
  }
+  pyobj_slot_.maybe_destroy_pyobj();
 }

 #ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
@ -988,30 +989,6 @@ void TensorImpl::empty_tensor_restride_symint(MemoryFormat memory_format) {
  }
 }

-void TensorImpl::incref_pyobject() const {
-  // Because intrusive_ptr incref uses relaxed memory order, we need to
-  // do an acquire fence to ensure that the kHasPyObject bit was
-  // observed before the load of the PyObject* below.
-  // NB: This is a no-op on x86/x86-64
-  std::atomic_thread_fence(std::memory_order_acquire);
-
-  PyObject* obj = pyobj_slot_.load_pyobj();
-  (*pyobj_slot_.pyobj_interpreter())->incref(obj);
-}
-
-void TensorImpl::decref_pyobject() const {
-  PyObject* obj = pyobj_slot_.load_pyobj();
-  (*pyobj_slot_.pyobj_interpreter())->decref(obj);
-}
-
-bool TensorImpl::try_incref_pyobject() const {
-  c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
-  if (C10_UNLIKELY(!interp)) {
-    return false;
-  }
-  return (*interp)->try_incref(pyobj_slot_);
-}
-
 namespace impl {

 namespace {
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@ -2178,12 +2178,6 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    return &pyobj_slot_;
  }

-  void incref_pyobject() const override final;
-
-  void decref_pyobject() const override final;
-
-  bool try_incref_pyobject() const override final;
-
 private:
  // See NOTE [std::optional operator usage in CUDA]
  // We probably don't want to expose this publicly until
@ -3085,19 +3079,6 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
  friend class C10_TensorImpl_Size_Check_Dummy_Class;
 };

-namespace detail {
-
-#ifndef C10_MOBILE
-template <class T>
-struct TargetTraits<
-    T,
-    std::enable_if_t<std::is_base_of_v<c10::TensorImpl, std::remove_cv_t<T>>>> {
-  static constexpr bool can_have_pyobject = true;
-};
-#endif
-
-} // namespace detail
-
 // Note [TensorImpl size constraints]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // Changed the size of TensorImpl?  If the size went down, good for
--- a/c10/core/impl/PyInterpreter.cpp
+++ b/c10/core/impl/PyInterpreter.cpp
@ -11,11 +11,8 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {

  void incref(PyObject* pyobj) const override {} // do nothing

-  void decref(PyObject* pyobj) const override {} // do nothing
-
-  bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const override {
-    return false;
-  }
+  void decref(PyObject* pyobj, bool has_pyobj_slot) const override {
+  } // do nothing

 #define PANIC(m)              \
  TORCH_INTERNAL_ASSERT(      \
@ -23,10 +20,6 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
      "attempted to call " #m \
      " on a Tensor with nontrivial PyObject after corresponding interpreter died")

-  size_t refcnt(PyObject* pyobj) const override {
-    PANIC(refcnt);
-  }
-
  c10::intrusive_ptr<TensorImpl> detach(const TensorImpl* self) const override {
    PANIC(detach);
  }
--- a/c10/core/impl/PyInterpreter.h
+++ b/c10/core/impl/PyInterpreter.h
@ -18,9 +18,6 @@ namespace c10 {
 struct IValue;
 class OperatorHandle;
 struct TensorImpl;
-namespace impl {
-struct PyObjectSlot;
-} // namespace impl
 } // namespace c10

 namespace torch::jit {
@ -129,12 +126,9 @@ struct C10_API PyInterpreterVTable {

  // Run Py_INCREF on a PyObject.
  virtual void incref(PyObject* pyobj) const = 0;
-  // Run Py_DECREF on a PyObject.  We DO NOT assume the GIL is held on call.
-  virtual void decref(PyObject* pyobj) const = 0;
-  // Run PyUnstable_TryIncRef on a PyObject if it's not NULL.
-  virtual bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const = 0;
-  // Run Py_REFCNT on a PyObject.
-  virtual size_t refcnt(PyObject* pyobj) const = 0;
+  // Run Py_DECREF on a PyObject.  We DO NOT assume the GIL is held on call
+  // See NOTE [PyInterpreter::decref takes a `has_pyobj_slot` arg]
+  virtual void decref(PyObject* pyobj, bool has_pyobj_slot) const = 0;

  // Perform a detach by deferring to the __torch_dispatch__ implementation of
  // detach, which will also arrange for the PyObject to get copied in this
--- a/c10/core/impl/PyObjectSlot.cpp
+++ b/c10/core/impl/PyObjectSlot.cpp
@ -0,0 +1,56 @@
+#include <c10/core/impl/PyObjectSlot.h>
+
+namespace c10::impl {
+
+PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
+
+PyObjectSlot::~PyObjectSlot() {
+  maybe_destroy_pyobj();
+}
+
+void PyObjectSlot::maybe_destroy_pyobj() {
+  if (owns_pyobj()) {
+    TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
+    TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
+    (*pyobj_interpreter_.load(std::memory_order_acquire))
+        ->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
+    // NB: this destructor can only be entered when there are no
+    // references to this C++ object (obviously), NOR any references
+    // to the PyObject (if there are references to the PyObject,
+    // then the PyObject holds an owning reference to the tensor).
+    // So it is OK to clear pyobj_ here as it is impossible for it to
+    // be used again (modulo weak reference races)
+    pyobj_ = nullptr; // for safety
+  }
+}
+
+PyInterpreter* PyObjectSlot::pyobj_interpreter() {
+  return pyobj_interpreter_.load(std::memory_order_acquire);
+}
+
+PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
+  // NOLINTNEXTLINE(performance-no-int-to-ptr)
+  return reinterpret_cast<PyObject*>(
+      reinterpret_cast<uintptr_t>(pyobj_) & ~0x1ULL);
+}
+
+PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
+  auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
+  if (interpreter) {
+    return *interpreter;
+  }
+  TORCH_CHECK(false, "cannot access PyObject for Tensor - no interpreter set");
+}
+
+bool PyObjectSlot::owns_pyobj() {
+  // NOLINTNEXTLINE(performance-no-int-to-ptr)
+  return reinterpret_cast<uintptr_t>(pyobj_) & 1;
+}
+
+void PyObjectSlot::set_owns_pyobj(bool b) {
+  // NOLINTNEXTLINE(performance-no-int-to-ptr)
+  pyobj_ = reinterpret_cast<PyObject*>(
+      reinterpret_cast<uintptr_t>(_unchecked_untagged_pyobj()) | b);
+}
+
+} // namespace c10::impl
--- a/c10/core/impl/PyObjectSlot.h
+++ b/c10/core/impl/PyObjectSlot.h
@ -8,58 +8,117 @@

 #include <atomic>

-namespace torch::utils {
-class PyObjectPreservation;
-}
-
 namespace c10::impl {

 struct C10_API PyObjectSlot {
 public:
-  PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
+  PyObjectSlot();
+
+  ~PyObjectSlot();
+
+  void maybe_destroy_pyobj();
+
+  // Associate the TensorImpl with the specified PyObject, and, if necessary,
+  // also tag the interpreter.
+  //
+  // NB: This lives in a header so that we can inline away the switch on status
+  //
+  // NB: THIS FUNCTION CAN RAISE AN EXCEPTION.  Make sure to clean up after
+  // PyObject if necessary!
+  void init_pyobj(PyObject* pyobj) {
+    pyobj_interpreter_.store(
+        getGlobalPyInterpreter(), std::memory_order_relaxed);
+    pyobj_ = pyobj;
+  }

  // Query the PyObject interpreter.  This may return null if there is no
-  // interpreter.
-  PyInterpreter* pyobj_interpreter() const {
-    return pyobj_interpreter_.load(std::memory_order_acquire);
+  // interpreter.  This is racy!
+  PyInterpreter* pyobj_interpreter();
+
+  PyObject* _unchecked_untagged_pyobj() const;
+
+  // Test the interpreter tag.  If tagged for the current interpreter, return
+  // a non-nullopt (but possibly null) PyObject.  If (possibly) untagged,
+  // returns a nullopt.  If it is definitely invalid, raises an error.
+  //
+  // If `ignore_hermetic_tls` is false and this function is called from a
+  // hermetic context (ie, `HermeticPyObjectTLS::get_state()` is true), then
+  // nullopt is returned. If `ignore_hermetic_tls` is true, then the hermetic
+  // context is ignored, allowing you to check the interpreter tag of a
+  // nonhermetic PyObject from within a hermetic context. This is necessary
+  // because there are some cases where the deallocator function of a
+  // nonhermetic PyObject is called from within a hermetic context, so it must
+  // be properly treated as a nonhermetic PyObject.
+  //
+  // NB: this lives in header so that we can avoid actually creating the
+  // std::optional
+
+  // @todo alban: I'm not too sure what's going on here, we can probably delete
+  // it but it's worthwhile making sure
+  std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
+    impl::PyInterpreter* interpreter =
+        pyobj_interpreter_.load(std::memory_order_acquire);
+    if (interpreter == nullptr) {
+      return std::nullopt;
+    }
+
+    if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
+      return std::nullopt;
+    } else {
+      return _unchecked_untagged_pyobj();
+    }
  }

-  PyInterpreter& load_pyobj_interpreter() const {
-    auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
-    TORCH_INTERNAL_ASSERT(
-        interpreter, "cannot access PyObject for Tensor - no interpreter set");
-    return *interpreter;
-  }
+  PyInterpreter& load_pyobj_interpreter() const;

-  PyObject* load_pyobj() const {
-    return pyobj_.load(std::memory_order_acquire);
-  }
+  bool owns_pyobj();

-  void store_pyobj(PyObject* obj) {
-    pyobj_.store(obj, std::memory_order_release);
-  }
-
-  bool has_unique_reference() const {
-    PyObject* pyobj = load_pyobj();
-    return pyobj != nullptr && load_pyobj_interpreter()->refcnt(pyobj) == 1;
-  }
-
-  void clear() {
-    pyobj_.store(nullptr, std::memory_order_relaxed);
-    pyobj_interpreter_.store(nullptr, std::memory_order_relaxed);
-  }
+  void set_owns_pyobj(bool b);

 private:
-  // This is now always the global interpreter if the PyObject is set.
-  // Maybe we can remove this field some day...
+  // This field contains the interpreter tag for this object.  See
+  // Note [Python interpreter tag] for general context
+  //
+  // Note [Memory ordering on Python interpreter tag]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // What memory_order do we need when accessing this atomic?  We don't
+  // need a single total modification order (as provided by
+  // memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
+  // transition from -1 to some positive integer and never changes afterwards.
+  // Because there is only one modification, it trivially already has a total
+  // modification order (e.g., we don't need fences or locked instructions on
+  // x86)
+  //
+  // In fact, one could make a reasonable argument that relaxed reads are OK,
+  // due to the presence of external locking (GIL) to ensure that interactions
+  // with other data structures are still correctly synchronized, so that
+  // we fall in the "Single-Location Data Structures" case as described in
+  // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
+  // However, on x86, it doesn't matter if I use acquire or relaxed on the load
+  // as I get the same assembly in both cases.  So I just use the more
+  // conservative acquire (which will impede compiler optimizations but I don't
+  // care)
  std::atomic<PyInterpreter*> pyobj_interpreter_;

-  // The PyObject representing this Tensor or nullptr. Ownership is managed
-  // by intrusive_ptr. By the time the PyObjectSlot is destroyed, this
-  // reference is already dead.
-  std::atomic<PyObject*> pyobj_;
-
-  friend class torch::utils::PyObjectPreservation;
+  // This field contains a reference to a PyObject representing this Tensor.
+  // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
+  // PyObject for it and set this field.  This field does not have to be
+  // protected by an atomic as it is only allowed to be accessed when you hold
+  // the GIL, or during destruction of the tensor.
+  //
+  // When a PyObject dies, you are obligated to clear this field
+  // (otherwise, you will try to use-after-free the pyobj); this currently
+  // occurs in THPVariable_clear in torch/csrc/autograd/python_variable.cpp
+  //
+  // NB: Ordinarily, this should not be a strong reference, as if the
+  // PyObject owns the Tensor, this would create a reference cycle.
+  // However, sometimes this ownership flips.  To track who owns
+  // who, this has a single pointer tag indicating whether or not the
+  // C++ object owns the PyObject (the common case, zero, means PyObject
+  // owns the C++ object); see _unchecked_untagged_pyobj for raw access
+  // or check_pyobj for checked access.  See references to PyObject
+  // resurrection in torch/csrc/autograd/python_variable.cpp
+  PyObject* pyobj_;
 };

 } // namespace c10::impl
--- a/c10/cuda/driver_api.h
+++ b/c10/cuda/driver_api.h
@ -20,22 +20,6 @@
    }                                                                      \
  } while (0)

-#define C10_CUDA_DRIVER_CHECK_GOTO(EXPR, NEXT)                             \
-  do {                                                                     \
-    CUresult __err = EXPR;                                                 \
-    if (__err != CUDA_SUCCESS) {                                           \
-      const char* err_str;                                                 \
-      CUresult get_error_str_err [[maybe_unused]] =                        \
-          c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \
-      if (get_error_str_err != CUDA_SUCCESS) {                             \
-        TORCH_WARN("CUDA driver error: unknown error");                    \
-      } else {                                                             \
-        TORCH_WARN("CUDA driver error: ", err_str);                        \
-      }                                                                    \
-      goto NEXT;                                                           \
-    }                                                                      \
-  } while (0)
-
 // The integer in the second column specifies the requested CUDA Driver API
 // version. The dynamic loader will accept a driver with a newer version, but it
 // ensures that the requested symbol exists in *at least* the specified version
--- a/c10/util/Exception.cpp
+++ b/c10/util/Exception.cpp
@ -1,4 +1,5 @@
 #include <c10/util/Exception.h>
+#include <c10/util/FileSystem.h>
 #include <c10/util/Logging.h>
 #include <c10/util/Type.h>

@ -27,7 +28,7 @@ Error::Error(
    const void* caller)
    : Error(
          str("[enforce fail at ",
-              detail::StripBasename(file),
+              c10::filesystem::path(file).filename(),
              ":",
              line,
              "] ",
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@ -1,4 +1,5 @@
 #include <c10/util/Backtrace.h>
+#include <c10/util/FileSystem.h>
 #include <c10/util/Flags.h>
 #include <c10/util/Lazy.h>
 #include <c10/util/Logging.h>
@ -478,8 +479,7 @@ MessageLogger::MessageLogger(
          << std::setfill('0') << ' ' << std::setw(2) << timeinfo->tm_hour
          << ':' << std::setw(2) << timeinfo->tm_min << ':' << std::setw(2)
          << timeinfo->tm_sec << '.' << std::setw(9) << ns << ' '
-          << c10::detail::StripBasename(std::string(file)) << ':' << line
-          << "] ";
+          << c10::filesystem::path(file).filename() << ':' << line << "] ";
 }

 // Output the contents of the stream to the proper channel on destruction.
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@ -12,10 +12,6 @@ template <typename, typename...>
 class class_;
 }

-namespace torch::utils {
-class PyObjectPreservation;
-}
-
 namespace c10 {
 class intrusive_ptr_target;
 namespace raw {
@ -37,8 +33,6 @@ constexpr uint64_t kImpracticallyHugeWeakReferenceCount =
 constexpr uint64_t kReferenceCountOne = 1;
 constexpr uint64_t kWeakReferenceCountOne = (kReferenceCountOne << 32);
 constexpr uint64_t kUniqueRef = (kReferenceCountOne | kWeakReferenceCountOne);
-// Indicates whether the object has a PyObject wrapper.
-constexpr uint64_t kHasPyObject = (uint64_t(1) << 63);

 template <class TTarget>
 struct intrusive_target_default_null_type final {
@ -61,11 +55,7 @@ inline uint32_t refcount(uint64_t combined_refcount) {
 }

 inline uint32_t weakcount(uint64_t combined_refcount) {
-  return static_cast<uint32_t>((combined_refcount & ~kHasPyObject) >> 32);
-}
-
-inline bool has_pyobject(uint64_t combined_refcount) {
-  return (combined_refcount & kHasPyObject) != 0;
+  return static_cast<uint32_t>(combined_refcount >> 32);
 }

 // The only requirement for refcount increment is that it happens-before
@ -76,6 +66,12 @@ inline uint64_t atomic_combined_refcount_increment(
  return combined_refcount.fetch_add(inc, std::memory_order_relaxed) + inc;
 }

+inline uint32_t atomic_refcount_increment(
+    std::atomic<uint64_t>& combined_refcount) {
+  return detail::refcount(atomic_combined_refcount_increment(
+      combined_refcount, kReferenceCountOne));
+}
+
 inline uint32_t atomic_weakcount_increment(
    std::atomic<uint64_t>& combined_refcount) {
  return detail::weakcount(atomic_combined_refcount_increment(
@ -103,11 +99,6 @@ inline uint32_t atomic_weakcount_decrement(
      combined_refcount, kWeakReferenceCountOne));
 }

-template <class T, class = void>
-struct TargetTraits {
-  static constexpr bool can_have_pyobject = false;
-};
-
 } // namespace detail

 /**
@ -164,23 +155,6 @@ class C10_API intrusive_ptr_target {
  // we can atomically operate on both at the same time for performance
  // and defined behaviors.
  //
-  // Note [PyObject preservation for Tensor and Storages]
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // intrusive_ptr has special support for preserving PyObject wrappers
-  // for TensorImpl and StorageImpl. The most significant bit (kHasPyObject) of
-  // the combined_refcount_ is used to indicate whether the object has a
-  // PyObject wrapper.
-  //
-  //   - The PyObject, if it exists, holds a strong reference to the
-  //     intrusive_ptr_target.
-  //
-  //   - When the refcount goes from 1 to 2, we incref the PyObject.
-  //
-  //   - When the refcount goes from 2 to 1, we decref the PyObject.
-  //
-  // In other words, the intrusive_ptr keeps the PyObject alive as long as there
-  // are other C++ references to the intrusive_ptr_target.
-
  mutable std::atomic<uint64_t> combined_refcount_;
  static_assert(sizeof(std::atomic<uint64_t>) == 8);
  static_assert(alignof(std::atomic<uint64_t>) == 8);
@ -198,8 +172,6 @@ class C10_API intrusive_ptr_target {
  template <typename T>
  friend struct ExclusivelyOwnedTensorTraits;

-  friend class torch::utils::PyObjectPreservation;
-
 protected:
  // protected destructor. We never want to destruct intrusive_ptr_target*
  // directly.
@ -283,16 +255,6 @@ class C10_API intrusive_ptr_target {
   */
  virtual void release_resources() {}

-  /**
-   * These two methods are called when the refcount transitions between one
-   * and two and the object has a PyObject wrapper.
-   */
-  virtual void incref_pyobject() const {}
-  virtual void decref_pyobject() const {}
-  virtual bool try_incref_pyobject() const {
-    return false;
-  }
-
  uint32_t refcount(std::memory_order order = std::memory_order_relaxed) const {
    return detail::refcount(combined_refcount_.load(order));
  }
@ -303,19 +265,6 @@ class C10_API intrusive_ptr_target {
  }
 };

-namespace detail {
-
-#ifndef C10_MOBILE
-template <>
-struct TargetTraits<c10::intrusive_ptr_target> {
-  // A generic intrusive_ptr<intrusive_ptr_target> may actually be a TensorImpl
-  // or StorageImpl, so we have to allow for PyObject support.
-  static constexpr bool can_have_pyobject = true;
-};
-#endif
-
-} // namespace detail
-
 template <class TTarget, class NullType>
 class weak_intrusive_ptr;

@ -365,34 +314,18 @@ class intrusive_ptr final {

  void retain_() {
    if (target_ != NullType::singleton()) {
-      uint64_t combined = detail::atomic_combined_refcount_increment(
-          target_->combined_refcount_, detail::kReferenceCountOne);
-      uint32_t new_refcount = detail::refcount(combined);
+      uint32_t new_refcount =
+          detail::atomic_refcount_increment(target_->combined_refcount_);
      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
          new_refcount != 1,
          "intrusive_ptr: Cannot increase refcount after it reached zero.");
-
-      if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
-        // If the refcount transitioned from 1 to 2, we need to incref the
-        // PyObject. In other words, we need to ensure that the PyObject stays
-        // alive now that we have a C++ reference to this object in addition to
-        // the PyObject itself.
-        if (C10_UNLIKELY(
-                detail::has_pyobject(combined) &&
-                detail::refcount(combined) == 2)) {
-          target_->incref_pyobject();
-        }
-      } else {
-        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-            !detail::has_pyobject(combined),
-            "TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
-      }
    }
  }

  void reset_() noexcept {
    if (target_ != NullType::singleton()) {
-      if (is_uniquely_owned()) {
+      if (target_->combined_refcount_.load(std::memory_order_acquire) ==
+          detail::kUniqueRef) {
        // Both counts are 1, so there are no weak references and
        // we are releasing the last strong reference. No other
        // threads can observe the effects of this target_ deletion
@ -404,10 +337,9 @@ class intrusive_ptr final {

      auto combined_refcount = detail::atomic_combined_refcount_decrement(
          target_->combined_refcount_, detail::kReferenceCountOne);
-      uint32_t new_refcount = detail::refcount(combined_refcount);
-      bool has_pyobject = detail::has_pyobject(combined_refcount);
-      if (new_refcount == 0) {
-        bool should_delete = detail::weakcount(combined_refcount) == 1;
+      if (detail::refcount(combined_refcount) == 0) {
+        bool should_delete =
+            (combined_refcount == detail::kWeakReferenceCountOne);
        // See comment above about weakcount. As long as refcount>0,
        // weakcount is one larger than the actual number of weak references.
        // So we need to decrement it here.
@ -424,18 +356,6 @@ class intrusive_ptr final {
        if (should_delete) {
          delete target_;
        }
-      } else if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
-        // If the refcount transitioned from 2 to 1, we need to decref the
-        // PyObject. In other words, we don't want to keep the PyObject alive if
-        // there are no C++ references to this object other than the PyObject
-        // itself.
-        if (C10_UNLIKELY(has_pyobject && new_refcount == 1)) {
-          target_->decref_pyobject();
-        }
-      } else {
-        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-            !has_pyobject,
-            "TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
      }
    }
  }
@ -602,16 +522,6 @@ class intrusive_ptr final {
    return use_count() == 1;
  }

-  /**
-   * Stronger than unique() in that it must not have any weakrefs as well.
-   */
-  bool is_uniquely_owned() const noexcept {
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(target_ != NullType::singleton());
-    uint64_t combined =
-        target_->combined_refcount_.load(std::memory_order_acquire);
-    return (combined & ~detail::kHasPyObject) == detail::kUniqueRef;
-  }
-
  /**
   * Returns an owning (!) pointer to the underlying object and makes the
   * intrusive_ptr instance invalid. That means the refcount is not decreased.
@ -1022,7 +932,6 @@ class weak_intrusive_ptr final {
    if (target_ == NullType::singleton()) {
      return intrusive_ptr<TTarget, NullType>();
    } else {
-      bool increfed = false;
      auto combined_refcount =
          target_->combined_refcount_.load(std::memory_order_relaxed);
      do {
@ -1031,31 +940,12 @@ class weak_intrusive_ptr final {
          // Return nullptr.
          return intrusive_ptr<TTarget, NullType>();
        }
-        if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
-          if (detail::has_pyobject(combined_refcount) &&
-              detail::refcount(combined_refcount) == 1 && !increfed) {
-            // Object has a python wrapper with no other C++ references.
-            // We need to to incref the Python object before we acquire a
-            // strong reference to the C++ object to avoid a situation
-            // where the Python object is deallocated concurrently.
-            if (!target_->try_incref_pyobject()) {
-              return intrusive_ptr<TTarget, NullType>();
-            }
-            increfed = true;
-          }
-        }
      } while (!target_->combined_refcount_.compare_exchange_weak(
          combined_refcount,
          combined_refcount + detail::kReferenceCountOne,
          std::memory_order_acquire,
          std::memory_order_relaxed));

-      if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
-        if (increfed && detail::refcount(combined_refcount) != 1) {
-          target_->decref_pyobject();
-        }
-      }
-
      return intrusive_ptr<TTarget, NullType>(
          target_, raw::DontIncreaseRefcount{});
    }
@ -1170,18 +1060,7 @@ namespace intrusive_ptr {
 // NullType::singleton to this function
 inline void incref(intrusive_ptr_target* self) {
  if (self) {
-    uint64_t combined = detail::atomic_combined_refcount_increment(
-        self->combined_refcount_, detail::kReferenceCountOne);
-
-#ifndef C10_MOBILE
-    if (C10_UNLIKELY(
-            detail::has_pyobject(combined) &&
-            detail::refcount(combined) == 2)) {
-      self->incref_pyobject();
-    }
-#else
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!detail::has_pyobject(combined));
-#endif
+    detail::atomic_refcount_increment(self->combined_refcount_);
  }
 }

--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@ -15,8 +15,6 @@ using namespace c10::CachingDeviceAllocator;
 // newly allocated memory with 512-byte alignment.
 constexpr size_t kDeviceAlignment = 512;

-class XPUAllocator;
-
 namespace {
 using stream_set = ska::flat_hash_set<xpu::XPUStream>;

@ -25,19 +23,14 @@ typedef bool (*Comparison)(const Block*, const Block*);
 bool BlockComparatorSize(const Block* a, const Block* b);
 bool BlockComparatorAddress(const Block* a, const Block* b);

-struct PrivatePool;
-
 struct BlockPool {
-  BlockPool(bool small, PrivatePool* private_pool = nullptr)
+  BlockPool(bool small)
      : blocks(BlockComparatorSize),
        unmapped(BlockComparatorAddress),
-        is_small(small),
-        owner_PrivatePool(private_pool) {}
-
+        is_small(small) {}
  std::set<Block*, Comparison> blocks;
  std::set<Block*, Comparison> unmapped;
  const bool is_small;
-  PrivatePool* owner_PrivatePool;
 };

 struct ExpandableSegment;
@ -356,43 +349,6 @@ struct AllocParams {
  StatTypes stat_types = {};
 };

-// Internal implementation that manages actual memory blocks.
-// high level MemPool interface wraps PrivatePool via MempoolId.
-struct PrivatePool {
-  PrivatePool(MempoolId_t id, XPUAllocator* allocator = nullptr)
-      : id(std::move(id)),
-        allocator_(allocator),
-        large_blocks(/*small=*/false, this),
-        small_blocks(/*small=*/true, this) {}
-  PrivatePool(const PrivatePool&) = delete;
-  PrivatePool(PrivatePool&&) = delete;
-  PrivatePool& operator=(const PrivatePool&) = delete;
-  PrivatePool& operator=(PrivatePool&&) = delete;
-  ~PrivatePool() = default;
-
-  // default Mempool when no Mempool is specified
-  MempoolId_t id{0, 0};
-  // Number of live graphs using this pool
-  int use_count{1};
-  // Number of unfreed allocations made for this pool. When use_count and
-  // allocation_count drop to zero, we can delete this PrivatePool from
-  // graph_pools.
-  int allocation_count{0};
-  XPUAllocator* allocator_;
-  BlockPool large_blocks;
-  BlockPool small_blocks;
-
- public:
-  XPUAllocator* allocator() {
-    return allocator_;
-  }
-};
-struct MempoolIdHash {
-  std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
-    return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
-  }
-};
-
 } // anonymous namespace

 class DeviceCachingAllocator {
@ -409,13 +365,6 @@ class DeviceCachingAllocator {
  bool set_fraction = false;
  std::vector<ExpandableSegment*> expandable_segments;
  std::vector<c10::DeviceIndex> devices_with_peer_access; // reserved
-  std::vector<std::pair<MempoolId_t, std::function<bool(sycl::queue*)>>>
-      captures_underway;
-  ska::flat_hash_map<MempoolId_t, std::unique_ptr<PrivatePool>, MempoolIdHash>
-      graph_pools;
-  // Pools no longer referenced by any graph.
-  ska::flat_hash_map<MempoolId_t, PrivatePool*, MempoolIdHash>
-      graph_pools_freeable;

  size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
    if (!src || src->allocated || src->event_count > 0 ||
@ -514,22 +463,7 @@ class DeviceCachingAllocator {
    }
  }

-  BlockPool& get_pool(size_t size, sycl::queue* queue) {
-    if (C10_UNLIKELY(!captures_underway.empty())) {
-      for (auto& entry : captures_underway) {
-        // lookup for mempool id matching current capture graph
-        if (entry.second(queue)) {
-          auto it1 = graph_pools.find(entry.first);
-          // lookup mempool
-          TORCH_INTERNAL_ASSERT(it1 != graph_pools.end());
-          if (size <= kSmallSize) {
-            return it1->second->small_blocks;
-          } else {
-            return it1->second->large_blocks;
-          }
-        }
-      }
-    }
+  BlockPool& get_pool(size_t size) {
    if (size < kSmallSize) {
      return small_blocks;
    } else {
@ -735,10 +669,6 @@ class DeviceCachingAllocator {
    if (!ptr) {
      return false;
    }
-
-    if (p.pool->owner_PrivatePool) {
-      p.pool->owner_PrivatePool->allocation_count++;
-    }
    p.block = new Block(device, p.queue(), size, p.pool, ptr);
    for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
      stats.reserved_bytes[stat_type].increase(size);
@ -747,14 +677,11 @@ class DeviceCachingAllocator {
    return true;
  }

-  void synchronize_and_free_events(PrivatePool* pool = nullptr) {
+  void synchronize_and_free_events() {
    for (auto& xe : xpu_events) {
      for (auto& e : xe.second) {
        auto event = e.first;
        auto* block = e.second;
-        if (pool && block->pool->owner_PrivatePool != pool) {
-          continue;
-        }
        event.wait();
        block->event_count--;
        if (block->event_count == 0) {
@ -858,13 +785,6 @@ class DeviceCachingAllocator {
    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
      stats.reserved_bytes[stat_type].decrease(unmapped.size);
    });
-
-    if (block->pool->owner_PrivatePool) {
-      // The Freed block belonged to a XPU graph's PrivatePool.
-      TORCH_INTERNAL_ASSERT(
-          block->pool->owner_PrivatePool->allocation_count > 0);
-      block->pool->owner_PrivatePool->allocation_count--;
-    }
  }

  void release_blocks(BlockPool& pool) {
@ -892,41 +812,13 @@ class DeviceCachingAllocator {
    }
  }

-  bool release_cached_blocks(MempoolId_t mempool_id) {
-    if (mempool_id.first == 0 && mempool_id.second == 0 &&
-        captures_underway.empty()) {
-      synchronize_and_free_events();
-      // See Note [Safe to Free Blocks on BlockPool]
-      c10::xpu::syncStreamsOnDevice(device_index);
+  bool release_cached_blocks() {
+    synchronize_and_free_events();
+    // See Note [Safe to Free Blocks on BlockPool]
+    c10::xpu::syncStreamsOnDevice(device_index);

-      release_blocks(large_blocks);
-      release_blocks(small_blocks);
-    }
-
-    for (auto it = graph_pools_freeable.begin();
-         it != graph_pools_freeable.end();) {
-      if (mempool_id.first != 0 || mempool_id.second != 0) {
-        if (it->first == mempool_id) {
-          // If there is an active mempool, we sync only the events
-          // associated with the pool
-          synchronize_and_free_events(it->second);
-        } else {
-          // otherwise we move on
-          ++it;
-          continue;
-        }
-      }
-      TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
-      release_blocks(it->second->small_blocks);
-      release_blocks(it->second->large_blocks);
-      if (it->second->allocation_count == 0) {
-        auto erase_count = graph_pools.erase(it->first);
-        TORCH_INTERNAL_ASSERT(erase_count == 1);
-        it = graph_pools_freeable.erase(it);
-      } else {
-        ++it;
-      }
-    }
+    release_blocks(large_blocks);
+    release_blocks(small_blocks);
    return true;
  }

@ -1011,30 +903,6 @@ class DeviceCachingAllocator {
    }
  }

-  void create_or_incref_pool(
-      MempoolId_t mempool_id,
-      XPUAllocator* allocator = nullptr) {
-    auto it = graph_pools.find(mempool_id);
-    if (it == graph_pools.end()) {
-      // mempool_id does not reference an existing pool.
-      // Make a new pool for XPU graph capture or memory pool usage.
-      graph_pools.emplace(
-          mempool_id, std::make_unique<PrivatePool>(mempool_id, allocator));
-    } else {
-      // mempool_id references an existing pool, which the current XPU graph
-      // capture will share.
-      TORCH_INTERNAL_ASSERT(it->second->use_count > 0);
-      TORCH_INTERNAL_ASSERT(allocator == nullptr);
-      it->second->use_count++;
-    }
-  }
-
-  PrivatePool* get_private_pool(MempoolId_t mempool_id) {
-    auto it = graph_pools.find(mempool_id);
-    TORCH_INTERNAL_ASSERT(it != graph_pools.end());
-    return it->second.get();
-  }
-
 public:
  DeviceCachingAllocator(DeviceIndex device_index)
      : large_blocks(/* small */ false),
@ -1043,11 +911,9 @@ class DeviceCachingAllocator {

  Block* malloc(DeviceIndex device, size_t orig_size, sycl::queue& queue) {
    std::scoped_lock<std::recursive_mutex> lock(mutex);
-    if (C10_LIKELY(captures_underway.empty())) {
-      process_events();
-    }
+    process_events();
    size_t size = round_size(orig_size);
-    auto& pool = get_pool(size, &queue);
+    auto& pool = get_pool(size);
    const size_t alloc_size = get_allocation_size(size);
    AllocParams params(device, size, &queue, &pool, alloc_size);
    params.stat_types = get_stat_types_for_pool(pool);
@ -1057,7 +923,7 @@ class DeviceCachingAllocator {
    // Can't reuse an existing block, try to get a new one.
    if (!block_found) {
      block_found = alloc_block(params, false) ||
-          (release_cached_blocks({0, 0}) && alloc_block(params, true));
+          (release_cached_blocks() && alloc_block(params, true));
    }
    if (!block_found) {
      const auto& raw_device = c10::xpu::get_raw_device(device);
@ -1150,9 +1016,9 @@ class DeviceCachingAllocator {
    block->stream_uses.insert(stream);
  }

-  void emptyCache(MempoolId_t mempool_id) {
+  void emptyCache() {
    std::scoped_lock<std::recursive_mutex> lock(mutex);
-    release_cached_blocks(mempool_id);
+    release_cached_blocks();
  }

  DeviceStats getStats() {
@ -1306,9 +1172,9 @@ class XPUAllocator : public DeviceAllocator {
    }
  }

-  void emptyCache(MempoolId_t mempool_id) override {
+  void emptyCache(MempoolId_t mempool_id [[maybe_unused]] = {0, 0}) override {
    for (auto& da : device_allocators) {
-      da->emptyCache(mempool_id);
+      da->emptyCache();
    }
  }

@ -1424,8 +1290,8 @@ void init(DeviceIndex device_count) {
  return allocator.init(device_count);
 }

-void emptyCache(MempoolId_t mempool_id) {
-  return allocator.emptyCache(mempool_id);
+void emptyCache() {
+  return allocator.emptyCache();
 }

 void resetPeakStats(DeviceIndex device) {
--- a/c10/xpu/XPUCachingAllocator.h
+++ b/c10/xpu/XPUCachingAllocator.h
@ -10,7 +10,7 @@ C10_XPU_API Allocator* get();

 C10_XPU_API void init(DeviceIndex device_count);

-C10_XPU_API void emptyCache(MempoolId_t mempool_id = {0, 0});
+C10_XPU_API void emptyCache();

 C10_XPU_API void resetPeakStats(DeviceIndex device);

--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -1643,8 +1643,6 @@ if(USE_CUDA)
  target_link_libraries(torch_cuda PUBLIC c10_cuda)
  if(TARGET torch::nvtx3)
    target_link_libraries(torch_cuda PRIVATE torch::nvtx3)
-  else()
-    target_link_libraries(torch_cuda PUBLIC torch::nvtoolsext)
  endif()

  target_include_directories(
@ -1741,9 +1739,6 @@ if(BUILD_SHARED_LIBS)
  if(USE_CUDA)
    target_link_libraries(torch_global_deps ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
    target_link_libraries(torch_global_deps torch::cudart)
-    if(TARGET torch::nvtoolsext)
-      target_link_libraries(torch_global_deps torch::nvtoolsext)
-    endif()
  endif()
  install(TARGETS torch_global_deps DESTINATION "${TORCH_INSTALL_LIB_DIR}")
 endif()
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@ -118,12 +118,6 @@ if(INTERN_BUILD_ATEN_OPS)
            list(APPEND _file_compile_flags "-gencode;arch=compute_120a,code=sm_120a")
          endif()
        endif()
-        # We will need to gate against CUDA version, sm_121a was introduced in CUDA 12.9
-        if("${_arch}" STREQUAL "121a" AND CUDA_VERSION VERSION_GREATER_EQUAL 12.9)
-          if(_existing_arch_flags MATCHES ".*compute_120.*")
-            list(APPEND _file_compile_flags "-gencode;arch=compute_121a,code=sm_121a")
-          endif()
-        endif()
      endforeach()
      list(JOIN _file_compile_flags " " _file_compile_flags)

@ -132,7 +126,7 @@ if(INTERN_BUILD_ATEN_OPS)

    _BUILD_FOR_ADDITIONAL_ARCHS(
      "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/RowwiseScaledMM.cu"
-      "89;90a;100a;103a;120a;121a")
+      "89;90a;100a;103a;120a")
    _BUILD_FOR_ADDITIONAL_ARCHS(
      "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/ScaledGroupMM.cu"
      "90a")
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -968,11 +968,8 @@ find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir)
 if(nvtx3_FOUND)
  add_library(torch::nvtx3 INTERFACE IMPORTED)
  target_include_directories(torch::nvtx3 INTERFACE "${nvtx3_dir}")
-  target_compile_definitions(torch::nvtx3 INTERFACE TORCH_CUDA_USE_NVTX3)
 else()
-  message(WARNING "Cannot find NVTX3, find old NVTX instead")
-  add_library(torch::nvtoolsext INTERFACE IMPORTED)
-  set_property(TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES CUDA::nvToolsExt)
+  message(FATAL_ERROR "Cannot find NVTX3!")
 endif()


--- a/cmake/External/aotriton.cmake
+++ b/cmake/External/aotriton.cmake
@ -15,14 +15,12 @@ if(NOT __AOTRITON_INCLUDED)
      "manylinux_2_28"  # rocm6.3
      "manylinux_2_28"  # rocm6.4
      "manylinux_2_28"  # rocm7.0
-      "manylinux_2_28"  # rocm7.1
      )
  set(__AOTRITON_ROCM_LIST
      "rocm6.2"
      "rocm6.3"
      "rocm6.4"
      "rocm7.0"
-      "rocm7.1"
      )
  set(__AOTRITON_CI_COMMIT "972223c501ffc22068bb035ac5d64cf54318d895")
  set(__AOTRITON_SHA256_LIST
@ -30,7 +28,6 @@ if(NOT __AOTRITON_INCLUDED)
      "72a153549ea20707331e8a1f1e3d1b8de2913f9d5af2b900c56235d578b57efe"  # rocm6.3
      "c7f319dd7448cbbbab81889dd8a37d47dbc25ebcbd89760f09e6a0904e556393"  # rocm6.4
      "a2a974e0ad929a5e5827c0f896c59bda4872459cbaf8dd8e0a00407f404491cf"  # rocm7.0
-      "d4eb24c9f1a0cfedb35f9292efb41d16589cf5a4b98c3c0940181bbefc49d722"  # rocm7.1
      )
  set(__AOTRITON_IMAGE_LIST
      "amd-gfx90a"
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@ -132,9 +132,6 @@ if(@USE_CUDA@)
  else()
    set(TORCH_CUDA_LIBRARIES ${CUDA_NVRTC_LIB})
  endif()
-  if(TARGET torch::nvtoolsext)
-    list(APPEND TORCH_CUDA_LIBRARIES torch::nvtoolsext)
-  endif()

  if(@BUILD_SHARED_LIBS@)
    find_library(C10_CUDA_LIBRARY c10_cuda PATHS "${TORCH_INSTALL_PREFIX}/lib")
--- a/test/cpp/aoti_abi_check/test_scalartype.cpp
+++ b/test/cpp/aoti_abi_check/test_scalartype.cpp
@ -101,14 +101,3 @@ TEST(TestScalarType, toUnderlying) {
  AT_FORALL_FLOAT8_TYPES(DEFINE_CHECK);
 #undef DEFINE_CHECK
 }
-
-TEST(TestScalarType, isQIntType) {
-  using torch::headeronly::isQIntType;
-  using torch::headeronly::ScalarType;
-#define DEFINE_CHECK(_, name) EXPECT_TRUE(isQIntType(ScalarType::name));
-  AT_FORALL_QINT_TYPES(DEFINE_CHECK);
-#undef DEFINE_CHECK
-#define DEFINE_CHECK(_, name) EXPECT_FALSE(isQIntType(ScalarType::name));
-  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CHECK);
-#undef DEFINE_CHECK
-}
--- a/test/cpp/jit/test_custom_operators.cpp
+++ b/test/cpp/jit/test_custom_operators.cpp
@ -15,7 +15,7 @@ namespace jit {
 TEST(CustomOperatorTest, InferredSchema) {
  torch::RegisterOperators reg(
      "foo::bar", [](double a, at::Tensor b) { return a + b; });
-  auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
  ASSERT_EQ(ops.size(), 1);

  auto& op = ops.front();
@ -43,7 +43,8 @@ TEST(CustomOperatorTest, ExplicitSchema) {
      "foo::bar_with_schema(float a, Tensor b) -> Tensor",
      [](double a, at::Tensor b) { return a + b; });

-  auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
+  auto& ops =
+      getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
  ASSERT_EQ(ops.size(), 1);

  auto& op = ops.front();
@ -76,7 +77,7 @@ TEST(CustomOperatorTest, ListParameters) {
         torch::List<c10::complex<double>> complexdoubles,
         torch::List<at::Tensor> tensors) { return floats; });

-  auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
  ASSERT_EQ(ops.size(), 1);

  auto& op = ops.front();
@ -122,7 +123,7 @@ TEST(CustomOperatorTest, ListParameters2) {
      "foo::lists2(Tensor[] tensors) -> Tensor[]",
      [](torch::List<at::Tensor> tensors) { return tensors; });

-  auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
  ASSERT_EQ(ops.size(), 1);

  auto& op = ops.front();
@ -212,7 +213,7 @@ TEST(TestCustomOperator, OperatorGeneratorUndeclared) {
      },
      aliasAnalysisFromSchema())});

-  auto ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
  ASSERT_EQ(ops.size(), 0);
 }

@ -231,7 +232,7 @@ TEST(TestCustomOperator, OperatorGeneratorBasic) {
      },
      aliasAnalysisFromSchema())});

-  auto ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
+  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
  ASSERT_EQ(ops.size(), 1);

  auto& op = ops.front();
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/get_any_data_ptr.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/get_any_data_ptr.cpp
@ -1,20 +0,0 @@
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/tensor.h>
-
-using torch::stable::Tensor;
-
-uint64_t get_any_data_ptr(Tensor t, bool mutable_) {
-  if (mutable_) {
-    return reinterpret_cast<uint64_t>(t.mutable_data_ptr());
-  } else {
-    return reinterpret_cast<uint64_t>(t.const_data_ptr());
-  }
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def("get_any_data_ptr(Tensor t, bool mutable_) -> int");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("get_any_data_ptr", TORCH_BOX(&get_any_data_ptr));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/get_template_any_data_ptr.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/get_template_any_data_ptr.cpp
@ -1,34 +0,0 @@
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/tensor.h>
-#include <torch/headeronly/core/ScalarType.h>
-
-using torch::stable::Tensor;
-
-uint64_t get_template_any_data_ptr(Tensor t, torch::headeronly::ScalarType dtype, bool mutable_) {
-#define DEFINE_CASE(T, name)                                            \
-  case torch::headeronly::ScalarType::name: {                           \
-    if (mutable_) {                                                     \
-      return reinterpret_cast<uint64_t>(t.mutable_data_ptr<T>());       \
-    } else {                                                            \
-      return reinterpret_cast<uint64_t>(t.const_data_ptr<T>());         \
-    }                                                                   \
-  }
-  switch (dtype) {
-    // per aten/src/ATen/templates/TensorMethods.cpp:
-    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
-    DEFINE_CASE(uint16_t, UInt16)
-    DEFINE_CASE(uint32_t, UInt32)
-    DEFINE_CASE(uint64_t, UInt64)
-  default:
-      return 0;
-  }
-#undef DEFINE_CASE
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def("get_template_any_data_ptr(Tensor t, ScalarType dtype, bool mutable_) -> int");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("get_template_any_data_ptr", TORCH_BOX(&get_template_any_data_ptr));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/make_tensor_clones_and_call_foreach.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/make_tensor_clones_and_call_foreach.cpp
@ -1,41 +0,0 @@
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/ops.h>
-#include <torch/csrc/stable/tensor.h>
-
-#include <vector>
-
-using torch::stable::Tensor;
-
-// Declare my__foreach_mul (defined in my__foreach_mul.cpp)
-extern std::vector<Tensor> my__foreach_mul(
-    torch::headeronly::HeaderOnlyArrayRef<Tensor> self,
-    torch::headeronly::HeaderOnlyArrayRef<Tensor> other);
-
-// Helper function for cloning
-Tensor my_clone(Tensor t) {
-  return clone(t);
-}
-
-std::vector<Tensor> make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) {
-  // This function tests that my__foreach_mul can take in std::initializer_lists
-  // in addition to std::vectors.
-  Tensor t1_1 = my_clone(t1);
-  Tensor t1_2 = my_clone(t1);
-  Tensor t2_1 = my_clone(t2);
-  Tensor t2_2 = my_clone(t2);
-  return my__foreach_mul({t1_1, t2_1}, {t1_2, t2_2});
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def(
-      "make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) -> Tensor[]");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(
-    libtorch_agnostic_2_10,
-    CompositeExplicitAutograd,
-    m) {
-  m.impl(
-      "make_tensor_clones_and_call_foreach",
-      TORCH_BOX(&make_tensor_clones_and_call_foreach));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cpu.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cpu.cpp
@ -1,40 +0,0 @@
-// This is duplicated from the libtorch_agnostic_2_9_extension
-// as a negative test for test_version_compatibility.py
-
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/tensor.h>
-#include <torch/csrc/stable/ops.h>
-#include <torch/headeronly/util/Exception.h>
-#include <torch/headeronly/core/ScalarType.h>
-#include <torch/headeronly/core/Dispatch_v2.h>
-#include <torch/headeronly/core/TensorAccessor.h>
-
-#include "tensor_accessor_kernel.h"
-
-using torch::stable::Tensor;
-
-Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
-  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
-  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
-  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
-  STD_TORCH_CHECK(m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
-  STD_TORCH_CHECK(m.device() == v.device(), "m and v must be on the same device");
-  Tensor res = new_empty(m, {m.size(0)});
-  THO_DISPATCH_V2(m.scalar_type(), "mv_tensor_accessor_cpu",
-                  AT_WRAP(([&]() {
-                    auto resa = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(res.data_ptr()), res.sizes().data(), res.strides().data());
-                    auto ma = Accessor_cpu<scalar_t, 2>(reinterpret_cast<scalar_t*>(m.data_ptr()), m.sizes().data(), m.strides().data());
-                    auto va = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(v.data_ptr()), v.sizes().data(), v.strides().data());
-                    mv_tensor_accessor_kernel<Accessor_cpu, scalar_t>(resa, ma, va);
-                  })),
-                  AT_FLOATING_TYPES);
-  return res;
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def("mv_tensor_accessor_cpu(Tensor res, Tensor m, Tensor v) -> Tensor");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("mv_tensor_accessor_cpu", TORCH_BOX(&mv_tensor_accessor_cpu));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cuda.cu
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cuda.cu
@ -1,47 +0,0 @@
-// This is duplicated from the libtorch_agnostic_2_9_extension
-// as a negative test for test_version_compatibility.py
-
-#include "tensor_accessor_kernel.h"
-
-#include <cuda_runtime.h>
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/ops.h>
-#include <torch/csrc/stable/tensor.h>
-
-using torch::stable::Tensor;
-
-Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
-  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
-  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
-  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
-  STD_TORCH_CHECK(
-      m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
-  STD_TORCH_CHECK(
-      m.device() == v.device(), "m and v must be on the same device");
-  Tensor res = new_empty(m, {m.size(0)});
-  THO_DISPATCH_V2(
-      m.scalar_type(),
-      "mv_tensor_accessor_cuda",
-      AT_WRAP(([&]() {
-        auto resa = Accessor_cuda<scalar_t, 1>(
-            reinterpret_cast<scalar_t*>(res.data_ptr()),
-            res.sizes().data(),
-            res.strides().data());
-        auto ma = Accessor_cuda<scalar_t, 2>(
-            reinterpret_cast<scalar_t*>(m.data_ptr()),
-            m.sizes().data(),
-            m.strides().data());
-        auto va = Accessor_cuda<scalar_t, 1>(
-            reinterpret_cast<scalar_t*>(v.data_ptr()),
-            v.sizes().data(),
-            v.strides().data());
-        mv_tensor_accessor_kernel<Accessor_cuda, scalar_t>
-            <<<1, 1, 0, 0>>>(resa, ma, va);
-      })),
-      AT_FLOATING_TYPES);
-  return res;
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CUDA, m) {
-  m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul.cpp
@ -1,20 +0,0 @@
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/tensor.h>
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
-#include <vector>
-
-using torch::stable::Tensor;
-
-std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
-  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
-  aoti_torch_call_dispatcher("aten::_foreach_mul", "List", stack.data());
-  return torch::stable::detail::to<std::vector<Tensor>>(stack[0]);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def("my__foreach_mul(Tensor[] self, Tensor[] other) -> Tensor[]");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("my__foreach_mul", TORCH_BOX(&my__foreach_mul));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul_.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul_.cpp
@ -1,19 +0,0 @@
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/tensor.h>
-#include <torch/csrc/stable/stableivalue_conversions.h>
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
-
-using torch::stable::Tensor;
-
-void my__foreach_mul_(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
-  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
-  aoti_torch_call_dispatcher("aten::_foreach_mul_", "List", stack.data());
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def("my__foreach_mul_(Tensor(a!)[] self, Tensor[] other) -> ()");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("my__foreach_mul_", TORCH_BOX(&my__foreach_mul_));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_empty.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_empty.cpp
@ -1,25 +0,0 @@
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/tensor.h>
-#include <torch/csrc/stable/device.h>
-#include <torch/csrc/stable/ops.h>
-
-#include <optional>
-
-using torch::stable::Tensor;
-
-Tensor my_empty(
-    torch::headeronly::HeaderOnlyArrayRef<int64_t> size,
-    std::optional<torch::headeronly::ScalarType> dtype,
-    std::optional<torch::stable::Device> device,
-    std::optional<bool> pin_memory) {
-  return empty(size, dtype, device, pin_memory);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def(
-      "my_empty(int[] size, ScalarType? dtype=None, Device? device=None, bool? pin_memory=None) -> Tensor");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("my_empty", TORCH_BOX(&my_empty));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_reshape.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_reshape.cpp
@ -1,17 +0,0 @@
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/tensor.h>
-#include <torch/csrc/stable/ops.h>
-
-using torch::stable::Tensor;
-
-Tensor my_reshape(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> shape) {
-  return reshape(t, shape);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def("my_reshape(Tensor t, int[] shape) -> Tensor");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("my_reshape", TORCH_BOX(&my_reshape));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_view.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_view.cpp
@ -1,20 +0,0 @@
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/tensor.h>
-#include <torch/csrc/stable/ops.h>
-
-using torch::stable::Tensor;
-
-Tensor my_view(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> size) {
-  return view(t, size);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def("my_view(Tensor t, int[] size) -> Tensor");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(
-    libtorch_agnostic_2_10,
-    CompositeExplicitAutograd,
-    m) {
-  m.impl("my_view", TORCH_BOX(&my_view));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/tensor_accessor_kernel.h
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/tensor_accessor_kernel.h
@ -1,31 +0,0 @@
-// This is duplicated from the libtorch_agnostic_2_9_extension
-// as a negative test for test_version_compatibility.py
-
-#pragma once
-
-#include <torch/headeronly/core/Dispatch_v2.h>
-#include <torch/headeronly/core/TensorAccessor.h>
-
-template <typename T, size_t N>
-using Accessor_cpu = torch::headeronly::HeaderOnlyTensorAccessor<T, N>;
-
-#if defined(__CUDACC__) || defined(__HIPCC__)
-#define MAYBE_GLOBAL __global__
-
-template <typename T, size_t N>
-using Accessor_cuda = torch::headeronly::HeaderOnlyGenericPackedTensorAccessor<T, N, torch::headeronly::RestrictPtrTraits>;
-
-#else
-#define MAYBE_GLOBAL
-#endif
-
-template <template <typename, size_t> class Accessor, typename scalar_t>
-MAYBE_GLOBAL void mv_tensor_accessor_kernel(Accessor<scalar_t, 1> resa, Accessor<scalar_t, 2> ma, Accessor<scalar_t, 1> va) {
-  for (int64_t i = 0; i < resa.size(0); i++) {
-    scalar_t val = 0;
-    for (int64_t j = 0; j < ma.size(1); j++) {
-      val += ma[i][j] * va[j];
-    }
-    resa[i] = val;
-  }
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_constructor.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_constructor.cpp
@ -1,37 +0,0 @@
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/device.h>
-
-#include <string>
-
-torch::stable::Device test_device_constructor(
-    bool is_cuda,
-    torch::stable::DeviceIndex index,
-    bool use_str) {
-  using torch::stable::Device;
-  using torch::stable::DeviceType;
-
-  if (use_str) {
-    std::string device_str;
-    if (is_cuda) {
-      device_str = "cuda:" + std::to_string(index);
-    } else {
-      device_str = "cpu";
-    }
-    return Device(device_str);
-  } else {
-    if (is_cuda) {
-      return Device(DeviceType::CUDA, index);
-    } else {
-      return Device(DeviceType::CPU);
-    }
-  }
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def(
-      "test_device_constructor(bool is_cuda, DeviceIndex index, bool use_str) -> Device");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("test_device_constructor", TORCH_BOX(&test_device_constructor));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_equality.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_equality.cpp
@ -1,14 +0,0 @@
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/device.h>
-
-bool test_device_equality(torch::stable::Device d1, torch::stable::Device d2) {
-  return d1 == d2;
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def("test_device_equality(Device d1, Device d2) -> bool");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("test_device_equality", TORCH_BOX(&test_device_equality));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_index.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_index.cpp
@ -1,14 +0,0 @@
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/device.h>
-
-torch::stable::DeviceIndex test_device_index(torch::stable::Device device) {
-  return device.index();
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def("test_device_index(Device device) -> DeviceIndex");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("test_device_index", TORCH_BOX(&test_device_index));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cpu.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cpu.cpp
@ -1,14 +0,0 @@
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/device.h>
-
-bool test_device_is_cpu(torch::stable::Device device) {
-  return device.is_cpu();
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def("test_device_is_cpu(Device device) -> bool");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("test_device_is_cpu", TORCH_BOX(&test_device_is_cpu));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cuda.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cuda.cpp
@ -1,14 +0,0 @@
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/device.h>
-
-bool test_device_is_cuda(torch::stable::Device device) {
-  return device.is_cuda();
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def("test_device_is_cuda(Device device) -> bool");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("test_device_is_cuda", TORCH_BOX(&test_device_is_cuda));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_set_index.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_set_index.cpp
@ -1,17 +0,0 @@
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/device.h>
-
-torch::stable::Device test_device_set_index(
-    torch::stable::Device device,
-    torch::stable::DeviceIndex index) {
-  device.set_index(index);
-  return device;
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def("test_device_set_index(Device device, DeviceIndex index) -> Device");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("test_device_set_index", TORCH_BOX(&test_device_set_index));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_get_num_threads.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_get_num_threads.cpp
@ -1,14 +0,0 @@
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/ops.h>
-
-uint32_t test_get_num_threads() {
-  return torch::stable::get_num_threads();
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def("test_get_num_threads() -> int");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("test_get_num_threads", TORCH_BOX(&test_get_num_threads));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_parallel_for.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_parallel_for.cpp
@ -1,49 +0,0 @@
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/tensor.h>
-#include <torch/csrc/stable/ops.h>
-#include <torch/csrc/stable/device.h>
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
-#include <torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h>
-
-using torch::stable::Tensor;
-
-Tensor test_parallel_for(int64_t size, int64_t grain_size) {
-  AtenTensorHandle tensor_handle;
-  int64_t stride = 1;
-
-  aoti_torch_empty_strided(
-      1,
-      &size,
-      &stride,
-      aoti_torch_dtype_int64(),
-      aoti_torch_device_type_cpu(),
-      0,
-      &tensor_handle);
-
-  Tensor tensor(tensor_handle);
-  int64_t* data_ptr = reinterpret_cast<int64_t*>(tensor.data_ptr());
-
-  torch::stable::zero_(tensor);
-
-  // Use parallel_for to fill each element with its index
-  // If using a parallel path, the thread id is encoded in the upper 32 bits
-  torch::stable::parallel_for(
-      0, size, grain_size, [data_ptr](int64_t begin, int64_t end) {
-        for (auto i = begin; i < end; i++) {
-          STD_TORCH_CHECK(i <= UINT32_MAX);
-          uint32_t thread_id;
-          torch_get_thread_idx(&thread_id);
-          data_ptr[i] = i | (static_cast<int64_t>(thread_id) << 32);
-        }
-      });
-
-  return tensor;
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def("test_parallel_for(int size, int grain_size) -> Tensor");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("test_parallel_for", TORCH_BOX(&test_parallel_for));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_tensor_device.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_tensor_device.cpp
@ -1,17 +0,0 @@
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/tensor.h>
-#include <torch/csrc/stable/device.h>
-
-using torch::stable::Tensor;
-
-torch::stable::Device test_tensor_device(torch::stable::Tensor tensor) {
-  return tensor.device();
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
-  m.def("test_tensor_device(Tensor t) -> Device");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
-  m.impl("test_tensor_device", TORCH_BOX(&test_tensor_device));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/ops.py
@ -1,225 +0,0 @@
-import torch
-from torch import Tensor
-
-
-def my__foreach_mul_(tensors, others) -> ():
-    """
-    Updates tensors to be the result of pointwise multiplying with others.
-
-    Args:
-        tensors: list of tensors
-        others: list of tensors (with the same corresponding shapes as tensors)
-
-    Returns: nothing, tensors is updated in place.
-    """
-    torch.ops.libtorch_agnostic_2_10.my__foreach_mul_.default(tensors, others)
-
-
-def my__foreach_mul(tensors, others) -> list[Tensor]:
-    """
-    Returns a list of tensors that are the results of pointwise multiplying
-    tensors and others.
-
-    Args:
-        tensors: list of tensors
-        others: list of tensors (with the same corresponding shapes as tensors)
-
-    Returns: list of multiplied tensors
-    """
-    return torch.ops.libtorch_agnostic_2_10.my__foreach_mul.default(tensors, others)
-
-
-def make_tensor_clones_and_call_foreach(t1, t2) -> list[Tensor]:
-    """
-    Returns a list of 2 tensors corresponding to the square of the inputs.
-
-    Args:
-        t1: Tensor
-        t2: Tensor
-
-    Returns: list of [t1^2, t2^2]
-    """
-    return torch.ops.libtorch_agnostic_2_10.make_tensor_clones_and_call_foreach.default(
-        t1, t2
-    )
-
-
-def test_tensor_device(t):
-    """
-    Tests Tensor device() method.
-
-    Args:
-        t: Tensor - tensor to get device from
-
-    Returns: Device - device of the tensor
-    """
-    return torch.ops.libtorch_agnostic_2_10.test_tensor_device.default(t)
-
-
-def test_device_constructor(is_cuda, index, use_str):
-    """
-    Tests creating a Device from DeviceType and index, or from a string.
-
-    Args:
-        is_cuda: bool - if True, creates CUDA device; if False, creates CPU device
-        index: int - device index
-        use_str: bool - if True, constructs from string; if False, constructs from DeviceType
-
-    Returns: Device - A device with the specified type and index
-    """
-    return torch.ops.libtorch_agnostic_2_10.test_device_constructor.default(
-        is_cuda, index, use_str
-    )
-
-
-def test_device_equality(d1, d2) -> bool:
-    """
-    Tests Device equality operator.
-
-    Args:
-        d1: Device - first device
-        d2: Device - second device
-
-    Returns: bool - True if devices are equal
-    """
-    return torch.ops.libtorch_agnostic_2_10.test_device_equality.default(d1, d2)
-
-
-def test_device_set_index(device, index):
-    """
-    Tests Device set_index() method.
-
-    Args:
-        device: Device - device to modify
-        index: int - new device index
-
-    Returns: Device - device with updated index
-    """
-    return torch.ops.libtorch_agnostic_2_10.test_device_set_index.default(device, index)
-
-
-def test_device_index(device) -> int:
-    """
-    Tests Device index() method.
-
-    Args:
-        device: Device - device to query
-
-    Returns: int - device index
-    """
-    return torch.ops.libtorch_agnostic_2_10.test_device_index.default(device)
-
-
-def test_device_is_cuda(device) -> bool:
-    """
-    Tests Device is_cuda() method.
-
-    Args:
-        device: Device - device to check
-
-    Returns: bool - True if device is CUDA
-    """
-    return torch.ops.libtorch_agnostic_2_10.test_device_is_cuda.default(device)
-
-
-def test_device_is_cpu(device) -> bool:
-    """
-    Tests Device is_cpu() method.
-
-    Args:
-        device: Device - device to check
-
-    Returns: bool - True if device is CPU
-    """
-    return torch.ops.libtorch_agnostic_2_10.test_device_is_cpu.default(device)
-
-
-def test_parallel_for(size, grain_size) -> Tensor:
-    """
-    Tests the parallel_for functionality by using it to fill a tensor with indices.
-    Args:
-        size: int - size of the tensor to create
-        grain_size: int - grain size for parallel_for
-    Returns: Tensor - a 1D int64 tensor where each element contains its index
-        (if multiple threads are used the threadid will be encoded in the upper 32 bits)
-    """
-    return torch.ops.libtorch_agnostic_2_10.test_parallel_for.default(size, grain_size)
-
-
-def test_get_num_threads() -> int:
-    """
-    Tests the get_num_threads functionality by returning the number of threads
-    for the parallel backend.
-
-    Returns: int - the number of threads for the parallel backend
-    """
-    return torch.ops.libtorch_agnostic_2_10.test_get_num_threads.default()
-
-
-def my_empty(size, dtype=None, device=None, pin_memory=None) -> Tensor:
-    """
-    Creates an empty tensor with the specified size, dtype, device, and pin_memory.
-
-    Args:
-        size: list[int] - size of the tensor to create
-        dtype: ScalarType or None - data type of the tensor
-        device: Device or None - device on which to create the tensor
-        pin_memory: bool or None - whether to use pinned memory
-
-    Returns: Tensor - an uninitialized tensor with the specified properties
-    """
-    return torch.ops.libtorch_agnostic_2_10.my_empty.default(
-        size, dtype, device, pin_memory
-    )
-
-
-def my_reshape(t, shape) -> Tensor:
-    """
-    Returns a tensor with the same data but different shape.
-
-    Args:
-        t: Tensor - tensor to reshape
-        shape: list[int] - new shape for the tensor
-
-    Returns: Tensor - reshaped tensor
-    """
-    return torch.ops.libtorch_agnostic_2_10.my_reshape.default(t, shape)
-
-
-def my_view(t, size) -> Tensor:
-    """
-    Returns a new tensor with the same data as the input tensor but of a different shape.
-
-    Args:
-        t: Tensor - tensor to view
-        size: list[int] - new size for the tensor
-
-    Returns: Tensor - tensor with new view
-    """
-    return torch.ops.libtorch_agnostic_2_10.my_view.default(t, size)
-
-
-def get_any_data_ptr(t, mutable) -> int:
-    """
-    Return data pointer value of the tensor.
-    Args:
-        t: Input tensor
-        mutable: whether data pointer qualifier is mutable or const
-    Returns: int - pointer value
-    """
-    return torch.ops.libtorch_agnostic_2_10.get_any_data_ptr.default(t, mutable)
-
-
-def get_template_any_data_ptr(t, dtype, mutable) -> int:
-    """
-    Return data pointer value of the tensor iff it has dtype.
-    Args:
-        t: Input tensor
-        dtype: Input dtype
-        mutable: whether data pointer qualifier is mutable or const
-    Returns: int - pointer value
-    Raises RuntimeError when t.dtype() != dtype.
-    """
-    return torch.ops.libtorch_agnostic_2_10.get_template_any_data_ptr.default(
-        t, dtype, mutable
-    )
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/test_version_compatibility.py
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/test_version_compatibility.py
@ -1,308 +0,0 @@
-# Owner(s): ["module: cpp"]
-
-"""
-Unit tests to verify that each function file requires PyTorch 2.10+.
-
-This test suite compiles each .cpp file in the csrc directory with
-TORCH_TARGET_VERSION=2.9.0 and expects compilation to fail.
-If compilation succeeds, it means that either
-
-(1) The test function works with 2.9.0 and should not be in this directory.
-(2) The test function tests APIs that do not have proper TORCH_FEATURE_VERSION
-    guards. If this is the case, and you incorrectly move the test function into
-    libtorch_agnostic_2_9_extension the libtorch_agnostic_targetting CI workflow
-    will catch this.
-
-Run this script with VERSION_COMPAT_DEBUG=1 to see compilation errors.
-"""
-
-import os
-import subprocess
-import tempfile
-from pathlib import Path
-
-from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
-from torch.utils.cpp_extension import CUDA_HOME, include_paths as torch_include_paths
-
-
-# TODO: Fix this error in Windows:
-# numba.cuda.cudadrv.driver:driver.py:384 Call to cuInit results in CUDA_ERROR_NO_DEVICE
-if not IS_WINDOWS:
-
-    class FunctionVersionCompatibilityTest(TestCase):
-        """Test that all function files require PyTorch 2.10+."""
-
-        @classmethod
-        def setUpClass(cls):
-            """Set up test environment once for all tests."""
-            cls.csrc_dir = Path(__file__).parent / "libtorch_agnostic_2_10" / "csrc"
-            cls.build_dir = Path(tempfile.mkdtemp(prefix="version_check_"))
-
-            cls.pytorch_includes = [
-                f"-I{path}" for path in torch_include_paths(device_type="cpu")
-            ]
-            cls.cuda_includes = []
-            if CUDA_HOME:
-                cuda_include_path = os.path.join(CUDA_HOME, "include")
-                if os.path.exists(cuda_include_path):
-                    cls.cuda_includes = [f"-I{cuda_include_path}"]
-
-            cls.cuda_available = cls._check_cuda_available()
-
-        @classmethod
-        def tearDownClass(cls):
-            """Clean up build directory."""
-            import shutil
-
-            if cls.build_dir.exists():
-                shutil.rmtree(cls.build_dir)
-
-        @staticmethod
-        def _check_cuda_available() -> bool:
-            """Check if CUDA is available."""
-            try:
-                import torch
-
-                return torch.cuda.is_available()
-            except ImportError:
-                return False
-
-        def _compile_cpp_file(
-            self, source_file: Path, output_file: Path
-        ) -> tuple[bool, str]:
-            """
-            Compile a C++ file with TORCH_TARGET_VERSION=2.9.0.
-            Returns (success, error_message).
-            """
-            torch_version_2_9 = "0x0209000000000000"
-
-            cmd = [
-                "g++",
-                "-c",
-                "-std=c++17",
-                f"-DTORCH_TARGET_VERSION={torch_version_2_9}",
-                f"-I{source_file.parent}",  # For includes in same directory
-                *self.pytorch_includes,
-            ]
-
-            # Add CUDA flags if available
-            if self.cuda_available:
-                cmd.extend(self.cuda_includes)
-
-            cmd.extend([str(source_file), "-o", str(output_file)])
-
-            result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
-
-            if result.returncode == 0:
-                return True, ""
-            else:
-                return False, result.stderr
-
-        def _compile_cu_file(
-            self, source_file: Path, output_file: Path
-        ) -> tuple[bool, str]:
-            """
-            Compile a CUDA file with TORCH_TARGET_VERSION=2.9.0.
-            Returns (success, error_message).
-            """
-            if not CUDA_HOME:
-                return False, "CUDA_HOME not set"
-
-            torch_version_2_9 = "0x0209000000000000"
-
-            cmd = [
-                os.path.join(CUDA_HOME, "bin", "nvcc"),
-                "-c",
-                "-std=c++17",
-                f"-DTORCH_TARGET_VERSION={torch_version_2_9}",
-                f"-I{source_file.parent}",  # For includes in same directory
-                *self.pytorch_includes,
-                *self.cuda_includes,
-            ]
-
-            cmd.extend([str(source_file), "-o", str(output_file)])
-
-            result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
-
-            if result.returncode == 0:
-                return True, ""
-            else:
-                return False, result.stderr
-
-        def _test_function_file(self, source_file: Path):
-            """Test that a function file fails to compile with TORCH_TARGET_VERSION=2.9.0."""
-            func_name = source_file.stem
-            obj_file = self.build_dir / f"{func_name}.o"
-
-            # Choose the appropriate compiler based on file extension
-            if source_file.suffix == ".cu":
-                if not self.cuda_available:
-                    self.skipTest(f"CUDA not available, skipping {source_file.name}")
-                success, error_msg = self._compile_cu_file(source_file, obj_file)
-            else:
-                success, error_msg = self._compile_cpp_file(source_file, obj_file)
-
-            obj_file.unlink(missing_ok=True)
-
-            # Print error details for debugging
-            if not success:
-                relevant_errors = self._extract_relevant_errors(error_msg)
-                if relevant_errors:
-                    print(f"\n  Compilation errors for {func_name} (requires 2.10+):")
-                    for err in relevant_errors:
-                        print(f"    {err}")
-
-            self.assertFalse(
-                success,
-                f"Function {func_name} compiled successfully with TORCH_TARGET_VERSION=2.9.0. "
-                f"This could mean two things.\n\t1. It should run with 2.9.0 and should be "
-                "moved to libtorch_agnostic_2_9_extension\n\t2. The function(s) it tests do not use the "
-                "proper TORCH_FEATURE_VERSION guards\n\nThe libtorch_agnostic_targetting CI workflow will "
-                "verify if you incorrectly move this to the 2_9 extension instead of adding "
-                "the appropriate version guards.",
-            )
-
-        def test_mv_tensor_accessor_cpu_works_with_2_9(self):
-            """Test that mv_tensor_accessor_cpu.cpp compiles successfully with 2.9.0.
-
-            This is a negative test - it ensures that a file we expect to work with 2.9.0
-            actually does compile. This validates that our test infrastructure correctly
-            distinguishes between files that require 2.10+ and those that don't.
-            """
-            cpp_file = self.csrc_dir / "mv_tensor_accessor_cpu.cpp"
-
-            if not cpp_file.exists():
-                self.skipTest(f"{cpp_file} not found - this is a test file only")
-
-            obj_file = self.build_dir / "mv_tensor_accessor_cpu.o"
-            success, error_msg = self._compile_cpp_file(cpp_file, obj_file)
-
-            # Clean up
-            obj_file.unlink(missing_ok=True)
-
-            if not success:
-                relevant_errors = self._extract_relevant_errors(error_msg)
-                if relevant_errors:
-                    print(
-                        "\n  Unexpected compilation errors for mv_tensor_accessor_cpu:"
-                    )
-                    for err in relevant_errors:
-                        print(f"{err}")
-
-            self.assertTrue(
-                success,
-                f"mv_tensor_accessor_cpu.cpp failed to compile with TORCH_TARGET_VERSION=2.9.0. "
-                f"This file is expected to work with 2.9.0 since it doesn't use 2.10+ features. "
-                f"Error: {error_msg}",
-            )
-
-        def test_mv_tensor_accessor_cuda_works_with_2_9(self):
-            """Test that mv_tensor_accessor_cuda.cu compiles successfully with 2.9.0.
-
-            This is a negative test - it ensures that a .cu file we expect to work with 2.9.0
-            actually does compile. This validates that our test infrastructure correctly
-            compiles CUDA files and distinguishes between files that require 2.10+ and those
-            that don't.
-            """
-            if not self.cuda_available:
-                self.skipTest(
-                    "CUDA not available, skipping mv_tensor_accessor_cuda.cu test"
-                )
-
-            cu_file = self.csrc_dir / "mv_tensor_accessor_cuda.cu"
-
-            if not cu_file.exists():
-                self.skipTest(f"{cu_file} not found - this is a test file only")
-
-            obj_file = self.build_dir / "cuda_kernel.o"
-            success, error_msg = self._compile_cu_file(cu_file, obj_file)
-
-            # Clean up
-            obj_file.unlink(missing_ok=True)
-
-            if not success:
-                relevant_errors = self._extract_relevant_errors(error_msg)
-                if relevant_errors:
-                    print(
-                        "\n  Unexpected compilation errors for mv_tensor_accessor_cuda.cu:"
-                    )
-                    for err in relevant_errors:
-                        print(f"{err}")
-
-            self.assertTrue(
-                success,
-                f"mv_tensor_accessor_cuda.cu failed to compile with TORCH_TARGET_VERSION=2.9.0. "
-                f"This file is expected to work with 2.9.0 since it doesn't use 2.10+ features. "
-                f"Error: {error_msg}",
-            )
-
-        @staticmethod
-        def _extract_relevant_errors(error_msg: str) -> list[str]:
-            """Extract the most relevant error messages."""
-            error_lines = error_msg.strip().split("\n")
-            relevant_errors = []
-
-            for line in error_lines:
-                line_lower = line.lower()
-                if (
-                    "error:" in line_lower
-                    or "undefined" in line_lower
-                    or "undeclared" in line_lower
-                    or "no member named" in line_lower
-                ):
-                    relevant_errors.append(line.strip())
-
-            return relevant_errors
-
-    # Dynamically create test methods for each .cpp and .cu file
-
-    def _create_test_method_for_file(source_file: Path):
-        """Create a test method for a specific source file."""
-
-        def test_method_impl(self):
-            self._test_function_file(source_file)
-
-        # Set a descriptive name and docstring
-        func_name = source_file.stem
-        file_ext = source_file.suffix
-        test_method_impl.__name__ = f"test_{func_name}_requires_2_10"
-        test_method_impl.__doc__ = (
-            f"Test that {func_name}{file_ext} requires PyTorch 2.10+"
-        )
-
-        return test_method_impl
-
-    # Test discovery: generate a test for each .cpp and .cu file
-    _csrc_dir = Path(__file__).parent / "libtorch_agnostic_2_10" / "csrc"
-    if _csrc_dir.exists():
-        # Collect both .cpp and .cu files, excluding those used for negative test
-        # already defined above
-        _source_files = sorted(
-            [
-                f
-                for f in _csrc_dir.rglob("*.cpp")
-                if f.name not in ("mv_tensor_accessor_cpu.cpp",)
-            ]
-            + [
-                f
-                for f in _csrc_dir.rglob("*.cu")
-                if f.name not in ("mv_tensor_accessor_cuda.cu",)
-            ]
-        )
-
-        for _source_file in _source_files:
-            _test_method = _create_test_method_for_file(_source_file)
-            setattr(
-                FunctionVersionCompatibilityTest, _test_method.__name__, _test_method
-            )
-
-        del (
-            _create_test_method_for_file,
-            _csrc_dir,
-            _source_files,
-            _source_file,
-            _test_method,
-        )
-
-if __name__ == "__main__":
-    run_tests()
--- a/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/init.py
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/init.py
@ -1,21 +0,0 @@
-import ctypes
-from pathlib import Path
-
-import torch
-
-
-so_files = list(Path(__file__).parent.glob("_C*.so"))
-assert len(so_files) == 1, f"Expected one _C*.so file, found {len(so_files)}"
-
-# use ctypes.CDLL instead of load_library to be able to test the unload logic
-# below code is reduced from the load_library code
-with torch._ops.dl_open_guard():
-    loaded_lib = ctypes.CDLL(so_files[0])
-
-from . import ops
-
-
-__all__ = [
-    "loaded_lib",
-    "ops",
-]
--- a/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/cuda_kernel.cu
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/cuda_kernel.cu
@ -1,44 +0,0 @@
-#include "kernel.h"
-
-#include <cuda_runtime.h>
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/ops.h>
-#include <torch/csrc/stable/tensor.h>
-
-using torch::stable::Tensor;
-
-Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
-  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
-  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
-  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
-  STD_TORCH_CHECK(
-      m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
-  STD_TORCH_CHECK(
-      m.device() == v.device(), "m and v must be on the same device");
-  Tensor res = new_empty(m, {m.size(0)});
-  THO_DISPATCH_V2(
-      m.scalar_type(),
-      "mv_tensor_accessor_cuda",
-      AT_WRAP(([&]() {
-        auto resa = Accessor_cuda<scalar_t, 1>(
-            reinterpret_cast<scalar_t*>(res.data_ptr()),
-            res.sizes().data(),
-            res.strides().data());
-        auto ma = Accessor_cuda<scalar_t, 2>(
-            reinterpret_cast<scalar_t*>(m.data_ptr()),
-            m.sizes().data(),
-            m.strides().data());
-        auto va = Accessor_cuda<scalar_t, 1>(
-            reinterpret_cast<scalar_t*>(v.data_ptr()),
-            v.sizes().data(),
-            v.strides().data());
-        mv_tensor_accessor_kernel<Accessor_cuda, scalar_t>
-            <<<1, 1, 0, 0>>>(resa, ma, va);
-      })),
-      AT_FLOATING_TYPES);
-  return res;
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CUDA, m) {
-  m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
-}
--- a/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/ops.py
@ -1,363 +0,0 @@
-import torch
-from torch import Tensor
-
-
-def sgd_out_of_place(param, grad, weight_decay, lr, maximize) -> Tensor:
-    """
-    Computes a single step of SGD on a single parameter Tensor with grad.
-
-    Assumes:
-    - param and grad are the same shape and are 1D.
-    - param and grad are float and on CPU
-
-    Args:
-        param: a 1D tensor of floats
-        grad: a 1D tensor of floats
-        weight_decay: a python double between 0 and 1
-        lr: a python double
-
-    Returns:
-        a 1D float Tensor the same shape as param
-
-    """
-    return torch.ops.libtorch_agnostic_2_9.sgd_out_of_place.default(
-        param, grad, weight_decay, lr, maximize
-    )
-
-
-def identity(t) -> Tensor:
-    """
-    Returns the input tensor
-
-    Args:
-        t: any Tensor
-
-    Returns:
-        a Tensor, the same as input.
-    """
-    return torch.ops.libtorch_agnostic_2_9.identity.default(t)
-
-
-def my_abs(t) -> Tensor:
-    """
-    Returns abs on the input tensor, outputs a new Tensor
-
-    Args:
-        t: any Tensor
-
-    Returns:
-        a Tensor
-    """
-    return torch.ops.libtorch_agnostic_2_9.my_abs.default(t)
-
-
-def my_is_cpu(t) -> bool:
-    """
-    Returns is_cpu on the input tensor.
-
-    Args:
-        t: any Tensor
-
-    Returns:
-        a bool
-    """
-    return torch.ops.libtorch_agnostic_2_9.my_is_cpu.default(t)
-
-
-def my_ones_like(tensor, device) -> Tensor:
-    """
-    Returns a new Tensor like the input tensor, but with all ones
-
-    Args:
-        tensor: any Tensor
-        device: a device string
-
-    Returns:
-        a ones Tensor with the same dtype and shape and other attributes
-        like the input tensor
-    """
-    return torch.ops.libtorch_agnostic_2_9.my_ones_like.default(tensor, device)
-
-
-def exp_neg_is_leaf(t1, t2, t3) -> tuple[Tensor, Tensor, bool]:
-    """
-    Returns a Tensor, Tensor, bool tuple corresponding to the respective inputs
-    t1, t2, and t3.
-
-    Args:
-        t1: Tensor
-        t2: Tensor
-        t3: Tensor
-
-    Returns:
-        (exp(t1), neg(t2), is_leaf(t3))
-    """
-    return torch.ops.libtorch_agnostic_2_9.exp_neg_is_leaf.default(t1, t2, t3)
-
-
-def neg_exp(t) -> Tensor:
-    """
-    Returns a Tensor composing neg of exp
-
-    Args:
-        t: Tensor
-
-    Returns: neg(exp(t))
-    """
-    return torch.ops.libtorch_agnostic_2_9.neg_exp.default(t)
-
-
-def divide_neg_exp(t) -> Tensor:
-    """
-    Returns a Tensor division of neg and exp
-
-    Args:
-        t: Tensor
-
-    Returns: divide(neg(t), exp(t))
-    """
-    return torch.ops.libtorch_agnostic_2_9.divide_neg_exp.default(t)
-
-
-def is_contiguous(t) -> bool:
-    """
-    Returns a bool indicating if the input tensor is contiguous
-
-    Args:
-        t: Tensor
-
-    Returns: is_contiguous(t)
-    """
-    return torch.ops.libtorch_agnostic_2_9.is_contiguous.default(t)
-
-
-def my_transpose(t, dim0, dim1) -> Tensor:
-    """
-    Returns t.transpose(dim0, dim1)
-
-    Args:
-        t: Tensor
-
-    Returns: my_transpose(t, dim0, dim1)
-    """
-    return torch.ops.libtorch_agnostic_2_9.my_transpose.default(t, dim0, dim1)
-
-
-def my_empty_like(t) -> Tensor:
-    """
-    Returns t.empty_like()
-
-    Args:
-        t: Tensor
-
-    Returns: my_empty_like(t)
-    """
-    return torch.ops.libtorch_agnostic_2_9.my_empty_like.default(t)
-
-
-def my_zero_(t) -> Tensor:
-    """
-    Returns t.zero_()
-
-    Args:
-        t: Tensor
-
-    Returns: my_zero_(t)
-    """
-    return torch.ops.libtorch_agnostic_2_9.my_zero_.default(t)
-
-
-def my_amax(t) -> Tensor:
-    """
-    Returns t.amax()
-
-    Args:
-        t: Tensor
-
-    Returns: amax(t)
-    """
-    return torch.ops.libtorch_agnostic_2_9.my_amax.default(t)
-
-
-def my_amax_vec(t) -> Tensor:
-    """
-    Returns t.amax()
-
-    Args:
-        t: Tensor
-
-    Returns: amax(t)
-    """
-    return torch.ops.libtorch_agnostic_2_9.my_amax_vec.default(t)
-
-
-def fill_infinity(t) -> Tensor:
-    """
-    Fills the tensor with inf.
-
-    Args:
-        t: Tensor to fill
-
-    Returns: The modified tensor (same as input)
-    """
-    return torch.ops.libtorch_agnostic_2_9.fill_infinity.default(t)
-
-
-def test_default_constructor(defined) -> bool:
-    """
-    Tests the default constructor for torch::stable::Tensor.
-
-    Args:
-        defined: bool - if True, tests defined tensor; if False, tests undefined tensor
-
-    Returns: bool - result of calling .defined() on the tensor
-    """
-    return torch.ops.libtorch_agnostic_2_9.test_default_constructor.default(defined)
-
-
-def mv_tensor_accessor(m, v) -> Tensor:
-    """
-    Returns matrix-vector product.
-
-    Args:
-        m: any 2-D Tensor with shape (N, M)
-        v: any 1-D Tensor with shape (M,)
-
-    Returns:
-        a 1-D Tensor with shape (N,)
-    """
-    return torch.ops.libtorch_agnostic_2_9.mv_tensor_accessor.default(m, v)
-
-
-def my_pad(t) -> Tensor:
-    """
-    Pads the input tensor with hardcoded padding parameters.
-
-    Args:
-        t: Input tensor
-
-    Returns: Padded tensor with padding [1, 2, 2, 1], mode "constant", value 0.0
-    """
-    return torch.ops.libtorch_agnostic_2_9.my_pad.default(t)
-
-
-def my_narrow(t, dim, start, length) -> Tensor:
-    """
-    Returns a new tensor that is a narrowed version of the input tensor.
-
-    Args:
-        t: Input tensor
-        dim: Dimension along which to narrow
-        start: Starting position
-        length: Length of the narrowed section
-
-    Returns: Narrowed tensor
-    """
-    return torch.ops.libtorch_agnostic_2_9.my_narrow.default(t, dim, start, length)
-
-
-def my_copy_(dst, src, non_blocking) -> Tensor:
-    """
-    Returns tensor dst that is updated with src elements.
-
-    Args:
-        dst: Destination tensor
-        src: Source tensor
-        non_blocking: bool
-
-    Returns: Updated tensor
-    """
-    return torch.ops.libtorch_agnostic_2_9.my_copy_.default(dst, src, non_blocking)
-
-
-def my_clone(t) -> Tensor:
-    """
-    Returns a clone of input tensor.
-
-    Args:
-        t: Input tensor
-
-    Returns: Cloned tensor
-    """
-    return torch.ops.libtorch_agnostic_2_9.my_clone.default(t)
-
-
-def test_device_guard(device_index) -> int:
-    """
-    Tests the DeviceGuard functionality by creating a device guard and returning an empty tensor.
-
-    Args:
-        device_index: Device index to set the guard to
-
-    Returns: result of cudaGetDevice() as an integer after using the guard
-    """
-    return torch.ops.libtorch_agnostic_2_9.test_device_guard.default(device_index)
-
-
-def test_device_guard_set_index() -> int:
-    """
-    Tests the DeviceGuard set_index functionality by creating a device guard with index 1,
-    then setting it to index 0, and returning the current device.
-
-    Returns: result of cudaGetDevice() as an integer after using set_index
-    """
-    return torch.ops.libtorch_agnostic_2_9.test_device_guard_set_index.default()
-
-
-def test_stream(device_index) -> int:
-    """
-    Tests the Stream functionality by getting the current stream ID for the specified device.
-
-    Args:
-        device_index: Device index to get the stream for
-
-    Returns: Stream ID as an integer
-    """
-    return torch.ops.libtorch_agnostic_2_9.test_stream.default(device_index)
-
-
-def test_get_current_device_index() -> int:
-    """
-    Tests the getCurrentDeviceIndex functionality by getting the current device index.
-
-    Returns: Current device index as an integer
-    """
-    return torch.ops.libtorch_agnostic_2_9.test_get_current_device_index.default()
-
-
-def my_new_empty_dtype_variant(t) -> Tensor:
-    """
-    Returns a new empty tensor with shape [2, 5] and dtype bfloat16
-
-    Args:
-        t: Input tensor used as a reference for device and other properties
-
-    Returns: New empty tensor with shape [2, 5] and dtype bfloat16
-    """
-    return torch.ops.libtorch_agnostic_2_9.my_new_empty_dtype_variant.default(t)
-
-
-def my_new_zeros_dtype_variant(t) -> Tensor:
-    """
-    Returns a new tensor filled with 0s with shape [2, 5] and dtype Float
-
-    Args:
-        t: Input tensor used as a reference for device and other properties
-
-    Returns: New zeros tensor
-    """
-    return torch.ops.libtorch_agnostic_2_9.my_new_zeros_dtype_variant.default(t)
-
-
-def my_flatten(t, start_dim=0, end_dim=-1) -> Tensor:
-    """
-    Flattens the input tensor from start_dim to end_dim into a single dimension.
-
-    Args:
-        t: Tensor - tensor to flatten
-        start_dim: int - first dimension to flatten (default: 0)
-        end_dim: int - last dimension to flatten (default: -1)
-
-    Returns: Tensor - flattened tensor
-    """
-    return torch.ops.libtorch_agnostic_2_9.my_flatten.default(t, start_dim, end_dim)
--- a/test/cpp_extensions/libtorch_agnostic_2_9_extension/setup.py
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/setup.py
@ -1,82 +0,0 @@
-import distutils.command.clean
-import shutil
-from pathlib import Path
-
-from setuptools import find_packages, setup
-
-import torch
-from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension
-
-
-ROOT_DIR = Path(__file__).parent
-CSRC_DIR = ROOT_DIR / "libtorch_agnostic_2_9" / "csrc"
-
-
-class clean(distutils.command.clean.clean):
-    def run(self):
-        # Run default behavior first
-        distutils.command.clean.clean.run(self)
-
-        # Remove extension
-        for path in (ROOT_DIR / "libtorch_agnostic_2_9").glob("**/*.so"):
-            path.unlink()
-        # Remove build and dist and egg-info directories
-        dirs = [
-            ROOT_DIR / "build",
-            ROOT_DIR / "dist",
-            ROOT_DIR / "libtorch_agnostic_2_9.egg-info",
-        ]
-        for path in dirs:
-            if path.exists():
-                shutil.rmtree(str(path), ignore_errors=True)
-
-
-def get_extension():
-    extra_compile_args = {
-        "cxx": [
-            "-fdiagnostics-color=always",
-            "-DTORCH_STABLE_ONLY",
-            "-DTORCH_TARGET_VERSION=0x0209000000000000",
-        ],
-    }
-    sources = list(CSRC_DIR.glob("**/*.cpp"))
-
-    extension = CppExtension
-    # allow including <cuda_runtime.h>
-    if torch.cuda.is_available():
-        extra_compile_args["cxx"].append("-DLAE_USE_CUDA")
-        extra_compile_args["nvcc"] = [
-            "-O2",
-            "-DTORCH_TARGET_VERSION=0x0209000000000000",
-        ]
-        extension = CUDAExtension
-        sources.extend(CSRC_DIR.glob("**/*.cu"))
-
-    return [
-        extension(
-            "libtorch_agnostic_2_9._C",
-            sources=sorted(str(s) for s in sources),
-            py_limited_api=True,
-            extra_compile_args=extra_compile_args,
-            extra_link_args=[],
-        )
-    ]
-
-
-setup(
-    name="libtorch_agnostic_2_9",
-    version="0.0",
-    author="PyTorch Core Team",
-    description="Example of libtorch agnostic extension for PyTorch 2.9",
-    packages=find_packages(exclude=("test",)),
-    package_data={"libtorch_agnostic_2_9": ["*.dll", "*.dylib", "*.so"]},
-    install_requires=[
-        "torch",
-    ],
-    ext_modules=get_extension(),
-    cmdclass={
-        "build_ext": BuildExtension.with_options(no_python_abi_suffix=True),
-        "clean": clean,
-    },
-    options={"bdist_wheel": {"py_limited_api": "cp39"}},
-)
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/init.py
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/init.py
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/cuda_kernel.cu
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/cuda_kernel.cu
@ -0,0 +1,30 @@
+#include "kernel.h"
+
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+#include <cuda_runtime.h>
+
+using torch::stable::Tensor;
+
+Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
+  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
+  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
+  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
+  STD_TORCH_CHECK(m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
+  STD_TORCH_CHECK(m.device() == v.device(), "m and v must be on the same device");
+  Tensor res = new_empty(m, {m.size(0)});
+  THO_DISPATCH_V2(m.scalar_type(), "mv_tensor_accessor_cuda",
+                  AT_WRAP(([&]() {
+                    auto resa = Accessor_cuda<scalar_t, 1>(reinterpret_cast<scalar_t*>(res.data_ptr()), res.sizes().data(), res.strides().data());
+                    auto ma = Accessor_cuda<scalar_t, 2>(reinterpret_cast<scalar_t*>(m.data_ptr()), m.sizes().data(), m.strides().data());
+                    auto va = Accessor_cuda<scalar_t, 1>(reinterpret_cast<scalar_t*>(v.data_ptr()), v.sizes().data(), v.strides().data());
+                    mv_tensor_accessor_kernel<Accessor_cuda, scalar_t><<<1, 1, 0, 0>>>(resa, ma, va);
+                  })),
+                  AT_FLOATING_TYPES);
+  return res;
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CUDA, m) {
+  m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.cpp
@ -1,6 +1,8 @@
 #include "kernel.h"

 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/stable/accelerator.h>
+#include <torch/csrc/stable/device.h>
 #include <torch/csrc/stable/library.h>
 #include <torch/csrc/stable/tensor.h>
 #include <torch/csrc/stable/ops.h>
@ -9,7 +11,6 @@

 #ifdef LAE_USE_CUDA
 #include <cuda_runtime.h>
-#include <torch/csrc/stable/accelerator.h>
 #endif

 #include <optional>
@ -67,11 +68,11 @@ Tensor sgd_out_of_place(
  return out;
 }

-STABLE_TORCH_LIBRARY(libtorch_agnostic_2_9, m) {
+STABLE_TORCH_LIBRARY(libtorch_agnostic, m) {
  m.def("sgd_out_of_place(Tensor param, Tensor grad, float weight_decay, float lr, bool maximize) -> Tensor");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CPU, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
  m.impl("sgd_out_of_place", TORCH_BOX(&sgd_out_of_place));
 }

@ -80,15 +81,15 @@ Tensor identity(Tensor t) {
 }


-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
  m.def("identity(Tensor t) -> Tensor");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CUDA, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CUDA, m) {
  m.impl("identity", TORCH_BOX(&identity));
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CPU, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
  m.impl("identity", TORCH_BOX(&identity));
 }

@ -100,11 +101,11 @@ Tensor my_abs(Tensor t) {
  return torch::stable::detail::to<Tensor>(stack[0]);
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
  m.def("my_abs(Tensor t) -> Tensor");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("my_abs", TORCH_BOX(&my_abs));
 }

@ -126,11 +127,11 @@ Tensor my_ones_like(Tensor t, StableIValue device) {
  return torch::stable::detail::to<Tensor>(stack[0]);
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
  m.def("my_ones_like(Tensor t, Device d) -> Tensor");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("my_ones_like", TORCH_BOX(&my_ones_like));
 }

@ -153,11 +154,11 @@ std::tuple<Tensor, Tensor, bool> exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3
    torch::stable::detail::to<bool>(stack_is_leaf[0]));
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
  m.def("exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3) -> (Tensor, Tensor, bool)");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("exp_neg_is_leaf", TORCH_BOX(&exp_neg_is_leaf));
 }

@ -169,11 +170,11 @@ Tensor neg_exp(Tensor t) {
  return torch::stable::detail::to<Tensor>(stack[0]);
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
  m.def("neg_exp(Tensor t) -> Tensor");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("neg_exp", TORCH_BOX(&neg_exp));
 }

@ -193,11 +194,11 @@ Tensor divide_neg_exp(Tensor t) {
  return torch::stable::detail::to<Tensor>(stack_div[0]);
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
  m.def("divide_neg_exp(Tensor t) -> Tensor");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("divide_neg_exp", TORCH_BOX(&divide_neg_exp));
 }

@ -205,11 +206,11 @@ bool is_contiguous(Tensor t) {
  return t.is_contiguous();
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
  m.def("is_contiguous(Tensor t) -> bool");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("is_contiguous", TORCH_BOX(&is_contiguous));
 }

@ -264,7 +265,7 @@ Tensor my_clone(Tensor t) {
  return clone(t);
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
  m.def("my_transpose(Tensor t, int dim0, int dim1) -> Tensor");
  m.def("my_empty_like(Tensor t) -> Tensor");
  m.def("fill_infinity(Tensor(a!) t) -> Tensor(a!)");
@ -276,7 +277,7 @@ STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
  m.def("my_clone(Tensor t) -> Tensor");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("my_transpose", TORCH_BOX(&my_transpose));
  m.impl("my_empty_like", TORCH_BOX(&my_empty_like));
  m.impl("fill_infinity", TORCH_BOX(&fill_infinity));
@ -287,7 +288,7 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
  m.impl("my_clone", TORCH_BOX(&my_clone));
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeImplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeImplicitAutograd, m) {
  m.impl("my_pad", TORCH_BOX(&my_pad));
  m.impl("my_narrow", TORCH_BOX(&my_narrow));
 }
@ -304,12 +305,12 @@ Tensor my_amax_vec(Tensor t) {
  return amax(t, {0,1}, false);
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
  m.def("my_zero_(Tensor(a!) t) -> Tensor(a!)");
  m.def("my_amax(Tensor a) -> Tensor");
  m.def("my_amax_vec(Tensor a) -> Tensor");
  m.def("my_is_cpu(Tensor t) -> bool");
-  m.def("test_default_constructor(bool undefined) -> bool");
+   m.def("test_default_constructor(bool undefined) -> bool");
 }

 bool test_default_constructor(bool defined) {
@ -331,13 +332,190 @@ bool test_default_constructor(bool defined) {
  return out.defined();
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("my_zero_", TORCH_BOX(&my_zero_));
  m.impl("my_amax", TORCH_BOX(&my_amax));
  m.impl("my_amax_vec", TORCH_BOX(&my_amax_vec));
  m.impl("test_default_constructor", TORCH_BOX(&test_default_constructor));
 }

+std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
+  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
+  aoti_torch_call_dispatcher("aten::_foreach_mul", "List", stack.data());
+  return torch::stable::detail::to<std::vector<Tensor>>(stack[0]);
+}
+
+void my__foreach_mul_(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
+  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
+  aoti_torch_call_dispatcher("aten::_foreach_mul_", "List", stack.data());
+}
+
+std::vector<Tensor> make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) {
+  // This function tests that my__foreach_mul can take in std::initializer_lists
+  // in addition to std::vectors.
+  Tensor t1_1 = my_clone(t1);
+  Tensor t1_2 = my_clone(t1);
+  Tensor t2_1 = my_clone(t2);
+  Tensor t2_2 = my_clone(t2);
+  return my__foreach_mul({t1_1, t2_1}, {t1_2, t2_2});
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("my__foreach_mul(Tensor[] self, Tensor[] other) -> Tensor[]");
+  m.def("my__foreach_mul_(Tensor(a!)[] self, Tensor[] other) -> ()");
+  m.def("make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) -> Tensor[]");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+  m.impl("my__foreach_mul", TORCH_BOX(&my__foreach_mul));
+  m.impl("my__foreach_mul_", TORCH_BOX(&my__foreach_mul_));
+  m.impl("make_tensor_clones_and_call_foreach", TORCH_BOX(&make_tensor_clones_and_call_foreach));
+}
+
+// Test functions for torch::stable::Tensor device method
+
+torch::stable::Device test_tensor_device(torch::stable::Tensor tensor) {
+  return tensor.device();
+}
+
+void boxed_test_tensor_device(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  torch::stable::Device res = test_tensor_device(
+      torch::stable::detail::to<torch::stable::Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
+}
+
+// Test functions for torch::stable::Device
+
+torch::stable::Device test_device_constructor(
+    bool is_cuda,
+    torch::stable::DeviceIndex index,
+    bool use_str) {
+  using torch::stable::Device;
+  using torch::stable::DeviceType;
+
+  if (use_str) {
+    std::string device_str;
+    if (is_cuda) {
+      device_str = "cuda:" + std::to_string(index);
+    } else {
+      device_str = "cpu";
+    }
+    return Device(device_str);
+  } else {
+    if (is_cuda) {
+      return Device(DeviceType::CUDA, index);
+    } else {
+      return Device(DeviceType::CPU);
+    }
+  }
+}
+
+void boxed_test_device_constructor(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  torch::stable::Device res = test_device_constructor(
+      torch::stable::detail::to<bool>(stack[0]),
+      torch::stable::detail::to<torch::stable::DeviceIndex>(stack[1]),
+      torch::stable::detail::to<bool>(stack[2]));
+  stack[0] = torch::stable::detail::from(res);
+}
+
+bool test_device_equality(torch::stable::Device d1, torch::stable::Device d2) {
+  return d1 == d2;
+}
+
+void boxed_test_device_equality(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  bool res = test_device_equality(
+      torch::stable::detail::to<torch::stable::Device>(stack[0]),
+      torch::stable::detail::to<torch::stable::Device>(stack[1]));
+  stack[0] = torch::stable::detail::from(res);
+}
+
+torch::stable::Device test_device_set_index(
+    torch::stable::Device device,
+    torch::stable::DeviceIndex index) {
+  device.set_index(index);
+  return device;
+}
+
+void boxed_test_device_set_index(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  torch::stable::Device res = test_device_set_index(
+      torch::stable::detail::to<torch::stable::Device>(stack[0]),
+      torch::stable::detail::to<torch::stable::DeviceIndex>(stack[1]));
+  stack[0] = torch::stable::detail::from(res);
+}
+
+torch::stable::DeviceIndex test_device_index(torch::stable::Device device) {
+  return device.index();
+}
+
+void boxed_test_device_index(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  torch::stable::DeviceIndex res = test_device_index(
+      torch::stable::detail::to<torch::stable::Device>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
+}
+
+bool test_device_is_cuda(torch::stable::Device device) {
+  return device.is_cuda();
+}
+
+void boxed_test_device_is_cuda(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  bool res = test_device_is_cuda(
+      torch::stable::detail::to<torch::stable::Device>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
+}
+
+bool test_device_is_cpu(torch::stable::Device device) {
+  return device.is_cpu();
+}
+
+void boxed_test_device_is_cpu(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  bool res = test_device_is_cpu(
+      torch::stable::detail::to<torch::stable::Device>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("test_tensor_device(Tensor t) -> Device");
+  m.def(
+      "test_device_constructor(bool is_cuda, DeviceIndex index, bool use_str) -> Device");
+  m.def("test_device_equality(Device d1, Device d2) -> bool");
+  m.def("test_device_set_index(Device device, DeviceIndex index) -> Device");
+  m.def("test_device_index(Device device) -> DeviceIndex");
+  m.def("test_device_is_cuda(Device device) -> bool");
+  m.def("test_device_is_cpu(Device device) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+  m.impl("test_tensor_device", &boxed_test_tensor_device);
+  m.impl("test_device_constructor", &boxed_test_device_constructor);
+  m.impl("test_device_equality", &boxed_test_device_equality);
+  m.impl("test_device_set_index", &boxed_test_device_set_index);
+  m.impl("test_device_index", &boxed_test_device_index);
+  m.impl("test_device_is_cuda", &boxed_test_device_is_cuda);
+  m.impl("test_device_is_cpu", &boxed_test_device_is_cpu);
+}
+
 Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
@ -356,11 +534,11 @@ Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
  return res;
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
  m.def("mv_tensor_accessor(Tensor m, Tensor v) -> Tensor");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CPU, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
  m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cpu));
 }

@ -406,14 +584,14 @@ int64_t test_get_current_device_index() {
  return torch::stable::accelerator::getCurrentDeviceIndex();
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
  m.def("test_device_guard(int device_index) -> int");
  m.def("test_device_guard_set_index() -> int");
  m.def("test_stream(int device_index) -> int");
  m.def("test_get_current_device_index() -> int");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("test_device_guard", TORCH_BOX(&test_device_guard));
  m.impl("test_device_guard_set_index", TORCH_BOX(&test_device_guard_set_index));
  m.impl("test_stream", TORCH_BOX(&test_stream));
@ -422,14 +600,100 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {

 #endif // LAE_USE_CUDA

+Tensor test_parallel_for(int64_t size, int64_t grain_size) {
+  AtenTensorHandle tensor_handle;
+  int64_t stride = 1;
+
+  aoti_torch_empty_strided(
+      1,
+      &size,
+      &stride,
+      aoti_torch_dtype_int64(),
+      aoti_torch_device_type_cpu(),
+      0,
+      &tensor_handle);
+
+  Tensor tensor(tensor_handle);
+  int64_t* data_ptr = reinterpret_cast<int64_t*>(tensor.data_ptr());
+
+  torch::stable::zero_(tensor);
+
+  // Use parallel_for to fill each element with its index
+  // If using a parallel path, the thread id is encoded in the upper 32 bits
+  torch::stable::parallel_for(
+      0, size, grain_size, [data_ptr](int64_t begin, int64_t end) {
+        for (auto i = begin; i < end; i++) {
+          STD_TORCH_CHECK(i <= UINT32_MAX);
+          uint32_t thread_id;
+          torch_get_thread_idx(&thread_id);
+          data_ptr[i] = i | (static_cast<int64_t>(thread_id) << 32);
+        }
+      });
+
+  return tensor;
+}
+
+void boxed_test_parallel_for(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  Tensor res = test_parallel_for(to<int64_t>(stack[0]), to<int64_t>(stack[1]));
+  stack[0] = from(res);
+}
+
+uint32_t test_get_num_threads() {
+  return torch::stable::get_num_threads();
+}
+
+void boxed_test_get_num_threads(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  uint32_t res = test_get_num_threads();
+  stack[0] = from(res);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("test_parallel_for(int size, int grain_size) -> Tensor");
+  m.def("test_get_num_threads() -> int");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+  m.impl("test_parallel_for", &boxed_test_parallel_for);
+  m.impl("test_get_num_threads", &boxed_test_get_num_threads);
+}
+
+Tensor my_empty(
+    torch::headeronly::HeaderOnlyArrayRef<int64_t> size,
+    std::optional<torch::headeronly::ScalarType> dtype,
+    std::optional<torch::stable::Device> device,
+    std::optional<bool> pin_memory) {
+  return empty(size, dtype, device, pin_memory);
+}
+
 Tensor my_flatten(Tensor t, int64_t start_dim, int64_t end_dim) {
  return flatten(t, start_dim, end_dim);
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
-  m.def("my_flatten(Tensor t, int start_dim=0, int end_dim=-1) -> Tensor");
+Tensor my_reshape(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> shape) {
+  return reshape(t, shape);
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
-  m.impl("my_flatten", TORCH_BOX(&my_flatten));
+Tensor my_view(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> size) {
+  return view(t, size);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def(
+      "my_empty(int[] size, ScalarType? dtype=None, Device? device=None, bool? pin_memory=None) -> Tensor");
+  m.def("my_flatten(Tensor t, int start_dim=0, int end_dim=-1) -> Tensor");
+  m.def("my_reshape(Tensor t, int[] shape) -> Tensor");
+  m.def("my_view(Tensor t, int[] size) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+  m.impl("my_empty", TORCH_BOX(&my_empty));
+  m.impl("my_flatten", TORCH_BOX(&my_flatten));
+  m.impl("my_reshape", TORCH_BOX(&my_reshape));
+  m.impl("my_view", TORCH_BOX(&my_view));
 }
--- a/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.h
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.h
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
@ -0,0 +1,558 @@
+import torch
+from torch import Tensor
+
+
+def sgd_out_of_place(param, grad, weight_decay, lr, maximize) -> Tensor:
+    """
+    Computes a single step of SGD on a single parameter Tensor with grad.
+
+    Assumes:
+    - param and grad are the same shape and are 1D.
+    - param and grad are float and on CPU
+
+    Args:
+        param: a 1D tensor of floats
+        grad: a 1D tensor of floats
+        weight_decay: a python double between 0 and 1
+        lr: a python double
+
+    Returns:
+        a 1D float Tensor the same shape as param
+
+    """
+    return torch.ops.libtorch_agnostic.sgd_out_of_place.default(
+        param, grad, weight_decay, lr, maximize
+    )
+
+
+def identity(t) -> Tensor:
+    """
+    Returns the input tensor
+
+    Args:
+        t: any Tensor
+
+    Returns:
+        a Tensor, the same as input.
+    """
+    return torch.ops.libtorch_agnostic.identity.default(t)
+
+
+def my_abs(t) -> Tensor:
+    """
+    Returns abs on the input tensor, outputs a new Tensor
+
+    Args:
+        t: any Tensor
+
+    Returns:
+        a Tensor
+    """
+    return torch.ops.libtorch_agnostic.my_abs.default(t)
+
+
+def my_is_cpu(t) -> bool:
+    """
+    Returns is_cpu on the input tensor.
+
+    Args:
+        t: any Tensor
+
+    Returns:
+        a bool
+    """
+    return torch.ops.libtorch_agnostic.my_is_cpu.default(t)
+
+
+def my_ones_like(tensor, device) -> Tensor:
+    """
+    Returns a new Tensor like the input tensor, but with all ones
+
+    Args:
+        tensor: any Tensor
+        device: a device string
+
+    Returns:
+        a ones Tensor with the same dtype and shape and other attributes
+        like the input tensor
+    """
+    return torch.ops.libtorch_agnostic.my_ones_like.default(tensor, device)
+
+
+def exp_neg_is_leaf(t1, t2, t3) -> tuple[Tensor, Tensor, bool]:
+    """
+    Returns a Tensor, Tensor, bool tuple corresponding to the respective inputs
+    t1, t2, and t3.
+
+    Args:
+        t1: Tensor
+        t2: Tensor
+        t3: Tensor
+
+    Returns:
+        (exp(t1), neg(t2), is_leaf(t3))
+    """
+    return torch.ops.libtorch_agnostic.exp_neg_is_leaf.default(t1, t2, t3)
+
+
+def neg_exp(t) -> Tensor:
+    """
+    Returns a Tensor composing neg of exp
+
+    Args:
+        t: Tensor
+
+    Returns: neg(exp(t))
+    """
+    return torch.ops.libtorch_agnostic.neg_exp.default(t)
+
+
+def divide_neg_exp(t) -> Tensor:
+    """
+    Returns a Tensor division of neg and exp
+
+    Args:
+        t: Tensor
+
+    Returns: divide(neg(t), exp(t))
+    """
+    return torch.ops.libtorch_agnostic.divide_neg_exp.default(t)
+
+
+def is_contiguous(t) -> bool:
+    """
+    Returns a bool indicating if the input tensor is contiguous
+
+    Args:
+        t: Tensor
+
+    Returns: is_contiguous(t)
+    """
+    return torch.ops.libtorch_agnostic.is_contiguous.default(t)
+
+
+def my_transpose(t, dim0, dim1) -> Tensor:
+    """
+    Returns t.transpose(dim0, dim1)
+
+    Args:
+        t: Tensor
+
+    Returns: my_transpose(t, dim0, dim1)
+    """
+    return torch.ops.libtorch_agnostic.my_transpose.default(t, dim0, dim1)
+
+
+def my_empty_like(t) -> Tensor:
+    """
+    Returns t.empty_like()
+
+    Args:
+        t: Tensor
+
+    Returns: my_empty_like(t)
+    """
+    return torch.ops.libtorch_agnostic.my_empty_like.default(t)
+
+
+def my_zero_(t) -> Tensor:
+    """
+    Returns t.zero_()
+
+    Args:
+        t: Tensor
+
+    Returns: my_zero_(t)
+    """
+    return torch.ops.libtorch_agnostic.my_zero_.default(t)
+
+
+def my_amax(t) -> Tensor:
+    """
+    Returns t.amax()
+
+    Args:
+        t: Tensor
+
+    Returns: amax(t)
+    """
+    return torch.ops.libtorch_agnostic.my_amax.default(t)
+
+
+def my_amax_vec(t) -> Tensor:
+    """
+    Returns t.amax()
+
+    Args:
+        t: Tensor
+
+    Returns: amax(t)
+    """
+    return torch.ops.libtorch_agnostic.my_amax_vec.default(t)
+
+
+def fill_infinity(t) -> Tensor:
+    """
+    Fills the tensor with inf.
+
+    Args:
+        t: Tensor to fill
+
+    Returns: The modified tensor (same as input)
+    """
+    return torch.ops.libtorch_agnostic.fill_infinity.default(t)
+
+
+def test_default_constructor(defined) -> bool:
+    """
+    Tests the default constructor for torch::stable::Tensor.
+
+    Args:
+        defined: bool - if True, tests defined tensor; if False, tests undefined tensor
+
+    Returns: bool - result of calling .defined() on the tensor
+    """
+    return torch.ops.libtorch_agnostic.test_default_constructor.default(defined)
+
+
+def test_tensor_device(t):
+    """
+    Tests Tensor device() method.
+
+    Args:
+        t: Tensor - tensor to get device from
+
+    Returns: Device - device of the tensor
+    """
+    return torch.ops.libtorch_agnostic.test_tensor_device.default(t)
+
+
+def my_pad(t) -> Tensor:
+    """
+    Pads the input tensor with hardcoded padding parameters.
+
+    Args:
+        t: Input tensor
+
+    Returns: Padded tensor with padding [1, 2, 2, 1], mode "constant", value 0.0
+    """
+    return torch.ops.libtorch_agnostic.my_pad.default(t)
+
+
+def my_narrow(t, dim, start, length) -> Tensor:
+    """
+    Returns a new tensor that is a narrowed version of the input tensor.
+
+    Args:
+        t: Input tensor
+        dim: Dimension along which to narrow
+        start: Starting position
+        length: Length of the narrowed section
+
+    Returns: Narrowed tensor
+    """
+    return torch.ops.libtorch_agnostic.my_narrow.default(t, dim, start, length)
+
+
+def my_copy_(dst, src, non_blocking) -> Tensor:
+    """
+    Returns tensor dst that is updated with src elements.
+
+    Args:
+        dst: Destination tensor
+        src: Source tensor
+        non_blocking: bool
+
+    Returns: Updated tensor
+    """
+    return torch.ops.libtorch_agnostic.my_copy_.default(dst, src, non_blocking)
+
+
+def my_clone(t) -> Tensor:
+    """
+    Returns a clone of input tensor.
+
+    Args:
+        t: Input tensor
+
+    Returns: Cloned tensor
+    """
+    return torch.ops.libtorch_agnostic.my_clone.default(t)
+
+
+def test_device_guard(device_index) -> int:
+    """
+    Tests the DeviceGuard functionality by creating a device guard and returning an empty tensor.
+
+    Args:
+        device_index: Device index to set the guard to
+
+    Returns: result of cudaGetDevice() as an integer after using the guard
+    """
+    return torch.ops.libtorch_agnostic.test_device_guard.default(device_index)
+
+
+def test_device_guard_set_index() -> int:
+    """
+    Tests the DeviceGuard set_index functionality by creating a device guard with index 1,
+    then setting it to index 0, and returning the current device.
+
+    Returns: result of cudaGetDevice() as an integer after using set_index
+    """
+    return torch.ops.libtorch_agnostic.test_device_guard_set_index.default()
+
+
+def test_stream(device_index) -> int:
+    """
+    Tests the Stream functionality by getting the current stream ID for the specified device.
+
+    Args:
+        device_index: Device index to get the stream for
+
+    Returns: Stream ID as an integer
+    """
+    return torch.ops.libtorch_agnostic.test_stream.default(device_index)
+
+
+def test_get_current_device_index() -> int:
+    """
+    Tests the getCurrentDeviceIndex functionality by getting the current device index.
+
+    Returns: Current device index as an integer
+    """
+    return torch.ops.libtorch_agnostic.test_get_current_device_index.default()
+
+
+def my_new_empty_dtype_variant(t) -> Tensor:
+    """
+    Returns a new empty tensor with shape [2, 5] and dtype bfloat16
+
+    Args:
+        t: Input tensor used as a reference for device and other properties
+
+    Returns: New empty tensor with shape [2, 5] and dtype bfloat16
+    """
+    return torch.ops.libtorch_agnostic.my_new_empty_dtype_variant.default(t)
+
+
+def my_new_zeros_dtype_variant(t) -> Tensor:
+    """
+    Returns a new tensor filled with 0s with shape [2, 5] and dtype Float
+
+    Args:
+        t: Input tensor used as a reference for device and other properties
+
+    Returns: New zeros tensor
+    """
+    return torch.ops.libtorch_agnostic.my_new_zeros_dtype_variant.default(t)
+
+
+def my__foreach_mul_(tensors, others) -> ():
+    """
+    Updates tensors to be the result of pointwise multiplying with others.
+
+    Args:
+        tensors: list of tensors
+        others: list of tensors (with the same corresponding shapes as tensors)
+
+    Returns: nothing, tensors is updated in place.
+    """
+    torch.ops.libtorch_agnostic.my__foreach_mul_.default(tensors, others)
+
+
+def my__foreach_mul(tensors, others) -> list[Tensor]:
+    """
+    Returns a list of tensors that are the results of pointwise multiplying
+    tensors and others.
+
+    Args:
+        tensors: list of tensors
+        others: list of tensors (with the same corresponding shapes as tensors)
+
+    Returns: list of multiplied tensors
+    """
+    return torch.ops.libtorch_agnostic.my__foreach_mul.default(tensors, others)
+
+
+def make_tensor_clones_and_call_foreach(t1, t2) -> list[Tensor]:
+    """
+    Returns a list of 2 tensors corresponding to the square of the inputs.
+
+    Args:
+        t1: Tensor
+        t2: Tensor
+
+    Returns: list of [t1^2, t2^2]
+    """
+    return torch.ops.libtorch_agnostic.make_tensor_clones_and_call_foreach.default(
+        t1, t2
+    )
+
+
+def test_device_constructor(is_cuda, index, use_str):
+    """
+    Tests creating a Device from DeviceType and index, or from a string.
+
+    Args:
+        is_cuda: bool - if True, creates CUDA device; if False, creates CPU device
+        index: int - device index
+        use_str: bool - if True, constructs from string; if False, constructs from DeviceType
+
+    Returns: Device - A device with the specified type and index
+    """
+    return torch.ops.libtorch_agnostic.test_device_constructor.default(
+        is_cuda, index, use_str
+    )
+
+
+def test_device_equality(d1, d2) -> bool:
+    """
+    Tests Device equality operator.
+
+    Args:
+        d1: Device - first device
+        d2: Device - second device
+
+    Returns: bool - True if devices are equal
+    """
+    return torch.ops.libtorch_agnostic.test_device_equality.default(d1, d2)
+
+
+def test_device_set_index(device, index):
+    """
+    Tests Device set_index() method.
+
+    Args:
+        device: Device - device to modify
+        index: int - new device index
+
+    Returns: Device - device with updated index
+    """
+    return torch.ops.libtorch_agnostic.test_device_set_index.default(device, index)
+
+
+def test_device_index(device) -> int:
+    """
+    Tests Device index() method.
+
+    Args:
+        device: Device - device to query
+
+    Returns: int - device index
+    """
+    return torch.ops.libtorch_agnostic.test_device_index.default(device)
+
+
+def test_device_is_cuda(device) -> bool:
+    """
+    Tests Device is_cuda() method.
+
+    Args:
+        device: Device - device to check
+
+    Returns: bool - True if device is CUDA
+    """
+    return torch.ops.libtorch_agnostic.test_device_is_cuda.default(device)
+
+
+def test_device_is_cpu(device) -> bool:
+    """
+    Tests Device is_cpu() method.
+
+    Args:
+        device: Device - device to check
+
+    Returns: bool - True if device is CPU
+    """
+    return torch.ops.libtorch_agnostic.test_device_is_cpu.default(device)
+
+
+def test_parallel_for(size, grain_size) -> Tensor:
+    """
+    Tests the parallel_for functionality by using it to fill a tensor with indices.
+    Args:
+        size: int - size of the tensor to create
+        grain_size: int - grain size for parallel_for
+    Returns: Tensor - a 1D int64 tensor where each element contains its index
+        (if multiple threads are used the threadid will be encoded in the upper 32 bits)
+    """
+    return torch.ops.libtorch_agnostic.test_parallel_for.default(size, grain_size)
+
+
+def test_get_num_threads() -> int:
+    """
+    Tests the get_num_threads functionality by returning the number of threads
+    for the parallel backend.
+
+    Returns: int - the number of threads for the parallel backend
+    """
+    return torch.ops.libtorch_agnostic.test_get_num_threads.default()
+
+
+def my_empty(size, dtype=None, device=None, pin_memory=None) -> Tensor:
+    """
+    Creates an empty tensor with the specified size, dtype, device, and pin_memory.
+
+    Args:
+        size: list[int] - size of the tensor to create
+        dtype: ScalarType or None - data type of the tensor
+        device: Device or None - device on which to create the tensor
+        pin_memory: bool or None - whether to use pinned memory
+
+    Returns: Tensor - an uninitialized tensor with the specified properties
+    """
+    return torch.ops.libtorch_agnostic.my_empty.default(size, dtype, device, pin_memory)
+
+
+def my_flatten(t, start_dim=0, end_dim=-1) -> Tensor:
+    """
+    Flattens the input tensor from start_dim to end_dim into a single dimension.
+
+    Args:
+        t: Tensor - tensor to flatten
+        start_dim: int - first dimension to flatten (default: 0)
+        end_dim: int - last dimension to flatten (default: -1)
+
+    Returns: Tensor - flattened tensor
+    """
+    return torch.ops.libtorch_agnostic.my_flatten.default(t, start_dim, end_dim)
+
+
+def my_reshape(t, shape) -> Tensor:
+    """
+    Returns a tensor with the same data but different shape.
+
+    Args:
+        t: Tensor - tensor to reshape
+        shape: list[int] - new shape for the tensor
+
+    Returns: Tensor - reshaped tensor
+    """
+    return torch.ops.libtorch_agnostic.my_reshape.default(t, shape)
+
+
+def my_view(t, size) -> Tensor:
+    """
+    Returns a new tensor with the same data as the input tensor but of a different shape.
+
+    Args:
+        t: Tensor - tensor to view
+        size: list[int] - new size for the tensor
+
+    Returns: Tensor - tensor with new view
+    """
+    return torch.ops.libtorch_agnostic.my_view.default(t, size)
+
+
+def mv_tensor_accessor(m, v) -> Tensor:
+    """
+    Returns matrix-vector product.
+
+    Args:
+        m: any 2-D Tensor with shape (N, M)
+        v: any 1-D Tensor with shape (M,)
+
+    Returns:
+        a 1-D Tensor with shape (N,)
+    """
+    return torch.ops.libtorch_agnostic.mv_tensor_accessor.default(m, v)
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/setup.py
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/setup.py
@ -9,7 +9,7 @@ from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtensio


 ROOT_DIR = Path(__file__).parent
-CSRC_DIR = ROOT_DIR / "libtorch_agnostic_2_10" / "csrc"
+CSRC_DIR = ROOT_DIR / "libtorch_agnostic" / "csrc"


 class clean(distutils.command.clean.clean):
@ -18,13 +18,13 @@ class clean(distutils.command.clean.clean):
        distutils.command.clean.clean.run(self)

        # Remove extension
-        for path in (ROOT_DIR / "libtorch_agnostic_2_10").glob("**/*.so"):
+        for path in (ROOT_DIR / "libtorch_agnostic").glob("**/*.so"):
            path.unlink()
        # Remove build and dist and egg-info directories
        dirs = [
            ROOT_DIR / "build",
            ROOT_DIR / "dist",
-            ROOT_DIR / "libtorch_agnostic_2_10.egg-info",
+            ROOT_DIR / "libtorch_agnostic.egg-info",
        ]
        for path in dirs:
            if path.exists():
@ -33,11 +33,7 @@ class clean(distutils.command.clean.clean):

 def get_extension():
    extra_compile_args = {
-        "cxx": [
-            "-fdiagnostics-color=always",
-            "-DTORCH_STABLE_ONLY",
-            "-DTORCH_TARGET_VERSION=0x020a000000000000",
-        ],
+        "cxx": ["-fdiagnostics-color=always"],
    }
    sources = list(CSRC_DIR.glob("**/*.cpp"))

@ -51,7 +47,7 @@ def get_extension():

    return [
        extension(
-            "libtorch_agnostic_2_10._C",
+            "libtorch_agnostic._C",
            sources=sorted(str(s) for s in sources),
            py_limited_api=True,
            extra_compile_args=extra_compile_args,
@ -61,12 +57,12 @@ def get_extension():


 setup(
-    name="libtorch_agnostic_2_10",
+    name="libtorch_agnostic",
    version="0.0",
    author="PyTorch Core Team",
-    description="Example of libtorch agnostic extension for PyTorch 2.10+",
+    description="Example of libtorch agnostic extension",
    packages=find_packages(exclude=("test",)),
-    package_data={"libtorch_agnostic_2_10": ["*.dll", "*.dylib", "*.so"]},
+    package_data={"libtorch_agnostic": ["*.dll", "*.dylib", "*.so"]},
    install_requires=[
        "torch",
    ],
--- a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
@ -1,7 +1,6 @@
 # Owner(s): ["module: cpp"]

 import math
-import unittest
 from pathlib import Path

 import torch
@ -15,110 +14,26 @@ from torch.testing._internal.common_utils import (
    install_cpp_extension,
    IS_WINDOWS,
    run_tests,
-    skipIfTorchDynamo,
    TestCase,
    xfailIfTorchDynamo,
 )


-def get_supported_dtypes():
-    """Return a list of dtypes that are supported by torch stable ABI."""
-    return [
-        torch.int8,
-        torch.int16,
-        torch.int32,
-        torch.int64,
-        torch.uint8,
-        torch.uint16,
-        torch.uint32,
-        torch.uint64,
-        torch.bfloat16,
-        torch.float16,
-        torch.float32,
-        torch.float64,
-        torch.float8_e5m2,
-        torch.float8_e4m3fn,
-        torch.float8_e5m2fnuz,
-        torch.float8_e4m3fnuz,
-        torch.complex32,
-        torch.complex64,
-        torch.complex128,
-        torch.bool,
-    ]
-
-
-def skipIfTorchVersionLessThan(major, minor):
-    """Skip test if PyTorch version is less than specified version."""
-
-    def decorator(func):
-        version_parts = torch.__version__.split(".")
-        current_major = int(version_parts[0])
-        current_minor = int(
-            version_parts[1].split("+")[0].split("a")[0].split("b")[0].split("rc")[0]
-        )
-
-        should_skip = (current_major < major) or (
-            current_major == major and current_minor < minor
-        )
-        reason = f"Test requires PyTorch >= {major}.{minor}, current version is {torch.__version__}"
-
-        return unittest.skipIf(should_skip, reason)(func)
-
-    return decorator
-
-
 # TODO: Fix this error in Windows:
 # LINK : error LNK2001: unresolved external symbol PyInit__C
 if not IS_WINDOWS:

    class TestLibtorchAgnostic(TestCase):
-        """
-        Tests for versioned libtorch_agnostic extensions.
-
-        This test class supports testing both:
-
-        - libtorch_agnostic_2_9: Extension built with TORCH_TARGET_VERSION=2.9.0
-        - libtorch_agnostic_2_10: Extension built with TORCH_TARGET_VERSION=2.10.0
-
-        Tests should be decorated with @skipIfTorchVersionLessThan to indicate the
-        version that they target.
-        """
-
        @classmethod
        def setUpClass(cls):
-            # Build both 2.9 and 2.10 extensions
-            base_dir = Path(__file__).parent
-
            try:
-                import libtorch_agnostic_2_9  # noqa: F401
+                import libtorch_agnostic  # noqa: F401
            except Exception:
-                install_cpp_extension(
-                    extension_root=base_dir / "libtorch_agnostic_2_9_extension"
-                )
-
-            # Only build 2.10 extension if running on PyTorch 2.10+
-            import re
-
-            version_parts = torch.__version__.split(".")
-            current_major = int(version_parts[0])
-            # Extract just the numeric part of the minor version (handles "10+git", "10a1", etc.)
-            current_minor = int(re.match(r"\d+", version_parts[1]).group())
-
-            if (current_major > 2) or (current_major == 2 and current_minor >= 10):
-                try:
-                    import libtorch_agnostic_2_10  # noqa: F401
-                except Exception:
-                    install_cpp_extension(
-                        extension_root=base_dir / "libtorch_agnostic_2_10_extension"
-                    )
-            else:
-                print(
-                    f"Skipping 2.10 extension (running on PyTorch {torch.__version__})"
-                )
+                install_cpp_extension(extension_root=Path(__file__).parent.parent)

        @onlyCPU
        def test_slow_sgd(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            param = torch.rand(5, device=device)
            grad = torch.rand_like(param)
@ -145,7 +60,7 @@ if not IS_WINDOWS:

        @onlyCUDA
        def test_identity_does_not_hog_memory(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            def _run_identity(prior_mem):
                t = torch.rand(32, 32, device=device)
@ -161,7 +76,7 @@ if not IS_WINDOWS:
                self.assertEqual(curr_mem, init_mem)

        def test_exp_neg_is_leaf(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            t1 = torch.rand(2, 3, device=device)
            t2 = torch.rand(3, 2, device=device)
@ -173,7 +88,7 @@ if not IS_WINDOWS:
            self.assertEqual(is_leaf, t3.is_leaf)

        def test_my_abs(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.rand(32, 16, device=device) - 0.5
            res = libtorch_agnostic.ops.my_abs(t)
@ -192,7 +107,7 @@ if not IS_WINDOWS:
                    self.assertEqual(curr_mem, init_mem)

        def test_neg_exp(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.rand(32, 16, device=device) - 0.5
            res = libtorch_agnostic.ops.neg_exp(t)
@ -211,7 +126,7 @@ if not IS_WINDOWS:
                    self.assertEqual(curr_mem, init_mem)

        def test_divide_neg_exp(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.zeros(2, 3, device=device) - 0.5
            res = libtorch_agnostic.ops.divide_neg_exp(t)
@ -230,7 +145,7 @@ if not IS_WINDOWS:
                    self.assertEqual(curr_mem, init_mem)

        def test_is_contiguous(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.rand(2, 7, device=device)
            self.assertTrue(libtorch_agnostic.ops.is_contiguous(t))
@ -242,7 +157,7 @@ if not IS_WINDOWS:
        # **{}): got AssertionError("tensor's device must be `meta`, got cpu instead")
        @xfailIfTorchDynamo
        def test_my_ones_like(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.rand(3, 1, device=device) - 0.5
            cpu_t = libtorch_agnostic.ops.my_ones_like(t, "cpu")
@ -261,7 +176,7 @@ if not IS_WINDOWS:
                    self.assertEqual(curr_mem, init_mem)

        def test_my_transpose(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.rand(2, 7, device=device)
            out = libtorch_agnostic.ops.my_transpose(t, 0, 1)
@ -271,7 +186,7 @@ if not IS_WINDOWS:
                libtorch_agnostic.ops.my_transpose(t, 1, 2)

        def test_my_empty_like(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            deterministic = torch.are_deterministic_algorithms_enabled()
            try:
@ -287,7 +202,7 @@ if not IS_WINDOWS:

        @onlyCPU
        def test_my_zero_(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.rand(2, 7, device=device)
            out = libtorch_agnostic.ops.my_zero_(t)
@ -295,28 +210,28 @@ if not IS_WINDOWS:
            self.assertEqual(out, torch.zeros_like(t))

        def test_my_amax(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.rand(2, 7, device=device)
            out = libtorch_agnostic.ops.my_amax(t)
            self.assertEqual(out, torch.amax(t, 0))

        def test_my_amax_vec(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.rand(2, 7, 5, device=device)
            out = libtorch_agnostic.ops.my_amax_vec(t)
            self.assertEqual(out, torch.amax(t, (0, 1)))

        def test_my_is_cpu(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.rand(2, 7, device=device)
            out = libtorch_agnostic.ops.my_is_cpu(t)
            self.assertEqual(out, t.is_cpu)

        def test_fill_infinity(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.rand(3, 4, device=device)
            out = libtorch_agnostic.ops.fill_infinity(t)
@ -327,7 +242,7 @@ if not IS_WINDOWS:

        @onlyCPU
        def test_default_constructor(self):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            defined_tensor_is_defined = libtorch_agnostic.ops.test_default_constructor(
                True
@ -340,7 +255,7 @@ if not IS_WINDOWS:
            self.assertFalse(undefined_tensor_is_defined)

        def test_my_pad(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.rand(2, 3, device=device)
            out = libtorch_agnostic.ops.my_pad(t)
@ -348,7 +263,7 @@ if not IS_WINDOWS:
            self.assertEqual(out, expected)

        def test_my_narrow(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.randn(2, 5, device=device)

@ -362,7 +277,7 @@ if not IS_WINDOWS:
        @onlyCUDA
        @deviceCountAtLeast(2)
        def test_device_guard(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            device_index = 1
            out = libtorch_agnostic.ops.test_device_guard(device_index)
@ -371,7 +286,7 @@ if not IS_WINDOWS:
        @onlyCUDA
        @deviceCountAtLeast(2)
        def test_device_guard_set_index(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            # This test creates a DeviceGuard with index 1, then sets it to index 0
            # and returns the current device (should be 0)
@ -380,7 +295,7 @@ if not IS_WINDOWS:

        @onlyCUDA
        def test_stream(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            stream = torch.cuda.Stream()
            device = torch.cuda.current_device()
@ -394,7 +309,7 @@ if not IS_WINDOWS:
        @onlyCUDA
        @deviceCountAtLeast(2)
        def test_get_current_device_index(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            prev_device = torch.cuda.current_device()

@ -408,7 +323,7 @@ if not IS_WINDOWS:
                torch.cuda.set_device(prev_device)

        def test_my_new_empty_dtype_variant(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            deterministic = torch.are_deterministic_algorithms_enabled()
            try:
@ -423,7 +338,7 @@ if not IS_WINDOWS:
                torch.use_deterministic_algorithms(deterministic)

        def test_my_new_zeros_dtype_variant(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.randn(3, 4, device=device)
            out = libtorch_agnostic.ops.my_new_zeros_dtype_variant(t)
@ -431,7 +346,7 @@ if not IS_WINDOWS:
            self.assertEqual(out, ref_out, exact_device=True)

        def test_my_copy_(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            dst = torch.empty(2, 5, device=device)
            src = torch.randn(2, 5, device=device)
@ -442,7 +357,7 @@ if not IS_WINDOWS:
            self.assertEqual(result.data_ptr(), dst.data_ptr())

        def test_my_clone(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.randn(2, 5, device=device)

@ -452,9 +367,8 @@ if not IS_WINDOWS:
            self.assertNotEqual(result.data_ptr(), expected.data_ptr())
            self.assertEqual(result.stride(), expected.stride())

-        @skipIfTorchVersionLessThan(2, 10)
        def test_my__foreach_mul_(self, device):
-            import libtorch_agnostic_2_10 as libtorch_agnostic
+            import libtorch_agnostic

            N = 5
            tensors = [torch.rand(32, 16, device=device) for _ in range(N)]
@ -467,9 +381,8 @@ if not IS_WINDOWS:
            for tensor_t, expected_t in zip(tensors, expected_values):
                self.assertEqual(tensor_t, expected_t)

-        @skipIfTorchVersionLessThan(2, 10)
        def test_my__foreach_mul(self, device):
-            import libtorch_agnostic_2_10 as libtorch_agnostic
+            import libtorch_agnostic

            N = 5
            tensors = [torch.rand(32, 16, device=device) for _ in range(N)]
@ -496,9 +409,8 @@ if not IS_WINDOWS:
                    curr_mem = torch.cuda.memory_allocated(device)
                    self.assertEqual(curr_mem, init_mem)

-        @skipIfTorchVersionLessThan(2, 10)
        def test_make_tensor_clones_and_call_foreach(self, device):
-            import libtorch_agnostic_2_10 as libtorch_agnostic
+            import libtorch_agnostic

            t1 = torch.rand(2, 5, device=device)
            t2 = torch.rand(3, 4, device=device)
@ -506,10 +418,9 @@ if not IS_WINDOWS:
            self.assertEqual(result[0], t1 * t1)
            self.assertEqual(result[1], t2 * t2)

-        @skipIfTorchVersionLessThan(2, 10)
        @onlyCUDA
        def test_device(self, device):
-            import libtorch_agnostic_2_10 as libtorch_agnostic
+            import libtorch_agnostic

            cuda_device = libtorch_agnostic.ops.test_device_constructor(
                is_cuda=True, index=1, use_str=False
@ -562,11 +473,10 @@ if not IS_WINDOWS:
            ):
                libtorch_agnostic.ops.test_device_set_index(cuda_device, 129)

-        @skipIfTorchVersionLessThan(2, 10)
        @onlyCUDA
        @deviceCountAtLeast(2)
        def test_tensor_device(self, device):
-            import libtorch_agnostic_2_10 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.randn(2, 3)
            self.assertEqual(libtorch_agnostic.ops.test_tensor_device(t), t.device)
@ -581,7 +491,6 @@ if not IS_WINDOWS:
                libtorch_agnostic.ops.test_tensor_device(t_cuda_1), t_cuda_1.device
            )

-        @skipIfTorchVersionLessThan(2, 10)
        @onlyCPU
        # TODO: Debug this:
        # Dynamo failed to run FX node with fake tensors:
@ -591,7 +500,7 @@ if not IS_WINDOWS:
        # Declaration: libtorch_agnostic::test_parallel_for(int size, int grain_size) -> Tensor')
        @xfailIfTorchDynamo
        def test_parallel_for(self, device):
-            import libtorch_agnostic_2_10 as libtorch_agnostic
+            import libtorch_agnostic

            num_threads = torch.get_num_threads()
            size = 100
@ -608,18 +517,16 @@ if not IS_WINDOWS:
            self.assertEqual(result_values, expected)
            self.assertEqual(result_thread_ids, torch.arange(expected_num_threads_used))

-        @skipIfTorchVersionLessThan(2, 10)
        @onlyCPU
        def test_get_num_threads(self, device):
-            import libtorch_agnostic_2_10 as libtorch_agnostic
+            import libtorch_agnostic

            num_threads = libtorch_agnostic.ops.test_get_num_threads()
            expected_num_threads = torch.get_num_threads()
            self.assertEqual(num_threads, expected_num_threads)

-        @skipIfTorchVersionLessThan(2, 10)
        def test_my_empty(self, device):
-            import libtorch_agnostic_2_10 as libtorch_agnostic
+            import libtorch_agnostic

            deterministic = torch.are_deterministic_algorithms_enabled()
            try:
@ -660,7 +567,7 @@ if not IS_WINDOWS:
                torch.use_deterministic_algorithms(deterministic)

        def test_my_flatten(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.randn(2, 3, 4, device=device)
            result = libtorch_agnostic.ops.my_flatten(t)
@ -675,9 +582,8 @@ if not IS_WINDOWS:
            expected_range = torch.flatten(t, 2, -1)
            self.assertEqual(result_range, expected_range)

-        @skipIfTorchVersionLessThan(2, 10)
        def test_my_reshape(self, device):
-            import libtorch_agnostic_2_10 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.randn(2, 3, 4, device=device)

@ -693,9 +599,8 @@ if not IS_WINDOWS:
            expected_flat = torch.reshape(t, [-1])
            self.assertEqual(result_flat, expected_flat)

-        @skipIfTorchVersionLessThan(2, 10)
        def test_my_view(self, device):
-            import libtorch_agnostic_2_10 as libtorch_agnostic
+            import libtorch_agnostic

            t = torch.randn(2, 3, 4, device=device)

@ -712,7 +617,7 @@ if not IS_WINDOWS:
            self.assertEqual(result_flat, expected_flat)

        def test_mv_tensor_accessor(self, device):
-            import libtorch_agnostic_2_9 as libtorch_agnostic
+            import libtorch_agnostic

            m = torch.rand(3, 5, device=device)
            v = torch.rand(5, device=device)
@ -727,45 +632,6 @@ if not IS_WINDOWS:
            expected = torch.mv(m, v)
            self.assertEqual(result, expected)

-        @skipIfTorchVersionLessThan(2, 10)
-        @skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
-        def test_get_any_data_ptr(self, device):
-            import libtorch_agnostic_2_10 as libtorch_agnostic
-
-            t = torch.empty(2, 5, device=device, dtype=torch.float32)
-            expected_p = t.data_ptr()
-
-            for mutable in [True, False]:
-                p = libtorch_agnostic.ops.get_any_data_ptr(t, mutable)
-                self.assertEqual(p, expected_p)
-
-        @skipIfTorchVersionLessThan(2, 10)
-        @skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
-        def test_get_template_any_data_ptr(self, device):
-            import libtorch_agnostic_2_10 as libtorch_agnostic
-
-            supported_dtypes = get_supported_dtypes()
-
-            for dtype in supported_dtypes:
-                t = torch.empty(2, 5, device=device, dtype=dtype)
-                expected_p = t.data_ptr()
-
-                for rdtype in supported_dtypes:
-                    if dtype == rdtype:
-                        for mutable in [True, False]:
-                            p = libtorch_agnostic.ops.get_template_any_data_ptr(
-                                t, rdtype, mutable
-                            )
-                            self.assertEqual(p, expected_p)
-                    else:
-                        for mutable in [True, False]:
-                            with self.assertRaisesRegex(
-                                RuntimeError, "expected scalar type.* but found"
-                            ):
-                                libtorch_agnostic.ops.get_template_any_data_ptr(
-                                    t, rdtype, mutable
-                                )
-
    instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)

 if __name__ == "__main__":
--- a/test/custom_operator/test_custom_ops.cpp
+++ b/test/custom_operator/test_custom_ops.cpp
@ -22,7 +22,7 @@ void check_all_parameters(

 template<class Result, class... Args>
 Result get_operator_from_registry_and_execute(const char* op_name, Args&&... args) {
-  auto ops = torch::jit::getAllOperatorsFor(
+  auto& ops = torch::jit::getAllOperatorsFor(
      torch::jit::Symbol::fromQualString(op_name));
  TORCH_INTERNAL_ASSERT(ops.size() == 1);

--- a/test/distributed/_composable/fsdp/test_fully_shard_comm.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
@ -59,12 +59,7 @@ from torch.testing._internal.common_fsdp import (
    patch_reshard,
    patch_unshard,
 )
-from torch.testing._internal.common_utils import (
-    run_tests,
-    TEST_WITH_ROCM,
-    TEST_XPU,
-    xfailIf,
-)
+from torch.testing._internal.common_utils import run_tests, TEST_XPU, xfailIf
 from torch.testing._internal.distributed._tensor.common_dtensor import (
    FeedForward,
    ModelArgs,
@ -1663,17 +1658,10 @@ class TestFullyShardAllocFromPG(FSDPTest):
 class TestFullyShardForceSumReduction(FSDPTest):
    # The messages might change when we move to a different NCCL version.
    # Please update this test if it starts failing.
-
-    if TEST_WITH_ROCM and torch.cuda.nccl.version()[:2] >= (2, 27):
-        COLLECTIVE_RE = (
-            r"NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ acc \(nil\) "
-            "count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
-        )
-    else:
-        COLLECTIVE_RE = (
-            "NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ "
-            "count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
-        )
+    COLLECTIVE_RE = (
+        "NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ "
+        "count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
+    )
    # See here for the numerical values for each reduction op:
    # https://github.com/NVIDIA/nccl/blob/72d2432094d6ae36abd6e511c3a16a2d052dbf94/src/nccl.h.in#L260-L275
    SUM_REDUCTION = 0
--- a/Show More
+++ b/Show More