Update

[ghstack-poisoned]
Update (base update)
2025-11-19 10:04:58 +08:00 · 2025-11-18 06:56:40 -07:00 · 2025-11-18 06:56:40 -07:00 · 2025-11-18 08:15:46 +00:00 · 2025-11-18 07:48:54 +00:00 · 2025-11-18 07:48:54 +00:00
197 changed files with 9405 additions and 1909 deletions
--- a/.ci/aarch64_linux/README.md
+++ b/.ci/aarch64_linux/README.md
@ -0,0 +1,19 @@
+# Aarch64 (ARM/Graviton) Support Scripts
+Scripts for building aarch64 PyTorch PIP Wheels. These scripts build the following wheels:
+* torch
+* torchvision
+* torchaudio
+* torchtext
+* torchdata
+## Aarch64_ci_build.sh
+This script is design to support CD operations within PyPi manylinux aarch64 container, and be executed in the container. It prepares the container and then executes __aarch64_wheel_ci_build.py__ to build the wheels. The script "assumes" the PyTorch repo is located at: ```/pytorch``` and will put the wheels into ```/artifacts```.
+### Usage
+```DESIRED_PYTHON=<PythonVersion> aarch64_ci_build.sh```
+
+__NOTE:__ CI build is currently __EXPERMINTAL__
+
+## Build_aarch64_wheel.py
+This app allows a person to build using AWS EC3 resources and requires AWS-CLI and Boto3 with AWS credentials to support building EC2 instances for the wheel builds. Can be used in a codebuild CD or from a local system.
+
+### Usage
+```build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch <RCtag>```
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -0,0 +1,53 @@
+#!/bin/bash
+set -eux -o pipefail
+
+GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
+
+# Set CUDA architecture lists to match x86 build_cuda.sh
+if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0"
+elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
+elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
+elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
+fi
+
+# Compress the fatbin with -compress-mode=size for CUDA 13
+if [[ "$DESIRED_CUDA" == *"13"* ]]; then
+    export TORCH_NVCC_FLAGS="-compress-mode=size"
+    # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
+    export BUILD_BUNDLE_PTXAS=1
+fi
+
+SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
+source $SCRIPTPATH/aarch64_ci_setup.sh
+
+###############################################################################
+# Run aarch64 builder python
+###############################################################################
+cd /
+# adding safe directory for git as the permissions will be
+# on the mounted pytorch repo
+git config --global --add safe.directory /pytorch
+pip install -r /pytorch/requirements.txt
+pip install auditwheel==6.2.0 wheel
+if [ "$DESIRED_CUDA" = "cpu" ]; then
+    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
+    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
+else
+    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
+    export USE_SYSTEM_NCCL=1
+
+    # Check if we should use NVIDIA libs from PyPI (similar to x86 build_cuda.sh logic)
+    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
+        echo "Bundling CUDA libraries with wheel for aarch64."
+    else
+        echo "Using nvidia libs from pypi for aarch64."
+        echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
+        export USE_NVIDIA_PYPI_LIBS=1
+    fi
+
+    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
+fi
--- a/.ci/aarch64_linux/aarch64_ci_setup.sh
+++ b/.ci/aarch64_linux/aarch64_ci_setup.sh
@ -0,0 +1,21 @@
+#!/bin/bash
+set -eux -o pipefail
+
+# This script is used to prepare the Docker container for aarch64_ci_wheel_build.py python script
+# By creating symlinks from desired /opt/python to /usr/local/bin/
+
+NUMPY_VERSION=2.0.2
+if [[ "$DESIRED_PYTHON"  == "3.13" || "$DESIRED_PYTHON" == "3.13t" ]]; then
+    NUMPY_VERSION=2.1.2
+fi
+
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+source $SCRIPTPATH/../manywheel/set_desired_python.sh
+
+pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2
+
+for tool in python python3 pip pip3 ninja scons patchelf; do
+    ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin;
+done
+
+python --version
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -0,0 +1,333 @@
+#!/usr/bin/env python3
+# encoding: UTF-8
+
+import os
+import shutil
+from subprocess import check_call, check_output
+
+
+def list_dir(path: str) -> list[str]:
+    """'
+    Helper for getting paths for Python
+    """
+    return check_output(["ls", "-1", path]).decode().split("\n")
+
+
+def replace_tag(filename) -> None:
+    with open(filename) as f:
+        lines = f.readlines()
+    for i, line in enumerate(lines):
+        if line.startswith("Tag:"):
+            lines[i] = line.replace("-linux_", "-manylinux_2_28_")
+            print(f"Updated tag from {line} to {lines[i]}")
+            break
+
+    with open(filename, "w") as f:
+        f.writelines(lines)
+
+
+def patch_library_rpath(
+    folder: str,
+    lib_name: str,
+    use_nvidia_pypi_libs: bool = False,
+    desired_cuda: str = "",
+) -> None:
+    """Apply patchelf to set RPATH for a library in torch/lib"""
+    lib_path = f"{folder}/tmp/torch/lib/{lib_name}"
+
+    if use_nvidia_pypi_libs:
+        # For PyPI NVIDIA libraries, construct CUDA RPATH
+        cuda_rpaths = [
+            "$ORIGIN/../../nvidia/cudnn/lib",
+            "$ORIGIN/../../nvidia/nvshmem/lib",
+            "$ORIGIN/../../nvidia/nccl/lib",
+            "$ORIGIN/../../nvidia/cusparselt/lib",
+        ]
+
+        if "130" in desired_cuda:
+            cuda_rpaths.append("$ORIGIN/../../nvidia/cu13/lib")
+        else:
+            cuda_rpaths.extend(
+                [
+                    "$ORIGIN/../../nvidia/cublas/lib",
+                    "$ORIGIN/../../nvidia/cuda_cupti/lib",
+                    "$ORIGIN/../../nvidia/cuda_nvrtc/lib",
+                    "$ORIGIN/../../nvidia/cuda_runtime/lib",
+                    "$ORIGIN/../../nvidia/cufft/lib",
+                    "$ORIGIN/../../nvidia/curand/lib",
+                    "$ORIGIN/../../nvidia/cusolver/lib",
+                    "$ORIGIN/../../nvidia/cusparse/lib",
+                    "$ORIGIN/../../nvidia/nvtx/lib",
+                    "$ORIGIN/../../nvidia/cufile/lib",
+                ]
+            )
+
+        # Add $ORIGIN for local torch libs
+        rpath = ":".join(cuda_rpaths) + ":$ORIGIN"
+    else:
+        # For bundled libraries, just use $ORIGIN
+        rpath = "$ORIGIN"
+
+    if os.path.exists(lib_path):
+        os.system(
+            f"cd {folder}/tmp/torch/lib/; "
+            f"patchelf --set-rpath '{rpath}' --force-rpath {lib_name}"
+        )
+
+
+def copy_and_patch_library(
+    src_path: str,
+    folder: str,
+    use_nvidia_pypi_libs: bool = False,
+    desired_cuda: str = "",
+) -> None:
+    """Copy a library to torch/lib and patch its RPATH"""
+    if os.path.exists(src_path):
+        lib_name = os.path.basename(src_path)
+        shutil.copy2(src_path, f"{folder}/tmp/torch/lib/{lib_name}")
+        patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
+
+
+def package_cuda_wheel(wheel_path, desired_cuda) -> None:
+    """
+    Package the cuda wheel libraries
+    """
+    folder = os.path.dirname(wheel_path)
+    os.mkdir(f"{folder}/tmp")
+    os.system(f"unzip {wheel_path} -d {folder}/tmp")
+    # Delete original wheel since it will be repackaged
+    os.system(f"rm {wheel_path}")
+
+    # Check if we should use PyPI NVIDIA libraries or bundle system libraries
+    use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
+
+    if use_nvidia_pypi_libs:
+        print("Using nvidia libs from pypi - skipping CUDA library bundling")
+        # For PyPI approach, we don't bundle CUDA libraries - they come from PyPI packages
+        # We only need to bundle non-NVIDIA libraries
+        minimal_libs_to_copy = [
+            "/lib64/libgomp.so.1",
+            "/usr/lib64/libgfortran.so.5",
+            "/acl/build/libarm_compute.so",
+            "/acl/build/libarm_compute_graph.so",
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_lapack_core.so.0",
+            "/usr/local/lib/libnvpl_blas_core.so.0",
+        ]
+
+        # Copy minimal libraries to unzipped_folder/torch/lib
+        for lib_path in minimal_libs_to_copy:
+            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
+
+        # Patch torch libraries used for searching libraries
+        torch_libs_to_patch = [
+            "libtorch.so",
+            "libtorch_cpu.so",
+            "libtorch_cuda.so",
+            "libtorch_cuda_linalg.so",
+            "libtorch_global_deps.so",
+            "libtorch_python.so",
+            "libtorch_nvshmem.so",
+            "libc10.so",
+            "libc10_cuda.so",
+            "libcaffe2_nvrtc.so",
+            "libshm.so",
+        ]
+        for lib_name in torch_libs_to_patch:
+            patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
+    else:
+        print("Bundling CUDA libraries with wheel")
+        # Original logic for bundling system CUDA libraries
+        # Common libraries for all CUDA versions
+        common_libs = [
+            # Non-NVIDIA system libraries
+            "/lib64/libgomp.so.1",
+            "/usr/lib64/libgfortran.so.5",
+            "/acl/build/libarm_compute.so",
+            "/acl/build/libarm_compute_graph.so",
+            # Common CUDA libraries (same for all versions)
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_lapack_core.so.0",
+            "/usr/local/lib/libnvpl_blas_core.so.0",
+            "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
+            "/usr/local/cuda/lib64/libcudnn.so.9",
+            "/usr/local/cuda/lib64/libcusparseLt.so.0",
+            "/usr/local/cuda/lib64/libcurand.so.10",
+            "/usr/local/cuda/lib64/libnccl.so.2",
+            "/usr/local/cuda/lib64/libnvshmem_host.so.3",
+            "/usr/local/cuda/lib64/libcudnn_adv.so.9",
+            "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
+            "/usr/local/cuda/lib64/libcudnn_graph.so.9",
+            "/usr/local/cuda/lib64/libcudnn_ops.so.9",
+            "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
+            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
+            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
+            "/usr/local/cuda/lib64/libcufile.so.0",
+            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
+            "/usr/local/cuda/lib64/libcusparse.so.12",
+        ]
+
+        # CUDA version-specific libraries
+        if "13" in desired_cuda:
+            minor_version = desired_cuda[-1]
+            version_specific_libs = [
+                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
+                "/usr/local/cuda/lib64/libcublas.so.13",
+                "/usr/local/cuda/lib64/libcublasLt.so.13",
+                "/usr/local/cuda/lib64/libcudart.so.13",
+                "/usr/local/cuda/lib64/libcufft.so.12",
+                "/usr/local/cuda/lib64/libcusolver.so.12",
+                "/usr/local/cuda/lib64/libnvJitLink.so.13",
+                "/usr/local/cuda/lib64/libnvrtc.so.13",
+                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",
+            ]
+        elif "12" in desired_cuda:
+            # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
+            minor_version = desired_cuda[-1]
+            version_specific_libs = [
+                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
+                "/usr/local/cuda/lib64/libcublas.so.12",
+                "/usr/local/cuda/lib64/libcublasLt.so.12",
+                "/usr/local/cuda/lib64/libcudart.so.12",
+                "/usr/local/cuda/lib64/libcufft.so.11",
+                "/usr/local/cuda/lib64/libcusolver.so.11",
+                "/usr/local/cuda/lib64/libnvJitLink.so.12",
+                "/usr/local/cuda/lib64/libnvrtc.so.12",
+                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
+            ]
+        else:
+            raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")
+
+        # Combine all libraries
+        libs_to_copy = common_libs + version_specific_libs
+
+        # Copy libraries to unzipped_folder/torch/lib
+        for lib_path in libs_to_copy:
+            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
+
+    # Make sure the wheel is tagged with manylinux_2_28
+    for f in os.scandir(f"{folder}/tmp/"):
+        if f.is_dir() and f.name.endswith(".dist-info"):
+            replace_tag(f"{f.path}/WHEEL")
+            break
+
+    os.system(f"wheel pack {folder}/tmp/ -d {folder}")
+    os.system(f"rm -rf {folder}/tmp/")
+
+
+def complete_wheel(folder: str) -> str:
+    """
+    Complete wheel build and put in artifact location
+    """
+    wheel_name = list_dir(f"/{folder}/dist")[0]
+
+    # Please note for cuda we don't run auditwheel since we use custom script to package
+    # the cuda dependencies to the wheel file using update_wheel() method.
+    # However we need to make sure filename reflects the correct Manylinux platform.
+    if "pytorch" in folder and not enable_cuda:
+        print("Repairing Wheel with AuditWheel")
+        check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
+        repaired_wheel_name = list_dir(f"/{folder}/wheelhouse")[0]
+
+        print(f"Moving {repaired_wheel_name} wheel to /{folder}/dist")
+        os.rename(
+            f"/{folder}/wheelhouse/{repaired_wheel_name}",
+            f"/{folder}/dist/{repaired_wheel_name}",
+        )
+    else:
+        repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
+
+    print(f"Copying {repaired_wheel_name} to artifacts")
+    shutil.copy2(
+        f"/{folder}/dist/{repaired_wheel_name}", f"/artifacts/{repaired_wheel_name}"
+    )
+
+    return repaired_wheel_name
+
+
+def parse_arguments():
+    """
+    Parse inline arguments
+    """
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser("AARCH64 wheels python CD")
+    parser.add_argument("--debug", action="store_true")
+    parser.add_argument("--build-only", action="store_true")
+    parser.add_argument("--test-only", type=str)
+    parser.add_argument("--enable-mkldnn", action="store_true")
+    parser.add_argument("--enable-cuda", action="store_true")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    """
+    Entry Point
+    """
+    args = parse_arguments()
+    enable_mkldnn = args.enable_mkldnn
+    enable_cuda = args.enable_cuda
+    branch = check_output(
+        ["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd="/pytorch"
+    ).decode()
+
+    print("Building PyTorch wheel")
+    build_vars = ""
+    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
+    if enable_cuda:
+        build_vars += "MAX_JOBS=5 "
+
+        # Handle PyPI NVIDIA libraries vs bundled libraries
+        use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
+        if use_nvidia_pypi_libs:
+            print("Configuring build for PyPI NVIDIA libraries")
+            # Configure for dynamic linking (matching x86 logic)
+            build_vars += "ATEN_STATIC_CUDA=0 USE_CUDA_STATIC_LINK=0 USE_CUPTI_SO=1 "
+        else:
+            print("Configuring build for bundled NVIDIA libraries")
+            # Keep existing static linking approach - already configured above
+
+    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
+    desired_cuda = os.getenv("DESIRED_CUDA")
+    if override_package_version is not None:
+        version = override_package_version
+        build_vars += (
+            f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
+        )
+    elif branch in ["nightly", "main"]:
+        build_date = (
+            check_output(["git", "log", "--pretty=format:%cs", "-1"], cwd="/pytorch")
+            .decode()
+            .replace("-", "")
+        )
+        version = (
+            check_output(["cat", "version.txt"], cwd="/pytorch").decode().strip()[:-2]
+        )
+        if enable_cuda:
+            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date}+{desired_cuda} PYTORCH_BUILD_NUMBER=1 "
+        else:
+            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
+    elif branch.startswith(("v1.", "v2.")):
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
+
+    if enable_mkldnn:
+        print("build pytorch with mkldnn+acl backend")
+        build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
+        build_vars += "ACL_ROOT_DIR=/acl "
+        if enable_cuda:
+            build_vars += "BLAS=NVPL "
+        else:
+            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS "
+    else:
+        print("build pytorch without mkldnn backend")
+
+    os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")
+    if enable_cuda:
+        print("Updating Cuda Dependency")
+        filename = os.listdir("/pytorch/dist/")
+        wheel_path = f"/pytorch/dist/{filename[0]}"
+        package_cuda_wheel(wheel_path, desired_cuda)
+    pytorch_wheel_name = complete_wheel("/pytorch/")
+    print(f"Build Complete. Created {pytorch_wheel_name}..")
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -0,0 +1,999 @@
+#!/usr/bin/env python3
+
+# This script is for building  AARCH64 wheels using AWS EC2 instances.
+# To generate binaries for the release follow these steps:
+# 1. Update mappings for each of the Domain Libraries by adding new row to a table like this:
+#         "v1.11.0": ("0.11.0", "rc1"),
+# 2. Run script with following arguments for each of the supported python versions and required tag, for example:
+# build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch v1.11.0-rc3
+
+
+import os
+import subprocess
+import sys
+import time
+from typing import Optional, Union
+
+import boto3
+
+
+# AMI images for us-east-1, change the following based on your ~/.aws/config
+os_amis = {
+    "ubuntu20_04": "ami-052eac90edaa9d08f",  # login_name: ubuntu
+    "ubuntu22_04": "ami-0c6c29c5125214c77",  # login_name: ubuntu
+    "redhat8": "ami-0698b90665a2ddcf1",  # login_name: ec2-user
+}
+
+ubuntu20_04_ami = os_amis["ubuntu20_04"]
+
+
+def compute_keyfile_path(key_name: Optional[str] = None) -> tuple[str, str]:
+    if key_name is None:
+        key_name = os.getenv("AWS_KEY_NAME")
+        if key_name is None:
+            return os.getenv("SSH_KEY_PATH", ""), ""
+
+    homedir_path = os.path.expanduser("~")
+    default_path = os.path.join(homedir_path, ".ssh", f"{key_name}.pem")
+    return os.getenv("SSH_KEY_PATH", default_path), key_name
+
+
+ec2 = boto3.resource("ec2")
+
+
+def ec2_get_instances(filter_name, filter_value):
+    return ec2.instances.filter(
+        Filters=[{"Name": filter_name, "Values": [filter_value]}]
+    )
+
+
+def ec2_instances_of_type(instance_type="t4g.2xlarge"):
+    return ec2_get_instances("instance-type", instance_type)
+
+
+def ec2_instances_by_id(instance_id):
+    rc = list(ec2_get_instances("instance-id", instance_id))
+    return rc[0] if len(rc) > 0 else None
+
+
+def start_instance(
+    key_name, ami=ubuntu20_04_ami, instance_type="t4g.2xlarge", ebs_size: int = 50
+):
+    inst = ec2.create_instances(
+        ImageId=ami,
+        InstanceType=instance_type,
+        SecurityGroups=["ssh-allworld"],
+        KeyName=key_name,
+        MinCount=1,
+        MaxCount=1,
+        BlockDeviceMappings=[
+            {
+                "DeviceName": "/dev/sda1",
+                "Ebs": {
+                    "DeleteOnTermination": True,
+                    "VolumeSize": ebs_size,
+                    "VolumeType": "standard",
+                },
+            }
+        ],
+    )[0]
+    print(f"Create instance {inst.id}")
+    inst.wait_until_running()
+    running_inst = ec2_instances_by_id(inst.id)
+    print(f"Instance started at {running_inst.public_dns_name}")
+    return running_inst
+
+
+class RemoteHost:
+    addr: str
+    keyfile_path: str
+    login_name: str
+    container_id: Optional[str] = None
+    ami: Optional[str] = None
+
+    def __init__(self, addr: str, keyfile_path: str, login_name: str = "ubuntu"):
+        self.addr = addr
+        self.keyfile_path = keyfile_path
+        self.login_name = login_name
+
+    def _gen_ssh_prefix(self) -> list[str]:
+        return [
+            "ssh",
+            "-o",
+            "StrictHostKeyChecking=no",
+            "-i",
+            self.keyfile_path,
+            f"{self.login_name}@{self.addr}",
+            "--",
+        ]
+
+    @staticmethod
+    def _split_cmd(args: Union[str, list[str]]) -> list[str]:
+        return args.split() if isinstance(args, str) else args
+
+    def run_ssh_cmd(self, args: Union[str, list[str]]) -> None:
+        subprocess.check_call(self._gen_ssh_prefix() + self._split_cmd(args))
+
+    def check_ssh_output(self, args: Union[str, list[str]]) -> str:
+        return subprocess.check_output(
+            self._gen_ssh_prefix() + self._split_cmd(args)
+        ).decode("utf-8")
+
+    def scp_upload_file(self, local_file: str, remote_file: str) -> None:
+        subprocess.check_call(
+            [
+                "scp",
+                "-i",
+                self.keyfile_path,
+                local_file,
+                f"{self.login_name}@{self.addr}:{remote_file}",
+            ]
+        )
+
+    def scp_download_file(
+        self, remote_file: str, local_file: Optional[str] = None
+    ) -> None:
+        if local_file is None:
+            local_file = "."
+        subprocess.check_call(
+            [
+                "scp",
+                "-i",
+                self.keyfile_path,
+                f"{self.login_name}@{self.addr}:{remote_file}",
+                local_file,
+            ]
+        )
+
+    def start_docker(self, image="quay.io/pypa/manylinux2014_aarch64:latest") -> None:
+        self.run_ssh_cmd("sudo apt-get install -y docker.io")
+        self.run_ssh_cmd(f"sudo usermod -a -G docker {self.login_name}")
+        self.run_ssh_cmd("sudo service docker start")
+        self.run_ssh_cmd(f"docker pull {image}")
+        self.container_id = self.check_ssh_output(
+            f"docker run -t -d -w /root {image}"
+        ).strip()
+
+    def using_docker(self) -> bool:
+        return self.container_id is not None
+
+    def run_cmd(self, args: Union[str, list[str]]) -> None:
+        if not self.using_docker():
+            return self.run_ssh_cmd(args)
+        assert self.container_id is not None
+        docker_cmd = self._gen_ssh_prefix() + [
+            "docker",
+            "exec",
+            "-i",
+            self.container_id,
+            "bash",
+        ]
+        p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE)
+        p.communicate(
+            input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
+                "utf-8"
+            )
+        )
+        rc = p.wait()
+        if rc != 0:
+            raise subprocess.CalledProcessError(rc, docker_cmd)
+
+    def check_output(self, args: Union[str, list[str]]) -> str:
+        if not self.using_docker():
+            return self.check_ssh_output(args)
+        assert self.container_id is not None
+        docker_cmd = self._gen_ssh_prefix() + [
+            "docker",
+            "exec",
+            "-i",
+            self.container_id,
+            "bash",
+        ]
+        p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+        (out, err) = p.communicate(
+            input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
+                "utf-8"
+            )
+        )
+        rc = p.wait()
+        if rc != 0:
+            raise subprocess.CalledProcessError(rc, docker_cmd, output=out, stderr=err)
+        return out.decode("utf-8")
+
+    def upload_file(self, local_file: str, remote_file: str) -> None:
+        if not self.using_docker():
+            return self.scp_upload_file(local_file, remote_file)
+        tmp_file = os.path.join("/tmp", os.path.basename(local_file))
+        self.scp_upload_file(local_file, tmp_file)
+        self.run_ssh_cmd(
+            ["docker", "cp", tmp_file, f"{self.container_id}:/root/{remote_file}"]
+        )
+        self.run_ssh_cmd(["rm", tmp_file])
+
+    def download_file(self, remote_file: str, local_file: Optional[str] = None) -> None:
+        if not self.using_docker():
+            return self.scp_download_file(remote_file, local_file)
+        tmp_file = os.path.join("/tmp", os.path.basename(remote_file))
+        self.run_ssh_cmd(
+            ["docker", "cp", f"{self.container_id}:/root/{remote_file}", tmp_file]
+        )
+        self.scp_download_file(tmp_file, local_file)
+        self.run_ssh_cmd(["rm", tmp_file])
+
+    def download_wheel(
+        self, remote_file: str, local_file: Optional[str] = None
+    ) -> None:
+        if self.using_docker() and local_file is None:
+            basename = os.path.basename(remote_file)
+            local_file = basename.replace(
+                "-linux_aarch64.whl", "-manylinux2014_aarch64.whl"
+            )
+        self.download_file(remote_file, local_file)
+
+    def list_dir(self, path: str) -> list[str]:
+        return self.check_output(["ls", "-1", path]).split("\n")
+
+
+def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
+    import socket
+
+    for i in range(attempt_cnt):
+        try:
+            with socket.create_connection((addr, port), timeout=timeout):
+                return
+        except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203
+            if i == attempt_cnt - 1:
+                raise
+            time.sleep(timeout)
+
+
+def update_apt_repo(host: RemoteHost) -> None:
+    time.sleep(5)
+    host.run_cmd("sudo systemctl stop apt-daily.service || true")
+    host.run_cmd("sudo systemctl stop unattended-upgrades.service || true")
+    host.run_cmd(
+        "while systemctl is-active --quiet apt-daily.service; do sleep 1; done"
+    )
+    host.run_cmd(
+        "while systemctl is-active --quiet unattended-upgrades.service; do sleep 1; done"
+    )
+    host.run_cmd("sudo apt-get update")
+    time.sleep(3)
+    host.run_cmd("sudo apt-get update")
+
+
+def install_condaforge(
+    host: RemoteHost, suffix: str = "latest/download/Miniforge3-Linux-aarch64.sh"
+) -> None:
+    print("Install conda-forge")
+    host.run_cmd(f"curl -OL https://github.com/conda-forge/miniforge/releases/{suffix}")
+    host.run_cmd(f"sh -f {os.path.basename(suffix)} -b")
+    host.run_cmd(f"rm -f {os.path.basename(suffix)}")
+    if host.using_docker():
+        host.run_cmd("echo 'PATH=$HOME/miniforge3/bin:$PATH'>>.bashrc")
+    else:
+        host.run_cmd(
+            [
+                "sed",
+                "-i",
+                "'/^# If not running interactively.*/i PATH=$HOME/miniforge3/bin:$PATH'",
+                ".bashrc",
+            ]
+        )
+
+
+def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
+    if python_version == "3.6":
+        # Python-3.6 EOLed and not compatible with conda-4.11
+        install_condaforge(
+            host, suffix="download/4.10.3-10/Miniforge3-4.10.3-10-Linux-aarch64.sh"
+        )
+        host.run_cmd(f"conda install -y python={python_version} numpy pyyaml")
+    else:
+        install_condaforge(
+            host, suffix="download/4.11.0-4/Miniforge3-4.11.0-4-Linux-aarch64.sh"
+        )
+        # Pytorch-1.10 or older are not compatible with setuptools=59.6 or newer
+        host.run_cmd(
+            f"conda install -y python={python_version} numpy pyyaml setuptools>=59.5.0"
+        )
+
+
+def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:
+    host.run_cmd("pip3 install auditwheel")
+    host.run_cmd(
+        "conda install -y patchelf" if use_conda else "sudo apt-get install -y patchelf"
+    )
+    from tempfile import NamedTemporaryFile
+
+    with NamedTemporaryFile() as tmp:
+        tmp.write(embed_library_script.encode("utf-8"))
+        tmp.flush()
+        host.upload_file(tmp.name, "embed_library.py")
+
+    print("Embedding libgomp into wheel")
+    if host.using_docker():
+        host.run_cmd(f"python3 embed_library.py {wheel_name} --update-tag")
+    else:
+        host.run_cmd(f"python3 embed_library.py {wheel_name}")
+
+
+def checkout_repo(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    url: str,
+    git_clone_flags: str,
+    mapping: dict[str, tuple[str, str]],
+) -> Optional[str]:
+    for prefix in mapping:
+        if not branch.startswith(prefix):
+            continue
+        tag = f"v{mapping[prefix][0]}-{mapping[prefix][1]}"
+        host.run_cmd(f"git clone {url} -b {tag} {git_clone_flags}")
+        return mapping[prefix][0]
+
+    host.run_cmd(f"git clone {url} -b {branch} {git_clone_flags}")
+    return None
+
+
+def build_torchvision(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    use_conda: bool = True,
+    git_clone_flags: str,
+    run_smoke_tests: bool = True,
+) -> str:
+    print("Checking out TorchVision repo")
+    build_version = checkout_repo(
+        host,
+        branch=branch,
+        url="https://github.com/pytorch/vision",
+        git_clone_flags=git_clone_flags,
+        mapping={
+            "v1.7.1": ("0.8.2", "rc2"),
+            "v1.8.0": ("0.9.0", "rc3"),
+            "v1.8.1": ("0.9.1", "rc1"),
+            "v1.9.0": ("0.10.0", "rc1"),
+            "v1.10.0": ("0.11.1", "rc1"),
+            "v1.10.1": ("0.11.2", "rc1"),
+            "v1.10.2": ("0.11.3", "rc1"),
+            "v1.11.0": ("0.12.0", "rc1"),
+            "v1.12.0": ("0.13.0", "rc4"),
+            "v1.12.1": ("0.13.1", "rc6"),
+            "v1.13.0": ("0.14.0", "rc4"),
+            "v1.13.1": ("0.14.1", "rc2"),
+            "v2.0.0": ("0.15.1", "rc2"),
+            "v2.0.1": ("0.15.2", "rc2"),
+        },
+    )
+    print("Building TorchVision wheel")
+
+    # Please note libnpg and jpeg are required to build image.so extension
+    if use_conda:
+        host.run_cmd("conda install -y libpng jpeg")
+        # Remove .so files to force static linking
+        host.run_cmd(
+            "rm miniforge3/lib/libpng.so miniforge3/lib/libpng16.so miniforge3/lib/libjpeg.so"
+        )
+        # And patch setup.py to include libz dependency for libpng
+        host.run_cmd(
+            [
+                'sed -i -e \'s/image_link_flags\\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py'
+            ]
+        )
+
+    build_vars = ""
+    if branch == "nightly":
+        version = host.check_output(
+            ["if [ -f vision/version.txt ]; then cat vision/version.txt; fi"]
+        ).strip()
+        if len(version) == 0:
+            # In older revisions, version was embedded in setup.py
+            version = (
+                host.check_output(["grep", '"version = \'"', "vision/setup.py"])
+                .strip()
+                .split("'")[1][:-2]
+            )
+        build_date = (
+            host.check_output("cd vision && git log --pretty=format:%s -1")
+            .strip()
+            .split()[0]
+            .replace("-", "")
+        )
+        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
+    elif build_version is not None:
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
+    if host.using_docker():
+        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
+
+    host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation")
+    vision_wheel_name = host.list_dir("vision/dist")[0]
+    embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name))
+
+    print("Copying TorchVision wheel")
+    host.download_wheel(os.path.join("vision", "dist", vision_wheel_name))
+    if run_smoke_tests:
+        host.run_cmd(
+            f"pip3 install {os.path.join('vision', 'dist', vision_wheel_name)}"
+        )
+        host.run_cmd("python3 vision/test/smoke_test.py")
+    print("Delete vision checkout")
+    host.run_cmd("rm -rf vision")
+
+    return vision_wheel_name
+
+
+def build_torchdata(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    use_conda: bool = True,
+    git_clone_flags: str = "",
+) -> str:
+    print("Checking out TorchData repo")
+    git_clone_flags += " --recurse-submodules"
+    build_version = checkout_repo(
+        host,
+        branch=branch,
+        url="https://github.com/pytorch/data",
+        git_clone_flags=git_clone_flags,
+        mapping={
+            "v1.13.1": ("0.5.1", ""),
+            "v2.0.0": ("0.6.0", "rc5"),
+            "v2.0.1": ("0.6.1", "rc1"),
+        },
+    )
+    print("Building TorchData wheel")
+    build_vars = ""
+    if branch == "nightly":
+        version = host.check_output(
+            ["if [ -f data/version.txt ]; then cat data/version.txt; fi"]
+        ).strip()
+        build_date = (
+            host.check_output("cd data && git log --pretty=format:%s -1")
+            .strip()
+            .split()[0]
+            .replace("-", "")
+        )
+        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
+    elif build_version is not None:
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
+    if host.using_docker():
+        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
+
+    host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation")
+    wheel_name = host.list_dir("data/dist")[0]
+    embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name))
+
+    print("Copying TorchData wheel")
+    host.download_wheel(os.path.join("data", "dist", wheel_name))
+
+    return wheel_name
+
+
+def build_torchtext(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    use_conda: bool = True,
+    git_clone_flags: str = "",
+) -> str:
+    print("Checking out TorchText repo")
+    git_clone_flags += " --recurse-submodules"
+    build_version = checkout_repo(
+        host,
+        branch=branch,
+        url="https://github.com/pytorch/text",
+        git_clone_flags=git_clone_flags,
+        mapping={
+            "v1.9.0": ("0.10.0", "rc1"),
+            "v1.10.0": ("0.11.0", "rc2"),
+            "v1.10.1": ("0.11.1", "rc1"),
+            "v1.10.2": ("0.11.2", "rc1"),
+            "v1.11.0": ("0.12.0", "rc1"),
+            "v1.12.0": ("0.13.0", "rc2"),
+            "v1.12.1": ("0.13.1", "rc5"),
+            "v1.13.0": ("0.14.0", "rc3"),
+            "v1.13.1": ("0.14.1", "rc1"),
+            "v2.0.0": ("0.15.1", "rc2"),
+            "v2.0.1": ("0.15.2", "rc2"),
+        },
+    )
+    print("Building TorchText wheel")
+    build_vars = ""
+    if branch == "nightly":
+        version = host.check_output(
+            ["if [ -f text/version.txt ]; then cat text/version.txt; fi"]
+        ).strip()
+        build_date = (
+            host.check_output("cd text && git log --pretty=format:%s -1")
+            .strip()
+            .split()[0]
+            .replace("-", "")
+        )
+        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
+    elif build_version is not None:
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
+    if host.using_docker():
+        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
+
+    host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation")
+    wheel_name = host.list_dir("text/dist")[0]
+    embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name))
+
+    print("Copying TorchText wheel")
+    host.download_wheel(os.path.join("text", "dist", wheel_name))
+
+    return wheel_name
+
+
+def build_torchaudio(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    use_conda: bool = True,
+    git_clone_flags: str = "",
+) -> str:
+    print("Checking out TorchAudio repo")
+    git_clone_flags += " --recurse-submodules"
+    build_version = checkout_repo(
+        host,
+        branch=branch,
+        url="https://github.com/pytorch/audio",
+        git_clone_flags=git_clone_flags,
+        mapping={
+            "v1.9.0": ("0.9.0", "rc2"),
+            "v1.10.0": ("0.10.0", "rc5"),
+            "v1.10.1": ("0.10.1", "rc1"),
+            "v1.10.2": ("0.10.2", "rc1"),
+            "v1.11.0": ("0.11.0", "rc1"),
+            "v1.12.0": ("0.12.0", "rc3"),
+            "v1.12.1": ("0.12.1", "rc5"),
+            "v1.13.0": ("0.13.0", "rc4"),
+            "v1.13.1": ("0.13.1", "rc2"),
+            "v2.0.0": ("2.0.1", "rc3"),
+            "v2.0.1": ("2.0.2", "rc2"),
+        },
+    )
+    print("Building TorchAudio wheel")
+    build_vars = ""
+    if branch == "nightly":
+        version = (
+            host.check_output(["grep", '"version = \'"', "audio/setup.py"])
+            .strip()
+            .split("'")[1][:-2]
+        )
+        build_date = (
+            host.check_output("cd audio && git log --pretty=format:%s -1")
+            .strip()
+            .split()[0]
+            .replace("-", "")
+        )
+        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
+    elif build_version is not None:
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
+    if host.using_docker():
+        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
+
+    host.run_cmd(
+        f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \
+        && ./packaging/ffmpeg/build.sh \
+        && {build_vars} python3 -m build --wheel --no-isolation"
+    )
+
+    wheel_name = host.list_dir("audio/dist")[0]
+    embed_libgomp(host, use_conda, os.path.join("audio", "dist", wheel_name))
+
+    print("Copying TorchAudio wheel")
+    host.download_wheel(os.path.join("audio", "dist", wheel_name))
+
+    return wheel_name
+
+
+def configure_system(
+    host: RemoteHost,
+    *,
+    compiler: str = "gcc-8",
+    use_conda: bool = True,
+    python_version: str = "3.8",
+) -> None:
+    if use_conda:
+        install_condaforge_python(host, python_version)
+
+    print("Configuring the system")
+    if not host.using_docker():
+        update_apt_repo(host)
+        host.run_cmd("sudo apt-get install -y ninja-build g++ git cmake gfortran unzip")
+    else:
+        host.run_cmd("yum install -y sudo")
+        host.run_cmd("conda install -y ninja scons")
+
+    if not use_conda:
+        host.run_cmd(
+            "sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
+        )
+    host.run_cmd("pip3 install dataclasses typing-extensions")
+    if not use_conda:
+        print("Installing Cython + numpy from PyPy")
+        host.run_cmd("sudo pip3 install Cython")
+        host.run_cmd("sudo pip3 install numpy")
+
+
+def build_domains(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    use_conda: bool = True,
+    git_clone_flags: str = "",
+) -> tuple[str, str, str, str]:
+    vision_wheel_name = build_torchvision(
+        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
+    )
+    audio_wheel_name = build_torchaudio(
+        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
+    )
+    data_wheel_name = build_torchdata(
+        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
+    )
+    text_wheel_name = build_torchtext(
+        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
+    )
+    return (vision_wheel_name, audio_wheel_name, data_wheel_name, text_wheel_name)
+
+
+def start_build(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    compiler: str = "gcc-8",
+    use_conda: bool = True,
+    python_version: str = "3.8",
+    pytorch_only: bool = False,
+    pytorch_build_number: Optional[str] = None,
+    shallow_clone: bool = True,
+    enable_mkldnn: bool = False,
+) -> tuple[str, str, str, str, str]:
+    git_clone_flags = " --depth 1 --shallow-submodules" if shallow_clone else ""
+    if host.using_docker() and not use_conda:
+        print("Auto-selecting conda option for docker images")
+        use_conda = True
+    if not host.using_docker():
+        print("Disable mkldnn for host builds")
+        enable_mkldnn = False
+
+    configure_system(
+        host, compiler=compiler, use_conda=use_conda, python_version=python_version
+    )
+
+    if host.using_docker():
+        print("Move libgfortant.a into a standard location")
+        # HACK: pypa gforntran.a is compiled without PIC, which leads to the following error
+        # libgfortran.a(error.o)(.text._gfortrani_st_printf+0x34): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `__stack_chk_guard@@GLIBC_2.17'  # noqa: E501, B950
+        # Workaround by copying gfortran library from the host
+        host.run_ssh_cmd("sudo apt-get install -y gfortran-8")
+        host.run_cmd("mkdir -p /usr/lib/gcc/aarch64-linux-gnu/8")
+        host.run_ssh_cmd(
+            [
+                "docker",
+                "cp",
+                "/usr/lib/gcc/aarch64-linux-gnu/8/libgfortran.a",
+                f"{host.container_id}:/opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/",
+            ]
+        )
+
+    print("Checking out PyTorch repo")
+    host.run_cmd(
+        f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"
+    )
+
+    host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")
+
+    print("Building PyTorch wheel")
+    build_opts = ""
+    if pytorch_build_number is not None:
+        build_opts += f" -C--build-option=--build-number={pytorch_build_number}"
+    # Breakpad build fails on aarch64
+    build_vars = "USE_BREAKPAD=0 "
+    if branch == "nightly":
+        build_date = (
+            host.check_output("cd pytorch && git log --pretty=format:%s -1")
+            .strip()
+            .split()[0]
+            .replace("-", "")
+        )
+        version = host.check_output("cat pytorch/version.txt").strip()[:-2]
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
+    if branch.startswith(("v1.", "v2.")):
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
+    if host.using_docker():
+        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
+    if enable_mkldnn:
+        host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")
+        print("build pytorch with mkldnn+acl backend")
+        build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
+        build_vars += " BLAS=OpenBLAS"
+        build_vars += " OpenBLAS_HOME=/opt/OpenBLAS"
+        build_vars += " ACL_ROOT_DIR=/acl"
+        host.run_cmd(
+            f"cd $HOME/pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
+        )
+        print("Repair the wheel")
+        pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
+        ld_library_path = "/acl/build:$HOME/pytorch/build/lib"
+        host.run_cmd(
+            f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"
+        )
+        print("replace the original wheel with the repaired one")
+        pytorch_repaired_wheel_name = host.list_dir("wheelhouse")[0]
+        host.run_cmd(
+            f"cp $HOME/wheelhouse/{pytorch_repaired_wheel_name} $HOME/pytorch/dist/{pytorch_wheel_name}"
+        )
+    else:
+        print("build pytorch without mkldnn backend")
+        host.run_cmd(
+            f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
+        )
+
+    print("Deleting build folder")
+    host.run_cmd("cd pytorch && rm -rf build")
+    pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
+    embed_libgomp(host, use_conda, os.path.join("pytorch", "dist", pytorch_wheel_name))
+    print("Copying the wheel")
+    host.download_wheel(os.path.join("pytorch", "dist", pytorch_wheel_name))
+
+    print("Installing PyTorch wheel")
+    host.run_cmd(f"pip3 install pytorch/dist/{pytorch_wheel_name}")
+
+    if pytorch_only:
+        return (pytorch_wheel_name, None, None, None, None)
+    domain_wheels = build_domains(
+        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
+    )
+
+    return (pytorch_wheel_name, *domain_wheels)
+
+
+embed_library_script = """
+#!/usr/bin/env python3
+
+from auditwheel.patcher import Patchelf
+from auditwheel.wheeltools import InWheelCtx
+from auditwheel.elfutils import elf_file_filter
+from auditwheel.repair import copylib
+from auditwheel.lddtree import lddtree
+from subprocess import check_call
+import os
+import shutil
+import sys
+from tempfile import TemporaryDirectory
+
+
+def replace_tag(filename):
+   with open(filename, 'r') as f:
+     lines = f.read().split("\\n")
+   for i,line in enumerate(lines):
+       if not line.startswith("Tag: "):
+           continue
+       lines[i] = line.replace("-linux_", "-manylinux2014_")
+       print(f'Updated tag from {line} to {lines[i]}')
+
+   with open(filename, 'w') as f:
+       f.write("\\n".join(lines))
+
+
+class AlignedPatchelf(Patchelf):
+    def set_soname(self, file_name: str, new_soname: str) -> None:
+        check_call(['patchelf', '--page-size', '65536', '--set-soname', new_soname, file_name])
+
+    def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
+        check_call(['patchelf', '--page-size', '65536', '--replace-needed', soname, new_soname, file_name])
+
+
+def embed_library(whl_path, lib_soname, update_tag=False):
+    patcher = AlignedPatchelf()
+    out_dir = TemporaryDirectory()
+    whl_name = os.path.basename(whl_path)
+    tmp_whl_name = os.path.join(out_dir.name, whl_name)
+    with InWheelCtx(whl_path) as ctx:
+        torchlib_path = os.path.join(ctx._tmpdir.name, 'torch', 'lib')
+        ctx.out_wheel=tmp_whl_name
+        new_lib_path, new_lib_soname = None, None
+        for filename, elf in elf_file_filter(ctx.iter_files()):
+            if not filename.startswith('torch/lib'):
+                continue
+            libtree = lddtree(filename)
+            if lib_soname not in libtree['needed']:
+                continue
+            lib_path = libtree['libs'][lib_soname]['path']
+            if lib_path is None:
+                print(f"Can't embed {lib_soname} as it could not be found")
+                break
+            if lib_path.startswith(torchlib_path):
+                continue
+
+            if new_lib_path is None:
+                new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
+            patcher.replace_needed(filename, lib_soname, new_lib_soname)
+            print(f'Replacing {lib_soname} with {new_lib_soname} for {filename}')
+        if update_tag:
+            # Add manylinux2014 tag
+            for filename in ctx.iter_files():
+                if os.path.basename(filename) != 'WHEEL':
+                    continue
+                replace_tag(filename)
+    shutil.move(tmp_whl_name, whl_path)
+
+
+if __name__ == '__main__':
+    embed_library(sys.argv[1], 'libgomp.so.1', len(sys.argv) > 2 and sys.argv[2] == '--update-tag')
+"""
+
+
+def run_tests(host: RemoteHost, whl: str, branch="main") -> None:
+    print("Configuring the system")
+    update_apt_repo(host)
+    host.run_cmd("sudo apt-get install -y python3-pip git")
+    host.run_cmd("sudo pip3 install Cython")
+    host.run_cmd("sudo pip3 install numpy")
+    host.upload_file(whl, ".")
+    host.run_cmd(f"sudo pip3 install {whl}")
+    host.run_cmd("python3 -c 'import torch;print(torch.rand((3,3))'")
+    host.run_cmd(f"git clone -b {branch} https://github.com/pytorch/pytorch")
+    host.run_cmd("cd pytorch/test; python3 test_torch.py -v")
+
+
+def get_instance_name(instance) -> Optional[str]:
+    if instance.tags is None:
+        return None
+    for tag in instance.tags:
+        if tag["Key"] == "Name":
+            return tag["Value"]
+    return None
+
+
+def list_instances(instance_type: str) -> None:
+    print(f"All instances of type {instance_type}")
+    for instance in ec2_instances_of_type(instance_type):
+        ifaces = instance.network_interfaces
+        az = ifaces[0].subnet.availability_zone if len(ifaces) > 0 else None
+        print(
+            f"{instance.id} {get_instance_name(instance)} {instance.public_dns_name} {instance.state['Name']} {az}"
+        )
+
+
+def terminate_instances(instance_type: str) -> None:
+    print(f"Terminating all instances of type {instance_type}")
+    instances = list(ec2_instances_of_type(instance_type))
+    for instance in instances:
+        print(f"Terminating {instance.id}")
+        instance.terminate()
+    print("Waiting for termination to complete")
+    for instance in instances:
+        instance.wait_until_terminated()
+
+
+def parse_arguments():
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser("Build and test AARCH64 wheels using EC2")
+    parser.add_argument("--key-name", type=str)
+    parser.add_argument("--debug", action="store_true")
+    parser.add_argument("--build-only", action="store_true")
+    parser.add_argument("--test-only", type=str)
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument("--os", type=str, choices=list(os_amis.keys()))
+    group.add_argument("--ami", type=str)
+    parser.add_argument(
+        "--python-version",
+        type=str,
+        choices=[f"3.{d}" for d in range(6, 12)],
+        default=None,
+    )
+    parser.add_argument("--alloc-instance", action="store_true")
+    parser.add_argument("--list-instances", action="store_true")
+    parser.add_argument("--pytorch-only", action="store_true")
+    parser.add_argument("--keep-running", action="store_true")
+    parser.add_argument("--terminate-instances", action="store_true")
+    parser.add_argument("--instance-type", type=str, default="t4g.2xlarge")
+    parser.add_argument("--ebs-size", type=int, default=50)
+    parser.add_argument("--branch", type=str, default="main")
+    parser.add_argument("--use-docker", action="store_true")
+    parser.add_argument(
+        "--compiler",
+        type=str,
+        choices=["gcc-7", "gcc-8", "gcc-9", "clang"],
+        default="gcc-8",
+    )
+    parser.add_argument("--use-torch-from-pypi", action="store_true")
+    parser.add_argument("--pytorch-build-number", type=str, default=None)
+    parser.add_argument("--disable-mkldnn", action="store_true")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    ami = (
+        args.ami
+        if args.ami is not None
+        else os_amis[args.os]
+        if args.os is not None
+        else ubuntu20_04_ami
+    )
+    keyfile_path, key_name = compute_keyfile_path(args.key_name)
+
+    if args.list_instances:
+        list_instances(args.instance_type)
+        sys.exit(0)
+
+    if args.terminate_instances:
+        terminate_instances(args.instance_type)
+        sys.exit(0)
+
+    if len(key_name) == 0:
+        raise RuntimeError("""
+            Cannot start build without key_name, please specify
+            --key-name argument or AWS_KEY_NAME environment variable.""")
+    if len(keyfile_path) == 0 or not os.path.exists(keyfile_path):
+        raise RuntimeError(f"""
+            Cannot find keyfile with name: [{key_name}] in path: [{keyfile_path}], please
+            check `~/.ssh/` folder or manually set SSH_KEY_PATH environment variable.""")
+
+    # Starting the instance
+    inst = start_instance(
+        key_name, ami=ami, instance_type=args.instance_type, ebs_size=args.ebs_size
+    )
+    instance_name = f"{args.key_name}-{args.os}"
+    if args.python_version is not None:
+        instance_name += f"-py{args.python_version}"
+    inst.create_tags(
+        DryRun=False,
+        Tags=[
+            {
+                "Key": "Name",
+                "Value": instance_name,
+            }
+        ],
+    )
+    addr = inst.public_dns_name
+    wait_for_connection(addr, 22)
+    host = RemoteHost(addr, keyfile_path)
+    host.ami = ami
+    if args.use_docker:
+        update_apt_repo(host)
+        host.start_docker()
+
+    if args.test_only:
+        run_tests(host, args.test_only)
+        sys.exit(0)
+
+    if args.alloc_instance:
+        if args.python_version is None:
+            sys.exit(0)
+        install_condaforge_python(host, args.python_version)
+        sys.exit(0)
+
+    python_version = args.python_version if args.python_version is not None else "3.10"
+
+    if args.use_torch_from_pypi:
+        configure_system(host, compiler=args.compiler, python_version=python_version)
+        print("Installing PyTorch wheel")
+        host.run_cmd("pip3 install torch")
+        build_domains(
+            host, branch=args.branch, git_clone_flags=" --depth 1 --shallow-submodules"
+        )
+    else:
+        start_build(
+            host,
+            branch=args.branch,
+            compiler=args.compiler,
+            python_version=python_version,
+            pytorch_only=args.pytorch_only,
+            pytorch_build_number=args.pytorch_build_number,
+            enable_mkldnn=not args.disable_mkldnn,
+        )
+    if not args.keep_running:
+        print(f"Waiting for instance {inst.id} to terminate")
+        inst.terminate()
+        inst.wait_until_terminated()
--- a/.ci/aarch64_linux/embed_library.py
+++ b/.ci/aarch64_linux/embed_library.py
@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+
+import os
+import shutil
+import sys
+from subprocess import check_call
+from tempfile import TemporaryDirectory
+
+from auditwheel.elfutils import elf_file_filter
+from auditwheel.lddtree import lddtree
+from auditwheel.patcher import Patchelf
+from auditwheel.repair import copylib
+from auditwheel.wheeltools import InWheelCtx
+
+
+def replace_tag(filename):
+    with open(filename) as f:
+        lines = f.read().split("\\n")
+    for i, line in enumerate(lines):
+        if not line.startswith("Tag: "):
+            continue
+        lines[i] = line.replace("-linux_", "-manylinux2014_")
+        print(f"Updated tag from {line} to {lines[i]}")
+
+    with open(filename, "w") as f:
+        f.write("\\n".join(lines))
+
+
+class AlignedPatchelf(Patchelf):
+    def set_soname(self, file_name: str, new_soname: str) -> None:
+        check_call(
+            ["patchelf", "--page-size", "65536", "--set-soname", new_soname, file_name]
+        )
+
+    def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
+        check_call(
+            [
+                "patchelf",
+                "--page-size",
+                "65536",
+                "--replace-needed",
+                soname,
+                new_soname,
+                file_name,
+            ]
+        )
+
+
+def embed_library(whl_path, lib_soname, update_tag=False):
+    patcher = AlignedPatchelf()
+    out_dir = TemporaryDirectory()
+    whl_name = os.path.basename(whl_path)
+    tmp_whl_name = os.path.join(out_dir.name, whl_name)
+    with InWheelCtx(whl_path) as ctx:
+        torchlib_path = os.path.join(ctx._tmpdir.name, "torch", "lib")
+        ctx.out_wheel = tmp_whl_name
+        new_lib_path, new_lib_soname = None, None
+        for filename, _ in elf_file_filter(ctx.iter_files()):
+            if not filename.startswith("torch/lib"):
+                continue
+            libtree = lddtree(filename)
+            if lib_soname not in libtree["needed"]:
+                continue
+            lib_path = libtree["libs"][lib_soname]["path"]
+            if lib_path is None:
+                print(f"Can't embed {lib_soname} as it could not be found")
+                break
+            if lib_path.startswith(torchlib_path):
+                continue
+
+            if new_lib_path is None:
+                new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
+            patcher.replace_needed(filename, lib_soname, new_lib_soname)
+            print(f"Replacing {lib_soname} with {new_lib_soname} for {filename}")
+        if update_tag:
+            # Add manylinux2014 tag
+            for filename in ctx.iter_files():
+                if os.path.basename(filename) != "WHEEL":
+                    continue
+                replace_tag(filename)
+    shutil.move(tmp_whl_name, whl_path)
+
+
+if __name__ == "__main__":
+    embed_library(
+        sys.argv[1], "libgomp.so.1", len(sys.argv) > 2 and sys.argv[2] == "--update-tag"
+    )
--- a/.ci/manywheel/build.sh
+++ b/.ci/manywheel/build.sh
@ -4,17 +4,14 @@ set -ex

 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

-# Source the common build script for architecture-specific configurations (MKLDNN, ACL, etc.)
-source "${SCRIPTPATH}/../pytorch/build.sh" || true
-
 case "${GPU_ARCH_TYPE:-BLANK}" in
-    cuda | cuda-aarch64)
+    cuda)
        bash "${SCRIPTPATH}/build_cuda.sh"
        ;;
    rocm)
        bash "${SCRIPTPATH}/build_rocm.sh"
        ;;
-    cpu | cpu-cxx11-abi | cpu-aarch64 | cpu-s390x)
+    cpu | cpu-cxx11-abi | cpu-s390x)
        bash "${SCRIPTPATH}/build_cpu.sh"
        ;;
    xpu)
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -18,31 +18,12 @@ retry () {
    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
 }

-# Detect architecture first
-ARCH=$(uname -m)
-echo "Detected architecture: $ARCH"
-
 PLATFORM=""
 # TODO move this into the Docker images
 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
 if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    retry yum install -q -y zip openssl
-    # Set platform based on architecture
-    case $ARCH in
-        x86_64)
-            PLATFORM="manylinux_2_28_x86_64"
-            ;;
-        aarch64)
-            PLATFORM="manylinux_2_28_aarch64"
-            ;;
-        s390x)
-            PLATFORM="manylinux_2_28_s390x"
-            ;;
-        *)
-            echo "Unsupported architecture: $ARCH"
-            exit 1
-            ;;
-    esac
+    PLATFORM="manylinux_2_28_x86_64"
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    retry dnf install -q -y zip openssl
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
@ -57,8 +38,6 @@ else
    exit 1
 fi

-echo "Platform set to: $PLATFORM"
-
 # We use the package name to test the package by passing this to 'pip install'
 # This is the env variable that setup.py uses to name the package. Note that
 # pip 'normalizes' the name first by changing all - to _
@ -320,8 +299,8 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
            # ROCm workaround for roctracer dlopens
            if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
                patchedpath=$(fname_without_so_number $destpath)
-            # Keep the so number for XPU dependencies, libgomp.so.1, ACL libraries, and NVPL libraries to avoid twice load
-            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" || "$filename" == libarm_compute* || "$filename" == libnvpl* || "$filename" == "libgfortran.so.5" ]]; then
+            # Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load
+            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then
                patchedpath=$destpath
            else
                patchedpath=$(fname_with_sha256 $destpath)
@ -367,22 +346,9 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
    done

    # create Manylinux 2_28 tag this needs to happen before regenerate the RECORD
-    # Support all architectures (x86_64, aarch64, s390x)
-    if [[ "$IS_MANYLINUX2_28" == "1" && $GPU_ARCH_TYPE != "xpu" ]]; then
+    if [[ $PLATFORM == "manylinux_2_28_x86_64" && $GPU_ARCH_TYPE != "cpu-s390x" && $GPU_ARCH_TYPE != "xpu" ]]; then
        wheel_file=$(echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/WHEEL/g')
-        echo "Updating wheel tag for $ARCH architecture"
-        # Replace linux_* with manylinux_2_28_* based on architecture
-        case $ARCH in
-            x86_64)
-                sed -i -e 's#linux_x86_64#manylinux_2_28_x86_64#g' $wheel_file
-                ;;
-            aarch64)
-                sed -i -e 's#linux_aarch64#manylinux_2_28_aarch64#g' $wheel_file
-                ;;
-            s390x)
-                sed -i -e 's#linux_s390x#manylinux_2_28_s390x#g' $wheel_file
-                ;;
-        esac
+        sed -i -e s#linux_x86_64#"${PLATFORM}"# $wheel_file;
    fi

    # regenerate the RECORD file with new hashes
--- a/.ci/manywheel/build_cpu.sh
+++ b/.ci/manywheel/build_cpu.sh
@ -15,10 +15,6 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
    EXTRA_CAFFE2_CMAKE_FLAGS=()
 fi

-# Detect architecture
-ARCH=$(uname -m)
-echo "Building CPU wheel for architecture: $ARCH"
-
 WHEELHOUSE_DIR="wheelhousecpu"
 LIBTORCH_HOUSE_DIR="libtorch_housecpu"
 if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
@ -38,10 +34,8 @@ elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
 elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
-    if [[ "$ARCH" == "s390x" ]]; then
+    if [[ "$(uname -m)" == "s390x" ]]; then
        LIBGOMP_PATH="/usr/lib/s390x-linux-gnu/libgomp.so.1"
-    elif [[ "$ARCH" == "aarch64" ]]; then
-        LIBGOMP_PATH="/usr/lib/aarch64-linux-gnu/libgomp.so.1"
    else
        LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
    fi
@ -55,34 +49,6 @@ DEPS_SONAME=(
    "libgomp.so.1"
 )

-# Add ARM-specific library dependencies for CPU builds
-if [[ "$ARCH" == "aarch64" ]]; then
-    echo "Adding ARM-specific CPU library dependencies"
-
-    # ARM Compute Library (if available)
-    if [[ -d "/acl/build" ]]; then
-        echo "Adding ARM Compute Library for CPU"
-        DEPS_LIST+=(
-            "/acl/build/libarm_compute.so"
-            "/acl/build/libarm_compute_graph.so"
-        )
-        DEPS_SONAME+=(
-            "libarm_compute.so"
-            "libarm_compute_graph.so"
-        )
-    fi
-
-    # ARM system libraries
-    DEPS_LIST+=(
-        "/usr/lib64/libgfortran.so.5"
-        "/opt/OpenBLAS/lib/libopenblas.so.0"
-    )
-    DEPS_SONAME+=(
-        "libgfortran.so.5"
-        "libopenblas.so.0"
-    )
-fi
-
 rm -rf /usr/local/cuda*

 SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -29,10 +29,6 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
    EXTRA_CAFFE2_CMAKE_FLAGS=()
 fi

-# Detect architecture
-ARCH=$(uname -m)
-echo "Building for architecture: $ARCH"
-
 # Determine CUDA version and architectures to build for
 #
 # NOTE: We should first check `DESIRED_CUDA` when determining `CUDA_VERSION`,
@ -57,60 +53,34 @@ fi
 cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
 EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")

-# Function to remove architectures from a list
-remove_archs() {
-    local result="$1"
-    shift
-    for arch in "$@"; do
-        result="${result//${arch};/}"
-    done
-    echo "$result"
-}
-
-# Function to filter CUDA architectures for aarch64
-# aarch64 ARM GPUs only support certain compute capabilities
-# Keep: 8.0 (A100), 9.0+ (Hopper, Grace Hopper, newer)
-# Remove: < 8.0 (no ARM GPUs), 8.6 (x86_64 RTX 3090/A6000 only)
-filter_aarch64_archs() {
-    local arch_list="$1"
-    # Explicitly remove architectures not needed on aarch64
-    arch_list=$(remove_archs "$arch_list" "5.0" "6.0" "7.0" "7.5" "8.6")
-    echo "$arch_list"
-}
-
-# Base: Common architectures across all modern CUDA versions
-TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0"
-
 case ${CUDA_VERSION} in
-    12.6) TORCH_CUDA_ARCH_LIST="5.0;6.0;${TORCH_CUDA_ARCH_LIST}" ;;  # Only 12.6 includes Legacy Maxwell/Pascal that will be removed in future releases
-    12.8) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0" ;;  # +Hopper/Blackwell support
-    12.9) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0+PTX" # +Hopper/Blackwell support + PTX for forward compatibility
+    #removing sm_50-sm_60 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
+    #however we would like to keep sm_70 architecture see: https://github.com/pytorch/pytorch/issues/157517
+    12.8)
+        TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0"
+        ;;
+    12.9)
+        TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0+PTX"
+        # WAR to resolve the ld error in libtorch build with CUDA 12.9
        if [[ "$PACKAGE_TYPE" == "libtorch" ]]; then
-            TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//7.0;/}"  # Remove 7.0 to resolve the ld error
-            TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//8.6;/}"  # Remove 8.6 for libtorch
+            TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
        fi
        ;;
    13.0)
-        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;$([[ "$ARCH" == "aarch64" ]] && echo "11.0;" || echo "")12.0+PTX"
-        export TORCH_NVCC_FLAGS="-compress-mode=size"
-        export BUILD_BUNDLE_PTXAS=1
+        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
+        ;;
+    12.6)
+        TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
+        ;;
+    *)
+        echo "unknown cuda version $CUDA_VERSION"
+        exit 1
        ;;
-    *) echo "unknown cuda version $CUDA_VERSION"; exit 1 ;;
 esac

-# Filter for aarch64: Remove < 8.0 and 8.6
-[[ "$ARCH" == "aarch64" ]] && TORCH_CUDA_ARCH_LIST=$(filter_aarch64_archs "$TORCH_CUDA_ARCH_LIST")
-
-echo "TORCH_CUDA_ARCH_LIST set to: $TORCH_CUDA_ARCH_LIST"
 export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
 echo "${TORCH_CUDA_ARCH_LIST}"

-# Disable MAGMA for aarch64 as pre-built libraries are x86-64 only
-if [[ "$ARCH" == "aarch64" ]]; then
-    echo "Disabling MAGMA for aarch64 architecture"
-    export USE_MAGMA=0
-fi
-
 # Package directories
 WHEELHOUSE_DIR="wheelhouse$cuda_version_nodot"
 LIBTORCH_HOUSE_DIR="libtorch_house$cuda_version_nodot"
@ -274,51 +244,6 @@ else
    exit 1
 fi

-# Add ARM-specific library dependencies
-if [[ "$ARCH" == "aarch64" ]]; then
-    echo "Adding ARM-specific library dependencies"
-
-    # ARM Compute Library (if available)
-    if [[ -d "/acl/build" ]]; then
-        echo "Adding ARM Compute Library"
-        DEPS_LIST+=(
-            "/acl/build/libarm_compute.so"
-            "/acl/build/libarm_compute_graph.so"
-        )
-        DEPS_SONAME+=(
-            "libarm_compute.so"
-            "libarm_compute_graph.so"
-        )
-    fi
-
-    # ARM system libraries
-    DEPS_LIST+=(
-        "/lib64/libgomp.so.1"
-        "/usr/lib64/libgfortran.so.5"
-    )
-    DEPS_SONAME+=(
-        "libgomp.so.1"
-        "libgfortran.so.5"
-    )
-
-    # NVPL libraries (ARM optimized BLAS/LAPACK)
-    if [[ -d "/usr/local/lib" && -f "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0" ]]; then
-        echo "Adding NVPL libraries for ARM"
-        DEPS_LIST+=(
-            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0"
-            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0"
-            "/usr/local/lib/libnvpl_lapack_core.so.0"
-            "/usr/local/lib/libnvpl_blas_core.so.0"
-        )
-        DEPS_SONAME+=(
-            "libnvpl_lapack_lp64_gomp.so.0"
-            "libnvpl_blas_lp64_gomp.so.0"
-            "libnvpl_lapack_core.so.0"
-            "libnvpl_blas_core.so.0"
-        )
-    fi
-fi
-
 # run_tests.sh requires DESIRED_CUDA to know what tests to exclude
 export DESIRED_CUDA="$cuda_version_nodot"

@ -326,11 +251,9 @@ export DESIRED_CUDA="$cuda_version_nodot"
 rm -rf /usr/local/cuda || true
 ln -s "/usr/local/cuda-${CUDA_VERSION}" /usr/local/cuda

-# Switch `/usr/local/magma` to the desired CUDA version (skip for aarch64)
-if [[ "$ARCH" != "aarch64" ]]; then
-    rm -rf /usr/local/magma || true
-    ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
-fi
+# Switch `/usr/local/magma` to the desired CUDA version
+rm -rf /usr/local/magma || true
+ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma

 export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev) # 10.0.130
 export CUDA_VERSION_SHORT=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev | cut -f1,2 -d".") # 10.0
--- a/.ci/onnx/common.sh
+++ b/.ci/onnx/common.sh
@ -21,3 +21,87 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
 fi

 mkdir -p "$pytest_reports_dir" || true
+
+##########################################
+# copied from .ci/pytorch/common_utils.sh
+##########################################
+
+function get_pinned_commit() {
+  cat .github/ci_commit_pins/"${1}".txt
+}
+
+function pip_install_whl() {
+  # This is used to install PyTorch and other build artifacts wheel locally
+  # without using any network connection
+
+  # Convert the input arguments into an array
+  local args=("$@")
+
+  # Check if the first argument contains multiple paths separated by spaces
+  if [[ "${args[0]}" == *" "* ]]; then
+    # Split the string by spaces into an array
+    IFS=' ' read -r -a paths <<< "${args[0]}"
+    # Loop through each path and install individually
+    for path in "${paths[@]}"; do
+      echo "Installing $path"
+      python3 -mpip install --no-index --no-deps "$path"
+    done
+  else
+    # Loop through each argument and install individually
+    for path in "${args[@]}"; do
+      echo "Installing $path"
+      python3 -mpip install --no-index --no-deps "$path"
+    done
+  fi
+}
+
+function pip_build_and_install() {
+  local build_target=$1
+  local wheel_dir=$2
+
+  local found_whl=0
+  for file in "${wheel_dir}"/*.whl
+  do
+    if [[ -f "${file}" ]]; then
+      found_whl=1
+      break
+    fi
+  done
+
+  # Build the wheel if it doesn't exist
+  if [ "${found_whl}" == "0" ]; then
+    python3 -m pip wheel \
+      --no-build-isolation \
+      --no-deps \
+      -w "${wheel_dir}" \
+      "${build_target}"
+  fi
+
+  for file in "${wheel_dir}"/*.whl
+  do
+    pip_install_whl "${file}"
+  done
+}
+
+function install_torchvision() {
+  local orig_preload
+  local commit
+  commit=$(get_pinned_commit vision)
+  orig_preload=${LD_PRELOAD}
+  if [ -n "${LD_PRELOAD}" ]; then
+    # Silence dlerror to work-around glibc ASAN bug, see https://sourceware.org/bugzilla/show_bug.cgi?id=27653#c9
+    echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c -
+    LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so
+  fi
+
+  if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
+    # Not sure if both are needed, but why not
+    export FORCE_CUDA=1
+    export WITH_CUDA=1
+  fi
+  pip_build_and_install "git+https://github.com/pytorch/vision.git@${commit}" dist/vision
+
+  if [ -n "${LD_PRELOAD}" ]; then
+    LD_PRELOAD=${orig_preload}
+  fi
+}
--- a/.ci/onnx/test.sh
+++ b/.ci/onnx/test.sh
@ -19,7 +19,7 @@ git config --global --add safe.directory /var/lib/jenkins/workspace

 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
  # TODO: This can be removed later once vision is also part of the Docker image
-  pip install -q --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
+  install_torchvision
  # JIT C++ extensions require ninja, so put it into PATH.
  export PATH="/var/lib/jenkins/.local/bin:$PATH"
  # NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -86,20 +86,14 @@ else
  fi
 fi

-# Enable MKLDNN with ARM Compute Library for ARM builds
+if [[ "$BUILD_ENVIRONMENT" == *zen* ]]; then
+  export USE_ZENDNN=1
+fi
+
 if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
  export USE_MKLDNN=1
-
-  # ACL is required for aarch64 builds
-  if [[ ! -d "/acl" ]]; then
-    echo "ERROR: ARM Compute Library not found at /acl"
-    echo "ACL is required for aarch64 builds. Check Docker image setup."
-    exit 1
-  fi
-
  export USE_MKLDNN_ACL=1
  export ACL_ROOT_DIR=/acl
-  echo "ARM Compute Library enabled for MKLDNN: ACL_ROOT_DIR=/acl"
 fi

 if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -1250,6 +1250,97 @@ test_custom_script_ops() {
  assert_git_not_dirty
 }

+test_libtorch_agnostic_targetting() {
+    echo "Testing libtorch_agnostic runs correctly on TORCH_TARGET_VERSION"
+
+    REPO_DIR=$(pwd)
+    WHEEL_DIR="${REPO_DIR}/test/cpp_extensions/.wheels"
+
+    # Build wheel with current PyTorch (this has TORCH_TARGET_VERSION 2_9_0)
+    echo "Building 2.9 extension wheel with current PyTorch..."
+    pushd test/cpp_extensions/libtorch_agnostic_2_9_extension
+    time python setup.py bdist_wheel
+
+    # Save the wheel
+    mkdir -p "$WHEEL_DIR"
+    cp dist/*.whl "$WHEEL_DIR/"
+    WHEEL_FILE=$(find "$WHEEL_DIR" -maxdepth 1 -name "*.whl" -type f | head -1)
+    echo "Built wheel: $(basename "$WHEEL_FILE")"
+    popd
+
+    # Create venv and install PyTorch 2.9
+    python -m venv venv_pytorch_2_9
+    # shellcheck disable=SC1091
+    . venv_pytorch_2_9/bin/activate
+
+    # Clear PYTHONPATH to avoid using the development PyTorch
+    echo "Clearing PYTHONPATH to use only venv packages..."
+    unset PYTHONPATH
+
+    # Upgrade pip to latest version
+    echo "Upgrading pip to latest version..."
+    pip install --upgrade pip
+    pip --version
+
+    echo "Installing PyTorch 2.9..."
+
+    # Install from release channel only
+    PYTORCH_VERSION="2.9.0"
+
+    # Extract CUDA version from BUILD_ENVIRONMENT (e.g., "cuda12.1" -> "cu121")
+    if [[ "$BUILD_ENVIRONMENT" =~ cuda([0-9]+)\.([0-9]+) ]]; then
+        CUDA_MAJOR="${BASH_REMATCH[1]}"
+        CUDA_MINOR="${BASH_REMATCH[2]}"
+        CUDA_VERSION="cu${CUDA_MAJOR}${CUDA_MINOR}"
+        echo "  Detected CUDA ${CUDA_MAJOR}.${CUDA_MINOR} from BUILD_ENVIRONMENT, using ${CUDA_VERSION}"
+    else
+        # Default to CPU build
+        CUDA_VERSION="cpu"
+        echo "  No CUDA detected in BUILD_ENVIRONMENT, using CPU build"
+    fi
+
+    if pip install torch=="${PYTORCH_VERSION}" --index-url https://download.pytorch.org/whl/${CUDA_VERSION}/; then
+        echo "Installed PyTorch ${PYTORCH_VERSION} from release channel (${CUDA_VERSION})"
+    else
+        echo "  FAILED to install PyTorch 2.9.0 from release channel"
+        echo "  URL: https://download.pytorch.org/whl/${CUDA_VERSION}/"
+        deactivate
+        rm -rf venv_pytorch_2_9
+        return 1
+    fi
+
+    INSTALLED_VERSION=$(python -c "import torch; print(torch.__version__)" 2>/dev/null || echo "unknown")
+    echo "  Installed version: $INSTALLED_VERSION"
+
+    # Install test dependencies
+    echo "Installing test dependencies..."
+    pip install expecttest numpy unittest-xml-reporting
+
+    # Install the pre-built wheel
+    echo ""
+    echo "Installing pre-built 2.9 extension wheel (built with PyTorch 2.10)..."
+    pip install "$WHEEL_FILE"
+    echo "Installed $(basename "$WHEEL_FILE") into PyTorch 2.9 environment"
+
+    # Run tests with PyTorch 2.9 runtime (2.10 tests will be skipped automatically)
+    echo ""
+    echo "Running tests with PyTorch 2.9 runtime (using wheel built on PyTorch 2.10)..."
+    if time python test/cpp_extensions/test_libtorch_agnostic.py -v; then
+        echo ""
+        echo "  Wheel built with current torch and TORCH_TARGET_VERSION 2_9_0 works with PyTorch 2.9 runtime!"
+    else
+        echo "targeting test failed"
+        deactivate
+        rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
+        return 1
+    fi
+
+    deactivate
+    rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
+
+    assert_git_not_dirty
+}
+
 test_jit_hooks() {
  echo "Testing jit hooks in cpp"
  HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
@ -1722,6 +1813,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]];
 elif [[ "${TEST_CONFIG}" == *backward* ]]; then
  test_forward_backward_compatibility
  # Do NOT add tests after bc check tests, see its comment.
+elif [[ "${TEST_CONFIG}" == *libtorch_agnostic_targetting* ]]; then
+  test_libtorch_agnostic_targetting
 elif [[ "${TEST_CONFIG}" == *xla* ]]; then
  install_torchvision
  build_xla
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -260,8 +260,11 @@ jobs:
            "${DOCKER_IMAGE}"
          )
          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Unified build script for all architectures (x86_64, aarch64, s390x)
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
+          if [[ ${BUILD_ENVIRONMENT} == *"aarch64"* ]]; then
+            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/aarch64_linux/aarch64_ci_build.sh"
+          else
+            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
+          fi

      - name: Chown artifacts
        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
--- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@ -80,7 +80,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-zen-py3.10-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
@ -106,7 +106,7 @@ jobs:
    needs: inductor-build
    if: github.event.schedule == '0 7 * * *'
    with:
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-zen-py3.10-gcc11-build
      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
@ -122,7 +122,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: inductor-build
    with:
-      build-environment: linux-jammy-py3.10-gcc11-build
+      build-environment: linux-jammy-zen-py3.10-gcc11-build
      dashboard-tag: training-${{ inputs.training || 'false' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'true' }}-aotinductor-${{ inputs.aotinductor || 'true' }}-freezing-${{ inputs.freezing || 'true' }}
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -70,6 +70,7 @@ jobs:
          { config: "distributed", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "distributed", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "numpy_2_x", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
        ]}
    secrets: inherit

--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -83,6 +83,7 @@ jobs:
          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
          { config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
+          { config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
        ]}
    secrets: inherit

--- a/.gitmodules
+++ b/.gitmodules
@ -132,3 +132,6 @@
 [submodule "third_party/aiter"]
 	path = third_party/aiter
 	url = https://github.com/ROCm/aiter.git
+[submodule "third_party/ZenDNN"]
+	path = third_party/ZenDNN
+	url = https://github.com/amd/ZenDNN.git
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -82,6 +82,7 @@ include_patterns = [
    'aten/src/ATen/native/mkldnn/xpu/**/*.cpp',
    'aten/src/ATen/native/Tensor*.h',
    'aten/src/ATen/native/Tensor*.cpp',
+    'aten/src/ATen/native/zendnn/*.*',
    'c10/**/*.h',
    'c10/**/*.cpp',
    'torch/csrc/**/*.h',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -205,6 +205,11 @@ filegroup(
    srcs = glob(["aten/src/ATen/native/xnnpack/*.cpp"]),
 )

+filegroup(
+    name = "aten_native_zendnn_cpp",
+    srcs = glob(["aten/src/ATen/native/zendnn/*.cpp"]),
+)
+
 filegroup(
    name = "aten_base_vulkan",
    srcs = glob(["aten/src/ATen/vulkan/*.cpp"]),
@ -285,6 +290,7 @@ header_template_rule(
        "@AT_BLAS_USE_CBLAS_DOT@": "1",
        "@AT_KLEIDIAI_ENABLED@": "0",
        "@AT_USE_EIGEN_SPARSE@": "0",
+        "@AT_ZENDNN_ENABLED@": "0",
    },
 )

@ -365,6 +371,7 @@ cc_library(
        ":aten_native_sparse_cpp",
        ":aten_native_transformers_cpp",
        ":aten_native_xnnpack",
+        ":aten_native_zendnn_cpp",
        ":aten_src_ATen_config",
    ] + generated_cpu_cpp + aten_ufunc_generated_cpu_sources("aten/src/ATen/{}"),
    copts = ATEN_COPTS,
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -336,6 +336,21 @@ set(MKLDNN_ENABLE_CONCURRENT_EXEC ${USE_MKLDNN})
 cmake_dependent_option(USE_MKLDNN_CBLAS "Use CBLAS in MKLDNN" OFF "USE_MKLDNN"
                       OFF)
 option(USE_STATIC_MKL "Prefer to link with MKL statically (Unix only)" OFF)
+
+# currently ZenDNN is kept off and enabled only through user setting on X86_64/AMD64
+option(USE_ZENDNN
+       "Build with ZENDNN support"
+       OFF)
+if(USE_ZENDNN AND NOT CPU_INTEL)
+    message(WARNING
+        "USE_ZENDNN was requested, but the target processor "
+        "(${CMAKE_SYSTEM_PROCESSOR}) is not AMD64/x86_64. "
+        "ZENDNN support will be disabled.")
+
+    # Switch it off in the cache so the GUI / subsequent runs see the change
+    set(USE_ZENDNN OFF CACHE BOOL "Build with ZENDNN support" FORCE)
+endif()
+
 cmake_dependent_option(
  USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON
  "USE_DISTRIBUTED" OFF)
@ -1385,6 +1400,7 @@ if(BUILD_SHARED_LIBS)
          ${PROJECT_SOURCE_DIR}/cmake/public/gflags.cmake
          ${PROJECT_SOURCE_DIR}/cmake/public/mkl.cmake
          ${PROJECT_SOURCE_DIR}/cmake/public/mkldnn.cmake
+          ${PROJECT_SOURCE_DIR}/cmake/public/zendnn.cmake
          ${PROJECT_SOURCE_DIR}/cmake/public/protobuf.cmake
          ${PROJECT_SOURCE_DIR}/cmake/public/utils.cmake
          ${PROJECT_SOURCE_DIR}/cmake/public/LoadHIP.cmake
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -93,6 +93,7 @@ file(GLOB mkldnn_xpu_cpp "native/mkldnn/xpu/*.cpp" "native/mkldnn/xpu/detail/*.c
 file(GLOB native_cpp "native/*.cpp")
 file(GLOB native_mkl_cpp "native/mkl/*.cpp")
 file(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp")
+file(GLOB native_zendnn_cpp "native/zendnn/*.cpp")
 file(GLOB vulkan_cpp "vulkan/*.cpp")
 file(GLOB native_vulkan_cpp "native/vulkan/*.cpp" "native/vulkan/api/*.cpp" "native/vulkan/impl/*.cpp" "native/vulkan/ops/*.cpp")

@ -378,7 +379,7 @@ if(BUILD_LITE_INTERPRETER)
  append_filelist("aten_native_source_non_codegen_list" all_cpu_cpp)
 else()
  set(
-    all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp}
+    all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp} ${native_zendnn_cpp}
    ${native_ao_sparse_cpp} ${native_sparse_cpp} ${native_nested_cpp}
    ${native_quantized_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp}
    ${native_transformers_cpp}
--- a/aten/src/ATen/Config.h.in
+++ b/aten/src/ATen/Config.h.in
@ -21,3 +21,4 @@
 #define AT_BLAS_USE_CBLAS_DOT() @AT_BLAS_USE_CBLAS_DOT@
 #define AT_KLEIDIAI_ENABLED() @AT_KLEIDIAI_ENABLED@
 #define AT_USE_EIGEN_SPARSE() @AT_USE_EIGEN_SPARSE@
+#define AT_ZENDNN_ENABLED() @AT_ZENDNN_ENABLED@
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -664,6 +664,14 @@ bool Context::hasEigenSparse() {
 #endif
 }

+bool Context::hasZenDNN() {
+#if AT_ZENDNN_ENABLED()
+  return true;
+#else
+  return false;
+#endif
+}
+
 at::QEngine Context::qEngine() const {
  static auto _quantized_engine = []() {
    at::QEngine qengine = at::kNoQEngine;
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -150,6 +150,7 @@ class TORCH_API Context {
  static bool hasMKL();
  static bool hasKleidiAI();
  static bool hasLAPACK();
+  static bool hasZenDNN();
  static bool hasMKLDNN();
  static bool ckSupported();
  static bool hasEigenSparse();
@ -639,6 +640,10 @@ inline bool hasEigenSparse() {
  return globalContext().hasEigenSparse();
 }

+inline bool hasZenDNN() {
+  return globalContext().hasZenDNN();
+}
+
 inline bool hasMAGMA() {
  return globalContext().hasMAGMA();
 }
--- a/aten/src/ATen/cpu/Utils.cpp
+++ b/aten/src/ATen/cpu/Utils.cpp
@ -130,4 +130,29 @@ uint32_t L2_cache_size() {
  return get_cache_size(2);
 }

+bool is_amd_cpu() {
+#if !defined(__s390x__) && !defined(__powerpc__)
+    auto check_amd_vendor = []() -> bool {
+        if (!cpuinfo_initialize()) {
+          return false;
+        }
+        const uint32_t num_cores = cpuinfo_get_cores_count();
+        if (num_cores <= 0) {
+          return false;
+        }
+        // Get first core information
+        const struct cpuinfo_core* core = cpuinfo_get_core(0);
+        if (!core) {
+          return false;
+        }
+        // Check AMD vendor support
+        return (core->vendor == cpuinfo_vendor_amd);
+    };
+    static bool is_amd = check_amd_vendor();
+    return is_amd;
+#else
+    return false;
+#endif
+}
+
 } // namespace at::cpu
--- a/aten/src/ATen/cpu/Utils.h
+++ b/aten/src/ATen/cpu/Utils.h
@ -9,6 +9,9 @@ namespace at::cpu {
 TORCH_API bool is_avx2_supported();
 TORCH_API bool is_avx512_supported();

+// Detect if CPU is AMD Zen4 or newer.
+TORCH_API bool is_amd_cpu();
+
 // Detect if CPU support Vector Neural Network Instruction.
 TORCH_API bool is_avx512_vnni_supported();

@ -30,4 +33,7 @@ TORCH_API uint32_t L1d_cache_size();
 // Get the L2 cache size per core in Byte
 TORCH_API uint32_t L2_cache_size();

+// Detect if CPU is AMD.
+TORCH_API bool is_amd_cpu();
+
 } // namespace at::cpu
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@ -20,6 +20,7 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/mkldnn/Matmul.h>
 #include <ATen/native/mkldnn/Utils.h>
+#include <ATen/native/zendnn/Matmul.h>
 #include <ATen/cpu/Utils.h>
 #include <c10/core/GradMode.h>
 #include <c10/util/accumulate.h>
@ -1396,6 +1397,7 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) {
  return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size;
 }
 #endif
+
 static void addmm_impl_cpu_(
    Tensor &result, const Tensor &self, Tensor m1, Tensor m2, const Scalar& beta, const Scalar& alpha) {
  TORCH_INTERNAL_ASSERT(self.dim() == 2 && m1.dim() == 2 && m2.dim() == 2);
@ -1728,7 +1730,6 @@ static void baddbmm_with_gemm_(const Tensor &result, const Tensor &mat1, const T
        result.data_ptr<scalar_t>(), ldc, result_strides[0]);
  });
 }
-
 // This tries to apply some optimizations to bmm/baddbmm:
 // - When the operand size is small, computation are parallelized over the batch
 //   dimension using OMP and naive matrix multiplication is applied.
@ -1751,6 +1752,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
  int64_t res_rows = batch1_sizes[1];
  int64_t res_cols = batch2_sizes[2];

+
  // handle pathological cases that blas may not like
  if (self_or_result.numel() == 0) {
    return;
@ -1771,6 +1773,19 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
    return (strides[2] == 1 && (sizes[1] == 1 || strides[1] >= sizes[2])) ||
        (strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1]));
  };
+
+#if AT_ZENDNN_ENABLED()
+  if(at::cpu::is_amd_cpu()
+      && at::cpu::is_avx512_supported()
+      && self_or_result.scalar_type() == kBFloat16
+      && self_or_result.is_contiguous()
+      && self_or_result.sizes()[0] > 1)
+  {
+      zendnn_baddbmm(self_or_result, batch1, batch2, beta.to<float>(), alpha.to<float>());
+      return;
+  }
+#endif
+
 #if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
  // Always apply mkldnn heuristic on x86 platform, but on ARM only if compiled with ACL
  bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]);
--- a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
+++ b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp
@ -17,6 +17,15 @@
 #else
 #include <ATen/ops/empty.h>
 #endif
+
+#if AT_ZENDNN_ENABLED()
+#include <zendnnl.hpp>
+#include <ATen/cpu/Utils.h>
+#include <ATen/native/zendnn/ZenDNN_utils.hpp>
+
+
+using namespace zendnnl::lowoha;
+#endif
 namespace at::native {

 namespace {
@ -440,7 +449,29 @@ void cpu_flash_attention(
  accum_t* buf_data = buf.data_ptr<accum_t>();
  scalar_t* buf_reduced_data = is_reduced_type ? buf_reduced.data_ptr<scalar_t>() : nullptr;

-  // Buffer to store padding query and packing key/value
+  bool enable_zen_matmul = false;
+
+#if AT_ZENDNN_ENABLED()
+  enable_zen_matmul = at::cpu::is_amd_cpu()
+                      && at::cpu::is_avx512_supported()
+                      && output.scalar_type() == kBFloat16;
+  data_type_t out_type = get_zendnn_dtype(buf);
+  data_type_t inp_dtype = get_zendnn_dtype(query);
+  data_type_t wgt_dtype = get_zendnn_dtype(key);
+
+  data_types matmul_dtype;
+  matmul_dtype.src = inp_dtype;
+  matmul_dtype.wei = wgt_dtype;
+  matmul_dtype.dst = out_type;
+  matmul_dtype.bias = data_type_t::none;
+  matmul_dtype.compute = data_type_t::none;
+
+  lowoha_params params;
+  params.dtypes = matmul_dtype;
+  params.lowoha_algo= matmul_algo_t::libxsmm;
+#endif
+
+ // Buffer to store padding query and packing key/value
  scalar_t* key_reorder_ptr = nullptr;
  scalar_t* value_reorder_ptr = nullptr;
  scalar_t* query_padding_ptr = nullptr;
@ -575,22 +606,55 @@ void cpu_flash_attention(
                qk_data);
          }
        } else {
-          cpublas::gemm(
-            TransposeType::Transpose,
-            TransposeType::NoTranspose,
-            kvBlockSize,
-            qBlockSize,
-            headSize,
-            static_cast<accum_t>(1),
-            k_data + i * kStrideB + kv_j * kStrideH +
-                n * kStrideN,
-            kStrideN,
-            q_data + i * qStrideB + j * qStrideH +
+
+          if(enable_zen_matmul)
+          {
+#if AT_ZENDNN_ENABLED()
+            // Limit OpenMP nesting to prevent over-subscription and
+            // ensure optimal thread performance
+            omp_set_max_active_levels(1);
+            zendnnl::lowoha::matmul_direct(
+                'r',    // row major
+                false,  // transA
+                true,   // trasnsB
+                qBlockSize,
+                kvBlockSize,
+                headSize,
+                static_cast<accum_t>(1),   // alpha
+                q_data + i * qStrideB + j * qStrideH +
                m * qStrideM,
-            qStrideM,
-            static_cast<accum_t>(0),
-            qk_data,
-            kvBlockSize);
+                qStrideM,
+                k_data + i * kStrideB + kv_j * kStrideH +
+                n * kStrideN,
+                kStrideN,
+                nullptr,
+                static_cast<accum_t>(0),   // beta
+                qk_data,
+                kvBlockSize,
+                params,
+                1,  // batch size 1
+                1); // batch size 1
+#endif
+          }
+          else
+          {
+            cpublas::gemm(
+              TransposeType::Transpose,
+              TransposeType::NoTranspose,
+              kvBlockSize,
+              qBlockSize,
+              headSize,
+              static_cast<accum_t>(1),
+              k_data + i * kStrideB + kv_j * kStrideH +
+                  n * kStrideN,
+              kStrideN,
+              q_data + i * qStrideB + j * qStrideH +
+                  m * qStrideM,
+              qStrideM,
+              static_cast<accum_t>(0),
+              qk_data,
+              kvBlockSize);
+          }
        }
        // Apply causal mask, fill unused with -inf
        if (is_causal && num_keys - n <= kvSplitSize) {
@ -706,21 +770,52 @@ void cpu_flash_attention(
                  dst_data);
          }
        } else {
-          cpublas::gemm(
-            TransposeType::NoTranspose,
-            TransposeType::NoTranspose,
-            headSize,
-            qBlockSize,
-            kvBlockSize,
-            static_cast<accum_t>(1),
-            v_data + i * vStrideB + kv_j * vStrideH +
-                n * vStrideN,
-            vStrideN,
-            conditional_data_ptr(qk_data, qk_reduced_data),
-            kvBlockSize,
-            n == 0 ? static_cast<accum_t>(0) : static_cast<accum_t>(1),
-            dst_data,
-            headSize);
+              if(enable_zen_matmul)
+              {
+#if AT_ZENDNN_ENABLED()
+            // Limit OpenMP nesting to prevent over-subscription and
+            // ensure optimal thread performance
+                omp_set_max_active_levels(1);
+                zendnnl::lowoha::matmul_direct(
+                    'r',   // row major
+                    false,  // transA
+                    false,  // transB
+                    qBlockSize,
+                    headSize,
+                    kvBlockSize,
+                    static_cast<accum_t>(1),  // alpha
+                    conditional_data_ptr(qk_data, qk_reduced_data),
+                    kvBlockSize,
+                    v_data + i * vStrideB + kv_j * vStrideH +
+                    n * vStrideN,
+                    vStrideN,
+                    nullptr,
+                    n == 0 ? static_cast<accum_t>(0) : static_cast<accum_t>(1), // beta
+                    dst_data,
+                    headSize,
+                    params,
+                    1,  // batch size 1
+                    1); // batch size 1
+#endif
+              }
+              else
+              {
+                cpublas::gemm(
+                  TransposeType::NoTranspose,
+                  TransposeType::NoTranspose,
+                  headSize,
+                  qBlockSize,
+                  kvBlockSize,
+                  static_cast<accum_t>(1),
+                  v_data + i * vStrideB + kv_j * vStrideH +
+                      n * vStrideN,
+                  vStrideN,
+                  conditional_data_ptr(qk_data, qk_reduced_data),
+                  kvBlockSize,
+                  n == 0 ? static_cast<accum_t>(0) : static_cast<accum_t>(1),
+                  dst_data,
+                  headSize);
+              }
        }
      }

--- a/aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/ScaledBlas.cpp
@ -5,6 +5,7 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
 #include <ATen/native/xpu/Blas.h>
+#include <ATen/xpu/XPUScaledBlas.h>
 #include <torch/library.h>

 #ifndef AT_PER_OPERATOR_HEADERS
@ -339,4 +340,399 @@ Tensor _scaled_mm_xpu(
      out);
 }

+using acceptance_fn = std::function<bool(
+    c10::ScalarType,
+    std::vector<ScalingType>&,
+    ArrayRef<Tensor>&,
+    c10::ScalarType,
+    std::vector<ScalingType>&,
+    ArrayRef<Tensor>&)>;
+using namespace std::placeholders;
+
+namespace scaled_blas = at::native::onednn::scaled;
+using scaled_blas::convert_int_to_enum;
+using scaled_blas::ScaledGemmImplementation;
+
+std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 2>
+    scale_kernel_dispatch = {{
+        {"tensorwise_tensorwise",
+         scaled_blas::check_tensorwise_recipe,
+         ScaledGemmImplementation::TENSORWISE_TENSORWISE},
+        {"rowwise_rowwise",
+         scaled_blas::check_rowwise_recipe,
+         ScaledGemmImplementation::ROWWISE_ROWWISE},
+
+    }};
+
+Tensor& _scaled_tensorwise_tensorwise(
+    const Tensor& mat_a,
+    const Tensor& mat_b,
+    const Tensor& scale_a,
+    const Tensor& scale_b,
+    const std::optional<Tensor>& bias,
+    const c10::ScalarType out_dtype,
+    bool use_fast_accum,
+    Tensor& out) {
+  // Restrictions:
+  // A, B are FP8, scales are fp32
+
+  TORCH_CHECK_VALUE(
+      isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()),
+      "mat_a and mat_b must be fp8 types, got: ",
+      mat_a.scalar_type(),
+      mat_b.scalar_type());
+  TORCH_CHECK_VALUE(
+      scale_a.numel() == 1 && scale_a.scalar_type() == kFloat,
+      "scale_a must have 1 Float element")
+  TORCH_CHECK_VALUE(
+      scale_b.numel() == 1 && scale_b.scalar_type() == kFloat,
+      "scale_b must have 1 Float element")
+
+  auto scaling_choice_a = ScalingType::TensorWise;
+  auto scaling_choice_b = ScalingType::TensorWise;
+
+  _scaled_gemm(
+      mat_a,
+      mat_b,
+      scale_a,
+      scale_b,
+      scaling_choice_a,
+      scaling_choice_b,
+      bias,
+      use_fast_accum,
+      out);
+
+  return out;
+}
+
+Tensor& _scaled_rowwise_rowwise(
+    const Tensor& mat_a,
+    const Tensor& mat_b,
+    const Tensor& scale_a,
+    const Tensor& scale_b,
+    const std::optional<Tensor>& bias,
+    const c10::ScalarType out_dtype,
+    bool use_fast_accum,
+    Tensor& out) {
+  // Restrictions:
+  // A, B are FP8, scales are fp32, shape M/N for A/B
+  TORCH_CHECK_VALUE(
+      isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()),
+      "mat_a and mat_b must be fp8 types, got: ",
+      mat_a.scalar_type(),
+      mat_b.scalar_type());
+  TORCH_CHECK_VALUE(
+      scale_a.size(0) == mat_a.size(0) && scale_a.size(1) == 1,
+      "scale_a must have shape [",
+      mat_a.size(0),
+      ", 1], got [",
+      scale_a.sizes(),
+      "]");
+  TORCH_CHECK_VALUE(
+      scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat,
+      "scale_a must have ",
+      mat_a.size(0),
+      " Float elements, got ",
+      scale_a.numel())
+  TORCH_CHECK_VALUE(
+      scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat,
+      "scale_b must have ",
+      mat_b.size(1),
+      " Float elements, got ",
+      scale_b.numel())
+
+  TORCH_CHECK_VALUE(
+      scale_a.stride(1) == 1,
+      "expected scale_a.stride(1) to be 1, but got ",
+      scale_a.stride(1));
+  TORCH_CHECK_VALUE(
+      scale_b.stride(1) == 1,
+      "expected scale_b.stride(1) to be 1, but got ",
+      scale_b.stride(1));
+
+  auto scaling_choice_a = ScalingType::RowWise;
+  auto scaling_choice_b = ScalingType::RowWise;
+
+  _scaled_gemm(
+      mat_a,
+      mat_b,
+      scale_a,
+      scale_b,
+      scaling_choice_a,
+      scaling_choice_b,
+      bias,
+      use_fast_accum,
+      out);
+
+  return out;
+}
+
+// V2: Computes matrix multiply + bias while applying scaling to input and
+// output matrices Scales are only applicable when matrices are of Float8 type
+// and assumed to be equal to 1.0 by default. If output matrix type is 16 or
+// 32-bit type, scale_result is not applied. Known limitations:
+//  - Only works if mat1 is row-major and mat2 is column-major
+//  - Only works if matrices sizes are divisible by 32
+//  - If 1-dimensional tensors are used then scale_a should be size =
+//  mat1.size(0)
+//    and scale_b should have size = to mat2.size(1)
+//  Arguments:
+//    - `mat_a`: the first operand of the matrix multiply, can be type
+//    `torch.float8_e4m3fn` or `torch.float8_e5m2`
+//    - `mat_b`: the second operand of the matrix multiply, can be type
+//    `torch.float8_e4m3fn` or `torch.float8_e5m2`
+//    - `scale_a`: a tensor with the inverse scale of `mat1`, whose
+//    shape/strides/dtype depend on the scaling scheme
+//    - `scale_recipe_a`: An integer corresponding to an enum describing the
+//    scaling scheme used for `scale_a`
+//    - `swizzle_a`: An integer corresponding to a `SwizzleType` enum describing
+//    the swizzling scheme for `scale_a`.
+//        Not supported for XPU for now.
+//    - `scale_b`: a tensor with the inverse scale of `mat2`, whose
+//    shape/strides/dtype depend on the scaling scheme
+//    - `scale_recipe_b`: An integer corresponding to an enum describing the
+//    scaling scheme used for `scale_b`
+//    - `swizzle_b`: An integer corresponding to a `SwizzleType` enum describing
+//    the swizzling scheme for `scale_b`.
+//        Not supported for XPU for now.
+//    - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
+//    - `out_dtype`: the output dtype, can either be a float8 or a higher
+//    precision floating point type
+//    - `contraction_dim`: describe which dimensions are `K` in the matmul.
+//       Not supported for XPU. Should always be empty.
+//    - `use_fast_accum`: Not supported for XPU, should always be false.
+//    - `out`: a reference to the output tensor
+Tensor& _scaled_mm_xpu_v2_out(
+    const Tensor& mat_a,
+    const Tensor& mat_b,
+    ArrayRef<Tensor> scale_a,
+    IntArrayRef scale_recipe_a,
+    IntArrayRef swizzle_a,
+    ArrayRef<Tensor> scale_b,
+    IntArrayRef scale_recipe_b,
+    IntArrayRef swizzle_b,
+    const std::optional<Tensor>& bias,
+    const std::optional<c10::ScalarType> out_dtype,
+    IntArrayRef contraction_dim,
+    bool use_fast_accum,
+    Tensor& out) {
+  TORCH_CHECK_VALUE(mat_a.dim() == 2, "mat_a must be a matrix");
+  TORCH_CHECK_VALUE(mat_b.dim() == 2, "mat_b must be a matrix");
+
+  // If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm
+  // kernels do not support this case).
+  if (mat_a.size(0) == 0 || mat_a.size(1) == 0 || mat_b.size(1) == 0) {
+    // `out` was created with `at::empty`. In the case where we are multiplying
+    // MxK by KxN and K is the zero dim, we need to initialize here to properly
+    // return a tensor of zeros.
+    at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
+    if (mat_a.size(1) == 0) {
+      out.zero_();
+    }
+
+    return out;
+  }
+
+  // Note: The `contraction_dim` is not actually used for now. We will need to
+  // align this code when upstreamed CUDA code is done. Currently, only keeps
+  // the code here for check.
+
+  // Check if the input matrix sizes can be multiplied
+  // - if optional contraction dims are provided, use those
+  //   -- mostly for < 1B formats (i.e. nvfp4x2) where cheap .t() is not
+  //   available.
+  if (contraction_dim.size() > 0) {
+    TORCH_CHECK_VALUE(
+        contraction_dim.size() == 2,
+        "contraction_dim must have exactly 2 elements");
+    auto mat_a_dim = contraction_dim[0];
+    auto mat_b_dim = contraction_dim[1];
+    TORCH_CHECK_VALUE(
+        mat_a.size(mat_a_dim) == mat_b.size(mat_b_dim),
+        "mat_a and mat_b shapes cannot be multiplied (",
+        mat_a.size(0),
+        "x",
+        mat_a.size(1),
+        " and ",
+        mat_b.size(0),
+        "x",
+        mat_b.size(1),
+        ") ",
+        "with contraction dims mat_a: ",
+        mat_a_dim,
+        ", mat_b: ",
+        mat_b_dim);
+  } else {
+    TORCH_CHECK_VALUE(
+        mat_a.size(1) == mat_b.size(0),
+        "mat_a and mat_b shapes cannot be multiplied (",
+        mat_a.size(0),
+        "x",
+        mat_a.size(1),
+        " and ",
+        mat_b.size(0),
+        "x",
+        mat_b.size(1),
+        ")");
+  }
+
+  TORCH_CHECK_VALUE(
+      !bias || bias->numel() == mat_b.sizes()[1],
+      "Bias must be size ",
+      mat_b.sizes()[1],
+      " but got ",
+      bias->numel());
+
+  TORCH_CHECK_VALUE(
+      !out_dtype || *out_dtype == out.scalar_type(),
+      "out_dtype must match output matrix type");
+
+  if (bias) {
+    TORCH_CHECK_VALUE(
+        bias->scalar_type() == kFloat ||
+            bias->scalar_type() == c10::ScalarType::BFloat16 ||
+            bias->scalar_type() == c10::ScalarType::Half,
+        "Bias must be Float32 or BFloat16 or Half, but got ",
+        bias->scalar_type());
+  }
+  {
+    auto bias_ = bias.value_or(Tensor());
+    // NOLINTNEXTLINE(*c-array*)
+    TensorArg targs[]{
+        {out, "out", 0},
+        {mat_a, "mat_a", 1},
+        {mat_b, "mat_b", 2},
+        {bias_, "bias", 3},
+        {scale_a[0], "scale_a", 4},
+        {scale_b[0], "scale_b", 5}};
+    checkAllSameGPU(__func__, targs);
+  }
+  // Align with CUDA's default out to be bf16
+  auto out_dtype_ = out_dtype.value_or(c10::ScalarType::BFloat16);
+
+  // Conversion of implicitly-defined enums to explicit
+  auto scale_recipe_a_enum = convert_int_to_enum<ScalingType>(scale_recipe_a);
+  auto swizzle_a_enum = convert_int_to_enum<SwizzleType>(swizzle_a);
+  auto scale_recipe_b_enum = convert_int_to_enum<ScalingType>(scale_recipe_b);
+  auto swizzle_b_enum = convert_int_to_enum<SwizzleType>(swizzle_b);
+
+  // XPU does not support swizzle for now. So directly return false.
+  TORCH_CHECK_VALUE(
+      swizzle_a_enum[0] == at::blas::SwizzleType::NO_SWIZZLE &&
+          swizzle_b_enum[0] == at::blas::SwizzleType::NO_SWIZZLE,
+      "XPU does not support swizzle yet.");
+
+  // at this point we can start working out what we want to be doing
+  // Try to do as few steps as possible.
+  // NOTE: support is deliberately sparse, can explicitly enumerate all
+  // combinations allowed. Do this via a list of defined (name, acceptance,
+  // concrete_impl) tuples.
+  bool found_impl = false;
+  ScaledGemmImplementation gemm_impl = ScaledGemmImplementation::NONE;
+
+  for (const auto& fn_entry : scale_kernel_dispatch) {
+    const auto [name, accept_fn, scaled_gemm_impl] = fn_entry;
+    bool ok = accept_fn(
+        mat_a.scalar_type(),
+        scale_recipe_a_enum,
+        scale_a,
+        mat_b.scalar_type(),
+        scale_recipe_b_enum,
+        scale_b);
+    if (ok) {
+      gemm_impl = scaled_gemm_impl;
+      found_impl = true;
+      break;
+    }
+  }
+  TORCH_CHECK_VALUE(
+      found_impl,
+      "Invalid scaling configuration.\n"
+      "- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n"
+      "- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (",
+      mat_a.size(0),
+      ", 1) and scale_b should be (1, ",
+      mat_b.size(1),
+      "), and both should be contiguous.\n"
+      "Got mat_a.dtype()=",
+      mat_a.scalar_type(),
+      ", scale_a[0].dtype()=",
+      scale_a[0].scalar_type(),
+      ", scale_a[0].size()=",
+      scale_a[0].sizes(),
+      ", scale_a[0].stride()=",
+      scale_a[0].strides(),
+      ", ",
+      "mat_b.dtype()=",
+      mat_b.scalar_type(),
+      ", scale_b[0].dtype()=",
+      scale_b[0].scalar_type(),
+      ", scale_b[0].size()=",
+      scale_b[0].sizes(),
+      " and scale_b[0].stride()=",
+      scale_b[0].strides());
+
+  at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
+
+  auto bias_ = bias.value_or(Tensor());
+
+  // dispatch to appropriate lower-level calls for error checking & execution
+  if (gemm_impl == ScaledGemmImplementation::TENSORWISE_TENSORWISE) {
+    return _scaled_tensorwise_tensorwise(
+        mat_a,
+        mat_b,
+        scale_a[0],
+        scale_b[0],
+        bias,
+        out_dtype_,
+        use_fast_accum,
+        out);
+  } else if (gemm_impl == ScaledGemmImplementation::ROWWISE_ROWWISE) {
+    return _scaled_rowwise_rowwise(
+        mat_a,
+        mat_b,
+        scale_a[0],
+        scale_b[0],
+        bias,
+        out_dtype_,
+        use_fast_accum,
+        out);
+  } else {
+    TORCH_CHECK_VALUE(
+        false, "Invalid state - found an implementation, but not really");
+  }
+}
+
+Tensor _scaled_mm_xpu_v2(
+    const Tensor& mat_a,
+    const Tensor& mat_b,
+    ArrayRef<Tensor> scale_a,
+    IntArrayRef scale_recipe_a,
+    IntArrayRef swizzle_a,
+    ArrayRef<Tensor> scale_b,
+    IntArrayRef scale_recipe_b,
+    IntArrayRef swizzle_b,
+    const std::optional<Tensor>& bias,
+    const std::optional<c10::ScalarType> out_dtype,
+    IntArrayRef contraction_dim,
+    bool use_fast_accum) {
+  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
+  Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
+
+  return _scaled_mm_xpu_v2_out(
+      mat_a,
+      mat_b,
+      scale_a,
+      scale_recipe_a,
+      swizzle_a,
+      scale_b,
+      scale_recipe_b,
+      swizzle_b,
+      bias,
+      out_dtype,
+      contraction_dim,
+      use_fast_accum,
+      out);
+}
+
 } // namespace at::native
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -3403,6 +3403,14 @@
  dispatch:
    CompositeExplicitAutograd: linear_out

+- func: zendnn_linear_unary(Tensor input, Tensor weight, Tensor? bias=None, *, bool is_weight_prepacked=False, str post_op="none") -> Tensor
+  dispatch:
+    CPU: zendnn_linear_unary
+
+- func: zendnn_weight_prepack_for_linear(Tensor weight) -> Tensor
+  dispatch:
+    CPU: zendnn_weight_prepack_for_linear
+
 - func: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor
  python_module: nn
  dispatch:
--- a/aten/src/ATen/native/zendnn/Linear.cpp
+++ b/aten/src/ATen/native/zendnn/Linear.cpp
@ -0,0 +1,96 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/zendnn/Linear_utils.hpp>
+#include <string_view>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/zendnn_linear_unary_native.h>
+#endif
+
+#if !AT_ZENDNN_ENABLED()
+namespace at::native {
+at::Tensor zendnn_linear_unary(
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const std::optional<at::Tensor>& bias,
+    bool is_weight_prepacked,
+    std::string_view post_op) {
+  TORCH_CHECK(
+      false, "zendnn_linear_unary: ATen is not compiled with ZenDNN support");
+}
+} // namespace at::native
+
+#else // !AT_ZENDNN_ENABLED()
+
+namespace at::native {
+using namespace zendnnl::interface;
+
+inline void zendnn_linear_impl(
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const at::Tensor& bias,
+    at::Tensor& result,
+    bool is_weight_prepacked) {
+  // Get appropriately processed tensors (2D input, transposed weight, 2D
+  // result)
+  check_args_for_linear(input, weight);
+  data_type_t datatype = get_zendnn_dtype(input);
+  auto input_2d = get_2d_view(input);
+  auto weight_transposed = weight.t();
+  auto result_2d = result.view(get_2d_size_for_tensor(result));
+  check_tensor_dtypes_for_linear(input_2d, weight_transposed, bias, result_2d);
+  check_tensor_sizes_for_linear(input_2d, weight_transposed, bias, result_2d);
+  // declare linear tensors
+  matmul_context_t matmul_context;
+  tensor_t input_tensor, weight_tensor, output_tensor, bias_tensor;
+  create_zendnn_tensor(input_2d, input_tensor, "matmul_input", datatype);
+  create_zendnn_tensor(
+      weight_transposed,
+      weight_tensor,
+      "weights",
+      datatype,
+      is_weight_prepacked);
+  create_zendnn_tensor(result_2d, output_tensor, "matmul_output", datatype);
+  if (bias.defined()) {
+    // adds dimension at dim=0 -> [1, n]
+    auto bias_unsqueezed = bias.unsqueeze(0);
+    create_zendnn_tensor(bias_unsqueezed, bias_tensor, "bias", datatype);
+    set_linear_context_attributes(matmul_context, weight_tensor, bias_tensor);
+  } else {
+    set_linear_context_attributes(matmul_context, weight_tensor);
+  }
+  matmul_context.create();
+  // define matmul operator
+  matmul_operator_t matmul_operator;
+  matmul_operator.set_name("matmul_operator")
+      .set_context(matmul_context)
+      .create();
+  TORCH_CHECK(
+      matmul_operator.check(),
+      "operator ",
+      matmul_operator.get_name(),
+      " creation failed.");
+  matmul_operator.set_input("matmul_input", input_tensor)
+      .set_output("matmul_output", output_tensor);
+  matmul_operator.execute();
+}
+
+at::Tensor zendnn_linear_unary(
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const std::optional<at::Tensor>& bias,
+    bool is_weight_prepacked,
+    std::string_view post_op) {
+  c10::MaybeOwned<at::Tensor> bias_maybe_owned =
+      at::borrow_from_optional_tensor(bias);
+  const at::Tensor& bias_t = *bias_maybe_owned;
+  // Create output tensor with appropriate size and strides
+  at::Tensor result = create_linear_output_tensor(input, weight);
+  // Perform ZENDNN linear operation
+  zendnn_linear_impl(input, weight, bias_t, result, is_weight_prepacked);
+  return result;
+}
+} // namespace at::native
+
+#endif // !AT_ZENDNN_ENABLED()
--- a/aten/src/ATen/native/zendnn/Linear_utils.hpp
+++ b/aten/src/ATen/native/zendnn/Linear_utils.hpp
@ -0,0 +1,136 @@
+#pragma once
+#include <ATen/native/zendnn/ZenDNN_utils.hpp>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+#include <c10/util/Logging.h>
+#include <cstdint>
+#include <functional> // For std::reference_wrapper, std::ref, std::cref
+#include <iostream>
+#include <optional> // For std::optional, std::nullopt
+#include <unordered_map>
+
+#if AT_ZENDNN_ENABLED()
+namespace at::native {
+using namespace zendnnl::interface;
+
+inline std::vector<int64_t> get_2d_size_for_tensor(
+    const at::Tensor& inp_tensor) {
+  const int64_t dim = inp_tensor.dim();
+  std::vector<int64_t> output_size(2);
+  output_size[0] = inp_tensor.numel() / inp_tensor.size(dim - 1);
+  output_size[1] = inp_tensor.size(dim - 1);
+  return output_size;
+}
+
+inline at::Tensor get_2d_view(const at::Tensor& tensor) {
+  auto stride = tensor.strides();
+  if (!std::is_sorted(stride.begin(), stride.end(), std::greater<int64_t>())) {
+    auto new_tensor = tensor.clone(at::MemoryFormat::Contiguous)
+                          .view(get_2d_size_for_tensor(tensor));
+    return new_tensor;
+  }
+  return tensor.view(get_2d_size_for_tensor(tensor));
+}
+
+inline std::vector<int64_t> compute_linear_output_sizes(
+    const at::Tensor& input,
+    const at::Tensor& weights) {
+  auto input_size = input.sizes();
+  std::vector<int64_t> output_size(input_size.begin(), input_size.end() - 1);
+  auto weights_last_dim_size = weights.size(weights.dim() - 1);
+  output_size.emplace_back(weights_last_dim_size);
+  return output_size;
+}
+// Returns output strides for linear (input @ weights) and linear operations
+inline std::vector<int64_t> compute_linear_output_strides(
+    const std::vector<int64_t>& output_size) {
+  std::vector<int64_t> output_strides(output_size.size(), 1);
+  for (int i = output_size.size() - 2; i >= 0; --i) {
+    output_strides[i] = output_strides[i + 1] * output_size[i + 1];
+  }
+  return output_strides;
+}
+
+inline at::Tensor create_linear_output_tensor(
+    const at::Tensor input,
+    const at::Tensor weight) {
+  auto output_size = compute_linear_output_sizes(input, weight.t());
+  auto output_strides = compute_linear_output_strides(output_size);
+  at::Tensor result = at::detail::empty_strided_cpu(
+      output_size, output_strides, input.options());
+  return result.is_contiguous() ? result : result.contiguous();
+}
+
+inline void check_args_for_linear(
+    const at::Tensor& input,
+    const at::Tensor& weights) {
+  TORCH_CHECK(
+      (input.dim() != 1 && weights.dim() != 1),
+      "1d dims are not supported yet.");
+  get_zendnn_dtype(input);
+}
+
+inline void check_tensor_sizes_for_linear(
+    const at::Tensor& input,
+    const at::Tensor& weights,
+    const at::Tensor& bias,
+    const at::Tensor& result) {
+  const int input_dim = input.dim();
+  const int weights_dim = weights.dim();
+  TORCH_CHECK(
+      (input_dim == 2 && weights_dim == 2),
+      "unsupported dims for input and weights");
+  const auto input_sizes = input.sizes();
+  const auto weights_sizes = weights.sizes();
+  TORCH_CHECK(
+      input_sizes[input_dim - 1] == weights_sizes[input_dim - 2],
+      "Tensor shapes incompatible for linear");
+  if (bias.defined()) {
+    TORCH_CHECK(
+        bias.dim() == 1 && bias.size(0) == weights_sizes[1],
+        "bias shape incompatible with linear");
+  }
+}
+
+inline void check_tensor_dtypes_for_linear(
+    const at::Tensor& input,
+    const at::Tensor& weights,
+    const at::Tensor& bias,
+    const at::Tensor& result) {
+  auto is_fp32 = [](const at::Tensor& t) {
+    return t.scalar_type() == c10::ScalarType::Float;
+  };
+  auto is_bf16 = [](const at::Tensor& t) {
+    return t.scalar_type() == c10::ScalarType::BFloat16;
+  };
+  bool all_fp32 = is_fp32(input) && is_fp32(weights) && is_fp32(result) &&
+      (!bias.defined() || is_fp32(bias));
+  bool all_bf16 = is_bf16(input) && is_bf16(weights) && is_bf16(result) &&
+      (!bias.defined() || is_bf16(bias));
+  TORCH_CHECK(
+      all_fp32 ^ all_bf16,
+      "All tensors must have consistent dtype and zendnn linear only supports Float and BFloat16");
+  if (all_bf16) {
+    TORCH_CHECK(
+        zendnn_bf16_device_check(),
+        "zendnn linear bf16 path needs cpu support avx512bf16");
+  }
+}
+
+inline void set_linear_context_attributes(
+    matmul_context_t& matmul_context,
+    tensor_t& weights,
+    std::optional<std::reference_wrapper<tensor_t>> bias_opt_ref =
+        std::nullopt) {
+  matmul_context.set_param("weights", weights);
+  if (bias_opt_ref.has_value()) {
+    tensor_t& bias = bias_opt_ref->get();
+    matmul_context.set_param("bias", bias);
+  }
+}
+} // namespace at::native
+#endif // AT_ZENDNN_ENABLED()
--- a/aten/src/ATen/native/zendnn/Matmul.cpp
+++ b/aten/src/ATen/native/zendnn/Matmul.cpp
@ -0,0 +1,104 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Config.h>
+#include <ATen/Context.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/zendnn/Matmul.h>
+#include <ATen/native/zendnn/ZenDNN_utils.hpp>
+#include <ATen/record_function.h>
+
+#if AT_ZENDNN_ENABLED()
+#include <zendnnl.hpp>
+namespace at::native {
+
+using namespace zendnnl::lowoha;
+void zendnn_baddbmm(
+    const Tensor& self,
+    const Tensor& batch1,
+    const Tensor& batch2,
+    float beta,
+    float alpha) {
+  RECORD_FUNCTION(
+      "zendnn::zendnn_baddbmm",
+      std::vector<c10::IValue>({batch1, batch2, self}));
+
+  Tensor b1 = batch1;
+  Tensor b2 = batch2;
+  // Infer matrix dimensions from 3D inputs:
+  // [B, M, K] x [B, K, N] -> [B, M, N]
+  const int64_t M = b1.size(1);
+  const int64_t N = b2.size(2);
+  const int64_t K = b1.size(2);
+
+  // Check if a 3D tensor is transposed (transposed version of a contiguous
+  // tensor) in the last two dimensions.
+  // For a transposed tensor
+  // [B, M, K] -> [B, K, M]:
+  // - stride[0] should be M*K (batch stride unchanged)
+  // - stride[1] should be 1 (innermost dimension after transpose)
+  // - stride[2] should be M (step size for original rows, now columns)
+  auto is_transposed = [](const Tensor& t) {
+    const auto sizes = t.sizes();
+    const auto strides = t.strides();
+    return strides[0] == sizes[1] * sizes[2] && strides[1] == 1 &&
+        strides[2] == sizes[1];
+  };
+
+  // check if tensor is transposed
+  bool transa = is_transposed(b1);
+  bool transb = is_transposed(b2);
+
+  // make a copy of tensor when tensor is neither contiguous nor transposed
+  b1 = (transa || b1.is_contiguous()) ? b1 : b1.contiguous();
+  b2 = (transb || b2.is_contiguous()) ? b2 : b2.contiguous();
+
+  auto strideA = b1.strides();
+  auto strideB = b2.strides();
+  auto strideC = self.strides();
+
+  const int64_t lda = transa ? strideA[2] : strideA[1];
+  const int64_t ldb = transb ? strideB[2] : strideB[1];
+  const int64_t ldc = strideC[1];
+
+  data_type_t out_type = get_zendnn_dtype(self);
+  data_type_t inp_dtype = get_zendnn_dtype(b1);
+  data_type_t wgt_dtype = get_zendnn_dtype(b2);
+
+  TORCH_CHECK(
+      (b1.scalar_type() == b2.scalar_type()),
+      "zendnn_baddbmm: batch1 and batch2 data types should be same");
+
+  data_types matmul_dtype;
+  matmul_dtype.src = inp_dtype;
+  matmul_dtype.wei = wgt_dtype;
+  matmul_dtype.dst = out_type;
+  matmul_dtype.bias = data_type_t::none;
+  matmul_dtype.compute = data_type_t::none;
+
+  lowoha_params params;
+  params.dtypes = matmul_dtype;
+
+  // Execute batched matmul directly for LoA path
+  matmul_direct(
+      'r',
+      transa,
+      transb,
+      M,
+      N,
+      K,
+      alpha,
+      b1.data_ptr(),
+      lda,
+      b2.data_ptr(),
+      ldb,
+      nullptr,
+      beta,
+      self.data_ptr(),
+      ldc,
+      params,
+      b1.size(0),
+      b2.size(0));
+  return;
+}
+} // namespace at::native
+
+#endif // AT_ZENDNN_ENABLED()
--- a/aten/src/ATen/native/zendnn/Matmul.h
+++ b/aten/src/ATen/native/zendnn/Matmul.h
@ -0,0 +1,18 @@
+#pragma once
+
+#include <ATen/Config.h>
+#include <ATen/core/Tensor.h>
+
+#if AT_ZENDNN_ENABLED()
+namespace at::native {
+
+TORCH_API void zendnn_baddbmm(
+    const Tensor& self,
+    const Tensor& batch1,
+    const Tensor& batch2,
+    float beta,
+    float alpha);
+
+} // namespace at::native
+
+#endif // AT_ZENDNN_ENABLED()
--- a/aten/src/ATen/native/zendnn/WeightPrepack.cpp
+++ b/aten/src/ATen/native/zendnn/WeightPrepack.cpp
@ -0,0 +1,82 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/zendnn/ZenDNN_utils.hpp>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/as_strided.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/zendnn_weight_prepack_for_linear_native.h>
+#endif
+
+#if !AT_ZENDNN_ENABLED()
+namespace at::native {
+at::Tensor zendnn_weight_prepack_for_linear(const at::Tensor& weight) {
+  TORCH_CHECK(
+      false,
+      "zendnn_weight_prepack_for_linear: ATen is not compiled with ZenDNN support");
+}
+} // namespace at::native
+#else // !AT_ZENDNN_ENABLED()
+namespace at::native {
+using namespace zendnnl::interface;
+at::Tensor zendnn_weight_prepack_for_linear(const at::Tensor& weight) {
+  TORCH_CHECK(
+      weight.dim() == 2,
+      "Weight tensor must be 2D for linear layer prepacking, got ",
+      weight.dim(),
+      "D tensor.");
+  TORCH_CHECK(
+      weight.scalar_type() == c10::ScalarType::Float ||
+          weight.scalar_type() == c10::ScalarType::BFloat16,
+      "Currently weight prepacking only supports float32 or bfloat16 dtype for weight tensor");
+  data_type_t datatype = get_zendnn_dtype(weight);
+  // Linear op internally works on transposed weight tensor, so to
+  // prepack the weight we need to use transposed weight.
+  auto reorder_input = weight.t();
+  tensor_t zen_reorder_input;
+  create_zendnn_tensor(
+      reorder_input, zen_reorder_input, "reorder_input", datatype);
+  // Currently, ZenDNN only supports blocked layout with AOCL kernels.
+  auto context = reorder_context_t().set_algo_format("aocl").create();
+  auto reorder_op =
+      reorder_operator_t().set_name("reorder_op").set_context(context).create();
+  // Check if reorder operation creation is successful.
+  TORCH_CHECK(
+      reorder_op.check(),
+      "operator ",
+      reorder_op.get_name(),
+      " creation failed.");
+  reorder_op.set_input("reorder_input", zen_reorder_input);
+  size_t reorder_bytes = reorder_op.get_reorder_size();
+  int64_t num_elements = reorder_bytes / weight.element_size();
+  // Create 1d tensor to hold the reordered weights with
+  // a stride of 1 to ensure contiguous memory layout.
+  at::Tensor reorder_output = at::detail::empty_strided_cpu(
+      /*size*/ {num_elements}, /*stride*/ {1}, weight.options());
+  tensor_t zen_reorder_output;
+  std::vector<long unsigned int> reorder_output_sizes(
+      reorder_input.sizes().begin(), reorder_input.sizes().end());
+  void* reorder_output_ptr = reorder_output.data_ptr();
+  zen_reorder_output.set_name("reorder_output")
+      .set_size(reorder_output_sizes)
+      .set_data_type(datatype)
+      .set_storage(reorder_output_ptr, reorder_output.nbytes());
+  if (is_tensor_2d_and_transposed(reorder_input)) {
+    zen_reorder_output.set_order("ba");
+  }
+  zen_reorder_output.set_layout(tensor_layout_t::blocked);
+  zen_reorder_output.create();
+  // Check if reorder output tensor creation is successful.
+  TORCH_CHECK(
+      zen_reorder_output.check(),
+      "tensor creation of ",
+      zen_reorder_output.get_name(),
+      " failed.");
+  reorder_op.set_output("reorder_output", zen_reorder_output);
+  reorder_op.execute();
+  return at::as_strided(reorder_output, weight.sizes(), weight.strides());
+}
+} // namespace at::native
+#endif // !AT_ZENDNN_ENABLED()
--- a/aten/src/ATen/native/zendnn/ZenDNN_utils.hpp
+++ b/aten/src/ATen/native/zendnn/ZenDNN_utils.hpp
@ -0,0 +1,69 @@
+#pragma once
+#include <ATen/Config.h>
+#include <ATen/core/Tensor.h>
+#include <cpuinfo.h>
+
+#if AT_ZENDNN_ENABLED()
+#include <zendnnl.hpp>
+
+namespace at::native {
+using namespace zendnnl::interface;
+inline bool zendnn_bf16_device_check() {
+  return cpuinfo_initialize() && cpuinfo_has_x86_avx512bf16();
+}
+
+inline data_type_t get_zendnn_dtype(const at::Tensor& tensor) {
+  if (tensor.scalar_type() == c10::ScalarType::Float) {
+    return data_type_t::f32;
+  } else if (tensor.scalar_type() == c10::ScalarType::BFloat16) {
+    return data_type_t::bf16;
+  }
+  TORCH_CHECK(false, "ZenDNN only supports Float32 and BFloat16.");
+}
+
+inline bool is_tensor_2d_and_transposed(const at::Tensor& t) {
+  if (t.dim() == 2) {
+    return t.strides()[0] == 1 && t.strides()[1] == t.sizes()[0];
+  }
+  return false;
+}
+
+inline void set_zendnn_tensor_attributes(
+    const at::Tensor& at_tensor,
+    tensor_t& zendnn_tensor,
+    const std::string& tensor_name,
+    const data_type_t& tensor_datatype,
+    const bool is_tensor_prepacked = false) {
+  std::vector<long unsigned int> at_tensor_sizes_vec(
+      at_tensor.sizes().begin(), at_tensor.sizes().end());
+  void* at_tensor_ptr = at_tensor.data_ptr();
+  zendnn_tensor.set_name(tensor_name)
+      .set_size(at_tensor_sizes_vec)
+      .set_data_type(tensor_datatype)
+      .set_storage(at_tensor_ptr, at_tensor.nbytes());
+  if (is_tensor_2d_and_transposed(at_tensor)) {
+    zendnn_tensor.set_order("ba");
+  }
+  if (is_tensor_prepacked && tensor_name == "weights") {
+    zendnn_tensor.set_layout(tensor_layout_t::blocked);
+  }
+}
+
+inline void create_zendnn_tensor(
+    const at::Tensor& source_tensor,
+    tensor_t& target_tensor,
+    const std::string& tensor_name,
+    const data_type_t datatype,
+    const bool is_tensor_prepacked = false) {
+  set_zendnn_tensor_attributes(
+      source_tensor, target_tensor, tensor_name, datatype, is_tensor_prepacked);
+  target_tensor.create();
+  TORCH_CHECK(
+      target_tensor.check(),
+      "tensor creation of ",
+      target_tensor.get_name(),
+      " failed.");
+}
+
+} // namespace at::native
+#endif // AT_ZENDNN_ENABLED()
--- a/aten/src/ATen/xpu/XPUScaledBlas.cpp
+++ b/aten/src/ATen/xpu/XPUScaledBlas.cpp
@ -0,0 +1,122 @@
+#include <c10/core/Scalar.h>
+#include <c10/core/ScalarType.h>
+#include <c10/util/Exception.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/typeid.h>
+#include <cstdint>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/BlasBackend.h>
+#include <ATen/Dispatch.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/OpMathType.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/core/NamedTensor.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/GroupedMMUtils.h>
+#include <ATen/native/Resize.h>
+#include <c10/util/MaybeOwned.h>
+
+#include <ATen/ceil_div.h>
+#include <ATen/xpu/XPUScaledBlas.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_addmm_activation_native.h>
+#include <ATen/ops/_efficientzerotensor.h>
+#include <ATen/ops/_scaled_mm_native.h>
+#include <ATen/ops/_unsafe_view_native.h>
+#include <ATen/ops/abs.h>
+#include <ATen/ops/addmm_native.h>
+#include <ATen/ops/addmv_native.h>
+#include <ATen/ops/baddbmm_native.h>
+#include <ATen/ops/bmm_native.h>
+#include <ATen/ops/copy_native.h>
+#include <ATen/ops/dot_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/gelu.h>
+#include <ATen/ops/max.h>
+#include <ATen/ops/mm_native.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/relu.h>
+#include <ATen/ops/scalar_tensor_native.h>
+#include <ATen/ops/vdot_native.h>
+#endif
+
+using at::blas::ScalingType;
+
+namespace at::native::onednn::scaled {
+
+/**
+ * Both inputs must be fp8,
+ * Each needs a single scale, {Tensorwise (float)}
+ */
+bool check_tensorwise_recipe(
+    c10::ScalarType type_a,
+    std::vector<ScalingType>& recipe_a,
+    ArrayRef<Tensor>& scales_a,
+    c10::ScalarType type_b,
+    std::vector<ScalingType>& recipe_b,
+    ArrayRef<Tensor>& scales_b) {
+  // both types must be fp8
+  if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) {
+    return false;
+  }
+
+  // 1 scale each, {Tensorwise, float}
+  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 ||
+      recipe_b.size() != 1) {
+    return false;
+  }
+  // Need {Blockwise_1x32, e8m0} for A & B
+  if (recipe_a[0] != ScalingType::TensorWise)
+    return false;
+  if (scales_a[0].scalar_type() != ScalarType::Float)
+    return false;
+  if (recipe_b[0] != ScalingType::TensorWise)
+    return false;
+  if (scales_b[0].scalar_type() != ScalarType::Float)
+    return false;
+
+  return true;
+}
+
+/**
+ * Both inputs must be fp8,
+ * Each needs scales, {Rowwise (float)}
+ */
+bool check_rowwise_recipe(
+    c10::ScalarType type_a,
+    std::vector<ScalingType>& recipe_a,
+    ArrayRef<Tensor>& scales_a,
+    c10::ScalarType type_b,
+    std::vector<ScalingType>& recipe_b,
+    ArrayRef<Tensor>& scales_b) {
+  // both types must be fp8
+  if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) {
+    return false;
+  }
+
+  // 1 scale each, {Tensorwise, float}
+  if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 ||
+      recipe_b.size() != 1) {
+    return false;
+  }
+
+  // Need {RowWise, dp32} for A & B
+  if (recipe_a[0] != ScalingType::RowWise)
+    return false;
+  if (scales_a[0].scalar_type() != ScalarType::Float)
+    return false;
+  if (recipe_b[0] != ScalingType::RowWise)
+    return false;
+  if (scales_b[0].scalar_type() != ScalarType::Float)
+    return false;
+
+  return true;
+}
+
+} // namespace at::native::onednn::scaled
--- a/aten/src/ATen/xpu/XPUScaledBlas.h
+++ b/aten/src/ATen/xpu/XPUScaledBlas.h
@ -0,0 +1,95 @@
+#include <c10/core/Scalar.h>
+#include <c10/core/ScalarType.h>
+#include <c10/util/Exception.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/typeid.h>
+#include <cstdint>
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Dispatch.h>
+#include <ATen/ExpandUtils.h>
+#include <ATen/OpMathType.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/core/NamedTensor.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/Resize.h>
+#include <c10/util/MaybeOwned.h>
+
+#include <ATen/BlasBackend.h>
+#include <ATen/ceil_div.h>
+
+#ifdef USE_FBGEMM_GENAI
+#include <fbgemm_gpu/torch_ops.h>
+#endif
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_addmm_activation_native.h>
+#include <ATen/ops/_efficientzerotensor.h>
+#include <ATen/ops/_scaled_mm_native.h>
+#include <ATen/ops/_unsafe_view_native.h>
+#include <ATen/ops/abs.h>
+#include <ATen/ops/addmm_native.h>
+#include <ATen/ops/addmv_native.h>
+#include <ATen/ops/baddbmm_native.h>
+#include <ATen/ops/bmm_native.h>
+#include <ATen/ops/copy_native.h>
+#include <ATen/ops/dot_native.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/gelu.h>
+#include <ATen/ops/max.h>
+#include <ATen/ops/mm_native.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/relu.h>
+#include <ATen/ops/scalar_tensor_native.h>
+#include <ATen/ops/vdot_native.h>
+#endif
+
+using at::blas::ScalingType;
+
+namespace at::native::onednn::scaled {
+
+/**
+ * Track concrete implementations available
+ */
+enum class ScaledGemmImplementation {
+  NONE = 0,
+  TENSORWISE_TENSORWISE = 1,
+  ROWWISE_ROWWISE = 2,
+};
+
+/**
+ * Convert passed int (enum) from python back into a
+ * strictly-typed enum
+ */
+template <class EnumType, class ArrayType>
+std::vector<EnumType> convert_int_to_enum(ArrayType& v) {
+  std::vector<EnumType> converted;
+  converted.reserve(v.size());
+
+  for (auto vi : v) {
+    converted.push_back(static_cast<EnumType>(vi));
+  }
+  return converted;
+}
+
+bool check_tensorwise_recipe(
+    c10::ScalarType,
+    std::vector<ScalingType>&,
+    ArrayRef<Tensor>&,
+    c10::ScalarType,
+    std::vector<ScalingType>&,
+    ArrayRef<Tensor>&);
+
+bool check_rowwise_recipe(
+    c10::ScalarType,
+    std::vector<ScalingType>&,
+    ArrayRef<Tensor>&,
+    c10::ScalarType,
+    std::vector<ScalingType>&,
+    ArrayRef<Tensor>&);
+
+} // namespace at::native::onednn::scaled
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@ -1169,6 +1169,9 @@ def define_buck_targets(
            "--replace",
            "@AT_USE_EIGEN_SPARSE@",
            "0",
+            "--replace",
+            "@AT_ZENDNN_ENABLED@",
+            "0",
        ]),
        outs = {
            "Config.h": ["Config.h"],
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -1184,6 +1184,9 @@ aten_cpu_source_non_codegen_list = [
    "aten/src/ATen/native/ComparisonUtils.cpp",
    "aten/src/ATen/native/DispatchStub.cpp",
    "aten/src/ATen/native/UpSample.cpp",
+    "aten/src/ATen/native/zendnn/Matmul.cpp",
+    "aten/src/ATen/native/zendnn/Linear.cpp",
+    "aten/src/ATen/native/zendnn/WeightPrepack.cpp",
    "aten/src/ATen/native/mkldnn/BinaryOps.cpp",
    "aten/src/ATen/native/mkldnn/Conv.cpp",
    "aten/src/ATen/native/mkldnn/ConvPrepack.cpp",
--- a/c10/cuda/driver_api.h
+++ b/c10/cuda/driver_api.h
@ -20,6 +20,22 @@
    }                                                                      \
  } while (0)

+#define C10_CUDA_DRIVER_CHECK_GOTO(EXPR, NEXT)                             \
+  do {                                                                     \
+    CUresult __err = EXPR;                                                 \
+    if (__err != CUDA_SUCCESS) {                                           \
+      const char* err_str;                                                 \
+      CUresult get_error_str_err [[maybe_unused]] =                        \
+          c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \
+      if (get_error_str_err != CUDA_SUCCESS) {                             \
+        TORCH_WARN("CUDA driver error: unknown error");                    \
+      } else {                                                             \
+        TORCH_WARN("CUDA driver error: ", err_str);                        \
+      }                                                                    \
+      goto NEXT;                                                           \
+    }                                                                      \
+  } while (0)
+
 // The integer in the second column specifies the requested CUDA Driver API
 // version. The dynamic loader will accept a driver with a newer version, but it
 // ensures that the requested symbol exists in *at least* the specified version
--- a/caffe2/core/macros.h.in
+++ b/caffe2/core/macros.h.in
@ -67,4 +67,5 @@
  {"USE_CUSPARSELT", "${USE_CUSPARSELT}"}, \
  {"USE_XPU", "${USE_XPU}"}, \
  {"USE_XCCL", "${USE_XCCL}"}, \
+  {"USE_ZENDNN", "${USE_ZENDNN}"} \
 }
--- a/cmake/Caffe2Config.cmake.in
+++ b/cmake/Caffe2Config.cmake.in
@ -117,6 +117,10 @@ if(@USE_MKLDNN@)
  include("${CMAKE_CURRENT_LIST_DIR}/public/mkldnn.cmake")
 endif()

+if(@USE_ZENDNN@)
+  include("${CMAKE_CURRENT_LIST_DIR}/public/zendnn.cmake")
+endif()
+
 # import targets
 include ("${CMAKE_CURRENT_LIST_DIR}/Caffe2Targets.cmake")

--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@ -118,6 +118,12 @@ if(INTERN_BUILD_ATEN_OPS)
            list(APPEND _file_compile_flags "-gencode;arch=compute_120a,code=sm_120a")
          endif()
        endif()
+        # We will need to gate against CUDA version, sm_121a was introduced in CUDA 12.9
+        if("${_arch}" STREQUAL "121a" AND CUDA_VERSION VERSION_GREATER_EQUAL 12.9)
+          if(_existing_arch_flags MATCHES ".*compute_120.*")
+            list(APPEND _file_compile_flags "-gencode;arch=compute_121a,code=sm_121a")
+          endif()
+        endif()
      endforeach()
      list(JOIN _file_compile_flags " " _file_compile_flags)

@ -126,7 +132,7 @@ if(INTERN_BUILD_ATEN_OPS)

    _BUILD_FOR_ADDITIONAL_ARCHS(
      "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/RowwiseScaledMM.cu"
-      "89;90a;100a;103a;120a")
+      "89;90a;100a;103a;120a;121a")
    _BUILD_FOR_ADDITIONAL_ARCHS(
      "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/ScaledGroupMM.cu"
      "90a")
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -162,6 +162,7 @@ set(AT_MKLDNN_ENABLED 0)
 set(AT_MKL_ENABLED 0)
 set(AT_KLEIDIAI_ENABLED 0)
 set(AT_USE_EIGEN_SPARSE 0)
+set(AT_ZENDNN_ENABLED 0)
 # setting default preferred BLAS options if not already present.
 if(NOT INTERN_BUILD_MOBILE)
  set(BLAS "MKL" CACHE STRING "Selected BLAS library")
@ -1509,6 +1510,32 @@ if(NOT INTERN_BUILD_MOBILE)
    message("disabling MKLDNN because USE_MKLDNN is not set")
  endif()

+  if(USE_ZENDNN)
+    if(NOT (CMAKE_SYSTEM_NAME MATCHES "Linux"))
+      message(WARNING
+        "USE_ZENDNN is currently only supported on Linux. Detected platform: ${CMAKE_SYSTEM_NAME}. Disabling ZenDNN support.")
+      set(USE_ZENDNN OFF)
+    elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
+      message(WARNING
+        "x64 operating system is required for ZenDNN. "
+        "ZenDNN codebase will not be compiled."
+        "Turn this warning off by USE_ZENDNN=OFF.")
+      set(USE_ZENDNN OFF)
+    else()
+      include(${CMAKE_CURRENT_LIST_DIR}/public/zendnn.cmake)
+      if(ZENDNN_FOUND)
+        set(AT_ZENDNN_ENABLED 1)
+        # Add to Caffe2 private dependencies
+        list(APPEND Caffe2_DEPENDENCY_LIBS zendnnl::zendnnl_archive)
+      else()
+        message(WARNING "ZENDNN could not be found.")
+        caffe2_update_option(USE_ZENDNN OFF)
+      endif()
+    endif()
+  else()
+    message(STATUS "disabling ZENDNN because USE_ZENDNN is not set")
+  endif()
+
  if(USE_KLEIDIAI)
    set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
    set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE)
--- a/cmake/Modules/FindZENDNN.cmake
+++ b/cmake/Modules/FindZENDNN.cmake
@ -0,0 +1,402 @@
+include_guard(GLOBAL)
+include(ExternalProject)
+
+# declare a zendnnl dependency
+macro(zendnnl_add_dependency )
+  set(options INCLUDE_ONLY)
+  set(oneValueArgs NAME PATH LIB_SUFFIX INCLUDE_SUFFIX ARCHIVE_FILE ALIAS)
+  set(multiValueArgs DEPENDS)
+  cmake_parse_arguments(_zad "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  string(TOUPPER ${_zad_NAME} _ZAD_UNAME)
+
+  if(DEFINED _zad_INCLUDE_SUFFIX)
+    set(ZENDNNL_${_ZAD_UNAME}_INC_DIR "${_zad_PATH}/${_zad_INCLUDE_SUFFIX}")
+  else()
+    set(ZENDNNL_${_ZAD_UNAME}_INC_DIR "${_zad_PATH}/include")
+  endif()
+
+  if(DEFINED _zad_LIB_SUFFIX)
+    set(ZENDNNL_${_ZAD_UNAME}_LIB_DIR "${_zad_PATH}/${_zad_LIB_SUFFIX}")
+  else()
+    set(ZENDNNL_${_ZAD_UNAME}_LIB_DIR "${_zad_PATH}/lib")
+  endif()
+
+  if(NOT EXISTS ${ZENDNNL_${_ZAD_UNAME}_INC_DIR})
+    file(MAKE_DIRECTORY ${ZENDNNL_${_ZAD_UNAME}_INC_DIR})
+  endif()
+
+  if(${_zad_INCLUDE_ONLY})
+    add_library(zendnnl_${_zad_NAME}_deps INTERFACE IMPORTED GLOBAL)
+    #add_dependencies(zendnnl_${_zad_NAME}_deps ${_zad_DEPENDS})
+
+    set_target_properties(zendnnl_${_zad_NAME}_deps
+      PROPERTIES
+      INTERFACE_INCLUDE_DIRECTORIES "${ZENDNNL_${_ZAD_UNAME}_INC_DIR}")
+  else()
+
+    add_library(zendnnl_${_zad_NAME}_deps STATIC IMPORTED GLOBAL)
+    #add_dependencies(zendnnl_${_zad_NAME}_deps ${_zad_DEPENDS})
+
+    set_target_properties(zendnnl_${_zad_NAME}_deps
+      PROPERTIES
+      IMPORTED_LOCATION "${ZENDNNL_${_ZAD_UNAME}_LIB_DIR}/${_zad_ARCHIVE_FILE}"
+      INCLUDE_DIRECTORIES "${ZENDNNL_${_ZAD_UNAME}_INC_DIR}"
+      INTERFACE_INCLUDE_DIRECTORIES "${ZENDNNL_${_ZAD_UNAME}_INC_DIR}")
+  endif()
+
+  add_library(${_zad_ALIAS} ALIAS zendnnl_${_zad_NAME}_deps)
+
+  list(APPEND ZNL_BYPRODUCTS "${ZENDNNL_${_ZAD_UNAME}_LIB_DIR}/${_zad_ARCHIVE_FILE}")
+endmacro()
+
+macro(zendnnl_add_option )
+  set(options EXECLUDE_FROM_COMMAND_LIST FORCE)
+  set(oneValueArgs NAME VALUE TYPE CACHE_STRING COMMAND_LIST)
+  set(multiValueArgs "")
+  cmake_parse_arguments(_zao "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  if(${_zao_FORCE})
+    set(${_zao_NAME} ${_zao_VALUE} CACHE ${_zao_TYPE} ${_zao_CACHE_STRING} FORCE)
+  else()
+    set(${_zao_NAME} ${_zao_VALUE} CACHE ${_zao_TYPE} ${_zao_CACHE_STRING})
+  endif()
+
+  if (NOT ${_zao_EXECLUDE_FROM_COMMAND_LIST})
+    list(APPEND ${_zao_COMMAND_LIST} "-D${_zao_NAME}:${_zao_TYPE}=${_zao_VALUE}")
+  endif()
+endmacro()
+
+message(AUTHOR_WARNING "(ZENDNNL) please ensure all zendnnl variables are set properly.")
+
+if(NOT ZENDNN_FOUND)
+  # find openmp
+  find_package(OpenMP REQUIRED QUIET)
+
+  # set zendnnl source dir, where zendnnl has been downloaded.
+  zendnnl_add_option(NAME ZENDNNL_SOURCE_DIR
+    VALUE ${PROJECT_SOURCE_DIR}/third_party/ZenDNN
+    TYPE PATH
+    CACHE_STRING "zendnnl_source_dir"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  # set zendnnl binary dir, if unsure set ${CMAKE_CURRENT_BINARY_DIR}/zendnnl.
+  zendnnl_add_option(NAME ZENDNNL_BINARY_DIR
+    VALUE ${CMAKE_BINARY_DIR}/third_party/ZenDNN
+    TYPE PATH
+    CACHE_STRING "zendnnl_binary_dir"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  # set zendnnl install dir, if unsure set ${CMAKE_INSTALL_PREFIX}/zendnnl.
+  zendnnl_add_option(NAME ZENDNNL_INSTALL_PREFIX
+    VALUE ${ZENDNNL_BINARY_DIR}/install
+    TYPE PATH
+    CACHE_STRING "zendnnl_install_dir"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  ## general zendnnl options
+  # set ZenDNNL framework build, this should on ON to avoid standalone build.
+  zendnnl_add_option(NAME ZENDNNL_FWK_BUILD
+    VALUE ON
+    TYPE BOOL
+    CACHE_STRING "zendnnl framework build"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  # set zendnnl build option, default is Release.
+  zendnnl_add_option(NAME ZENDNNL_BUILD_TYPE
+    VALUE "Release"
+    TYPE STRING
+    CACHE_STRING "zendnnl build type"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  # set zendnnl log level.
+  zendnnl_add_option(NAME ZENDNNL_MESSAGE_LOG_LEVEL
+    VALUE "DEBUG"
+    TYPE STRING
+    CACHE_STRING "zendnnl message log level"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  # set zendnnl verbose makefile option.
+  zendnnl_add_option(NAME ZENDNNL_VERBOSE_MAKEFILE
+    VALUE ON
+    TYPE BOOL
+    CACHE_STRING "zendnnl verbose makefile"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  ## components options
+  # set building zendnnl examples, default os OFF.
+  zendnnl_add_option(NAME ZENDNNL_BUILD_EXAMPLES
+    VALUE OFF
+    TYPE BOOL
+    CACHE_STRING "build zendnnl examples"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  # set building zendnnl gtests, default os OFF.
+  zendnnl_add_option(NAME ZENDNNL_BUILD_GTEST
+    VALUE OFF
+    TYPE BOOL
+    CACHE_STRING "build zendnnl gtests"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  # set building zendnnl doxygen documentation, default os OFF.
+  zendnnl_add_option(NAME ZENDNNL_BUILD_DOXYGEN
+    VALUE OFF
+    TYPE BOOL
+    CACHE_STRING "build zendnnl doxygen documentation"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  # set building zendnnl benchmarking tool, default os OFF.
+  zendnnl_add_option(NAME ZENDNNL_BUILD_BENCHDNN
+    VALUE OFF
+    TYPE BOOL
+    CACHE_STRING "build zendnnl benchdnn"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  # set zendnnl code coverage option, default os OFF.
+  zendnnl_add_option(NAME ZENDNNL_CODE_COVERAGE
+    VALUE OFF
+    TYPE BOOL
+    CACHE_STRING "build zendnnl code coverage"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  ## dependencies
+  # set if zendnnl depends on amdblis. this should bf OFF only if
+  # aocldlp dependency is ON.
+  zendnnl_add_option(NAME ZENDNNL_DEPENDS_AMDBLIS
+    VALUE OFF
+    TYPE BOOL
+    CACHE_STRING "zendnnl amdblis dependency"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  # set if zendnnl depends on aocldlp. this should bf ON only if
+  # amdblis dependency is OFF.
+  zendnnl_add_option(NAME ZENDNNL_DEPENDS_AOCLDLP
+    VALUE ON
+    TYPE BOOL
+    CACHE_STRING "zendnnl aocldlp dependency"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  # set if zendnnl depends on onednn, default is OFF.
+  zendnnl_add_option(NAME ZENDNNL_DEPENDS_ONEDNN
+    VALUE OFF
+    TYPE BOOL
+    CACHE_STRING "zendnnl onednn dependency"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  # set if zendnnl depends on libxsmm, default is OFF.
+  zendnnl_add_option(NAME ZENDNNL_DEPENDS_LIBXSMM
+    VALUE ON
+    TYPE BOOL
+    CACHE_STRING "zendnnl libxsmm dependency"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  # set path of amdblis if amdblis is injected. if the framework
+  # does not inject it, set it to "" (empty string).
+  zendnnl_add_option(NAME ZENDNNL_AMDBLIS_FWK_DIR
+    VALUE ""
+    TYPE PATH
+    CACHE_STRING "zendnnl amdblis framework path"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  # set path of aocldlp if aocldlp is injected. if the framework
+  # does not inject it, set it to "" (empty string).
+  zendnnl_add_option(NAME ZENDNNL_AOCLDLP_FWK_DIR
+    VALUE ""
+    TYPE PATH
+    CACHE_STRING "zendnnl aocldlp framework path"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  # set path of onednn if onednn is injected. if the framework
+  # does not inject it, set it to "" (empty string).
+  zendnnl_add_option(NAME ZENDNNL_ONEDNN_FWK_DIR
+    VALUE ""
+    TYPE PATH
+    CACHE_STRING "zendnnl onednnn framework path"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  # set path of libxsmm if libxsmm is injected. if the framework
+  # does not inject it, set it to "" (empty string).
+  zendnnl_add_option(NAME ZENDNNL_LIBXSMM_FWK_DIR
+    VALUE ""
+    TYPE PATH
+    CACHE_STRING "zendnnl libxsmm framework path"
+    COMMAND_LIST ZNL_CMAKE_ARGS)
+
+  # try to find pre-built package
+  set(zendnnl_ROOT "${ZENDNNL_INSTALL_PREFIX}/zendnnl")
+  set(zendnnl_DIR "${zendnnl_ROOT}/lib/cmake")
+  find_package(zendnnl QUIET)
+  if(zendnnl_FOUND)
+    message(STATUS "(ZENDNNL) ZENDNNL FOUND AT ${zendnnl_ROOT}")
+    message(STATUS "(ZENDNNL) if zendnnl options are changed from previous build,")
+    message(STATUS "(ZENDNNL) they will not be reflected")
+    message(STATUS "(ZENDNNL) If options are changed, please do a clean build.")
+    if(TARGET zendnnl::zendnnl_archive)
+      set_target_properties(zendnnl::zendnnl_archive
+        PROPERTIES IMPORTED_GLOBAL ON)
+    else()
+      message(FATAL_ERROR "(ZENDNNL) zendnnl installation does not have imported target zendnnl::zendnnl_archive")
+    endif()
+  else()
+    message(STATUS "(ZENDNNL) ZENDNNL NOT FOUND, will be built as an external project.")
+
+    # declare zendnnl library
+    set(ZENDNNL_LIBRARY_INC_DIR "${ZENDNNL_INSTALL_PREFIX}/zendnnl/include")
+    set(ZENDNNL_LIBRARY_LIB_DIR "${ZENDNNL_INSTALL_PREFIX}/zendnnl/lib")
+
+    if(NOT EXISTS ${ZENDNNL_LIBRARY_INC_DIR})
+      file(MAKE_DIRECTORY ${ZENDNNL_LIBRARY_INC_DIR})
+    endif()
+
+    add_library(zendnnl_library STATIC IMPORTED GLOBAL)
+    add_dependencies(zendnnl_library fwk_zendnnl)
+    set_target_properties(zendnnl_library
+      PROPERTIES
+      IMPORTED_LOCATION "${ZENDNNL_LIBRARY_LIB_DIR}/libzendnnl_archive.a"
+      INCLUDE_DIRECTORIES "${ZENDNNL_LIBRARY_INC_DIR}"
+      INTERFACE_INCLUDE_DIRECTORIES "${ZENDNNL_LIBRARY_INC_DIR}")
+
+    target_link_options(zendnnl_library INTERFACE "-fopenmp")
+    target_link_libraries(zendnnl_library
+      INTERFACE OpenMP::OpenMP_CXX
+      INTERFACE ${CMAKE_DL_LIBS})
+
+    add_library(zendnnl::zendnnl_archive ALIAS zendnnl_library)
+
+    list(APPEND ZNL_BYPRODUCTS "${ZENDNNL_LIBRARY_LIB_DIR}/libzendnnl_archive.a")
+
+    # declare all dependencies
+
+    # json dependency
+    zendnnl_add_dependency(NAME json
+      PATH "${ZENDNNL_INSTALL_PREFIX}/deps/json"
+      ALIAS "nlohmann_json::nlohmann_json"
+      INCLUDE_ONLY)
+
+    target_link_libraries(zendnnl_library INTERFACE nlohmann_json::nlohmann_json)
+
+    # aoclutils dependency
+    if (DEFINED ENV{ZENDNNL_MANYLINUX_BUILD})
+
+      zendnnl_add_dependency(NAME aoclutils
+        PATH "${ZENDNNL_INSTALL_PREFIX}/deps/aoclutils"
+        LIB_SUFFIX lib64
+        ARCHIVE_FILE "libaoclutils.a"
+        ALIAS "au::aoclutils")
+
+      target_link_libraries(zendnnl_library INTERFACE au::aoclutils)
+
+      zendnnl_add_dependency(NAME aucpuid
+        PATH "${ZENDNNL_INSTALL_PREFIX}/deps/aoclutils"
+        LIB_SUFFIX lib64
+        ARCHIVE_FILE "libau_cpuid.a"
+        ALIAS "au::au_cpuid")
+
+      target_link_libraries(zendnnl_library INTERFACE au::au_cpuid)
+
+    else()
+      zendnnl_add_dependency(NAME aoclutils
+        PATH "${ZENDNNL_INSTALL_PREFIX}/deps/aoclutils"
+        ARCHIVE_FILE "libaoclutils.a"
+        ALIAS "au::aoclutils")
+
+      target_link_libraries(zendnnl_library INTERFACE au::aoclutils)
+
+      zendnnl_add_dependency(NAME aucpuid
+        PATH "${ZENDNNL_INSTALL_PREFIX}/deps/aoclutils"
+        ARCHIVE_FILE "libau_cpuid.a"
+        ALIAS "au::au_cpuid")
+
+      target_link_libraries(zendnnl_library INTERFACE au::au_cpuid)
+
+    endif()
+
+    # amdblis dependency
+    if (ZENDNNL_DEPENDS_AMDBLIS)
+      zendnnl_add_dependency(NAME amdblis
+        PATH "${ZENDNNL_INSTALL_PREFIX}/deps/amdblis"
+        ARCHIVE_FILE "libblis-mt.a"
+        ALIAS "amdblis::amdblis_archive")
+
+      target_link_libraries(zendnnl_library INTERFACE amdblis::amdblis_archive)
+    endif()
+
+    if (ZENDNNL_DEPENDS_AOCLDLP)
+        zendnnl_add_dependency(NAME aocldlp
+          PATH "${ZENDNNL_INSTALL_PREFIX}/deps/aocldlp"
+          ARCHIVE_FILE "libaocl-dlp.a"
+          ALIAS "aocldlp::aocl_dlp_static")
+
+        target_link_libraries(zendnnl_library INTERFACE aocldlp::aocl_dlp_static)
+    endif()
+
+    if (ZENDNNL_DEPENDS_ONEDNN)
+        zendnnl_add_dependency(NAME onednn
+          PATH "${ZENDNNL_INSTALL_PREFIX}/deps/onednn"
+          ARCHIVE_FILE "libdnnl.a"
+          ALIAS "DNNL::dnnl")
+
+        target_link_libraries(zendnnl_library INTERFACE DNNL::dnnl)
+    endif()
+
+      # libxsmm dependency
+    if (ZENDNNL_DEPENDS_LIBXSMM)
+        zendnnl_add_dependency(NAME libxsmm
+          PATH "${ZENDNNL_INSTALL_PREFIX}/deps/libxsmm"
+          ARCHIVE_FILE "libxsmm.a"
+          ALIAS "libxsmm::libxsmm_archive")
+
+        target_link_libraries(zendnnl_library INTERFACE libxsmm::libxsmm_archive)
+    endif()
+
+    message(STATUS "(ZENDNNL) ZNL_BYPRODUCTS=${ZNL_BYPRODUCTS}")
+    message(STATUS "(ZENDNNL) ZNL_CMAKE_ARGS=${ZNL_CMAKE_ARGS}")
+
+    ExternalProject_ADD(fwk_zendnnl
+      SOURCE_DIR  "${ZENDNNL_SOURCE_DIR}"
+      BINARY_DIR  "${ZENDNNL_BINARY_DIR}"
+      CMAKE_ARGS  "${ZNL_CMAKE_ARGS}"
+      BUILD_COMMAND cmake --build . --target all -j
+      INSTALL_COMMAND ""
+      BUILD_BYPRODUCTS ${ZNL_BYPRODUCTS})
+
+    list(APPEND ZENDNNL_CLEAN_FILES "${ZENDNNL_BINARY_DIR}")
+    list(APPEND ZENDNNL_CLEAN_FILES "${ZENDNNL_INSTALL_PREFIX}")
+    set_target_properties(fwk_zendnnl
+      PROPERTIES
+      ADDITIONAL_CLEAN_FILES "${ZENDNNL_CLEAN_FILES}")
+
+    # framework dependencies
+    # add_dependencies(fwk_zendnnl <injected dependency targets>)
+    get_target_property(FWK_ZENDNNL_DEPENDS fwk_zendnnl MANUALLY_ADDED_DEPENDENCIES)
+    if(${FWK_ZENDNNL_DEPENDS} STREQUAL "FWK_ZENDNNL_DEPENDS-NOTFOUND")
+      message(AUTHOR_WARNING "(ZENDNNL) please ensure fwk_zendnnl depends on injected dependencies targets")
+    else()
+      message(STATUS "fwk_zendnnl dependencies : ${FWK_ZENDNNL_DEPENDS}")
+    endif()
+
+    # make library and its dependencies depend on fwk_zendnnl
+    add_dependencies(zendnnl_library fwk_zendnnl)
+    add_dependencies(zendnnl_json_deps fwk_zendnnl)
+    add_dependencies(zendnnl_aoclutils_deps fwk_zendnnl)
+    add_dependencies(zendnnl_aucpuid_deps fwk_zendnnl)
+
+    if(ZENDNNL_DEPENDS_AMDBLIS)
+      add_dependencies(zendnnl_amdblis_deps fwk_zendnnl)
+    endif()
+
+    if(ZENDNNL_DEPENDS_AOCLDLP)
+      add_dependencies(zendnnl_aocldlp_deps fwk_zendnnl)
+    endif()
+
+    if(ZENDNNL_DEPENDS_ONEDNN)
+      add_dependencies(zendnnl_onednn_deps fwk_zendnnl)
+    endif()
+
+    if(ZENDNNL_DEPENDS_LIBXSMM)
+      add_dependencies(zendnnl_libxsmm_deps fwk_zendnnl)
+    endif()
+  endif()
+  set(ZENDNN_FOUND TRUE)
+
+endif(NOT ZENDNN_FOUND)
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@ -148,6 +148,7 @@ function(caffe2_print_configuration_summary)
  message(STATUS "  USE_PYTORCH_METAL_EXPORT     : ${USE_PYTORCH_METAL_EXPORT}")
  message(STATUS "  USE_MPS               : ${USE_MPS}")
  message(STATUS "  CAN_COMPILE_METAL     : ${CAN_COMPILE_METAL}")
+  message(STATUS "  USE_ZENDNN            : ${USE_ZENDNN}")
  message(STATUS "  USE_MKL               : ${CAFFE2_USE_MKL}")
  if(${CAFFE2_USE_MKL})
    message(STATUS "    USE_STATIC_MKL      : ${USE_STATIC_MKL}")
--- a/cmake/public/zendnn.cmake
+++ b/cmake/public/zendnn.cmake
@ -0,0 +1,8 @@
+if(NOT EXISTS ${PROJECT_SOURCE_DIR}/third_party/ZenDNN)
+  message(WARNING "(ZENDNNL) Library not found at ${PROJECT_SOURCE_DIR}/third_party/ZenDNN")
+else()
+  find_package(ZENDNN QUIET)
+  if(ZENDNN_FOUND)
+    message(STATUS, "(ZENDNN) ZenDNN library was built successfully.")
+  endif(ZENDNN_FOUND)
+endif()
--- a/setup.py
+++ b/setup.py
@ -67,6 +67,9 @@
 #   USE_NUMPY=0
 #     disables the NumPy build
 #
+#   USE_ZENDNN=0
+#     disables the ZenDNN build
+#
 #   BUILD_TEST=0
 #     disables the test build
 #
@ -1221,6 +1224,10 @@ class build_ext(setuptools.command.build_ext.build_ext):
                report("-- Not using CBLAS in MKLDNN")
        else:
            report("-- Not using MKLDNN")
+        if cmake_cache_vars["USE_ZENDNN"]:
+            report("-- Using ZENDNN")
+        else:
+            report("-- Not using ZENDNN")
        if cmake_cache_vars["USE_NCCL"] and cmake_cache_vars["USE_SYSTEM_NCCL"]:
            report(
                "-- Using system provided NCCL library at "
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/init.py
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/init.py
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/get_any_data_ptr.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/get_any_data_ptr.cpp
@ -0,0 +1,20 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+
+using torch::stable::Tensor;
+
+uint64_t get_any_data_ptr(Tensor t, bool mutable_) {
+  if (mutable_) {
+    return reinterpret_cast<uint64_t>(t.mutable_data_ptr());
+  } else {
+    return reinterpret_cast<uint64_t>(t.const_data_ptr());
+  }
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("get_any_data_ptr(Tensor t, bool mutable_) -> int");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("get_any_data_ptr", TORCH_BOX(&get_any_data_ptr));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/get_template_any_data_ptr.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/get_template_any_data_ptr.cpp
@ -0,0 +1,34 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/headeronly/core/ScalarType.h>
+
+using torch::stable::Tensor;
+
+uint64_t get_template_any_data_ptr(Tensor t, torch::headeronly::ScalarType dtype, bool mutable_) {
+#define DEFINE_CASE(T, name)                                            \
+  case torch::headeronly::ScalarType::name: {                           \
+    if (mutable_) {                                                     \
+      return reinterpret_cast<uint64_t>(t.mutable_data_ptr<T>());       \
+    } else {                                                            \
+      return reinterpret_cast<uint64_t>(t.const_data_ptr<T>());         \
+    }                                                                   \
+  }
+  switch (dtype) {
+    // per aten/src/ATen/templates/TensorMethods.cpp:
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
+    DEFINE_CASE(uint16_t, UInt16)
+    DEFINE_CASE(uint32_t, UInt32)
+    DEFINE_CASE(uint64_t, UInt64)
+  default:
+      return 0;
+  }
+#undef DEFINE_CASE
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("get_template_any_data_ptr(Tensor t, ScalarType dtype, bool mutable_) -> int");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("get_template_any_data_ptr", TORCH_BOX(&get_template_any_data_ptr));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/make_tensor_clones_and_call_foreach.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/make_tensor_clones_and_call_foreach.cpp
@ -0,0 +1,41 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/tensor.h>
+
+#include <vector>
+
+using torch::stable::Tensor;
+
+// Declare my__foreach_mul (defined in my__foreach_mul.cpp)
+extern std::vector<Tensor> my__foreach_mul(
+    torch::headeronly::HeaderOnlyArrayRef<Tensor> self,
+    torch::headeronly::HeaderOnlyArrayRef<Tensor> other);
+
+// Helper function for cloning
+Tensor my_clone(Tensor t) {
+  return clone(t);
+}
+
+std::vector<Tensor> make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) {
+  // This function tests that my__foreach_mul can take in std::initializer_lists
+  // in addition to std::vectors.
+  Tensor t1_1 = my_clone(t1);
+  Tensor t1_2 = my_clone(t1);
+  Tensor t2_1 = my_clone(t2);
+  Tensor t2_2 = my_clone(t2);
+  return my__foreach_mul({t1_1, t2_1}, {t1_2, t2_2});
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def(
+      "make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) -> Tensor[]");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(
+    libtorch_agnostic_2_10,
+    CompositeExplicitAutograd,
+    m) {
+  m.impl(
+      "make_tensor_clones_and_call_foreach",
+      TORCH_BOX(&make_tensor_clones_and_call_foreach));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cpu.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cpu.cpp
@ -0,0 +1,40 @@
+// This is duplicated from the libtorch_agnostic_2_9_extension
+// as a negative test for test_version_compatibility.py
+
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/headeronly/util/Exception.h>
+#include <torch/headeronly/core/ScalarType.h>
+#include <torch/headeronly/core/Dispatch_v2.h>
+#include <torch/headeronly/core/TensorAccessor.h>
+
+#include "tensor_accessor_kernel.h"
+
+using torch::stable::Tensor;
+
+Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
+  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
+  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
+  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
+  STD_TORCH_CHECK(m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
+  STD_TORCH_CHECK(m.device() == v.device(), "m and v must be on the same device");
+  Tensor res = new_empty(m, {m.size(0)});
+  THO_DISPATCH_V2(m.scalar_type(), "mv_tensor_accessor_cpu",
+                  AT_WRAP(([&]() {
+                    auto resa = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(res.data_ptr()), res.sizes().data(), res.strides().data());
+                    auto ma = Accessor_cpu<scalar_t, 2>(reinterpret_cast<scalar_t*>(m.data_ptr()), m.sizes().data(), m.strides().data());
+                    auto va = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(v.data_ptr()), v.sizes().data(), v.strides().data());
+                    mv_tensor_accessor_kernel<Accessor_cpu, scalar_t>(resa, ma, va);
+                  })),
+                  AT_FLOATING_TYPES);
+  return res;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("mv_tensor_accessor_cpu(Tensor res, Tensor m, Tensor v) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("mv_tensor_accessor_cpu", TORCH_BOX(&mv_tensor_accessor_cpu));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cuda.cu
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/mv_tensor_accessor_cuda.cu
@ -0,0 +1,47 @@
+// This is duplicated from the libtorch_agnostic_2_9_extension
+// as a negative test for test_version_compatibility.py
+
+#include "tensor_accessor_kernel.h"
+
+#include <cuda_runtime.h>
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/tensor.h>
+
+using torch::stable::Tensor;
+
+Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
+  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
+  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
+  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
+  STD_TORCH_CHECK(
+      m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
+  STD_TORCH_CHECK(
+      m.device() == v.device(), "m and v must be on the same device");
+  Tensor res = new_empty(m, {m.size(0)});
+  THO_DISPATCH_V2(
+      m.scalar_type(),
+      "mv_tensor_accessor_cuda",
+      AT_WRAP(([&]() {
+        auto resa = Accessor_cuda<scalar_t, 1>(
+            reinterpret_cast<scalar_t*>(res.data_ptr()),
+            res.sizes().data(),
+            res.strides().data());
+        auto ma = Accessor_cuda<scalar_t, 2>(
+            reinterpret_cast<scalar_t*>(m.data_ptr()),
+            m.sizes().data(),
+            m.strides().data());
+        auto va = Accessor_cuda<scalar_t, 1>(
+            reinterpret_cast<scalar_t*>(v.data_ptr()),
+            v.sizes().data(),
+            v.strides().data());
+        mv_tensor_accessor_kernel<Accessor_cuda, scalar_t>
+            <<<1, 1, 0, 0>>>(resa, ma, va);
+      })),
+      AT_FLOATING_TYPES);
+  return res;
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CUDA, m) {
+  m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul.cpp
@ -0,0 +1,20 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <vector>
+
+using torch::stable::Tensor;
+
+std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
+  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
+  aoti_torch_call_dispatcher("aten::_foreach_mul", "List", stack.data());
+  return torch::stable::detail::to<std::vector<Tensor>>(stack[0]);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("my__foreach_mul(Tensor[] self, Tensor[] other) -> Tensor[]");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("my__foreach_mul", TORCH_BOX(&my__foreach_mul));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul_.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my__foreach_mul_.cpp
@ -0,0 +1,19 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/stableivalue_conversions.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+using torch::stable::Tensor;
+
+void my__foreach_mul_(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
+  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
+  aoti_torch_call_dispatcher("aten::_foreach_mul_", "List", stack.data());
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("my__foreach_mul_(Tensor(a!)[] self, Tensor[] other) -> ()");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("my__foreach_mul_", TORCH_BOX(&my__foreach_mul_));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_empty.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_empty.cpp
@ -0,0 +1,25 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/device.h>
+#include <torch/csrc/stable/ops.h>
+
+#include <optional>
+
+using torch::stable::Tensor;
+
+Tensor my_empty(
+    torch::headeronly::HeaderOnlyArrayRef<int64_t> size,
+    std::optional<torch::headeronly::ScalarType> dtype,
+    std::optional<torch::stable::Device> device,
+    std::optional<bool> pin_memory) {
+  return empty(size, dtype, device, pin_memory);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def(
+      "my_empty(int[] size, ScalarType? dtype=None, Device? device=None, bool? pin_memory=None) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("my_empty", TORCH_BOX(&my_empty));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_reshape.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_reshape.cpp
@ -0,0 +1,17 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+
+using torch::stable::Tensor;
+
+Tensor my_reshape(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> shape) {
+  return reshape(t, shape);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("my_reshape(Tensor t, int[] shape) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("my_reshape", TORCH_BOX(&my_reshape));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_view.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/my_view.cpp
@ -0,0 +1,20 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+
+using torch::stable::Tensor;
+
+Tensor my_view(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> size) {
+  return view(t, size);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("my_view(Tensor t, int[] size) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(
+    libtorch_agnostic_2_10,
+    CompositeExplicitAutograd,
+    m) {
+  m.impl("my_view", TORCH_BOX(&my_view));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/tensor_accessor_kernel.h
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/tensor_accessor_kernel.h
@ -0,0 +1,31 @@
+// This is duplicated from the libtorch_agnostic_2_9_extension
+// as a negative test for test_version_compatibility.py
+
+#pragma once
+
+#include <torch/headeronly/core/Dispatch_v2.h>
+#include <torch/headeronly/core/TensorAccessor.h>
+
+template <typename T, size_t N>
+using Accessor_cpu = torch::headeronly::HeaderOnlyTensorAccessor<T, N>;
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#define MAYBE_GLOBAL __global__
+
+template <typename T, size_t N>
+using Accessor_cuda = torch::headeronly::HeaderOnlyGenericPackedTensorAccessor<T, N, torch::headeronly::RestrictPtrTraits>;
+
+#else
+#define MAYBE_GLOBAL
+#endif
+
+template <template <typename, size_t> class Accessor, typename scalar_t>
+MAYBE_GLOBAL void mv_tensor_accessor_kernel(Accessor<scalar_t, 1> resa, Accessor<scalar_t, 2> ma, Accessor<scalar_t, 1> va) {
+  for (int64_t i = 0; i < resa.size(0); i++) {
+    scalar_t val = 0;
+    for (int64_t j = 0; j < ma.size(1); j++) {
+      val += ma[i][j] * va[j];
+    }
+    resa[i] = val;
+  }
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_constructor.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_constructor.cpp
@ -0,0 +1,37 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+#include <string>
+
+torch::stable::Device test_device_constructor(
+    bool is_cuda,
+    torch::stable::DeviceIndex index,
+    bool use_str) {
+  using torch::stable::Device;
+  using torch::stable::DeviceType;
+
+  if (use_str) {
+    std::string device_str;
+    if (is_cuda) {
+      device_str = "cuda:" + std::to_string(index);
+    } else {
+      device_str = "cpu";
+    }
+    return Device(device_str);
+  } else {
+    if (is_cuda) {
+      return Device(DeviceType::CUDA, index);
+    } else {
+      return Device(DeviceType::CPU);
+    }
+  }
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def(
+      "test_device_constructor(bool is_cuda, DeviceIndex index, bool use_str) -> Device");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_constructor", TORCH_BOX(&test_device_constructor));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_equality.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_equality.cpp
@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+bool test_device_equality(torch::stable::Device d1, torch::stable::Device d2) {
+  return d1 == d2;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_equality(Device d1, Device d2) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_equality", TORCH_BOX(&test_device_equality));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_index.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_index.cpp
@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+torch::stable::DeviceIndex test_device_index(torch::stable::Device device) {
+  return device.index();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_index(Device device) -> DeviceIndex");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_index", TORCH_BOX(&test_device_index));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cpu.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cpu.cpp
@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+bool test_device_is_cpu(torch::stable::Device device) {
+  return device.is_cpu();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_is_cpu(Device device) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_is_cpu", TORCH_BOX(&test_device_is_cpu));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cuda.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_is_cuda.cpp
@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+bool test_device_is_cuda(torch::stable::Device device) {
+  return device.is_cuda();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_is_cuda(Device device) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_is_cuda", TORCH_BOX(&test_device_is_cuda));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_set_index.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_device_set_index.cpp
@ -0,0 +1,17 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/device.h>
+
+torch::stable::Device test_device_set_index(
+    torch::stable::Device device,
+    torch::stable::DeviceIndex index) {
+  device.set_index(index);
+  return device;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_device_set_index(Device device, DeviceIndex index) -> Device");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_device_set_index", TORCH_BOX(&test_device_set_index));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_get_num_threads.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_get_num_threads.cpp
@ -0,0 +1,14 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/ops.h>
+
+uint32_t test_get_num_threads() {
+  return torch::stable::get_num_threads();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_get_num_threads() -> int");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_get_num_threads", TORCH_BOX(&test_get_num_threads));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_parallel_for.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_parallel_for.cpp
@ -0,0 +1,49 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/device.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h>
+
+using torch::stable::Tensor;
+
+Tensor test_parallel_for(int64_t size, int64_t grain_size) {
+  AtenTensorHandle tensor_handle;
+  int64_t stride = 1;
+
+  aoti_torch_empty_strided(
+      1,
+      &size,
+      &stride,
+      aoti_torch_dtype_int64(),
+      aoti_torch_device_type_cpu(),
+      0,
+      &tensor_handle);
+
+  Tensor tensor(tensor_handle);
+  int64_t* data_ptr = reinterpret_cast<int64_t*>(tensor.data_ptr());
+
+  torch::stable::zero_(tensor);
+
+  // Use parallel_for to fill each element with its index
+  // If using a parallel path, the thread id is encoded in the upper 32 bits
+  torch::stable::parallel_for(
+      0, size, grain_size, [data_ptr](int64_t begin, int64_t end) {
+        for (auto i = begin; i < end; i++) {
+          STD_TORCH_CHECK(i <= UINT32_MAX);
+          uint32_t thread_id;
+          torch_get_thread_idx(&thread_id);
+          data_ptr[i] = i | (static_cast<int64_t>(thread_id) << 32);
+        }
+      });
+
+  return tensor;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_parallel_for(int size, int grain_size) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_parallel_for", TORCH_BOX(&test_parallel_for));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_tensor_device.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/csrc/test_tensor_device.cpp
@ -0,0 +1,17 @@
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/device.h>
+
+using torch::stable::Tensor;
+
+torch::stable::Device test_tensor_device(torch::stable::Tensor tensor) {
+  return tensor.device();
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
+  m.def("test_tensor_device(Tensor t) -> Device");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
+  m.impl("test_tensor_device", TORCH_BOX(&test_tensor_device));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/libtorch_agnostic_2_10/ops.py
@ -0,0 +1,225 @@
+import torch
+from torch import Tensor
+
+
+def my__foreach_mul_(tensors, others) -> ():
+    """
+    Updates tensors to be the result of pointwise multiplying with others.
+
+    Args:
+        tensors: list of tensors
+        others: list of tensors (with the same corresponding shapes as tensors)
+
+    Returns: nothing, tensors is updated in place.
+    """
+    torch.ops.libtorch_agnostic_2_10.my__foreach_mul_.default(tensors, others)
+
+
+def my__foreach_mul(tensors, others) -> list[Tensor]:
+    """
+    Returns a list of tensors that are the results of pointwise multiplying
+    tensors and others.
+
+    Args:
+        tensors: list of tensors
+        others: list of tensors (with the same corresponding shapes as tensors)
+
+    Returns: list of multiplied tensors
+    """
+    return torch.ops.libtorch_agnostic_2_10.my__foreach_mul.default(tensors, others)
+
+
+def make_tensor_clones_and_call_foreach(t1, t2) -> list[Tensor]:
+    """
+    Returns a list of 2 tensors corresponding to the square of the inputs.
+
+    Args:
+        t1: Tensor
+        t2: Tensor
+
+    Returns: list of [t1^2, t2^2]
+    """
+    return torch.ops.libtorch_agnostic_2_10.make_tensor_clones_and_call_foreach.default(
+        t1, t2
+    )
+
+
+def test_tensor_device(t):
+    """
+    Tests Tensor device() method.
+
+    Args:
+        t: Tensor - tensor to get device from
+
+    Returns: Device - device of the tensor
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_tensor_device.default(t)
+
+
+def test_device_constructor(is_cuda, index, use_str):
+    """
+    Tests creating a Device from DeviceType and index, or from a string.
+
+    Args:
+        is_cuda: bool - if True, creates CUDA device; if False, creates CPU device
+        index: int - device index
+        use_str: bool - if True, constructs from string; if False, constructs from DeviceType
+
+    Returns: Device - A device with the specified type and index
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_device_constructor.default(
+        is_cuda, index, use_str
+    )
+
+
+def test_device_equality(d1, d2) -> bool:
+    """
+    Tests Device equality operator.
+
+    Args:
+        d1: Device - first device
+        d2: Device - second device
+
+    Returns: bool - True if devices are equal
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_device_equality.default(d1, d2)
+
+
+def test_device_set_index(device, index):
+    """
+    Tests Device set_index() method.
+
+    Args:
+        device: Device - device to modify
+        index: int - new device index
+
+    Returns: Device - device with updated index
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_device_set_index.default(device, index)
+
+
+def test_device_index(device) -> int:
+    """
+    Tests Device index() method.
+
+    Args:
+        device: Device - device to query
+
+    Returns: int - device index
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_device_index.default(device)
+
+
+def test_device_is_cuda(device) -> bool:
+    """
+    Tests Device is_cuda() method.
+
+    Args:
+        device: Device - device to check
+
+    Returns: bool - True if device is CUDA
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_device_is_cuda.default(device)
+
+
+def test_device_is_cpu(device) -> bool:
+    """
+    Tests Device is_cpu() method.
+
+    Args:
+        device: Device - device to check
+
+    Returns: bool - True if device is CPU
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_device_is_cpu.default(device)
+
+
+def test_parallel_for(size, grain_size) -> Tensor:
+    """
+    Tests the parallel_for functionality by using it to fill a tensor with indices.
+    Args:
+        size: int - size of the tensor to create
+        grain_size: int - grain size for parallel_for
+    Returns: Tensor - a 1D int64 tensor where each element contains its index
+        (if multiple threads are used the threadid will be encoded in the upper 32 bits)
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_parallel_for.default(size, grain_size)
+
+
+def test_get_num_threads() -> int:
+    """
+    Tests the get_num_threads functionality by returning the number of threads
+    for the parallel backend.
+
+    Returns: int - the number of threads for the parallel backend
+    """
+    return torch.ops.libtorch_agnostic_2_10.test_get_num_threads.default()
+
+
+def my_empty(size, dtype=None, device=None, pin_memory=None) -> Tensor:
+    """
+    Creates an empty tensor with the specified size, dtype, device, and pin_memory.
+
+    Args:
+        size: list[int] - size of the tensor to create
+        dtype: ScalarType or None - data type of the tensor
+        device: Device or None - device on which to create the tensor
+        pin_memory: bool or None - whether to use pinned memory
+
+    Returns: Tensor - an uninitialized tensor with the specified properties
+    """
+    return torch.ops.libtorch_agnostic_2_10.my_empty.default(
+        size, dtype, device, pin_memory
+    )
+
+
+def my_reshape(t, shape) -> Tensor:
+    """
+    Returns a tensor with the same data but different shape.
+
+    Args:
+        t: Tensor - tensor to reshape
+        shape: list[int] - new shape for the tensor
+
+    Returns: Tensor - reshaped tensor
+    """
+    return torch.ops.libtorch_agnostic_2_10.my_reshape.default(t, shape)
+
+
+def my_view(t, size) -> Tensor:
+    """
+    Returns a new tensor with the same data as the input tensor but of a different shape.
+
+    Args:
+        t: Tensor - tensor to view
+        size: list[int] - new size for the tensor
+
+    Returns: Tensor - tensor with new view
+    """
+    return torch.ops.libtorch_agnostic_2_10.my_view.default(t, size)
+
+
+def get_any_data_ptr(t, mutable) -> int:
+    """
+    Return data pointer value of the tensor.
+    Args:
+        t: Input tensor
+        mutable: whether data pointer qualifier is mutable or const
+    Returns: int - pointer value
+    """
+    return torch.ops.libtorch_agnostic_2_10.get_any_data_ptr.default(t, mutable)
+
+
+def get_template_any_data_ptr(t, dtype, mutable) -> int:
+    """
+    Return data pointer value of the tensor iff it has dtype.
+    Args:
+        t: Input tensor
+        dtype: Input dtype
+        mutable: whether data pointer qualifier is mutable or const
+    Returns: int - pointer value
+    Raises RuntimeError when t.dtype() != dtype.
+    """
+    return torch.ops.libtorch_agnostic_2_10.get_template_any_data_ptr.default(
+        t, dtype, mutable
+    )
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/setup.py
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/setup.py
@ -9,7 +9,7 @@ from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtensio


 ROOT_DIR = Path(__file__).parent
-CSRC_DIR = ROOT_DIR / "libtorch_agnostic" / "csrc"
+CSRC_DIR = ROOT_DIR / "libtorch_agnostic_2_10" / "csrc"


 class clean(distutils.command.clean.clean):
@ -18,13 +18,13 @@ class clean(distutils.command.clean.clean):
        distutils.command.clean.clean.run(self)

        # Remove extension
-        for path in (ROOT_DIR / "libtorch_agnostic").glob("**/*.so"):
+        for path in (ROOT_DIR / "libtorch_agnostic_2_10").glob("**/*.so"):
            path.unlink()
        # Remove build and dist and egg-info directories
        dirs = [
            ROOT_DIR / "build",
            ROOT_DIR / "dist",
-            ROOT_DIR / "libtorch_agnostic.egg-info",
+            ROOT_DIR / "libtorch_agnostic_2_10.egg-info",
        ]
        for path in dirs:
            if path.exists():
@ -33,7 +33,11 @@ class clean(distutils.command.clean.clean):

 def get_extension():
    extra_compile_args = {
-        "cxx": ["-fdiagnostics-color=always"],
+        "cxx": [
+            "-fdiagnostics-color=always",
+            "-DTORCH_STABLE_ONLY",
+            "-DTORCH_TARGET_VERSION=0x020a000000000000",
+        ],
    }
    sources = list(CSRC_DIR.glob("**/*.cpp"))

@ -47,7 +51,7 @@ def get_extension():

    return [
        extension(
-            "libtorch_agnostic._C",
+            "libtorch_agnostic_2_10._C",
            sources=sorted(str(s) for s in sources),
            py_limited_api=True,
            extra_compile_args=extra_compile_args,
@ -57,12 +61,12 @@ def get_extension():


 setup(
-    name="libtorch_agnostic",
+    name="libtorch_agnostic_2_10",
    version="0.0",
    author="PyTorch Core Team",
-    description="Example of libtorch agnostic extension",
+    description="Example of libtorch agnostic extension for PyTorch 2.10+",
    packages=find_packages(exclude=("test",)),
-    package_data={"libtorch_agnostic": ["*.dll", "*.dylib", "*.so"]},
+    package_data={"libtorch_agnostic_2_10": ["*.dll", "*.dylib", "*.so"]},
    install_requires=[
        "torch",
    ],
--- a/test/cpp_extensions/libtorch_agnostic_2_10_extension/test_version_compatibility.py
+++ b/test/cpp_extensions/libtorch_agnostic_2_10_extension/test_version_compatibility.py
@ -0,0 +1,308 @@
+# Owner(s): ["module: cpp"]
+
+"""
+Unit tests to verify that each function file requires PyTorch 2.10+.
+
+This test suite compiles each .cpp file in the csrc directory with
+TORCH_TARGET_VERSION=2.9.0 and expects compilation to fail.
+If compilation succeeds, it means that either
+
+(1) The test function works with 2.9.0 and should not be in this directory.
+(2) The test function tests APIs that do not have proper TORCH_FEATURE_VERSION
+    guards. If this is the case, and you incorrectly move the test function into
+    libtorch_agnostic_2_9_extension the libtorch_agnostic_targetting CI workflow
+    will catch this.
+
+Run this script with VERSION_COMPAT_DEBUG=1 to see compilation errors.
+"""
+
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+
+from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
+from torch.utils.cpp_extension import CUDA_HOME, include_paths as torch_include_paths
+
+
+# TODO: Fix this error in Windows:
+# numba.cuda.cudadrv.driver:driver.py:384 Call to cuInit results in CUDA_ERROR_NO_DEVICE
+if not IS_WINDOWS:
+
+    class FunctionVersionCompatibilityTest(TestCase):
+        """Test that all function files require PyTorch 2.10+."""
+
+        @classmethod
+        def setUpClass(cls):
+            """Set up test environment once for all tests."""
+            cls.csrc_dir = Path(__file__).parent / "libtorch_agnostic_2_10" / "csrc"
+            cls.build_dir = Path(tempfile.mkdtemp(prefix="version_check_"))
+
+            cls.pytorch_includes = [
+                f"-I{path}" for path in torch_include_paths(device_type="cpu")
+            ]
+            cls.cuda_includes = []
+            if CUDA_HOME:
+                cuda_include_path = os.path.join(CUDA_HOME, "include")
+                if os.path.exists(cuda_include_path):
+                    cls.cuda_includes = [f"-I{cuda_include_path}"]
+
+            cls.cuda_available = cls._check_cuda_available()
+
+        @classmethod
+        def tearDownClass(cls):
+            """Clean up build directory."""
+            import shutil
+
+            if cls.build_dir.exists():
+                shutil.rmtree(cls.build_dir)
+
+        @staticmethod
+        def _check_cuda_available() -> bool:
+            """Check if CUDA is available."""
+            try:
+                import torch
+
+                return torch.cuda.is_available()
+            except ImportError:
+                return False
+
+        def _compile_cpp_file(
+            self, source_file: Path, output_file: Path
+        ) -> tuple[bool, str]:
+            """
+            Compile a C++ file with TORCH_TARGET_VERSION=2.9.0.
+            Returns (success, error_message).
+            """
+            torch_version_2_9 = "0x0209000000000000"
+
+            cmd = [
+                "g++",
+                "-c",
+                "-std=c++17",
+                f"-DTORCH_TARGET_VERSION={torch_version_2_9}",
+                f"-I{source_file.parent}",  # For includes in same directory
+                *self.pytorch_includes,
+            ]
+
+            # Add CUDA flags if available
+            if self.cuda_available:
+                cmd.extend(self.cuda_includes)
+
+            cmd.extend([str(source_file), "-o", str(output_file)])
+
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+
+            if result.returncode == 0:
+                return True, ""
+            else:
+                return False, result.stderr
+
+        def _compile_cu_file(
+            self, source_file: Path, output_file: Path
+        ) -> tuple[bool, str]:
+            """
+            Compile a CUDA file with TORCH_TARGET_VERSION=2.9.0.
+            Returns (success, error_message).
+            """
+            if not CUDA_HOME:
+                return False, "CUDA_HOME not set"
+
+            torch_version_2_9 = "0x0209000000000000"
+
+            cmd = [
+                os.path.join(CUDA_HOME, "bin", "nvcc"),
+                "-c",
+                "-std=c++17",
+                f"-DTORCH_TARGET_VERSION={torch_version_2_9}",
+                f"-I{source_file.parent}",  # For includes in same directory
+                *self.pytorch_includes,
+                *self.cuda_includes,
+            ]
+
+            cmd.extend([str(source_file), "-o", str(output_file)])
+
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+
+            if result.returncode == 0:
+                return True, ""
+            else:
+                return False, result.stderr
+
+        def _test_function_file(self, source_file: Path):
+            """Test that a function file fails to compile with TORCH_TARGET_VERSION=2.9.0."""
+            func_name = source_file.stem
+            obj_file = self.build_dir / f"{func_name}.o"
+
+            # Choose the appropriate compiler based on file extension
+            if source_file.suffix == ".cu":
+                if not self.cuda_available:
+                    self.skipTest(f"CUDA not available, skipping {source_file.name}")
+                success, error_msg = self._compile_cu_file(source_file, obj_file)
+            else:
+                success, error_msg = self._compile_cpp_file(source_file, obj_file)
+
+            obj_file.unlink(missing_ok=True)
+
+            # Print error details for debugging
+            if not success:
+                relevant_errors = self._extract_relevant_errors(error_msg)
+                if relevant_errors:
+                    print(f"\n  Compilation errors for {func_name} (requires 2.10+):")
+                    for err in relevant_errors:
+                        print(f"    {err}")
+
+            self.assertFalse(
+                success,
+                f"Function {func_name} compiled successfully with TORCH_TARGET_VERSION=2.9.0. "
+                f"This could mean two things.\n\t1. It should run with 2.9.0 and should be "
+                "moved to libtorch_agnostic_2_9_extension\n\t2. The function(s) it tests do not use the "
+                "proper TORCH_FEATURE_VERSION guards\n\nThe libtorch_agnostic_targetting CI workflow will "
+                "verify if you incorrectly move this to the 2_9 extension instead of adding "
+                "the appropriate version guards.",
+            )
+
+        def test_mv_tensor_accessor_cpu_works_with_2_9(self):
+            """Test that mv_tensor_accessor_cpu.cpp compiles successfully with 2.9.0.
+
+            This is a negative test - it ensures that a file we expect to work with 2.9.0
+            actually does compile. This validates that our test infrastructure correctly
+            distinguishes between files that require 2.10+ and those that don't.
+            """
+            cpp_file = self.csrc_dir / "mv_tensor_accessor_cpu.cpp"
+
+            if not cpp_file.exists():
+                self.skipTest(f"{cpp_file} not found - this is a test file only")
+
+            obj_file = self.build_dir / "mv_tensor_accessor_cpu.o"
+            success, error_msg = self._compile_cpp_file(cpp_file, obj_file)
+
+            # Clean up
+            obj_file.unlink(missing_ok=True)
+
+            if not success:
+                relevant_errors = self._extract_relevant_errors(error_msg)
+                if relevant_errors:
+                    print(
+                        "\n  Unexpected compilation errors for mv_tensor_accessor_cpu:"
+                    )
+                    for err in relevant_errors:
+                        print(f"{err}")
+
+            self.assertTrue(
+                success,
+                f"mv_tensor_accessor_cpu.cpp failed to compile with TORCH_TARGET_VERSION=2.9.0. "
+                f"This file is expected to work with 2.9.0 since it doesn't use 2.10+ features. "
+                f"Error: {error_msg}",
+            )
+
+        def test_mv_tensor_accessor_cuda_works_with_2_9(self):
+            """Test that mv_tensor_accessor_cuda.cu compiles successfully with 2.9.0.
+
+            This is a negative test - it ensures that a .cu file we expect to work with 2.9.0
+            actually does compile. This validates that our test infrastructure correctly
+            compiles CUDA files and distinguishes between files that require 2.10+ and those
+            that don't.
+            """
+            if not self.cuda_available:
+                self.skipTest(
+                    "CUDA not available, skipping mv_tensor_accessor_cuda.cu test"
+                )
+
+            cu_file = self.csrc_dir / "mv_tensor_accessor_cuda.cu"
+
+            if not cu_file.exists():
+                self.skipTest(f"{cu_file} not found - this is a test file only")
+
+            obj_file = self.build_dir / "cuda_kernel.o"
+            success, error_msg = self._compile_cu_file(cu_file, obj_file)
+
+            # Clean up
+            obj_file.unlink(missing_ok=True)
+
+            if not success:
+                relevant_errors = self._extract_relevant_errors(error_msg)
+                if relevant_errors:
+                    print(
+                        "\n  Unexpected compilation errors for mv_tensor_accessor_cuda.cu:"
+                    )
+                    for err in relevant_errors:
+                        print(f"{err}")
+
+            self.assertTrue(
+                success,
+                f"mv_tensor_accessor_cuda.cu failed to compile with TORCH_TARGET_VERSION=2.9.0. "
+                f"This file is expected to work with 2.9.0 since it doesn't use 2.10+ features. "
+                f"Error: {error_msg}",
+            )
+
+        @staticmethod
+        def _extract_relevant_errors(error_msg: str) -> list[str]:
+            """Extract the most relevant error messages."""
+            error_lines = error_msg.strip().split("\n")
+            relevant_errors = []
+
+            for line in error_lines:
+                line_lower = line.lower()
+                if (
+                    "error:" in line_lower
+                    or "undefined" in line_lower
+                    or "undeclared" in line_lower
+                    or "no member named" in line_lower
+                ):
+                    relevant_errors.append(line.strip())
+
+            return relevant_errors
+
+    # Dynamically create test methods for each .cpp and .cu file
+
+    def _create_test_method_for_file(source_file: Path):
+        """Create a test method for a specific source file."""
+
+        def test_method_impl(self):
+            self._test_function_file(source_file)
+
+        # Set a descriptive name and docstring
+        func_name = source_file.stem
+        file_ext = source_file.suffix
+        test_method_impl.__name__ = f"test_{func_name}_requires_2_10"
+        test_method_impl.__doc__ = (
+            f"Test that {func_name}{file_ext} requires PyTorch 2.10+"
+        )
+
+        return test_method_impl
+
+    # Test discovery: generate a test for each .cpp and .cu file
+    _csrc_dir = Path(__file__).parent / "libtorch_agnostic_2_10" / "csrc"
+    if _csrc_dir.exists():
+        # Collect both .cpp and .cu files, excluding those used for negative test
+        # already defined above
+        _source_files = sorted(
+            [
+                f
+                for f in _csrc_dir.rglob("*.cpp")
+                if f.name not in ("mv_tensor_accessor_cpu.cpp",)
+            ]
+            + [
+                f
+                for f in _csrc_dir.rglob("*.cu")
+                if f.name not in ("mv_tensor_accessor_cuda.cu",)
+            ]
+        )
+
+        for _source_file in _source_files:
+            _test_method = _create_test_method_for_file(_source_file)
+            setattr(
+                FunctionVersionCompatibilityTest, _test_method.__name__, _test_method
+            )
+
+        del (
+            _create_test_method_for_file,
+            _csrc_dir,
+            _source_files,
+            _source_file,
+            _test_method,
+        )
+
+if __name__ == "__main__":
+    run_tests()
--- a/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/init.py
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/init.py
@ -0,0 +1,21 @@
+import ctypes
+from pathlib import Path
+
+import torch
+
+
+so_files = list(Path(__file__).parent.glob("_C*.so"))
+assert len(so_files) == 1, f"Expected one _C*.so file, found {len(so_files)}"
+
+# use ctypes.CDLL instead of load_library to be able to test the unload logic
+# below code is reduced from the load_library code
+with torch._ops.dl_open_guard():
+    loaded_lib = ctypes.CDLL(so_files[0])
+
+from . import ops
+
+
+__all__ = [
+    "loaded_lib",
+    "ops",
+]
--- a/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/cuda_kernel.cu
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/cuda_kernel.cu
@ -0,0 +1,44 @@
+#include "kernel.h"
+
+#include <cuda_runtime.h>
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/tensor.h>
+
+using torch::stable::Tensor;
+
+Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
+  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
+  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
+  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
+  STD_TORCH_CHECK(
+      m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
+  STD_TORCH_CHECK(
+      m.device() == v.device(), "m and v must be on the same device");
+  Tensor res = new_empty(m, {m.size(0)});
+  THO_DISPATCH_V2(
+      m.scalar_type(),
+      "mv_tensor_accessor_cuda",
+      AT_WRAP(([&]() {
+        auto resa = Accessor_cuda<scalar_t, 1>(
+            reinterpret_cast<scalar_t*>(res.data_ptr()),
+            res.sizes().data(),
+            res.strides().data());
+        auto ma = Accessor_cuda<scalar_t, 2>(
+            reinterpret_cast<scalar_t*>(m.data_ptr()),
+            m.sizes().data(),
+            m.strides().data());
+        auto va = Accessor_cuda<scalar_t, 1>(
+            reinterpret_cast<scalar_t*>(v.data_ptr()),
+            v.sizes().data(),
+            v.strides().data());
+        mv_tensor_accessor_kernel<Accessor_cuda, scalar_t>
+            <<<1, 1, 0, 0>>>(resa, ma, va);
+      })),
+      AT_FLOATING_TYPES);
+  return res;
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CUDA, m) {
+  m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
+}
--- a/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.cpp
@ -1,8 +1,6 @@
 #include "kernel.h"

 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
-#include <torch/csrc/stable/accelerator.h>
-#include <torch/csrc/stable/device.h>
 #include <torch/csrc/stable/library.h>
 #include <torch/csrc/stable/tensor.h>
 #include <torch/csrc/stable/ops.h>
@ -11,6 +9,7 @@

 #ifdef LAE_USE_CUDA
 #include <cuda_runtime.h>
+#include <torch/csrc/stable/accelerator.h>
 #endif

 #include <optional>
@ -68,11 +67,11 @@ Tensor sgd_out_of_place(
  return out;
 }

-STABLE_TORCH_LIBRARY(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY(libtorch_agnostic_2_9, m) {
  m.def("sgd_out_of_place(Tensor param, Tensor grad, float weight_decay, float lr, bool maximize) -> Tensor");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CPU, m) {
  m.impl("sgd_out_of_place", TORCH_BOX(&sgd_out_of_place));
 }

@ -81,15 +80,15 @@ Tensor identity(Tensor t) {
 }


-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
  m.def("identity(Tensor t) -> Tensor");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CUDA, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CUDA, m) {
  m.impl("identity", TORCH_BOX(&identity));
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CPU, m) {
  m.impl("identity", TORCH_BOX(&identity));
 }

@ -101,11 +100,11 @@ Tensor my_abs(Tensor t) {
  return torch::stable::detail::to<Tensor>(stack[0]);
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
  m.def("my_abs(Tensor t) -> Tensor");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
  m.impl("my_abs", TORCH_BOX(&my_abs));
 }

@ -127,11 +126,11 @@ Tensor my_ones_like(Tensor t, StableIValue device) {
  return torch::stable::detail::to<Tensor>(stack[0]);
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
  m.def("my_ones_like(Tensor t, Device d) -> Tensor");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
  m.impl("my_ones_like", TORCH_BOX(&my_ones_like));
 }

@ -154,11 +153,11 @@ std::tuple<Tensor, Tensor, bool> exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3
    torch::stable::detail::to<bool>(stack_is_leaf[0]));
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
  m.def("exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3) -> (Tensor, Tensor, bool)");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
  m.impl("exp_neg_is_leaf", TORCH_BOX(&exp_neg_is_leaf));
 }

@ -170,11 +169,11 @@ Tensor neg_exp(Tensor t) {
  return torch::stable::detail::to<Tensor>(stack[0]);
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
  m.def("neg_exp(Tensor t) -> Tensor");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
  m.impl("neg_exp", TORCH_BOX(&neg_exp));
 }

@ -194,11 +193,11 @@ Tensor divide_neg_exp(Tensor t) {
  return torch::stable::detail::to<Tensor>(stack_div[0]);
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
  m.def("divide_neg_exp(Tensor t) -> Tensor");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
  m.impl("divide_neg_exp", TORCH_BOX(&divide_neg_exp));
 }

@ -206,11 +205,11 @@ bool is_contiguous(Tensor t) {
  return t.is_contiguous();
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
  m.def("is_contiguous(Tensor t) -> bool");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
  m.impl("is_contiguous", TORCH_BOX(&is_contiguous));
 }

@ -265,7 +264,7 @@ Tensor my_clone(Tensor t) {
  return clone(t);
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
  m.def("my_transpose(Tensor t, int dim0, int dim1) -> Tensor");
  m.def("my_empty_like(Tensor t) -> Tensor");
  m.def("fill_infinity(Tensor(a!) t) -> Tensor(a!)");
@ -277,7 +276,7 @@ STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
  m.def("my_clone(Tensor t) -> Tensor");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
  m.impl("my_transpose", TORCH_BOX(&my_transpose));
  m.impl("my_empty_like", TORCH_BOX(&my_empty_like));
  m.impl("fill_infinity", TORCH_BOX(&fill_infinity));
@ -288,7 +287,7 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("my_clone", TORCH_BOX(&my_clone));
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeImplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeImplicitAutograd, m) {
  m.impl("my_pad", TORCH_BOX(&my_pad));
  m.impl("my_narrow", TORCH_BOX(&my_narrow));
 }
@ -305,7 +304,7 @@ Tensor my_amax_vec(Tensor t) {
  return amax(t, {0,1}, false);
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
  m.def("my_zero_(Tensor(a!) t) -> Tensor(a!)");
  m.def("my_amax(Tensor a) -> Tensor");
  m.def("my_amax_vec(Tensor a) -> Tensor");
@ -332,223 +331,11 @@ bool test_default_constructor(bool defined) {
  return out.defined();
 }

-uint64_t get_any_data_ptr(Tensor t, bool mutable_) {
-  if (mutable_) {
-    return reinterpret_cast<uint64_t>(t.mutable_data_ptr());
-  } else {
-    return reinterpret_cast<uint64_t>(t.const_data_ptr());
-  }
-}
-
-uint64_t get_template_any_data_ptr(Tensor t, c10::ScalarType dtype, bool mutable_) {
-#define DEFINE_CASE(T, name)                                            \
-  case torch::headeronly::ScalarType::name: {                           \
-    if (mutable_) {                                                     \
-      return reinterpret_cast<uint64_t>(t.mutable_data_ptr<T>());       \
-    } else {                                                            \
-      return reinterpret_cast<uint64_t>(t.const_data_ptr<T>());         \
-    }                                                                   \
-  }
-  switch (dtype) {
-    // per aten/src/ATen/templates/TensorMethods.cpp:
-    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
-    DEFINE_CASE(uint16_t, UInt16)
-    DEFINE_CASE(uint32_t, UInt32)
-    DEFINE_CASE(uint64_t, UInt64)
-  default:
-      return 0;
-  }
-#undef DEFINE_CASE
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("get_any_data_ptr(Tensor t, bool mutable_) -> int");
-  m.def("get_template_any_data_ptr(Tensor t, ScalarType dtype, bool mutable_) -> int");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
  m.impl("my_zero_", TORCH_BOX(&my_zero_));
  m.impl("my_amax", TORCH_BOX(&my_amax));
  m.impl("my_amax_vec", TORCH_BOX(&my_amax_vec));
  m.impl("test_default_constructor", TORCH_BOX(&test_default_constructor));
-  m.impl("get_any_data_ptr", TORCH_BOX(&get_any_data_ptr));
-  m.impl("get_template_any_data_ptr", TORCH_BOX(&get_template_any_data_ptr));
-}
-
-std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
-  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
-  aoti_torch_call_dispatcher("aten::_foreach_mul", "List", stack.data());
-  return torch::stable::detail::to<std::vector<Tensor>>(stack[0]);
-}
-
-void my__foreach_mul_(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
-  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
-  aoti_torch_call_dispatcher("aten::_foreach_mul_", "List", stack.data());
-}
-
-std::vector<Tensor> make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) {
-  // This function tests that my__foreach_mul can take in std::initializer_lists
-  // in addition to std::vectors.
-  Tensor t1_1 = my_clone(t1);
-  Tensor t1_2 = my_clone(t1);
-  Tensor t2_1 = my_clone(t2);
-  Tensor t2_2 = my_clone(t2);
-  return my__foreach_mul({t1_1, t2_1}, {t1_2, t2_2});
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("my__foreach_mul(Tensor[] self, Tensor[] other) -> Tensor[]");
-  m.def("my__foreach_mul_(Tensor(a!)[] self, Tensor[] other) -> ()");
-  m.def("make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) -> Tensor[]");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
-  m.impl("my__foreach_mul", TORCH_BOX(&my__foreach_mul));
-  m.impl("my__foreach_mul_", TORCH_BOX(&my__foreach_mul_));
-  m.impl("make_tensor_clones_and_call_foreach", TORCH_BOX(&make_tensor_clones_and_call_foreach));
-}
-
-// Test functions for torch::stable::Tensor device method
-
-torch::stable::Device test_tensor_device(torch::stable::Tensor tensor) {
-  return tensor.device();
-}
-
-void boxed_test_tensor_device(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  torch::stable::Device res = test_tensor_device(
-      torch::stable::detail::to<torch::stable::Tensor>(stack[0]));
-  stack[0] = torch::stable::detail::from(res);
-}
-
-// Test functions for torch::stable::Device
-
-torch::stable::Device test_device_constructor(
-    bool is_cuda,
-    torch::stable::DeviceIndex index,
-    bool use_str) {
-  using torch::stable::Device;
-  using torch::stable::DeviceType;
-
-  if (use_str) {
-    std::string device_str;
-    if (is_cuda) {
-      device_str = "cuda:" + std::to_string(index);
-    } else {
-      device_str = "cpu";
-    }
-    return Device(device_str);
-  } else {
-    if (is_cuda) {
-      return Device(DeviceType::CUDA, index);
-    } else {
-      return Device(DeviceType::CPU);
-    }
-  }
-}
-
-void boxed_test_device_constructor(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  torch::stable::Device res = test_device_constructor(
-      torch::stable::detail::to<bool>(stack[0]),
-      torch::stable::detail::to<torch::stable::DeviceIndex>(stack[1]),
-      torch::stable::detail::to<bool>(stack[2]));
-  stack[0] = torch::stable::detail::from(res);
-}
-
-bool test_device_equality(torch::stable::Device d1, torch::stable::Device d2) {
-  return d1 == d2;
-}
-
-void boxed_test_device_equality(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  bool res = test_device_equality(
-      torch::stable::detail::to<torch::stable::Device>(stack[0]),
-      torch::stable::detail::to<torch::stable::Device>(stack[1]));
-  stack[0] = torch::stable::detail::from(res);
-}
-
-torch::stable::Device test_device_set_index(
-    torch::stable::Device device,
-    torch::stable::DeviceIndex index) {
-  device.set_index(index);
-  return device;
-}
-
-void boxed_test_device_set_index(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  torch::stable::Device res = test_device_set_index(
-      torch::stable::detail::to<torch::stable::Device>(stack[0]),
-      torch::stable::detail::to<torch::stable::DeviceIndex>(stack[1]));
-  stack[0] = torch::stable::detail::from(res);
-}
-
-torch::stable::DeviceIndex test_device_index(torch::stable::Device device) {
-  return device.index();
-}
-
-void boxed_test_device_index(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  torch::stable::DeviceIndex res = test_device_index(
-      torch::stable::detail::to<torch::stable::Device>(stack[0]));
-  stack[0] = torch::stable::detail::from(res);
-}
-
-bool test_device_is_cuda(torch::stable::Device device) {
-  return device.is_cuda();
-}
-
-void boxed_test_device_is_cuda(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  bool res = test_device_is_cuda(
-      torch::stable::detail::to<torch::stable::Device>(stack[0]));
-  stack[0] = torch::stable::detail::from(res);
-}
-
-bool test_device_is_cpu(torch::stable::Device device) {
-  return device.is_cpu();
-}
-
-void boxed_test_device_is_cpu(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  bool res = test_device_is_cpu(
-      torch::stable::detail::to<torch::stable::Device>(stack[0]));
-  stack[0] = torch::stable::detail::from(res);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("test_tensor_device(Tensor t) -> Device");
-  m.def(
-      "test_device_constructor(bool is_cuda, DeviceIndex index, bool use_str) -> Device");
-  m.def("test_device_equality(Device d1, Device d2) -> bool");
-  m.def("test_device_set_index(Device device, DeviceIndex index) -> Device");
-  m.def("test_device_index(Device device) -> DeviceIndex");
-  m.def("test_device_is_cuda(Device device) -> bool");
-  m.def("test_device_is_cpu(Device device) -> bool");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
-  m.impl("test_tensor_device", &boxed_test_tensor_device);
-  m.impl("test_device_constructor", &boxed_test_device_constructor);
-  m.impl("test_device_equality", &boxed_test_device_equality);
-  m.impl("test_device_set_index", &boxed_test_device_set_index);
-  m.impl("test_device_index", &boxed_test_device_index);
-  m.impl("test_device_is_cuda", &boxed_test_device_is_cuda);
-  m.impl("test_device_is_cpu", &boxed_test_device_is_cpu);
 }

 Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
@ -569,11 +356,11 @@ Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
  return res;
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
  m.def("mv_tensor_accessor(Tensor m, Tensor v) -> Tensor");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CPU, m) {
  m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cpu));
 }

@ -619,14 +406,14 @@ int64_t test_get_current_device_index() {
  return torch::stable::accelerator::getCurrentDeviceIndex();
 }

-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
  m.def("test_device_guard(int device_index) -> int");
  m.def("test_device_guard_set_index() -> int");
  m.def("test_stream(int device_index) -> int");
  m.def("test_get_current_device_index() -> int");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
  m.impl("test_device_guard", TORCH_BOX(&test_device_guard));
  m.impl("test_device_guard_set_index", TORCH_BOX(&test_device_guard_set_index));
  m.impl("test_stream", TORCH_BOX(&test_stream));
@ -635,100 +422,14 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {

 #endif // LAE_USE_CUDA

-Tensor test_parallel_for(int64_t size, int64_t grain_size) {
-  AtenTensorHandle tensor_handle;
-  int64_t stride = 1;
-
-  aoti_torch_empty_strided(
-      1,
-      &size,
-      &stride,
-      aoti_torch_dtype_int64(),
-      aoti_torch_device_type_cpu(),
-      0,
-      &tensor_handle);
-
-  Tensor tensor(tensor_handle);
-  int64_t* data_ptr = reinterpret_cast<int64_t*>(tensor.data_ptr());
-
-  torch::stable::zero_(tensor);
-
-  // Use parallel_for to fill each element with its index
-  // If using a parallel path, the thread id is encoded in the upper 32 bits
-  torch::stable::parallel_for(
-      0, size, grain_size, [data_ptr](int64_t begin, int64_t end) {
-        for (auto i = begin; i < end; i++) {
-          STD_TORCH_CHECK(i <= UINT32_MAX);
-          uint32_t thread_id;
-          torch_get_thread_idx(&thread_id);
-          data_ptr[i] = i | (static_cast<int64_t>(thread_id) << 32);
-        }
-      });
-
-  return tensor;
-}
-
-void boxed_test_parallel_for(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  Tensor res = test_parallel_for(to<int64_t>(stack[0]), to<int64_t>(stack[1]));
-  stack[0] = from(res);
-}
-
-uint32_t test_get_num_threads() {
-  return torch::stable::get_num_threads();
-}
-
-void boxed_test_get_num_threads(
-    StableIValue* stack,
-    uint64_t num_args,
-    uint64_t num_outputs) {
-  uint32_t res = test_get_num_threads();
-  stack[0] = from(res);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def("test_parallel_for(int size, int grain_size) -> Tensor");
-  m.def("test_get_num_threads() -> int");
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
-  m.impl("test_parallel_for", &boxed_test_parallel_for);
-  m.impl("test_get_num_threads", &boxed_test_get_num_threads);
-}
-
-Tensor my_empty(
-    torch::headeronly::HeaderOnlyArrayRef<int64_t> size,
-    std::optional<torch::headeronly::ScalarType> dtype,
-    std::optional<torch::stable::Device> device,
-    std::optional<bool> pin_memory) {
-  return empty(size, dtype, device, pin_memory);
-}
-
 Tensor my_flatten(Tensor t, int64_t start_dim, int64_t end_dim) {
  return flatten(t, start_dim, end_dim);
 }

-Tensor my_reshape(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> shape) {
-  return reshape(t, shape);
-}
-
-Tensor my_view(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> size) {
-  return view(t, size);
-}
-
-STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
-  m.def(
-      "my_empty(int[] size, ScalarType? dtype=None, Device? device=None, bool? pin_memory=None) -> Tensor");
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
  m.def("my_flatten(Tensor t, int start_dim=0, int end_dim=-1) -> Tensor");
-  m.def("my_reshape(Tensor t, int[] shape) -> Tensor");
-  m.def("my_view(Tensor t, int[] size) -> Tensor");
 }

-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
-  m.impl("my_empty", TORCH_BOX(&my_empty));
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
  m.impl("my_flatten", TORCH_BOX(&my_flatten));
-  m.impl("my_reshape", TORCH_BOX(&my_reshape));
-  m.impl("my_view", TORCH_BOX(&my_view));
 }
--- a/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.h
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/csrc/kernel.h
--- a/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/libtorch_agnostic_2_9/ops.py
@ -0,0 +1,363 @@
+import torch
+from torch import Tensor
+
+
+def sgd_out_of_place(param, grad, weight_decay, lr, maximize) -> Tensor:
+    """
+    Computes a single step of SGD on a single parameter Tensor with grad.
+
+    Assumes:
+    - param and grad are the same shape and are 1D.
+    - param and grad are float and on CPU
+
+    Args:
+        param: a 1D tensor of floats
+        grad: a 1D tensor of floats
+        weight_decay: a python double between 0 and 1
+        lr: a python double
+
+    Returns:
+        a 1D float Tensor the same shape as param
+
+    """
+    return torch.ops.libtorch_agnostic_2_9.sgd_out_of_place.default(
+        param, grad, weight_decay, lr, maximize
+    )
+
+
+def identity(t) -> Tensor:
+    """
+    Returns the input tensor
+
+    Args:
+        t: any Tensor
+
+    Returns:
+        a Tensor, the same as input.
+    """
+    return torch.ops.libtorch_agnostic_2_9.identity.default(t)
+
+
+def my_abs(t) -> Tensor:
+    """
+    Returns abs on the input tensor, outputs a new Tensor
+
+    Args:
+        t: any Tensor
+
+    Returns:
+        a Tensor
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_abs.default(t)
+
+
+def my_is_cpu(t) -> bool:
+    """
+    Returns is_cpu on the input tensor.
+
+    Args:
+        t: any Tensor
+
+    Returns:
+        a bool
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_is_cpu.default(t)
+
+
+def my_ones_like(tensor, device) -> Tensor:
+    """
+    Returns a new Tensor like the input tensor, but with all ones
+
+    Args:
+        tensor: any Tensor
+        device: a device string
+
+    Returns:
+        a ones Tensor with the same dtype and shape and other attributes
+        like the input tensor
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_ones_like.default(tensor, device)
+
+
+def exp_neg_is_leaf(t1, t2, t3) -> tuple[Tensor, Tensor, bool]:
+    """
+    Returns a Tensor, Tensor, bool tuple corresponding to the respective inputs
+    t1, t2, and t3.
+
+    Args:
+        t1: Tensor
+        t2: Tensor
+        t3: Tensor
+
+    Returns:
+        (exp(t1), neg(t2), is_leaf(t3))
+    """
+    return torch.ops.libtorch_agnostic_2_9.exp_neg_is_leaf.default(t1, t2, t3)
+
+
+def neg_exp(t) -> Tensor:
+    """
+    Returns a Tensor composing neg of exp
+
+    Args:
+        t: Tensor
+
+    Returns: neg(exp(t))
+    """
+    return torch.ops.libtorch_agnostic_2_9.neg_exp.default(t)
+
+
+def divide_neg_exp(t) -> Tensor:
+    """
+    Returns a Tensor division of neg and exp
+
+    Args:
+        t: Tensor
+
+    Returns: divide(neg(t), exp(t))
+    """
+    return torch.ops.libtorch_agnostic_2_9.divide_neg_exp.default(t)
+
+
+def is_contiguous(t) -> bool:
+    """
+    Returns a bool indicating if the input tensor is contiguous
+
+    Args:
+        t: Tensor
+
+    Returns: is_contiguous(t)
+    """
+    return torch.ops.libtorch_agnostic_2_9.is_contiguous.default(t)
+
+
+def my_transpose(t, dim0, dim1) -> Tensor:
+    """
+    Returns t.transpose(dim0, dim1)
+
+    Args:
+        t: Tensor
+
+    Returns: my_transpose(t, dim0, dim1)
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_transpose.default(t, dim0, dim1)
+
+
+def my_empty_like(t) -> Tensor:
+    """
+    Returns t.empty_like()
+
+    Args:
+        t: Tensor
+
+    Returns: my_empty_like(t)
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_empty_like.default(t)
+
+
+def my_zero_(t) -> Tensor:
+    """
+    Returns t.zero_()
+
+    Args:
+        t: Tensor
+
+    Returns: my_zero_(t)
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_zero_.default(t)
+
+
+def my_amax(t) -> Tensor:
+    """
+    Returns t.amax()
+
+    Args:
+        t: Tensor
+
+    Returns: amax(t)
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_amax.default(t)
+
+
+def my_amax_vec(t) -> Tensor:
+    """
+    Returns t.amax()
+
+    Args:
+        t: Tensor
+
+    Returns: amax(t)
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_amax_vec.default(t)
+
+
+def fill_infinity(t) -> Tensor:
+    """
+    Fills the tensor with inf.
+
+    Args:
+        t: Tensor to fill
+
+    Returns: The modified tensor (same as input)
+    """
+    return torch.ops.libtorch_agnostic_2_9.fill_infinity.default(t)
+
+
+def test_default_constructor(defined) -> bool:
+    """
+    Tests the default constructor for torch::stable::Tensor.
+
+    Args:
+        defined: bool - if True, tests defined tensor; if False, tests undefined tensor
+
+    Returns: bool - result of calling .defined() on the tensor
+    """
+    return torch.ops.libtorch_agnostic_2_9.test_default_constructor.default(defined)
+
+
+def mv_tensor_accessor(m, v) -> Tensor:
+    """
+    Returns matrix-vector product.
+
+    Args:
+        m: any 2-D Tensor with shape (N, M)
+        v: any 1-D Tensor with shape (M,)
+
+    Returns:
+        a 1-D Tensor with shape (N,)
+    """
+    return torch.ops.libtorch_agnostic_2_9.mv_tensor_accessor.default(m, v)
+
+
+def my_pad(t) -> Tensor:
+    """
+    Pads the input tensor with hardcoded padding parameters.
+
+    Args:
+        t: Input tensor
+
+    Returns: Padded tensor with padding [1, 2, 2, 1], mode "constant", value 0.0
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_pad.default(t)
+
+
+def my_narrow(t, dim, start, length) -> Tensor:
+    """
+    Returns a new tensor that is a narrowed version of the input tensor.
+
+    Args:
+        t: Input tensor
+        dim: Dimension along which to narrow
+        start: Starting position
+        length: Length of the narrowed section
+
+    Returns: Narrowed tensor
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_narrow.default(t, dim, start, length)
+
+
+def my_copy_(dst, src, non_blocking) -> Tensor:
+    """
+    Returns tensor dst that is updated with src elements.
+
+    Args:
+        dst: Destination tensor
+        src: Source tensor
+        non_blocking: bool
+
+    Returns: Updated tensor
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_copy_.default(dst, src, non_blocking)
+
+
+def my_clone(t) -> Tensor:
+    """
+    Returns a clone of input tensor.
+
+    Args:
+        t: Input tensor
+
+    Returns: Cloned tensor
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_clone.default(t)
+
+
+def test_device_guard(device_index) -> int:
+    """
+    Tests the DeviceGuard functionality by creating a device guard and returning an empty tensor.
+
+    Args:
+        device_index: Device index to set the guard to
+
+    Returns: result of cudaGetDevice() as an integer after using the guard
+    """
+    return torch.ops.libtorch_agnostic_2_9.test_device_guard.default(device_index)
+
+
+def test_device_guard_set_index() -> int:
+    """
+    Tests the DeviceGuard set_index functionality by creating a device guard with index 1,
+    then setting it to index 0, and returning the current device.
+
+    Returns: result of cudaGetDevice() as an integer after using set_index
+    """
+    return torch.ops.libtorch_agnostic_2_9.test_device_guard_set_index.default()
+
+
+def test_stream(device_index) -> int:
+    """
+    Tests the Stream functionality by getting the current stream ID for the specified device.
+
+    Args:
+        device_index: Device index to get the stream for
+
+    Returns: Stream ID as an integer
+    """
+    return torch.ops.libtorch_agnostic_2_9.test_stream.default(device_index)
+
+
+def test_get_current_device_index() -> int:
+    """
+    Tests the getCurrentDeviceIndex functionality by getting the current device index.
+
+    Returns: Current device index as an integer
+    """
+    return torch.ops.libtorch_agnostic_2_9.test_get_current_device_index.default()
+
+
+def my_new_empty_dtype_variant(t) -> Tensor:
+    """
+    Returns a new empty tensor with shape [2, 5] and dtype bfloat16
+
+    Args:
+        t: Input tensor used as a reference for device and other properties
+
+    Returns: New empty tensor with shape [2, 5] and dtype bfloat16
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_new_empty_dtype_variant.default(t)
+
+
+def my_new_zeros_dtype_variant(t) -> Tensor:
+    """
+    Returns a new tensor filled with 0s with shape [2, 5] and dtype Float
+
+    Args:
+        t: Input tensor used as a reference for device and other properties
+
+    Returns: New zeros tensor
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_new_zeros_dtype_variant.default(t)
+
+
+def my_flatten(t, start_dim=0, end_dim=-1) -> Tensor:
+    """
+    Flattens the input tensor from start_dim to end_dim into a single dimension.
+
+    Args:
+        t: Tensor - tensor to flatten
+        start_dim: int - first dimension to flatten (default: 0)
+        end_dim: int - last dimension to flatten (default: -1)
+
+    Returns: Tensor - flattened tensor
+    """
+    return torch.ops.libtorch_agnostic_2_9.my_flatten.default(t, start_dim, end_dim)
--- a/test/cpp_extensions/libtorch_agnostic_2_9_extension/setup.py
+++ b/test/cpp_extensions/libtorch_agnostic_2_9_extension/setup.py
@ -0,0 +1,82 @@
+import distutils.command.clean
+import shutil
+from pathlib import Path
+
+from setuptools import find_packages, setup
+
+import torch
+from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension
+
+
+ROOT_DIR = Path(__file__).parent
+CSRC_DIR = ROOT_DIR / "libtorch_agnostic_2_9" / "csrc"
+
+
+class clean(distutils.command.clean.clean):
+    def run(self):
+        # Run default behavior first
+        distutils.command.clean.clean.run(self)
+
+        # Remove extension
+        for path in (ROOT_DIR / "libtorch_agnostic_2_9").glob("**/*.so"):
+            path.unlink()
+        # Remove build and dist and egg-info directories
+        dirs = [
+            ROOT_DIR / "build",
+            ROOT_DIR / "dist",
+            ROOT_DIR / "libtorch_agnostic_2_9.egg-info",
+        ]
+        for path in dirs:
+            if path.exists():
+                shutil.rmtree(str(path), ignore_errors=True)
+
+
+def get_extension():
+    extra_compile_args = {
+        "cxx": [
+            "-fdiagnostics-color=always",
+            "-DTORCH_STABLE_ONLY",
+            "-DTORCH_TARGET_VERSION=0x0209000000000000",
+        ],
+    }
+    sources = list(CSRC_DIR.glob("**/*.cpp"))
+
+    extension = CppExtension
+    # allow including <cuda_runtime.h>
+    if torch.cuda.is_available():
+        extra_compile_args["cxx"].append("-DLAE_USE_CUDA")
+        extra_compile_args["nvcc"] = [
+            "-O2",
+            "-DTORCH_TARGET_VERSION=0x0209000000000000",
+        ]
+        extension = CUDAExtension
+        sources.extend(CSRC_DIR.glob("**/*.cu"))
+
+    return [
+        extension(
+            "libtorch_agnostic_2_9._C",
+            sources=sorted(str(s) for s in sources),
+            py_limited_api=True,
+            extra_compile_args=extra_compile_args,
+            extra_link_args=[],
+        )
+    ]
+
+
+setup(
+    name="libtorch_agnostic_2_9",
+    version="0.0",
+    author="PyTorch Core Team",
+    description="Example of libtorch agnostic extension for PyTorch 2.9",
+    packages=find_packages(exclude=("test",)),
+    package_data={"libtorch_agnostic_2_9": ["*.dll", "*.dylib", "*.so"]},
+    install_requires=[
+        "torch",
+    ],
+    ext_modules=get_extension(),
+    cmdclass={
+        "build_ext": BuildExtension.with_options(no_python_abi_suffix=True),
+        "clean": clean,
+    },
+    options={"bdist_wheel": {"py_limited_api": "cp39"}},
+)
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/cuda_kernel.cu
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/cuda_kernel.cu
@ -1,30 +0,0 @@
-#include "kernel.h"
-
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/tensor.h>
-#include <torch/csrc/stable/ops.h>
-#include <cuda_runtime.h>
-
-using torch::stable::Tensor;
-
-Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
-  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
-  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
-  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
-  STD_TORCH_CHECK(m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
-  STD_TORCH_CHECK(m.device() == v.device(), "m and v must be on the same device");
-  Tensor res = new_empty(m, {m.size(0)});
-  THO_DISPATCH_V2(m.scalar_type(), "mv_tensor_accessor_cuda",
-                  AT_WRAP(([&]() {
-                    auto resa = Accessor_cuda<scalar_t, 1>(reinterpret_cast<scalar_t*>(res.data_ptr()), res.sizes().data(), res.strides().data());
-                    auto ma = Accessor_cuda<scalar_t, 2>(reinterpret_cast<scalar_t*>(m.data_ptr()), m.sizes().data(), m.strides().data());
-                    auto va = Accessor_cuda<scalar_t, 1>(reinterpret_cast<scalar_t*>(v.data_ptr()), v.sizes().data(), v.strides().data());
-                    mv_tensor_accessor_kernel<Accessor_cuda, scalar_t><<<1, 1, 0, 0>>>(resa, ma, va);
-                  })),
-                  AT_FLOATING_TYPES);
-  return res;
-}
-
-STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CUDA, m) {
-  m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
-}
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
@ -1,589 +0,0 @@
-import torch
-from torch import Tensor
-
-
-def sgd_out_of_place(param, grad, weight_decay, lr, maximize) -> Tensor:
-    """
-    Computes a single step of SGD on a single parameter Tensor with grad.
-
-    Assumes:
-    - param and grad are the same shape and are 1D.
-    - param and grad are float and on CPU
-
-    Args:
-        param: a 1D tensor of floats
-        grad: a 1D tensor of floats
-        weight_decay: a python double between 0 and 1
-        lr: a python double
-
-    Returns:
-        a 1D float Tensor the same shape as param
-
-    """
-    return torch.ops.libtorch_agnostic.sgd_out_of_place.default(
-        param, grad, weight_decay, lr, maximize
-    )
-
-
-def identity(t) -> Tensor:
-    """
-    Returns the input tensor
-
-    Args:
-        t: any Tensor
-
-    Returns:
-        a Tensor, the same as input.
-    """
-    return torch.ops.libtorch_agnostic.identity.default(t)
-
-
-def my_abs(t) -> Tensor:
-    """
-    Returns abs on the input tensor, outputs a new Tensor
-
-    Args:
-        t: any Tensor
-
-    Returns:
-        a Tensor
-    """
-    return torch.ops.libtorch_agnostic.my_abs.default(t)
-
-
-def my_is_cpu(t) -> bool:
-    """
-    Returns is_cpu on the input tensor.
-
-    Args:
-        t: any Tensor
-
-    Returns:
-        a bool
-    """
-    return torch.ops.libtorch_agnostic.my_is_cpu.default(t)
-
-
-def my_ones_like(tensor, device) -> Tensor:
-    """
-    Returns a new Tensor like the input tensor, but with all ones
-
-    Args:
-        tensor: any Tensor
-        device: a device string
-
-    Returns:
-        a ones Tensor with the same dtype and shape and other attributes
-        like the input tensor
-    """
-    return torch.ops.libtorch_agnostic.my_ones_like.default(tensor, device)
-
-
-def exp_neg_is_leaf(t1, t2, t3) -> tuple[Tensor, Tensor, bool]:
-    """
-    Returns a Tensor, Tensor, bool tuple corresponding to the respective inputs
-    t1, t2, and t3.
-
-    Args:
-        t1: Tensor
-        t2: Tensor
-        t3: Tensor
-
-    Returns:
-        (exp(t1), neg(t2), is_leaf(t3))
-    """
-    return torch.ops.libtorch_agnostic.exp_neg_is_leaf.default(t1, t2, t3)
-
-
-def neg_exp(t) -> Tensor:
-    """
-    Returns a Tensor composing neg of exp
-
-    Args:
-        t: Tensor
-
-    Returns: neg(exp(t))
-    """
-    return torch.ops.libtorch_agnostic.neg_exp.default(t)
-
-
-def divide_neg_exp(t) -> Tensor:
-    """
-    Returns a Tensor division of neg and exp
-
-    Args:
-        t: Tensor
-
-    Returns: divide(neg(t), exp(t))
-    """
-    return torch.ops.libtorch_agnostic.divide_neg_exp.default(t)
-
-
-def is_contiguous(t) -> bool:
-    """
-    Returns a bool indicating if the input tensor is contiguous
-
-    Args:
-        t: Tensor
-
-    Returns: is_contiguous(t)
-    """
-    return torch.ops.libtorch_agnostic.is_contiguous.default(t)
-
-
-def my_transpose(t, dim0, dim1) -> Tensor:
-    """
-    Returns t.transpose(dim0, dim1)
-
-    Args:
-        t: Tensor
-
-    Returns: my_transpose(t, dim0, dim1)
-    """
-    return torch.ops.libtorch_agnostic.my_transpose.default(t, dim0, dim1)
-
-
-def my_empty_like(t) -> Tensor:
-    """
-    Returns t.empty_like()
-
-    Args:
-        t: Tensor
-
-    Returns: my_empty_like(t)
-    """
-    return torch.ops.libtorch_agnostic.my_empty_like.default(t)
-
-
-def my_zero_(t) -> Tensor:
-    """
-    Returns t.zero_()
-
-    Args:
-        t: Tensor
-
-    Returns: my_zero_(t)
-    """
-    return torch.ops.libtorch_agnostic.my_zero_.default(t)
-
-
-def my_amax(t) -> Tensor:
-    """
-    Returns t.amax()
-
-    Args:
-        t: Tensor
-
-    Returns: amax(t)
-    """
-    return torch.ops.libtorch_agnostic.my_amax.default(t)
-
-
-def my_amax_vec(t) -> Tensor:
-    """
-    Returns t.amax()
-
-    Args:
-        t: Tensor
-
-    Returns: amax(t)
-    """
-    return torch.ops.libtorch_agnostic.my_amax_vec.default(t)
-
-
-def fill_infinity(t) -> Tensor:
-    """
-    Fills the tensor with inf.
-
-    Args:
-        t: Tensor to fill
-
-    Returns: The modified tensor (same as input)
-    """
-    return torch.ops.libtorch_agnostic.fill_infinity.default(t)
-
-
-def test_default_constructor(defined) -> bool:
-    """
-    Tests the default constructor for torch::stable::Tensor.
-
-    Args:
-        defined: bool - if True, tests defined tensor; if False, tests undefined tensor
-
-    Returns: bool - result of calling .defined() on the tensor
-    """
-    return torch.ops.libtorch_agnostic.test_default_constructor.default(defined)
-
-
-def test_tensor_device(t):
-    """
-    Tests Tensor device() method.
-
-    Args:
-        t: Tensor - tensor to get device from
-
-    Returns: Device - device of the tensor
-    """
-    return torch.ops.libtorch_agnostic.test_tensor_device.default(t)
-
-
-def get_any_data_ptr(t, mutable) -> int:
-    """
-    Return data pointer value of the tensor.
-
-    Args:
-        t: Input tensor
-        mutable: whether data pointer qualifier is mutable or const
-
-    Returns: int - pointer value
-    """
-    return torch.ops.libtorch_agnostic.get_any_data_ptr.default(t, mutable)
-
-
-def get_template_any_data_ptr(t, dtype, mutable) -> int:
-    """
-    Return data pointer value of the tensor iff it has dtype.
-
-    Args:
-        t: Input tensor
-        dtype: Input dtype
-        mutable: whether data pointer qualifier is mutable or const
-
-    Returns: int - pointer value
-
-    Raises RuntimeError when t.dtype() != dtype.
-    """
-    return torch.ops.libtorch_agnostic.get_template_any_data_ptr.default(
-        t, dtype, mutable
-    )
-
-
-def my_pad(t) -> Tensor:
-    """
-    Pads the input tensor with hardcoded padding parameters.
-
-    Args:
-        t: Input tensor
-
-    Returns: Padded tensor with padding [1, 2, 2, 1], mode "constant", value 0.0
-    """
-    return torch.ops.libtorch_agnostic.my_pad.default(t)
-
-
-def my_narrow(t, dim, start, length) -> Tensor:
-    """
-    Returns a new tensor that is a narrowed version of the input tensor.
-
-    Args:
-        t: Input tensor
-        dim: Dimension along which to narrow
-        start: Starting position
-        length: Length of the narrowed section
-
-    Returns: Narrowed tensor
-    """
-    return torch.ops.libtorch_agnostic.my_narrow.default(t, dim, start, length)
-
-
-def my_copy_(dst, src, non_blocking) -> Tensor:
-    """
-    Returns tensor dst that is updated with src elements.
-
-    Args:
-        dst: Destination tensor
-        src: Source tensor
-        non_blocking: bool
-
-    Returns: Updated tensor
-    """
-    return torch.ops.libtorch_agnostic.my_copy_.default(dst, src, non_blocking)
-
-
-def my_clone(t) -> Tensor:
-    """
-    Returns a clone of input tensor.
-
-    Args:
-        t: Input tensor
-
-    Returns: Cloned tensor
-    """
-    return torch.ops.libtorch_agnostic.my_clone.default(t)
-
-
-def test_device_guard(device_index) -> int:
-    """
-    Tests the DeviceGuard functionality by creating a device guard and returning an empty tensor.
-
-    Args:
-        device_index: Device index to set the guard to
-
-    Returns: result of cudaGetDevice() as an integer after using the guard
-    """
-    return torch.ops.libtorch_agnostic.test_device_guard.default(device_index)
-
-
-def test_device_guard_set_index() -> int:
-    """
-    Tests the DeviceGuard set_index functionality by creating a device guard with index 1,
-    then setting it to index 0, and returning the current device.
-
-    Returns: result of cudaGetDevice() as an integer after using set_index
-    """
-    return torch.ops.libtorch_agnostic.test_device_guard_set_index.default()
-
-
-def test_stream(device_index) -> int:
-    """
-    Tests the Stream functionality by getting the current stream ID for the specified device.
-
-    Args:
-        device_index: Device index to get the stream for
-
-    Returns: Stream ID as an integer
-    """
-    return torch.ops.libtorch_agnostic.test_stream.default(device_index)
-
-
-def test_get_current_device_index() -> int:
-    """
-    Tests the getCurrentDeviceIndex functionality by getting the current device index.
-
-    Returns: Current device index as an integer
-    """
-    return torch.ops.libtorch_agnostic.test_get_current_device_index.default()
-
-
-def my_new_empty_dtype_variant(t) -> Tensor:
-    """
-    Returns a new empty tensor with shape [2, 5] and dtype bfloat16
-
-    Args:
-        t: Input tensor used as a reference for device and other properties
-
-    Returns: New empty tensor with shape [2, 5] and dtype bfloat16
-    """
-    return torch.ops.libtorch_agnostic.my_new_empty_dtype_variant.default(t)
-
-
-def my_new_zeros_dtype_variant(t) -> Tensor:
-    """
-    Returns a new tensor filled with 0s with shape [2, 5] and dtype Float
-
-    Args:
-        t: Input tensor used as a reference for device and other properties
-
-    Returns: New zeros tensor
-    """
-    return torch.ops.libtorch_agnostic.my_new_zeros_dtype_variant.default(t)
-
-
-def my__foreach_mul_(tensors, others) -> ():
-    """
-    Updates tensors to be the result of pointwise multiplying with others.
-
-    Args:
-        tensors: list of tensors
-        others: list of tensors (with the same corresponding shapes as tensors)
-
-    Returns: nothing, tensors is updated in place.
-    """
-    torch.ops.libtorch_agnostic.my__foreach_mul_.default(tensors, others)
-
-
-def my__foreach_mul(tensors, others) -> list[Tensor]:
-    """
-    Returns a list of tensors that are the results of pointwise multiplying
-    tensors and others.
-
-    Args:
-        tensors: list of tensors
-        others: list of tensors (with the same corresponding shapes as tensors)
-
-    Returns: list of multiplied tensors
-    """
-    return torch.ops.libtorch_agnostic.my__foreach_mul.default(tensors, others)
-
-
-def make_tensor_clones_and_call_foreach(t1, t2) -> list[Tensor]:
-    """
-    Returns a list of 2 tensors corresponding to the square of the inputs.
-
-    Args:
-        t1: Tensor
-        t2: Tensor
-
-    Returns: list of [t1^2, t2^2]
-    """
-    return torch.ops.libtorch_agnostic.make_tensor_clones_and_call_foreach.default(
-        t1, t2
-    )
-
-
-def test_device_constructor(is_cuda, index, use_str):
-    """
-    Tests creating a Device from DeviceType and index, or from a string.
-
-    Args:
-        is_cuda: bool - if True, creates CUDA device; if False, creates CPU device
-        index: int - device index
-        use_str: bool - if True, constructs from string; if False, constructs from DeviceType
-
-    Returns: Device - A device with the specified type and index
-    """
-    return torch.ops.libtorch_agnostic.test_device_constructor.default(
-        is_cuda, index, use_str
-    )
-
-
-def test_device_equality(d1, d2) -> bool:
-    """
-    Tests Device equality operator.
-
-    Args:
-        d1: Device - first device
-        d2: Device - second device
-
-    Returns: bool - True if devices are equal
-    """
-    return torch.ops.libtorch_agnostic.test_device_equality.default(d1, d2)
-
-
-def test_device_set_index(device, index):
-    """
-    Tests Device set_index() method.
-
-    Args:
-        device: Device - device to modify
-        index: int - new device index
-
-    Returns: Device - device with updated index
-    """
-    return torch.ops.libtorch_agnostic.test_device_set_index.default(device, index)
-
-
-def test_device_index(device) -> int:
-    """
-    Tests Device index() method.
-
-    Args:
-        device: Device - device to query
-
-    Returns: int - device index
-    """
-    return torch.ops.libtorch_agnostic.test_device_index.default(device)
-
-
-def test_device_is_cuda(device) -> bool:
-    """
-    Tests Device is_cuda() method.
-
-    Args:
-        device: Device - device to check
-
-    Returns: bool - True if device is CUDA
-    """
-    return torch.ops.libtorch_agnostic.test_device_is_cuda.default(device)
-
-
-def test_device_is_cpu(device) -> bool:
-    """
-    Tests Device is_cpu() method.
-
-    Args:
-        device: Device - device to check
-
-    Returns: bool - True if device is CPU
-    """
-    return torch.ops.libtorch_agnostic.test_device_is_cpu.default(device)
-
-
-def test_parallel_for(size, grain_size) -> Tensor:
-    """
-    Tests the parallel_for functionality by using it to fill a tensor with indices.
-    Args:
-        size: int - size of the tensor to create
-        grain_size: int - grain size for parallel_for
-    Returns: Tensor - a 1D int64 tensor where each element contains its index
-        (if multiple threads are used the threadid will be encoded in the upper 32 bits)
-    """
-    return torch.ops.libtorch_agnostic.test_parallel_for.default(size, grain_size)
-
-
-def test_get_num_threads() -> int:
-    """
-    Tests the get_num_threads functionality by returning the number of threads
-    for the parallel backend.
-
-    Returns: int - the number of threads for the parallel backend
-    """
-    return torch.ops.libtorch_agnostic.test_get_num_threads.default()
-
-
-def my_empty(size, dtype=None, device=None, pin_memory=None) -> Tensor:
-    """
-    Creates an empty tensor with the specified size, dtype, device, and pin_memory.
-
-    Args:
-        size: list[int] - size of the tensor to create
-        dtype: ScalarType or None - data type of the tensor
-        device: Device or None - device on which to create the tensor
-        pin_memory: bool or None - whether to use pinned memory
-
-    Returns: Tensor - an uninitialized tensor with the specified properties
-    """
-    return torch.ops.libtorch_agnostic.my_empty.default(size, dtype, device, pin_memory)
-
-
-def my_flatten(t, start_dim=0, end_dim=-1) -> Tensor:
-    """
-    Flattens the input tensor from start_dim to end_dim into a single dimension.
-
-    Args:
-        t: Tensor - tensor to flatten
-        start_dim: int - first dimension to flatten (default: 0)
-        end_dim: int - last dimension to flatten (default: -1)
-
-    Returns: Tensor - flattened tensor
-    """
-    return torch.ops.libtorch_agnostic.my_flatten.default(t, start_dim, end_dim)
-
-
-def my_reshape(t, shape) -> Tensor:
-    """
-    Returns a tensor with the same data but different shape.
-
-    Args:
-        t: Tensor - tensor to reshape
-        shape: list[int] - new shape for the tensor
-
-    Returns: Tensor - reshaped tensor
-    """
-    return torch.ops.libtorch_agnostic.my_reshape.default(t, shape)
-
-
-def my_view(t, size) -> Tensor:
-    """
-    Returns a new tensor with the same data as the input tensor but of a different shape.
-
-    Args:
-        t: Tensor - tensor to view
-        size: list[int] - new size for the tensor
-
-    Returns: Tensor - tensor with new view
-    """
-    return torch.ops.libtorch_agnostic.my_view.default(t, size)
-
-
-def mv_tensor_accessor(m, v) -> Tensor:
-    """
-    Returns matrix-vector product.
-
-    Args:
-        m: any 2-D Tensor with shape (N, M)
-        v: any 1-D Tensor with shape (M,)
-
-    Returns:
-        a 1-D Tensor with shape (N,)
-    """
-    return torch.ops.libtorch_agnostic.mv_tensor_accessor.default(m, v)
--- a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
@ -1,6 +1,7 @@
 # Owner(s): ["module: cpp"]

 import math
+import unittest
 from pathlib import Path

 import torch
@ -46,21 +47,78 @@ def get_supported_dtypes():
    ]


+def skipIfTorchVersionLessThan(major, minor):
+    """Skip test if PyTorch version is less than specified version."""
+
+    def decorator(func):
+        version_parts = torch.__version__.split(".")
+        current_major = int(version_parts[0])
+        current_minor = int(
+            version_parts[1].split("+")[0].split("a")[0].split("b")[0].split("rc")[0]
+        )
+
+        should_skip = (current_major < major) or (
+            current_major == major and current_minor < minor
+        )
+        reason = f"Test requires PyTorch >= {major}.{minor}, current version is {torch.__version__}"
+
+        return unittest.skipIf(should_skip, reason)(func)
+
+    return decorator
+
+
 # TODO: Fix this error in Windows:
 # LINK : error LNK2001: unresolved external symbol PyInit__C
 if not IS_WINDOWS:

    class TestLibtorchAgnostic(TestCase):
+        """
+        Tests for versioned libtorch_agnostic extensions.
+
+        This test class supports testing both:
+
+        - libtorch_agnostic_2_9: Extension built with TORCH_TARGET_VERSION=2.9.0
+        - libtorch_agnostic_2_10: Extension built with TORCH_TARGET_VERSION=2.10.0
+
+        Tests should be decorated with @skipIfTorchVersionLessThan to indicate the
+        version that they target.
+        """
+
        @classmethod
        def setUpClass(cls):
+            # Build both 2.9 and 2.10 extensions
+            base_dir = Path(__file__).parent
+
            try:
-                import libtorch_agnostic  # noqa: F401
+                import libtorch_agnostic_2_9  # noqa: F401
            except Exception:
-                install_cpp_extension(extension_root=Path(__file__).parent.parent)
+                install_cpp_extension(
+                    extension_root=base_dir / "libtorch_agnostic_2_9_extension"
+                )
+
+            # Only build 2.10 extension if running on PyTorch 2.10+
+            import re
+
+            version_parts = torch.__version__.split(".")
+            current_major = int(version_parts[0])
+            # Extract just the numeric part of the minor version (handles "10+git", "10a1", etc.)
+            current_minor = int(re.match(r"\d+", version_parts[1]).group())
+
+            if (current_major > 2) or (current_major == 2 and current_minor >= 10):
+                try:
+                    import libtorch_agnostic_2_10  # noqa: F401
+                except Exception:
+                    install_cpp_extension(
+                        extension_root=base_dir / "libtorch_agnostic_2_10_extension"
+                    )
+            else:
+                print(
+                    f"Skipping 2.10 extension (running on PyTorch {torch.__version__})"
+                )

        @onlyCPU
        def test_slow_sgd(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            param = torch.rand(5, device=device)
            grad = torch.rand_like(param)
@ -87,7 +145,7 @@ if not IS_WINDOWS:

        @onlyCUDA
        def test_identity_does_not_hog_memory(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            def _run_identity(prior_mem):
                t = torch.rand(32, 32, device=device)
@ -103,7 +161,7 @@ if not IS_WINDOWS:
                self.assertEqual(curr_mem, init_mem)

        def test_exp_neg_is_leaf(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            t1 = torch.rand(2, 3, device=device)
            t2 = torch.rand(3, 2, device=device)
@ -115,7 +173,7 @@ if not IS_WINDOWS:
            self.assertEqual(is_leaf, t3.is_leaf)

        def test_my_abs(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            t = torch.rand(32, 16, device=device) - 0.5
            res = libtorch_agnostic.ops.my_abs(t)
@ -134,7 +192,7 @@ if not IS_WINDOWS:
                    self.assertEqual(curr_mem, init_mem)

        def test_neg_exp(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            t = torch.rand(32, 16, device=device) - 0.5
            res = libtorch_agnostic.ops.neg_exp(t)
@ -153,7 +211,7 @@ if not IS_WINDOWS:
                    self.assertEqual(curr_mem, init_mem)

        def test_divide_neg_exp(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            t = torch.zeros(2, 3, device=device) - 0.5
            res = libtorch_agnostic.ops.divide_neg_exp(t)
@ -172,7 +230,7 @@ if not IS_WINDOWS:
                    self.assertEqual(curr_mem, init_mem)

        def test_is_contiguous(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            t = torch.rand(2, 7, device=device)
            self.assertTrue(libtorch_agnostic.ops.is_contiguous(t))
@ -184,7 +242,7 @@ if not IS_WINDOWS:
        # **{}): got AssertionError("tensor's device must be `meta`, got cpu instead")
        @xfailIfTorchDynamo
        def test_my_ones_like(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            t = torch.rand(3, 1, device=device) - 0.5
            cpu_t = libtorch_agnostic.ops.my_ones_like(t, "cpu")
@ -203,7 +261,7 @@ if not IS_WINDOWS:
                    self.assertEqual(curr_mem, init_mem)

        def test_my_transpose(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            t = torch.rand(2, 7, device=device)
            out = libtorch_agnostic.ops.my_transpose(t, 0, 1)
@ -213,7 +271,7 @@ if not IS_WINDOWS:
                libtorch_agnostic.ops.my_transpose(t, 1, 2)

        def test_my_empty_like(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            deterministic = torch.are_deterministic_algorithms_enabled()
            try:
@ -229,7 +287,7 @@ if not IS_WINDOWS:

        @onlyCPU
        def test_my_zero_(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            t = torch.rand(2, 7, device=device)
            out = libtorch_agnostic.ops.my_zero_(t)
@ -237,28 +295,28 @@ if not IS_WINDOWS:
            self.assertEqual(out, torch.zeros_like(t))

        def test_my_amax(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            t = torch.rand(2, 7, device=device)
            out = libtorch_agnostic.ops.my_amax(t)
            self.assertEqual(out, torch.amax(t, 0))

        def test_my_amax_vec(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            t = torch.rand(2, 7, 5, device=device)
            out = libtorch_agnostic.ops.my_amax_vec(t)
            self.assertEqual(out, torch.amax(t, (0, 1)))

        def test_my_is_cpu(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            t = torch.rand(2, 7, device=device)
            out = libtorch_agnostic.ops.my_is_cpu(t)
            self.assertEqual(out, t.is_cpu)

        def test_fill_infinity(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            t = torch.rand(3, 4, device=device)
            out = libtorch_agnostic.ops.fill_infinity(t)
@ -269,7 +327,7 @@ if not IS_WINDOWS:

        @onlyCPU
        def test_default_constructor(self):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            defined_tensor_is_defined = libtorch_agnostic.ops.test_default_constructor(
                True
@ -282,7 +340,7 @@ if not IS_WINDOWS:
            self.assertFalse(undefined_tensor_is_defined)

        def test_my_pad(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            t = torch.rand(2, 3, device=device)
            out = libtorch_agnostic.ops.my_pad(t)
@ -290,7 +348,7 @@ if not IS_WINDOWS:
            self.assertEqual(out, expected)

        def test_my_narrow(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            t = torch.randn(2, 5, device=device)

@ -301,47 +359,10 @@ if not IS_WINDOWS:
            expected0 = torch.narrow(t, dim0, start0, length0)
            self.assertEqual(out0, expected0)

-        @skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
-        def test_get_any_data_ptr(self, device):
-            import libtorch_agnostic
-
-            t = torch.empty(2, 5, device=device, dtype=torch.float32)
-            expected_p = t.data_ptr()
-
-            for mutable in [True, False]:
-                p = libtorch_agnostic.ops.get_any_data_ptr(t, mutable)
-                self.assertEqual(p, expected_p)
-
-        @skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
-        def test_get_template_any_data_ptr(self, device):
-            import libtorch_agnostic
-
-            supported_dtypes = get_supported_dtypes()
-
-            for dtype in supported_dtypes:
-                t = torch.empty(2, 5, device=device, dtype=dtype)
-                expected_p = t.data_ptr()
-
-                for rdtype in supported_dtypes:
-                    if dtype == rdtype:
-                        for mutable in [True, False]:
-                            p = libtorch_agnostic.ops.get_template_any_data_ptr(
-                                t, rdtype, mutable
-                            )
-                            self.assertEqual(p, expected_p)
-                    else:
-                        for mutable in [True, False]:
-                            with self.assertRaisesRegex(
-                                RuntimeError, "expected scalar type.* but found"
-                            ):
-                                libtorch_agnostic.ops.get_template_any_data_ptr(
-                                    t, rdtype, mutable
-                                )
-
        @onlyCUDA
        @deviceCountAtLeast(2)
        def test_device_guard(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            device_index = 1
            out = libtorch_agnostic.ops.test_device_guard(device_index)
@ -350,7 +371,7 @@ if not IS_WINDOWS:
        @onlyCUDA
        @deviceCountAtLeast(2)
        def test_device_guard_set_index(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            # This test creates a DeviceGuard with index 1, then sets it to index 0
            # and returns the current device (should be 0)
@ -359,7 +380,7 @@ if not IS_WINDOWS:

        @onlyCUDA
        def test_stream(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            stream = torch.cuda.Stream()
            device = torch.cuda.current_device()
@ -373,7 +394,7 @@ if not IS_WINDOWS:
        @onlyCUDA
        @deviceCountAtLeast(2)
        def test_get_current_device_index(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            prev_device = torch.cuda.current_device()

@ -387,7 +408,7 @@ if not IS_WINDOWS:
                torch.cuda.set_device(prev_device)

        def test_my_new_empty_dtype_variant(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            deterministic = torch.are_deterministic_algorithms_enabled()
            try:
@ -402,7 +423,7 @@ if not IS_WINDOWS:
                torch.use_deterministic_algorithms(deterministic)

        def test_my_new_zeros_dtype_variant(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            t = torch.randn(3, 4, device=device)
            out = libtorch_agnostic.ops.my_new_zeros_dtype_variant(t)
@ -410,7 +431,7 @@ if not IS_WINDOWS:
            self.assertEqual(out, ref_out, exact_device=True)

        def test_my_copy_(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            dst = torch.empty(2, 5, device=device)
            src = torch.randn(2, 5, device=device)
@ -421,7 +442,7 @@ if not IS_WINDOWS:
            self.assertEqual(result.data_ptr(), dst.data_ptr())

        def test_my_clone(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            t = torch.randn(2, 5, device=device)

@ -431,8 +452,9 @@ if not IS_WINDOWS:
            self.assertNotEqual(result.data_ptr(), expected.data_ptr())
            self.assertEqual(result.stride(), expected.stride())

+        @skipIfTorchVersionLessThan(2, 10)
        def test_my__foreach_mul_(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic

            N = 5
            tensors = [torch.rand(32, 16, device=device) for _ in range(N)]
@ -445,8 +467,9 @@ if not IS_WINDOWS:
            for tensor_t, expected_t in zip(tensors, expected_values):
                self.assertEqual(tensor_t, expected_t)

+        @skipIfTorchVersionLessThan(2, 10)
        def test_my__foreach_mul(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic

            N = 5
            tensors = [torch.rand(32, 16, device=device) for _ in range(N)]
@ -473,8 +496,9 @@ if not IS_WINDOWS:
                    curr_mem = torch.cuda.memory_allocated(device)
                    self.assertEqual(curr_mem, init_mem)

+        @skipIfTorchVersionLessThan(2, 10)
        def test_make_tensor_clones_and_call_foreach(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic

            t1 = torch.rand(2, 5, device=device)
            t2 = torch.rand(3, 4, device=device)
@ -482,9 +506,10 @@ if not IS_WINDOWS:
            self.assertEqual(result[0], t1 * t1)
            self.assertEqual(result[1], t2 * t2)

+        @skipIfTorchVersionLessThan(2, 10)
        @onlyCUDA
        def test_device(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic

            cuda_device = libtorch_agnostic.ops.test_device_constructor(
                is_cuda=True, index=1, use_str=False
@ -537,10 +562,11 @@ if not IS_WINDOWS:
            ):
                libtorch_agnostic.ops.test_device_set_index(cuda_device, 129)

+        @skipIfTorchVersionLessThan(2, 10)
        @onlyCUDA
        @deviceCountAtLeast(2)
        def test_tensor_device(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic

            t = torch.randn(2, 3)
            self.assertEqual(libtorch_agnostic.ops.test_tensor_device(t), t.device)
@ -555,6 +581,7 @@ if not IS_WINDOWS:
                libtorch_agnostic.ops.test_tensor_device(t_cuda_1), t_cuda_1.device
            )

+        @skipIfTorchVersionLessThan(2, 10)
        @onlyCPU
        # TODO: Debug this:
        # Dynamo failed to run FX node with fake tensors:
@ -564,7 +591,7 @@ if not IS_WINDOWS:
        # Declaration: libtorch_agnostic::test_parallel_for(int size, int grain_size) -> Tensor')
        @xfailIfTorchDynamo
        def test_parallel_for(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic

            num_threads = torch.get_num_threads()
            size = 100
@ -581,16 +608,18 @@ if not IS_WINDOWS:
            self.assertEqual(result_values, expected)
            self.assertEqual(result_thread_ids, torch.arange(expected_num_threads_used))

+        @skipIfTorchVersionLessThan(2, 10)
        @onlyCPU
        def test_get_num_threads(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic

            num_threads = libtorch_agnostic.ops.test_get_num_threads()
            expected_num_threads = torch.get_num_threads()
            self.assertEqual(num_threads, expected_num_threads)

+        @skipIfTorchVersionLessThan(2, 10)
        def test_my_empty(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic

            deterministic = torch.are_deterministic_algorithms_enabled()
            try:
@ -631,7 +660,7 @@ if not IS_WINDOWS:
                torch.use_deterministic_algorithms(deterministic)

        def test_my_flatten(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            t = torch.randn(2, 3, 4, device=device)
            result = libtorch_agnostic.ops.my_flatten(t)
@ -646,8 +675,9 @@ if not IS_WINDOWS:
            expected_range = torch.flatten(t, 2, -1)
            self.assertEqual(result_range, expected_range)

+        @skipIfTorchVersionLessThan(2, 10)
        def test_my_reshape(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic

            t = torch.randn(2, 3, 4, device=device)

@ -663,8 +693,9 @@ if not IS_WINDOWS:
            expected_flat = torch.reshape(t, [-1])
            self.assertEqual(result_flat, expected_flat)

+        @skipIfTorchVersionLessThan(2, 10)
        def test_my_view(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_10 as libtorch_agnostic

            t = torch.randn(2, 3, 4, device=device)

@ -681,7 +712,7 @@ if not IS_WINDOWS:
            self.assertEqual(result_flat, expected_flat)

        def test_mv_tensor_accessor(self, device):
-            import libtorch_agnostic
+            import libtorch_agnostic_2_9 as libtorch_agnostic

            m = torch.rand(3, 5, device=device)
            v = torch.rand(5, device=device)
@ -696,6 +727,45 @@ if not IS_WINDOWS:
            expected = torch.mv(m, v)
            self.assertEqual(result, expected)

+        @skipIfTorchVersionLessThan(2, 10)
+        @skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
+        def test_get_any_data_ptr(self, device):
+            import libtorch_agnostic_2_10 as libtorch_agnostic
+
+            t = torch.empty(2, 5, device=device, dtype=torch.float32)
+            expected_p = t.data_ptr()
+
+            for mutable in [True, False]:
+                p = libtorch_agnostic.ops.get_any_data_ptr(t, mutable)
+                self.assertEqual(p, expected_p)
+
+        @skipIfTorchVersionLessThan(2, 10)
+        @skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
+        def test_get_template_any_data_ptr(self, device):
+            import libtorch_agnostic_2_10 as libtorch_agnostic
+
+            supported_dtypes = get_supported_dtypes()
+
+            for dtype in supported_dtypes:
+                t = torch.empty(2, 5, device=device, dtype=dtype)
+                expected_p = t.data_ptr()
+
+                for rdtype in supported_dtypes:
+                    if dtype == rdtype:
+                        for mutable in [True, False]:
+                            p = libtorch_agnostic.ops.get_template_any_data_ptr(
+                                t, rdtype, mutable
+                            )
+                            self.assertEqual(p, expected_p)
+                    else:
+                        for mutable in [True, False]:
+                            with self.assertRaisesRegex(
+                                RuntimeError, "expected scalar type.* but found"
+                            ):
+                                libtorch_agnostic.ops.get_template_any_data_ptr(
+                                    t, rdtype, mutable
+                                )
+
    instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)

 if __name__ == "__main__":
--- a/test/distributed/tensor/test_convolution_ops.py
+++ b/test/distributed/tensor/test_convolution_ops.py
@ -230,6 +230,98 @@ class DistConvolutionOpsTest(DTensorTestBase):
        out_dt, out = self._run_single_arg_fwd(model, x, [Shard(0)])
        self.assertEqual(out_dt, out)

+    @with_comms
+    def test_conv2d_no_bias_compile(self):
+        """Test Conv2d with bias=False in compile mode (Issue #167091)
+
+        Regression test: Previously this would fail during torch.compile
+        tracing with AssertionError when bias_spec was None.
+        """
+        device_mesh = self.build_device_mesh()
+
+        def conv_fn(x, w):
+            return F.conv2d(x, w, bias=None, padding=1)
+
+        compiled_fn = torch.compile(conv_fn)
+
+        # Create tensors
+        x = torch.randn(1, 4, 5, 5, device=self.device_type)
+        w = torch.randn(8, 4, 3, 3, device=self.device_type)
+
+        # Distribute tensors
+        x_dt = distribute_tensor(x, device_mesh, [Replicate()])
+        w_dt = distribute_tensor(w, device_mesh, [Replicate()])
+
+        # Test eager mode for comparison
+        result_eager = conv_fn(x_dt, w_dt)
+
+        # Test compiled mode - this should not crash
+        result_compiled = compiled_fn(x_dt, w_dt)
+
+        # Verify shape is correct (the key regression test)
+        self.assertEqual(result_compiled.shape, torch.Size([1, 8, 5, 5]))
+
+        # Verify numerical correctness
+        torch.testing.assert_close(result_compiled.to_local(), result_eager.to_local())
+
+    @with_comms
+    def test_conv2d_no_bias_backward(self):
+        """Test Conv2d backward pass with bias=False (Issue #167091)
+
+        Regression test: Previously backward pass would fail when
+        grad_bias_spec was None.
+        """
+        device_mesh = self.build_device_mesh()
+
+        # Create tensors with requires_grad
+        x = torch.randn(1, 4, 5, 5, device=self.device_type)
+        w = torch.randn(8, 4, 3, 3, device=self.device_type, requires_grad=True)
+
+        # Distribute tensors
+        x_dt = distribute_tensor(x, device_mesh, [Replicate()])
+        w_dt = torch.nn.Parameter(distribute_tensor(w, device_mesh, [Replicate()]))
+
+        # Forward pass
+        result = F.conv2d(x_dt, w_dt, bias=None, padding=1)
+
+        # Backward pass - this should not crash
+        grad_output = torch.randn_like(result)
+        result.backward(grad_output)
+
+        # Check weight gradient exists (the key regression test)
+        self.assertIsNotNone(w_dt.grad)
+        self.assertEqual(w_dt.grad.shape, torch.Size([8, 4, 3, 3]))
+
+    @with_comms
+    def test_conv2d_module_no_bias(self):
+        """Test nn.Conv2d module with bias=False (Issue #167091)
+
+        Regression test: Ensures nn.Conv2d with bias=False works with DTensor.
+        """
+        device_mesh = self.build_device_mesh()
+
+        # Create model with bias=False
+        model = nn.Conv2d(4, 8, kernel_size=3, padding=1, bias=False).to(
+            self.device_type
+        )
+        nn.init.ones_(model.weight)
+
+        # Distribute model
+        model_dt = distribute_module(model, device_mesh, _conv_fn)
+
+        # Create input
+        x = torch.randn(1, 4, 5, 5, device=self.device_type)
+        x_dt = distribute_tensor(x, device_mesh, [Replicate()])
+
+        # Forward pass - this should not crash
+        output_dt = model_dt(x_dt)
+
+        # Check outputs shape is correct
+        self.assertEqual(output_dt.shape, torch.Size([1, 8, 5, 5]))
+
+        # Check that model.bias is None
+        self.assertIsNone(model.bias)
+

 DistConvolutionOpsTestWithLocalTensor = create_local_tensor_test_class(
    DistConvolutionOpsTest,
@ -238,6 +330,10 @@ DistConvolutionOpsTestWithLocalTensor = create_local_tensor_test_class(
        "test_conv_backward_none_grad_inp",
        "test_depthwise_convolution",
        "test_downsampling_convolution",
+        # New tests for Issue #167091 - use send/recv via tp_convolution
+        "test_conv2d_no_bias_compile",
+        "test_conv2d_no_bias_backward",
+        "test_conv2d_module_no_bias",
    ],
 )

--- a/test/distributed/test_aten_comm_compute_reordering.py
+++ b/test/distributed/test_aten_comm_compute_reordering.py
@ -10,6 +10,7 @@ import torch._dynamo.test_case

 # for some reason importing functional collectives after dynamo breaks collectives handling!
 import torch.distributed._functional_collectives as _functional_collectives
+import torch.fx as fx
 from torch._C import FileCheck
 from torch._dynamo.utils import counters, same
 from torch._inductor.utils import run_and_get_code, run_and_get_triton_code
@ -238,6 +239,49 @@ graph():
            self.assertTrue(same(out, correct))
            self.assertEqual(counters["inductor"]["overlap_scheduling_exposed"], 0)

+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch(get_patches())
+    def test_schedulable_wait(self):
+        """Test that if a wait node is scheduable or not."""
+        from torch._inductor.fx_passes.bucketing import _schedulable_wait_node
+
+        def test_graph():
+            graph = fx.Graph()
+
+            inp = graph.placeholder("inp")
+            group_size = graph.placeholder("group_size")
+            group_name = graph.placeholder("group_name")
+
+            ag_0_out = graph.call_function(
+                torch.ops._c10d_functional.all_gather_into_tensor.default,
+                args=(inp, group_size, group_name),
+            )
+            ag_0_wait = graph.call_function(
+                torch.ops._c10d_functional.wait_tensor.default,
+                args=(ag_0_out,),
+            )
+            ag_1_out = graph.call_function(
+                torch.ops._c10d_functional.all_gather_into_tensor.default,
+                args=(ag_0_wait, group_size, group_name),
+            )
+            ag_1_wait = graph.call_function(
+                torch.ops._c10d_functional.wait_tensor.default,
+                args=(ag_1_out,),
+            )
+            ag_2_wait = graph.call_function(
+                torch.ops._c10d_functional.wait_tensor.default,
+                args=(ag_1_wait,),
+            )
+
+            graph.output(ag_2_wait)
+            return graph
+
+        graph = test_graph()
+        schedulable = {"wait_tensor_default", "wait_tensor_default_1"}
+        for node in list(graph.nodes):
+            expected = node.name in schedulable
+            assert _schedulable_wait_node(node) is expected
+
    @torch._inductor.config.patch(get_patches())
    def test_reorder_compute_for_overlap_mul(self):
        def func(a, *, tag, ranks, group_size):
@ -1061,6 +1105,63 @@ class TestComputeCommReorderingBucketing(TestComputeCommReorderingMultiProc):
            correct = func(a, b, c)
            self.assertTrue(same(out, correct))

+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    @torch._inductor.config.patch(get_bucket_patches())
+    def test_multiple_hiding_nodes_bucketing(self):
+        """Test that collectives hidden by multiple compute ops can bucket together."""
+
+        # Use 0.5 compute multiplier so each collective needs 2 matmuls to be fully hidden
+        def estimate_with_half_compute(fx_node, override_size=None):
+            return estimate_aten_runtime(fx_node, compute_multiplier=0.5)
+
+        def func(a, b, *, ranks):
+            # Two all_gathers that will be hidden by multiple compute operations
+            ag1 = _functional_collectives.all_gather_tensor(a, 0, ranks)
+            ag2 = _functional_collectives.all_gather_tensor(b, 0, ranks)
+
+            # Multiple compute operations that can hide the collectives
+            # With 0.5 multiplier: mm1 and mm2 together hide ag1, mm2 and mm3 together hide ag2
+            mm1 = torch.matmul(a, a.T)
+            mm2 = torch.matmul(b, b.T)
+            mm3 = torch.matmul(a + b, (a + b).T)
+
+            return ag1.sum() + ag2.sum() + mm1.sum() + mm2.sum() + mm3.sum()
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            a = torch.ones(8, 8, dtype=torch.float, device=device_type)
+            b = torch.ones(8, 8, dtype=torch.float, device=device_type) * 2
+            ranks = list(range(self.world_size))
+
+            func_c = functools.partial(func, ranks=ranks)
+
+            # Patch with custom estimation that uses 0.5 multiplier
+            with torch._inductor.config.patch(
+                {
+                    "aten_distributed_optimizations.custom_runtime_estimation": estimate_with_half_compute
+                }
+            ):
+                compiled = torch.compile(func_c)
+                out, aten_graph_str = run_and_get_aten_graph(compiled, a, b)
+
+            # Should have 1 bucketed all_gather (both ag1 and ag2 bucketed together)
+            FileCheck().check_count(
+                "torch.ops._c10d_functional.wait_tensor.default", 1, exactly=True
+            ).run(aten_graph_str)
+
+            # Verify bucketed collective is scheduled before all matmuls
+            FileCheck().check("functional.all_gather_into_tensor").check(
+                "aten.mm"
+            ).check("aten.mm").check("aten.mm").check("wait_tensor").run(aten_graph_str)
+
+            # Verify correctness
+            correct = func(a, b, ranks=ranks)
+            self.assertTrue(same(out, correct))
+

 def get_toy_model(device_type: str):
    """
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@ -23,7 +23,12 @@ from torch._inductor.comms import (
    sink_waits_iterative,
 )
 from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
-from torch._inductor.fx_passes.bucketing import is_all_gather_into_tensor
+from torch._inductor.fx_passes.bucketing import (
+    is_all_gather_into_tensor,
+    is_all_reduce_tensor,
+    is_all_to_all_tensor,
+    is_reduce_scatter_tensor,
+)
 from torch._inductor.scheduler import (
    _get_mm_like_fn,
    BaseSchedulerNode,
@ -2188,7 +2193,7 @@ class TestSyncDecisionCrossRanks(MultiProcessTestCase):
        self.assertEqual(saved_values, [wt1])

    @skip_if_lt_x_gpu(2)
-    def test_comm_analysis(self):
+    def test_all_gather_comm_analysis(self):
        store = c10d.FileStore(self.file_name, self.world_size)
        torch.cuda.set_device(self.rank)
        c10d.init_process_group(
@ -2229,6 +2234,140 @@ class TestSyncDecisionCrossRanks(MultiProcessTestCase):
                )
                assert est_ms_nccl > 0

+    @skip_if_lt_x_gpu(2)
+    def test_reduce_scatter_comm_analysis(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        torch.cuda.set_device(self.rank)
+        c10d.init_process_group(
+            backend="nccl", store=store, rank=self.rank, world_size=self.world_size
+        )
+        group = c10d.distributed_c10d._get_default_group()
+        group_name = "default"
+        torch._C._distributed_c10d._register_process_group(
+            group_name, torch.distributed.group.WORLD
+        )
+        group_size = group.size()
+
+        def func(inp, group_size, group_name):
+            rs_0_out = torch.ops._c10d_functional.reduce_scatter_tensor(
+                inp, "sum", group_size, group_name
+            )
+            rs_0_wait = torch.ops.c10d_functional.wait_tensor(rs_0_out)
+            rs_1_out = torch.ops._c10d_functional.reduce_scatter_tensor(
+                rs_0_wait, "sum", group_size, group_name
+            )
+            rs_1_wait = torch.ops.c10d_functional.wait_tensor(rs_1_out)
+            return rs_1_wait
+
+        gm = make_fx(func)(torch.ones(4, 4, device=self.device), group_size, group_name)
+        g = gm.graph
+        for n in g.nodes:
+            if is_reduce_scatter_tensor(n):
+                from torch._inductor.comm_analysis import (
+                    estimate_nccl_collective_runtime_from_fx_node,
+                )
+
+                est_ms = estimate_nccl_collective_runtime_from_fx_node(
+                    n, use_nccl_estimator=False
+                )
+                assert est_ms > 0
+                est_ms_nccl = estimate_nccl_collective_runtime_from_fx_node(
+                    n, use_nccl_estimator=True
+                )
+                assert est_ms_nccl > 0
+
+    @skip_if_lt_x_gpu(2)
+    def test_all_reduce_comm_analysis(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        torch.cuda.set_device(self.rank)
+        c10d.init_process_group(
+            backend="nccl", store=store, rank=self.rank, world_size=self.world_size
+        )
+        group = c10d.distributed_c10d._get_default_group()
+        group_name = "default"
+        torch._C._distributed_c10d._register_process_group(
+            group_name, torch.distributed.group.WORLD
+        )
+        group_size = group.size()
+
+        def func(inp, group_size, group_name):
+            ar_0_out = torch.ops._c10d_functional.all_reduce(inp, "sum", group_name)
+            ar_0_wait = torch.ops.c10d_functional.wait_tensor(ar_0_out)
+            ar_1_out = torch.ops._c10d_functional.all_reduce(
+                ar_0_wait, "sum", group_name
+            )
+            ar_1_wait = torch.ops.c10d_functional.wait_tensor(ar_1_out)
+            return ar_1_wait
+
+        gm = make_fx(func)(torch.ones(4, 4, device=self.device), group_size, group_name)
+        g = gm.graph
+        for n in g.nodes:
+            if is_all_reduce_tensor(n):
+                from torch._inductor.comm_analysis import (
+                    estimate_nccl_collective_runtime_from_fx_node,
+                )
+
+                est_ms = estimate_nccl_collective_runtime_from_fx_node(
+                    n, use_nccl_estimator=False
+                )
+                assert est_ms > 0
+                est_ms_nccl = estimate_nccl_collective_runtime_from_fx_node(
+                    n, use_nccl_estimator=True
+                )
+                assert est_ms_nccl > 0
+
+    @skip_if_lt_x_gpu(2)
+    def test_all_to_all_comm_analysis(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        torch.cuda.set_device(self.rank)
+        c10d.init_process_group(
+            backend="nccl", store=store, rank=self.rank, world_size=self.world_size
+        )
+        group = c10d.distributed_c10d._get_default_group()
+        group_name = "default"
+        torch._C._distributed_c10d._register_process_group(
+            group_name, torch.distributed.group.WORLD
+        )
+        group_size = group.size()
+
+        def func(inp, group_size, group_name):
+            chunk = inp.numel() // self.world_size
+            split_sizes = [chunk] * self.world_size
+            a2a_0_out = torch.ops._c10d_functional.all_to_all_single(
+                inp,
+                split_sizes,
+                split_sizes,
+                group_name,
+            )
+            a2a_0_wait = torch.ops.c10d_functional.wait_tensor(a2a_0_out)
+            a2a_1_out = torch.ops._c10d_functional.all_to_all_single(
+                a2a_0_wait,
+                split_sizes,
+                split_sizes,
+                group_name,
+            )
+            a2a_1_wait = torch.ops.c10d_functional.wait_tensor(a2a_1_out)
+            return a2a_1_wait
+
+        gm = make_fx(func)(
+            torch.ones(group_size * 4, 1, device=self.device), group_size, group_name
+        )
+        g = gm.graph
+        for n in g.nodes:
+            if is_all_to_all_tensor(n):
+                from torch._inductor.comm_analysis import (
+                    estimate_nccl_collective_runtime_from_fx_node,
+                )
+
+                est_ms = estimate_nccl_collective_runtime_from_fx_node(
+                    n, use_nccl_estimator=False
+                )
+                assert est_ms > 0
+                est_ms_nccl = estimate_nccl_collective_runtime_from_fx_node(
+                    n, use_nccl_estimator=True
+                )
+                assert est_ms_nccl > 0
+
    @skip_if_lt_x_gpu(2)
    @requires_gloo()
    def test_regression_use_nccl_estimate_with_gloo(self):
--- a/test/distributed/test_overlap_bucketing_unit.py
+++ b/test/distributed/test_overlap_bucketing_unit.py
@ -49,7 +49,8 @@ def build_collective_info(graph, hiding_annotations):
    """
    Build CollectiveInfo dict from manual hiding annotations.

-    hiding_annotations: dict mapping collective_start -> hiding_compute_node
+    hiding_annotations: dict mapping collective_start -> hiding_compute_node(s)
+                        Can be a single node or a list/OrderedSet of nodes
    """
    from torch._inductor.fx_passes.overlap_scheduling import CollectiveInfo

@ -65,12 +66,20 @@ def build_collective_info(graph, hiding_annotations):

    # Build CollectiveInfo for each collective
    for start_node, wait_node in start_to_wait.items():
-        hiding_node = hiding_annotations.get(start_node)
+        hiding_annotation = hiding_annotations.get(start_node)
+
+        # Convert to OrderedSet
+        hiding_nodes = OrderedSet()
+        if hiding_annotation is not None:
+            if isinstance(hiding_annotation, list | OrderedSet):
+                hiding_nodes = OrderedSet(hiding_annotation)
+            else:
+                hiding_nodes = OrderedSet([hiding_annotation])

        # Estimate size and time
        size_bytes = 16 * 4  # 4x4 tensor of floats
        estimated_time_ms = 1.0  # Dummy time
-        exposed_time_ms = 0.0 if hiding_node else 1.0  # Hidden if has hiding_node
+        exposed_time_ms = 0.0 if hiding_nodes else 1.0  # Hidden if has hiding_nodes

        collective_info[start_node] = CollectiveInfo(
            start_node=start_node,
@ -78,7 +87,7 @@ def build_collective_info(graph, hiding_annotations):
            size_bytes=size_bytes,
            estimated_time_ms=estimated_time_ms,
            exposed_time_ms=exposed_time_ms,
-            hiding_node=hiding_node,
+            hiding_nodes=hiding_nodes,
        )

    return collective_info
@ -567,6 +576,97 @@ class TestOverlapPreservingBucketing(InductorTestCase):
            graph_str
        )

+    def test_can_bucket_with_multiple_hiding_nodes(self):
+        """
+        Test that collectives with multiple hiding nodes CAN bucket.
+
+        Graph structure:
+        ag1_start -> ag2_start -> mm1 -> mm2 -> mm3 -> ag1_wait -> ag2_wait
+
+        Where:
+        - ag1 is hidden by mm1 and mm2
+        - ag2 is hidden by mm2 and mm3
+        - Both collectives share mm2 as a hiding node
+        """
+
+        def func(a, b):
+            group_name = "0"
+            group_size = 1
+
+            # Start both collectives
+            ag1 = torch.ops._c10d_functional.all_gather_into_tensor(
+                a, group_size, group_name
+            )
+            ag2 = torch.ops._c10d_functional.all_gather_into_tensor(
+                b, group_size, group_name
+            )
+
+            # Three compute operations that hide the collectives
+            mm1 = torch.mm(a, a)
+            mm2 = torch.mm(b, b)
+            mm3 = torch.mm(a + b, a + b)
+
+            # Wait for both
+            ag1_out = torch.ops._c10d_functional.wait_tensor(ag1)
+            ag2_out = torch.ops._c10d_functional.wait_tensor(ag2)
+
+            return ag1_out.sum() + ag2_out.sum() + mm1.sum() + mm2.sum() + mm3.sum()
+
+        # Use fake mode to trace without executing
+        with FakeTensorMode():
+            a = torch.ones(4, 4, device=self.device)
+            b = torch.ones(4, 4, device=self.device) * 2
+
+            # Trace with make_fx
+            traced = make_fx(func)(a, b)
+
+        # Find nodes using find_nodes
+        ag1, ag2 = traced.graph.find_nodes(
+            op="call_function",
+            target=torch.ops._c10d_functional.all_gather_into_tensor.default,
+        )
+        mm1, mm2, mm3 = traced.graph.find_nodes(
+            op="call_function", target=torch.ops.aten.mm.default
+        )
+
+        # Manually annotate hiding relationships with multiple hiding nodes
+        hiding_annotations = {
+            ag1: [mm1, mm2],  # ag1 is hidden by mm1 and mm2
+            ag2: [mm2, mm3],  # ag2 is hidden by mm2 and mm3
+        }
+
+        # Build collective info and ancestors
+        collective_info = build_collective_info(traced.graph, hiding_annotations)
+        node_ancestors = compute_ancestors(traced.graph)
+        scheduled = OrderedSet(traced.graph.nodes)
+
+        # Verify hiding_nodes are correctly set
+        self.assertEqual(len(collective_info[ag1].hiding_nodes), 2)
+        self.assertIn(mm1, collective_info[ag1].hiding_nodes)
+        self.assertIn(mm2, collective_info[ag1].hiding_nodes)
+        self.assertEqual(len(collective_info[ag2].hiding_nodes), 2)
+        self.assertIn(mm2, collective_info[ag2].hiding_nodes)
+        self.assertIn(mm3, collective_info[ag2].hiding_nodes)
+
+        # Run bucketing
+        from torch._inductor.fx_passes.overlap_preserving_bucketer import (
+            OverlapPreservingBucketer,
+        )
+
+        bucketer = OverlapPreservingBucketer(
+            traced.graph,
+            collective_info,
+            node_ancestors,
+            scheduled,
+        )
+        bucketer.bucket_collectives()
+
+        FileCheck().check_count(
+            "all_gather_into_tensor_out", 1, exactly=False
+        ).check_count("torch.ops.aten.mm.default", 3, exactly=True).run(
+            str(traced.graph)
+        )
+

 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/test_store.py
+++ b/test/distributed/test_store.py
@ -253,6 +253,14 @@ class StoreTestBase:
        a.set("foo", "bar")
        self.assertEqual(b.get("foo"), b"bar")

+    def test_list_keys(self):
+        a = self._create_store()
+        a.set("foo", "bar")
+        a.set("baz", "qux")
+        keys = a.list_keys()
+        self.assertIn("foo", keys)
+        self.assertIn("baz", keys)
+
    # This is the number of keys used in test_set_get. Adding this as a class
    # property instead of hardcoding in the test since some Store
    # implementations will have differing number of keys. In the base case,
--- a/test/dynamo/test_higher_order_ops.py
+++ b/test/dynamo/test_higher_order_ops.py
@ -39,7 +39,10 @@ from torch.testing._internal.common_utils import (
 )
 from torch.testing._internal.hop_db import hop_db
 from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
-from torch.testing._internal.triton_utils import requires_cuda_and_triton
+from torch.testing._internal.triton_utils import (
+    requires_cuda_and_triton,
+    requires_gpu_and_triton,
+)


 def count_ops(gm, args, freq, op):
@ -3395,6 +3398,91 @@ class GraphModule(torch.nn.Module):
        with self.assertRaisesRegex(RuntimeError, msg):
            fn_with_hints(x, y)

+    @requires_cuda_and_triton
+    def test_wrap_inductor_compiled_regions_option(self):
+        """
+        Test that wrap_inductor_compiled_regions option wraps compiled regions
+        in inductor_compiled_code HOP, making them visible to DebugMode.
+        """
+        from torch.utils._debug_mode import DebugMode
+
+        # Test with wrapping enabled
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": True},
+            fullgraph=True,
+        )
+        def fn_wrapped(x, y):
+            return torch.matmul(x, y)
+
+        # Test with wrapping disabled (default)
+        @torch.compile(backend="inductor", fullgraph=True)
+        def fn_not_wrapped(x, y):
+            return torch.matmul(x, y)
+
+        x = torch.randn(4, 4, device="cuda")
+        y = torch.randn(4, 4, device="cuda")
+
+        # Test wrapped version - HOP should be visible in DebugMode
+        with DebugMode() as debug_mode_wrapped:
+            result_wrapped = fn_wrapped(x, y)
+
+        debug_string_wrapped = debug_mode_wrapped.debug_string()
+        self.assertIn("inductor_compiled_code", debug_string_wrapped)
+
+        # Test non-wrapped version - HOP should NOT be visible
+        with DebugMode() as debug_mode_not_wrapped:
+            result_not_wrapped = fn_not_wrapped(x, y)
+
+        debug_string_not_wrapped = debug_mode_not_wrapped.debug_string()
+        self.assertNotIn("inductor_compiled_code", debug_string_not_wrapped)
+
+        # Both should produce correct results
+        expected = torch.matmul(x, y)
+        self.assertEqual(result_wrapped, expected)
+        self.assertEqual(result_not_wrapped, expected)
+
+    @requires_cuda_and_triton
+    def test_wrap_inductor_compiled_regions_with_backward(self):
+        """
+        Test that wrap_inductor_compiled_regions works correctly with autograd.
+        """
+        from torch.utils._debug_mode import DebugMode
+
+        @torch.compile(
+            backend="inductor",
+            options={"wrap_inductor_compiled_regions": True},
+            fullgraph=True,
+        )
+        def fn(x, y):
+            return torch.matmul(x, y)
+
+        x = torch.randn(4, 4, device="cuda", requires_grad=True)
+        y = torch.randn(4, 4, device="cuda", requires_grad=True)
+
+        # Clone for eager comparison
+        x_eager = x.detach().clone().requires_grad_(True)
+        y_eager = y.detach().clone().requires_grad_(True)
+
+        # Compiled forward and backward
+        with DebugMode() as debug_mode:
+            result = fn(x, y)
+            loss = result.sum()
+            loss.backward()
+
+        # HOP should be visible in forward pass
+        self.assertIn("inductor_compiled_code", debug_mode.debug_string())
+
+        # Eager forward and backward for comparison
+        expected = torch.matmul(x_eager, y_eager)
+        expected_loss = expected.sum()
+        expected_loss.backward()
+
+        # Check correctness
+        self.assertEqual(result, expected)
+        self.assertEqual(x.grad, x_eager.grad)
+        self.assertEqual(y.grad, y_eager.grad)
+

 class HigherOrderOpVmapGuardTests(
    torch._dynamo.test_case.TestCaseWithNestedGraphBreaks, LoggingTestCase
@ -6895,7 +6983,7 @@ class ActivationCheckpointingTests(
            fn, backend, x, y, skip_check=True
        )  # dropout decomp is known to diverge with eager

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    @torch._functorch.config.patch(functionalize_rng_ops=True)
    def test_fallback(self):
        def gn(x, y):
--- a/test/dynamo/test_streams.py
+++ b/test/dynamo/test_streams.py
@ -470,7 +470,7 @@ class <lambda>(torch.nn.Module):
        )

    @requires_cuda
-    def test_stream_backward(self) -> None:
+    def test_stream_backward_simple(self) -> None:
        def fn(x, y):
            s2 = torch.Stream()
            s0 = torch.Stream()
@ -524,7 +524,68 @@ class GraphModule(torch.nn.Module):
        # Annotation: {'stream': 1}
        mul_3: "f32[2, 2]" = torch.ops.aten.mul.Tensor(tangents_1, 2);  tangents_1 = None

+        # Annotation: {'stream': 0}
+        add_3: "f32[2, 2]" = torch.ops.aten.add.Tensor(mul_2, mul_3);  mul_2 = mul_3 = None
+        return (add_3, add_2)
+""",
+        )
+
+    @requires_cuda
+    def test_stream_backward_sync(self) -> None:
+        def fn(x, y):
+            s2 = torch.Stream()
+            s0 = torch.Stream()
+            with s0:
+                y0 = 2 * x + y
+            with s2:
+                z = 2 * x + y
+
+            return y0, z
+
+        inp = (
+            torch.ones(2, 2, device="cuda:0", requires_grad=True) + 1,
+            torch.ones(2, 2, device="cuda:0", requires_grad=True),
+        )
+        expected = fn(*inp)
+        (
+            actual,
+            _,
+            fw_graphs,
+            bw_graphs,
+        ) = extract_graph(fn, *inp)
+        self.assertEqual(len(fw_graphs), 1)
+        self.assertEqual(expected, actual)
+        self.assertExpectedInline(
+            print_graph(fw_graphs[0]),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "f32[2, 2]", primals_2: "f32[2, 2]"):
+        # Annotation: {'stream': 1}
+        mul: "f32[2, 2]" = torch.ops.aten.mul.Tensor(primals_1, 2);  primals_1 = None
+        add: "f32[2, 2]" = torch.ops.aten.add.Tensor(mul, primals_2)
+
+        # Annotation: {'stream': 0}
+        add_1: "f32[2, 2]" = torch.ops.aten.add.Tensor(mul, primals_2);  mul = primals_2 = None
+        return (add, add_1)
+""",
+        )
+
+        actual[1].sum().backward()
+        self.assertExpectedInline(
+            print_graph(bw_graphs[0]),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, tangents_1: "f32[2, 2]", tangents_2: "f32[2, 2]"):
+        # Annotation: {'stream': 0}
+        mul_2: "f32[2, 2]" = torch.ops.aten.mul.Tensor(tangents_2, 2)
+
        #
+        add_2: "f32[2, 2]" = torch.ops.aten.add.Tensor(tangents_2, tangents_1);  tangents_2 = None
+
+        # Annotation: {'stream': 1}
+        mul_3: "f32[2, 2]" = torch.ops.aten.mul.Tensor(tangents_1, 2);  tangents_1 = None
+
+        # Annotation: {'stream': 0}
        add_3: "f32[2, 2]" = torch.ops.aten.add.Tensor(mul_2, mul_3);  mul_2 = mul_3 = None
        return (add_3, add_2)
 """,
--- a/test/dynamo/test_wrap_inductor_compiled_regions.py
+++ b/test/dynamo/test_wrap_inductor_compiled_regions.py
--- a/test/expect/HasDecompTest.test_has_decomposition.expect
+++ b/test/expect/HasDecompTest.test_has_decomposition.expect
@ -1372,6 +1372,8 @@ aten::view_as_complex_copy.out
 aten::view_as_real
 aten::view_as_real_copy
 aten::view_as_real_copy.out
+aten::zendnn_linear_unary
+aten::zendnn_weight_prepack_for_linear
 aten::zeros.names
 aten::zeros.names_out
 aten::zeros.out
--- a/test/export/test_experimental.py
+++ b/test/export/test_experimental.py
@ -456,6 +456,31 @@ def forward(self, x):
        test_inputs = make_inputs()
        self.assertEqual(gm(*test_inputs), foo(*test_inputs))

+    def test_dynamo_graph_capture_with_call_override(self):
+        class _InterestingModule(torch.nn.Module):
+            def __init__(self, module):
+                super().__init__()
+                self._module = module
+
+            def __call__(self, *args, **kwargs):
+                return self._module(*args, **kwargs)
+
+        class MyModel(torch.nn.Module):
+            def forward(self, x):
+                return x + 1
+
+        foo = _InterestingModule(MyModel())
+
+        def make_inputs():
+            return (torch.randn(2, 3),)
+
+        trace_inputs = make_inputs()
+        gm = dynamo_graph_capture_for_export(foo)(*trace_inputs)
+        test_inputs = make_inputs()
+        self.assertEqual(gm(*test_inputs), foo(*test_inputs))
+        self.assertEqual(len(list(gm.buffers())), len(list(foo.buffers())))
+        self.assertEqual(len(list(gm.parameters())), len(list(foo.parameters())))
+
    def test_dynamo_graph_capture_custom_pytree_type(self):
        import torch.utils._pytree as pytree

--- a/test/higher_order_ops/test_print.py
+++ b/test/higher_order_ops/test_print.py
@ -3,12 +3,17 @@ import io
 from unittest.mock import patch

 import torch
-from torch._dynamo.utils import counters
 from torch._functorch.aot_autograd import aot_export_module
 from torch.fx.experimental.proxy_tensor import make_fx
-from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+    TestCase,
+)


+@instantiate_parametrized_tests
 class TestHopPrint(TestCase):
    def test_base_print(self):
        def f(x):
@ -18,7 +23,6 @@ class TestHopPrint(TestCase):
            torch._higher_order_ops.print("moo")
            return x

-        counters.clear()
        x = torch.randn(3, 3)
        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
            f(x)
@ -33,7 +37,6 @@ class TestHopPrint(TestCase):
            x = x * x
            return x

-        counters.clear()
        x = torch.randn(3, 3)
        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
            f(x)
@ -184,6 +187,62 @@ x = add_1, y = add_2);  getitem = None
            """print(str format_str) -> ()""",
        )

+    @parametrize("backend", ["eager", "aot_eager"])
+    def test_reorder_print_no_graph_break(self, backend):
+        def f(x):
+            x1 = x + x
+            torch._higher_order_ops.print("moo {x}", x=x1)
+            x2 = x1 * x1
+            torch._higher_order_ops.print("moo {x}", x=x2)
+            x3 = x2 + x2
+            return (x1, x3)
+
+        # Eager and aot_eager backend for dynamo tracing testing
+        x = torch.randn(3, 3)
+        opt_f = torch.compile(backend=backend, fullgraph=True)(f)
+        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
+            opt_out = opt_f(x)
+            printed_output = mock_stdout.getvalue().strip()
+            orig_out = f(x)
+
+        self.assertEqual(
+            printed_output,
+            f"moo {x * 2}\nmoo {x * 2 * x * 2}",
+        )
+        self.assertEqual(orig_out, opt_out)
+
+        x_new = torch.randn(2, 2)
+        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
+            opt_out = opt_f(x_new)
+            printed_output = mock_stdout.getvalue().strip()
+
+        self.assertEqual(
+            printed_output,
+            f"moo {x_new * 2}\nmoo {x_new * 2 * x_new * 2}",
+        )
+
+    @parametrize("backend", ["eager", "aot_eager"])
+    def test_constant_mutation(self, backend):
+        def f(x):
+            alist = [x]
+            alist.append(x + 1)
+            torch._higher_order_ops.print("moo {x}", x=alist[-1])
+            alist[0].sum().item()  # graph break
+            res = alist.pop()
+            torch._higher_order_ops.print("moo {x}", x=alist[-1])
+            res.sum().item()  # graph break
+            return res
+
+        inputs = (torch.tensor([1]),)
+        opt_f = torch.compile(backend=backend, fullgraph=True)(f)
+        with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
+            opt_out = opt_f(*inputs)
+            printed_output = mock_stdout.getvalue().strip()
+            orig_out = f(*inputs)
+
+        self.assertEqual(printed_output, "moo tensor([2])\nmoo tensor([1])")
+        self.assertEqual(orig_out, opt_out)
+

 if __name__ == "__main__":
    run_tests()
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@ -1554,7 +1554,8 @@ class AOTInductorTestsTemplate:

    # scaled_dot_product_flash_attention
    @unittest.skipIf(
-        not HAS_XPU_AND_TRITON and not SM80OrLater, "bfloat16 only supported in sm80+"
+        not SM80OrLater and not HAS_XPU_AND_TRITON,
+        "bfloat16 only supported in sm80+ or XPU",
    )
    def test_sdpa(self):
        class Model(torch.nn.Module):
@ -1571,7 +1572,10 @@ class AOTInductorTestsTemplate:
        )
        self.check_model(Model(), example_inputs)

-    @unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
+    @unittest.skipIf(
+        not SM80OrLater and not HAS_XPU_AND_TRITON,
+        "bfloat16 only supported in sm80+ or XPU",
+    )
    @unittest.skipIf(
        # for archs where this isn't lowered to flash attention, the math
        # backend will be used and it doesn't work for bfloat16
@ -5926,8 +5930,8 @@ class AOTInductorTestsTemplate:
    @requires_gpu
    def test_d2h_copy(self):
        # device to copy host should always have the same stride
-        if "cuda" not in self.device:
-            raise unittest.SkipTest("This test is only for CUDA")
+        if self.device not in ["cuda", "xpu"]:
+            raise unittest.SkipTest("This test is only for CUDA or XPU")

        class ToCpuModel(nn.Module):
            def forward(self, x):
--- a/test/inductor/test_aot_inductor_package.py
+++ b/test/inductor/test_aot_inductor_package.py
@ -28,7 +28,7 @@ from torch.export.pt2_archive._package import (
    load_weights_to_pt2_contents,
 )
 from torch.testing._internal.common_cuda import _get_torch_cuda_version
-from torch.testing._internal.common_utils import IS_FBCODE, skipIfXpu
+from torch.testing._internal.common_utils import IS_FBCODE, skipIfXpu, TEST_CUDA
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU


@ -267,9 +267,9 @@ class TestAOTInductorPackage(TestCase):

    @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
    @unittest.skipIf(
-        _get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
+        TEST_CUDA and _get_torch_cuda_version() < (12, 6),
+        "Test is only supported on CUDA 12.6+",
    )
-    @skipIfXpu  # build system may be different
    def test_compile_after_package(self):
        self.check_package_cpp_only()

--- a/test/inductor/test_combo_kernels.py
+++ b/test/inductor/test_combo_kernels.py
@ -11,19 +11,19 @@ from torch.testing._internal.common_utils import (
    instantiate_parametrized_tests,
    TestCase,
 )
-from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA_AND_TRITON
-from torch.testing._internal.triton_utils import requires_cuda_and_triton
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU_AND_TRITON
+from torch.testing._internal.triton_utils import requires_gpu_and_triton


 aten = torch.ops.aten

 try:
    try:
-        from .test_torchinductor import check_model, check_model_cuda
+        from .test_torchinductor import check_model, check_model_gpu
    except ImportError:
        from test_torchinductor import (  # @manual=fbcode//caffe2/test/inductor:test_inductor-library
            check_model,
-            check_model_cuda,
+            check_model_gpu,
        )
 except (unittest.SkipTest, ImportError) as e:
    sys.stderr.write(f"{type(e)}: {e}\n")
@ -34,7 +34,7 @@ except (unittest.SkipTest, ImportError) as e:

@instantiate_parametrized_tests
 class ComboKernelTests(TestCase):
-    check_model_cuda = check_model_cuda
+    check_model_gpu = check_model_gpu
    check_model_cpu = check_model
    check_kernel_count = True

@ -56,7 +56,7 @@ class ComboKernelTests(TestCase):
        torch._inductor.metrics.reset()
        super().tearDown()

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    def test_activation_functions(self):
        def test_activations(a, b, c):
            a1 = torch.nn.functional.relu(a)
@ -65,9 +65,9 @@ class ComboKernelTests(TestCase):
            return a1, b1, c1

        inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
        ]

        out_eager = test_activations(*inps)
@ -76,7 +76,7 @@ class ComboKernelTests(TestCase):
        self.assertEqual(out_eager, out_compiled)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    def test_reduce_functions(self):
        def test_reduce(a, b, c, d):
            a1 = torch.sum(a, dim=0)
@ -87,10 +87,10 @@ class ComboKernelTests(TestCase):
            return a1, b1, c1, d1

        inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(30, 8, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(30, 8, device=GPU_TYPE),
        ]

        out_eager = test_reduce(*inps)
@ -99,7 +99,7 @@ class ComboKernelTests(TestCase):
        self.assertEqual(out_eager, out_compiled)
        self.assertTrue(torch._inductor.metrics.generated_kernel_count <= 2)

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    def test_mutated_args(self):
        def test_mutated(a, b, c, d):
            a.add_(1)
@ -110,10 +110,10 @@ class ComboKernelTests(TestCase):
            return a, b, c, d

        inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(30, 8, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(30, 8, device=GPU_TYPE),
        ]

        out_eager = test_mutated(*inps)
@ -122,7 +122,7 @@ class ComboKernelTests(TestCase):
        self.assertEqual(out_eager, out_compiled)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    def test_reduce_split(self):
        def fn(a, b):
            a1 = torch.linalg.vector_norm(a)
@ -130,15 +130,15 @@ class ComboKernelTests(TestCase):
            return a1, b1

        inps = [
-            torch.rand(2048, 512, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
+            torch.rand(2048, 512, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
        ]
        out_eager = fn(*inps)
        out_compiled = torch.compile(fn)(*inps)

        self.assertEqual(out_eager, out_compiled)

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    def test_2d_blocking_partitioning(self):
        def fn(a0, a1, a2, b0, b1, b2):
            c0 = torch.add(a0, b0)
@ -146,15 +146,15 @@ class ComboKernelTests(TestCase):
            c2 = torch.add(a2, b2)
            return c0, c1, c2

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            (
-                torch.rand(30, 20, device="cuda"),
-                torch.rand(40, 30, device="cuda"),
-                torch.rand(36, 40, device="cuda"),
-                torch.rand(30, 20, device="cuda"),
-                torch.rand(30, 40, device="cuda").t(),
-                torch.rand(40, 36, device="cuda").t(),
+                torch.rand(30, 20, device=GPU_TYPE),
+                torch.rand(40, 30, device=GPU_TYPE),
+                torch.rand(36, 40, device=GPU_TYPE),
+                torch.rand(30, 20, device=GPU_TYPE),
+                torch.rand(30, 40, device=GPU_TYPE).t(),
+                torch.rand(40, 36, device=GPU_TYPE).t(),
            ),
        )

@ -163,7 +163,7 @@ class ComboKernelTests(TestCase):

@instantiate_parametrized_tests
 class ComboKernelBenchmarkTests(TestCase):
-    check_model_cuda = check_model_cuda
+    check_model_gpu = check_model_gpu
    check_model_cpu = check_model
    check_kernel_count = True

@ -185,7 +185,7 @@ class ComboKernelBenchmarkTests(TestCase):
        torch._inductor.metrics.reset()
        super().tearDown()

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    def test_activation_benchmark(self):
        def test_activations(a, b, c):
            a1 = torch.nn.functional.relu(a)
@ -194,9 +194,9 @@ class ComboKernelBenchmarkTests(TestCase):
            return a1, b1, c1

        inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
        ]

        out_eager = test_activations(*inps)
@ -205,7 +205,7 @@ class ComboKernelBenchmarkTests(TestCase):
        self.assertEqual(out_eager, out_compiled)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    def test_reduce_benchmark(self):
        def test_reduce(a, b, c, d):
            a1 = torch.sum(a, dim=0)
@ -216,10 +216,10 @@ class ComboKernelBenchmarkTests(TestCase):
            return a1, b1, c1, d1

        inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(30, 8, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(30, 8, device=GPU_TYPE),
        ]

        out_eager = test_reduce(*inps)
@ -228,7 +228,7 @@ class ComboKernelBenchmarkTests(TestCase):
        self.assertEqual(out_eager, out_compiled)
        self.assertTrue(4 < torch._inductor.metrics.generated_kernel_count <= 10)

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    def test_mutated_benchmark(self):
        def test_mutated(a, b, c, d):
            a.add_(1)
@ -239,10 +239,10 @@ class ComboKernelBenchmarkTests(TestCase):
            return a, b, c, d

        inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(30, 8, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(30, 8, device=GPU_TYPE),
        ]

        out_eager = test_mutated(*inps)
@ -251,7 +251,7 @@ class ComboKernelBenchmarkTests(TestCase):
        self.assertEqual(out_eager, out_compiled)
        self.assertTrue(torch._inductor.metrics.generated_kernel_count in [6, 9])

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    def test_round_robin_dispatch(self):
        # combo kernel dispatch strategy: round robin
        def test_mutated(a, b, c, d):
@ -263,10 +263,10 @@ class ComboKernelBenchmarkTests(TestCase):
            return a, b, c, d

        inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 5, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(5, 18, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 5, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(5, 18, device=GPU_TYPE),
        ]

        out_eager = test_mutated(*inps)
@ -275,7 +275,7 @@ class ComboKernelBenchmarkTests(TestCase):
        self.assertEqual(out_eager, out_compiled)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 6)

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    def test_2d_blocking_benchmark(self):
        def fn(a0, a1, a2, b0, b1, b2):
            c0 = torch.add(a0, b0)
@ -283,28 +283,28 @@ class ComboKernelBenchmarkTests(TestCase):
            c2 = torch.add(a2, b2)
            return c0, c1, c2

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            (
-                torch.rand(30, 20, device="cuda"),
-                torch.rand(40, 30, device="cuda"),
-                torch.rand(36, 40, device="cuda"),
-                torch.rand(30, 20, device="cuda"),
-                torch.rand(30, 40, device="cuda").t(),
-                torch.rand(40, 36, device="cuda").t(),
+                torch.rand(30, 20, device=GPU_TYPE),
+                torch.rand(40, 30, device=GPU_TYPE),
+                torch.rand(36, 40, device=GPU_TYPE),
+                torch.rand(30, 20, device=GPU_TYPE),
+                torch.rand(30, 40, device=GPU_TYPE).t(),
+                torch.rand(40, 36, device=GPU_TYPE).t(),
            ),
        )

        self.assertTrue(7 <= torch._inductor.metrics.generated_kernel_count <= 8)

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    def test_persistent_reduction_no_x_dim(self):
        def fn(x, y):
            return x.sum(1), y.sum(1)

        inps = (
-            torch.rand(16, 256, device="cuda"),
-            torch.rand(32, 256, device="cuda"),
+            torch.rand(16, 256, device=GPU_TYPE),
+            torch.rand(32, 256, device=GPU_TYPE),
        )
        torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
        torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
@ -317,7 +317,7 @@ class ComboKernelBenchmarkTests(TestCase):

@instantiate_parametrized_tests
 class ComboKernelDynamicShapesTests(TestCase):
-    check_model_cuda = check_model_cuda
+    check_model_gpu = check_model_gpu
    check_model_cpu = check_model
    check_kernel_count = True

@ -347,7 +347,7 @@ class ComboKernelDynamicShapesTests(TestCase):
        torch._inductor.metrics.reset()
        super().tearDown()

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    def test_dynamic_shapes_activations(self):
        def test_activations(a, b, c):
            a1 = torch.nn.functional.relu(a)
@ -356,9 +356,9 @@ class ComboKernelDynamicShapesTests(TestCase):
            return a1, b1, c1

        inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
        ]

        out_eager = test_activations(*inps)
@ -367,7 +367,7 @@ class ComboKernelDynamicShapesTests(TestCase):
        self.assertEqual(out_eager, out_compiled)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    def test_dynamic_shapes_2d_blocking(self):
        def fn(a0, a1, a2, b0, b1, b2):
            c0 = torch.add(a0, b0)
@ -375,21 +375,21 @@ class ComboKernelDynamicShapesTests(TestCase):
            c2 = torch.add(a2, b2)
            return c0, c1, c2

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            (
-                torch.rand(30, 20, device="cuda"),
-                torch.rand(40, 30, device="cuda"),
-                torch.rand(36, 40, device="cuda"),
-                torch.rand(30, 20, device="cuda"),
-                torch.rand(30, 40, device="cuda").t(),
-                torch.rand(40, 36, device="cuda").t(),
+                torch.rand(30, 20, device=GPU_TYPE),
+                torch.rand(40, 30, device=GPU_TYPE),
+                torch.rand(36, 40, device=GPU_TYPE),
+                torch.rand(30, 20, device=GPU_TYPE),
+                torch.rand(30, 40, device=GPU_TYPE).t(),
+                torch.rand(40, 36, device=GPU_TYPE).t(),
            ),
        )

        self.assertTrue(7 <= torch._inductor.metrics.generated_kernel_count <= 8)

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    def test_dynamic_shapes_reduce(self):
        def test_reduce(a, b, c, d):
            a1 = torch.sum(a, dim=0)
@ -400,10 +400,10 @@ class ComboKernelDynamicShapesTests(TestCase):
            return a1, b1, c1, d1

        inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(30, 8, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(30, 8, device=GPU_TYPE),
        ]

        out_eager = test_reduce(*inps)
@ -412,7 +412,7 @@ class ComboKernelDynamicShapesTests(TestCase):
        self.assertEqual(out_eager, out_compiled)
        self.assertTrue(4 < torch._inductor.metrics.generated_kernel_count <= 10)

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    def test_dynamic_shapes_mutated(self):
        # combo kernel dispatch strategy: round robin
        def test_mutated(a, b, c, d):
@ -424,10 +424,10 @@ class ComboKernelDynamicShapesTests(TestCase):
            return a, b, c, d

        inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 5, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(5, 18, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 5, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(5, 18, device=GPU_TYPE),
        ]

        out_eager = test_mutated(*inps)
@ -436,7 +436,7 @@ class ComboKernelDynamicShapesTests(TestCase):
        self.assertEqual(out_eager, out_compiled)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 6)

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    @torch._inductor.config.patch("combo_kernels_autotune", 0)
    def test_dynamic_shapes_activations_no_autotune(self):
        def test_activations(a, b, c):
@ -446,9 +446,9 @@ class ComboKernelDynamicShapesTests(TestCase):
            return a1, b1, c1

        inps = [
-            torch.rand(10, 10, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
-            torch.rand(10, 10, device="cuda"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
        ]

        out_eager = test_activations(*inps)
@ -457,7 +457,7 @@ class ComboKernelDynamicShapesTests(TestCase):
        self.assertEqual(out_eager, out_compiled)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    @torch._dynamo.config.patch("automatic_dynamic_shapes", True)
    @torch._dynamo.config.patch("assume_static_by_default", True)
    def test_dynamic_shapes_persistent_reduction_no_x_dim(self):
@ -465,8 +465,8 @@ class ComboKernelDynamicShapesTests(TestCase):
            return x.sum(1), y.sum(1)

        inps = (
-            torch.rand(16, 256, device="cuda"),
-            torch.rand(32, 256, device="cuda"),
+            torch.rand(16, 256, device=GPU_TYPE),
+            torch.rand(32, 256, device=GPU_TYPE),
        )
        torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
        torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
@ -476,7 +476,7 @@ class ComboKernelDynamicShapesTests(TestCase):
        self.assertEqual(out_eager, out_compiled)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    @torch._dynamo.config.patch("automatic_dynamic_shapes", True)
    @torch._dynamo.config.patch("assume_static_by_default", True)
    def test_dynamic_shapes_persistent_reduction_no_x_dim_2(self):
@ -484,8 +484,8 @@ class ComboKernelDynamicShapesTests(TestCase):
            return x.sum(2), y.sum(2)

        inps = (
-            torch.rand(8, 16, 256, device="cuda"),
-            torch.rand(8, 32, 256, device="cuda"),
+            torch.rand(8, 16, 256, device=GPU_TYPE),
+            torch.rand(8, 32, 256, device=GPU_TYPE),
        )
        torch._dynamo.mark_dynamic(inps[0], (0, 1), min=1, max=256)
        torch._dynamo.mark_dynamic(inps[1], (0, 1), min=1, max=256)
@ -495,7 +495,7 @@ class ComboKernelDynamicShapesTests(TestCase):
        self.assertEqual(out_eager, out_compiled)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    @torch._dynamo.config.patch("automatic_dynamic_shapes", True)
    @torch._dynamo.config.patch("assume_static_by_default", True)
    def test_dynamic_shapes_2d_blocking_round_robin(self):
@ -506,12 +506,12 @@ class ComboKernelDynamicShapesTests(TestCase):
            return c0, c1, c2

        inps = (
-            torch.rand(20, 30, device="cuda"),
-            torch.rand(30, 30, device="cuda"),
-            torch.rand(40, 32, device="cuda"),
-            torch.rand(30, 20, device="cuda").t(),
-            torch.rand(30, 30, device="cuda").t(),
-            torch.rand(32, 40, device="cuda").t(),
+            torch.rand(20, 30, device=GPU_TYPE),
+            torch.rand(30, 30, device=GPU_TYPE),
+            torch.rand(40, 32, device=GPU_TYPE),
+            torch.rand(30, 20, device=GPU_TYPE).t(),
+            torch.rand(30, 30, device=GPU_TYPE).t(),
+            torch.rand(32, 40, device=GPU_TYPE).t(),
        )

        out_eager = fn(*inps)
@ -522,19 +522,19 @@ class ComboKernelDynamicShapesTests(TestCase):
        torch._inductor.metrics.reset()

        inps = (
-            torch.rand(24, 30, device="cuda"),
-            torch.rand(32, 30, device="cuda"),
-            torch.rand(48, 32, device="cuda"),
-            torch.rand(30, 24, device="cuda").t(),
-            torch.rand(30, 32, device="cuda").t(),
-            torch.rand(32, 48, device="cuda").t(),
+            torch.rand(24, 30, device=GPU_TYPE),
+            torch.rand(32, 30, device=GPU_TYPE),
+            torch.rand(48, 32, device=GPU_TYPE),
+            torch.rand(30, 24, device=GPU_TYPE).t(),
+            torch.rand(30, 32, device=GPU_TYPE).t(),
+            torch.rand(32, 48, device=GPU_TYPE).t(),
        )
        out_compiled = compiled(*inps)
        out_eager = fn(*inps)
        self.assertEqual(out_eager, out_compiled)
        self.assertTrue(5 <= torch._inductor.metrics.generated_kernel_count <= 6)

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    @torch._dynamo.config.patch("automatic_dynamic_shapes", True)
    @torch._dynamo.config.patch("assume_static_by_default", True)
    @torch._inductor.config.patch("triton.autotune_at_compile_time", True)
@ -543,9 +543,9 @@ class ComboKernelDynamicShapesTests(TestCase):
            return x.sum(1), y.mean(1), z.max(1)

        inps = (
-            torch.rand(16, 128, device="cuda"),
-            torch.rand(32, 128, device="cuda"),
-            torch.rand(32, 256, device="cuda"),
+            torch.rand(16, 128, device=GPU_TYPE),
+            torch.rand(32, 128, device=GPU_TYPE),
+            torch.rand(32, 256, device=GPU_TYPE),
        )
        torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
        torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
@ -555,15 +555,15 @@ class ComboKernelDynamicShapesTests(TestCase):

        self.assertEqual(out_eager, out_compiled)

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    def test_helper_fn_defined(self):
        def fn(x, y, z):
            return x.sum(1), y.mean(1), z.cumsum(1)

        inps = (
-            torch.rand(16, 128, device="cuda"),
-            torch.rand(32, 128, device="cuda"),
-            torch.rand(32, 256, device="cuda"),
+            torch.rand(16, 128, device=GPU_TYPE),
+            torch.rand(32, 128, device=GPU_TYPE),
+            torch.rand(32, 256, device=GPU_TYPE),
        )

        out_eager = fn(*inps)
@ -577,5 +577,5 @@ class ComboKernelDynamicShapesTests(TestCase):
 if __name__ == "__main__":
    from torch._dynamo.test_case import run_tests

-    if HAS_CPU or HAS_CUDA_AND_TRITON:
+    if HAS_CPU or HAS_GPU_AND_TRITON:
        run_tests(needs="filelock")
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@ -45,6 +45,7 @@ from torch.testing._internal.common_utils import (
    parametrize,
    scoped_load_inline,
    skipIfWindows,
+    skipIfXpu,
 )
 from torch.testing._internal.hop_db import hop_db
 from torch.testing._internal.inductor_utils import (
@ -52,9 +53,13 @@ from torch.testing._internal.inductor_utils import (
    HAS_CPU,
    HAS_CUDA_AND_TRITON,
    HAS_GPU,
+    HAS_XPU_AND_TRITON,
 )
 from torch.testing._internal.logging_utils import logs_to_string
-from torch.testing._internal.triton_utils import requires_cuda_and_triton
+from torch.testing._internal.triton_utils import (
+    requires_cuda_and_triton,
+    requires_gpu_and_triton,
+)
 from torch.utils._python_dispatch import TorchDispatchMode


@ -3049,13 +3054,14 @@ main()

        self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)

-    @requires_cuda_and_triton
+    @skipIfXpu(msg="cudagraphs not supported on xpu for now!")
+    @requires_gpu_and_triton
    def test_cudagraphs_sdpa(self):
        query = torch.rand(
-            32, 8, 128, 64, dtype=torch.float16, device="cuda", requires_grad=True
+            32, 8, 128, 64, dtype=torch.float16, device=GPU_TYPE, requires_grad=True
        )
-        key = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
-        value = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
+        key = torch.rand(32, 8, 128, 64, dtype=torch.float16, device=GPU_TYPE)
+        value = torch.rand(32, 8, 128, 64, dtype=torch.float16, device=GPU_TYPE)
        out = torch.nn.functional.scaled_dot_product_attention(query, key, value)

        with (
@ -3747,7 +3753,7 @@ class CompiledAutograd0(torch.nn.Module):
        self.assertTrue(isinstance(view_nodes[0].args[1][0], torch.fx.Node))
        self.assertTrue(isinstance(view_nodes[1].args[1][0], torch.fx.Node))

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    def test_flex_attention(self):
        def _squared(score, b, h, m, n):
            """Joint graph needed for correctness"""
@ -3765,7 +3771,7 @@ class CompiledAutograd0(torch.nn.Module):
                    a * b,
                    b,
                    dtype=torch.bfloat16,
-                    device="cuda",
+                    device=GPU_TYPE,
                    requires_grad=True,
                )
                fwd_bwd(v)
@ -5333,12 +5339,13 @@ if IS_S390X:
 test_autograd = load_test_module("test_autograd")
 test_custom_ops = load_test_module("test_custom_ops")
 test_higher_order_ops = load_test_module("dynamo/test_higher_order_ops")
-
-TestAutogradWithCompiledAutograd = wrap_test_class(test_autograd.TestAutograd)
+if not HAS_XPU_AND_TRITON:
+    TestAutogradWithCompiledAutograd = wrap_test_class(test_autograd.TestAutograd)
 TestNestedCheckpointWithCompiledAutograd = wrap_test_class(
    test_autograd.TestNestedCheckpoint
 )
-TestCustomOpWithCompiledAutograd = wrap_test_class(test_custom_ops.TestCustomOp)
+if not HAS_XPU_AND_TRITON:
+    TestCustomOpWithCompiledAutograd = wrap_test_class(test_custom_ops.TestCustomOp)
 HigherOrderOpTestsWithCompiledAutograd = wrap_test_class(
    test_higher_order_ops.HigherOrderOpTests
 )
@ -5367,6 +5374,7 @@ class TestCompiledAutogradOpInfo(TestCase):
        super(TestCase, self).tearDown()
        reset()

+    @skipIfXpu(msg="NotImplementedError: The operator 'testlib::mutating_custom_op'")
    @ops(
        list(filter(lambda op: op.name not in xfail_hops, hop_db)),
        allowed_dtypes=(torch.float,),
@ -5419,7 +5427,7 @@ class TestCompiledAutogradOpInfo(TestCase):
            self.assertEqual(expected, actual)


-instantiate_device_type_tests(TestCompiledAutogradOpInfo, globals())
+instantiate_device_type_tests(TestCompiledAutogradOpInfo, globals(), allow_xpu=True)
 instantiate_parametrized_tests(TestCompiledAutograd)

 if __name__ == "__main__":
--- a/test/inductor/test_compiled_optimizers.py
+++ b/test/inductor/test_compiled_optimizers.py
@ -65,7 +65,11 @@ from torch.testing._internal.inductor_utils import (
    HAS_GPU,
    has_triton,
 )
-from torch.testing._internal.triton_utils import requires_cuda_and_triton, requires_gpu
+from torch.testing._internal.triton_utils import (
+    requires_cuda_and_triton,
+    requires_gpu,
+    requires_gpu_and_triton,
+)


 def get_inputs(optim):
@ -946,7 +950,7 @@ class CompiledOptimizerTests(TestCase):
            kwargs = aot_graph_input_parser(forward)
            torch.compile(forward)(**kwargs)

-    @requires_cuda_and_triton
+    @requires_gpu_and_triton
    def test_foreach_map_adam(self):
        params = [
            torch.rand(
--- a/Show More
+++ b/Show More