Update on "Hide all symbols (except stable/headeronly/shim) if TORCH_STABLE_ONLY is defined"

Fixes https://github.com/pytorch/pytorch/issues/161660 This extends the `TORCH_STABLE_ONLY` stopgap added in https://github.com/pytorch/pytorch/pull/161658 [ghstack-poisoned]
Update base for Update on "Hide all symbols (except stable/headeronly/shim) if TORCH_STABLE_ONLY is defined"
2025-11-18 17:45:09 +08:00 · 2025-11-17 21:22:45 -08:00 · 2025-11-17 21:22:45 -08:00 · 2025-11-18 00:17:45 +00:00 · 2025-11-17 23:44:18 +00:00 · 2025-11-17 23:38:39 +00:00
232 changed files with 6533 additions and 3349 deletions
--- a/.ci/aarch64_linux/README.md
+++ b/.ci/aarch64_linux/README.md
@ -0,0 +1,19 @@
+# Aarch64 (ARM/Graviton) Support Scripts
+Scripts for building aarch64 PyTorch PIP Wheels. These scripts build the following wheels:
+* torch
+* torchvision
+* torchaudio
+* torchtext
+* torchdata
+## Aarch64_ci_build.sh
+This script is design to support CD operations within PyPi manylinux aarch64 container, and be executed in the container. It prepares the container and then executes __aarch64_wheel_ci_build.py__ to build the wheels. The script "assumes" the PyTorch repo is located at: ```/pytorch``` and will put the wheels into ```/artifacts```.
+### Usage
+```DESIRED_PYTHON=<PythonVersion> aarch64_ci_build.sh```
+
+__NOTE:__ CI build is currently __EXPERMINTAL__
+
+## Build_aarch64_wheel.py
+This app allows a person to build using AWS EC3 resources and requires AWS-CLI and Boto3 with AWS credentials to support building EC2 instances for the wheel builds. Can be used in a codebuild CD or from a local system.
+
+### Usage
+```build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch <RCtag>```
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -0,0 +1,53 @@
+#!/bin/bash
+set -eux -o pipefail
+
+GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
+
+# Set CUDA architecture lists to match x86 build_cuda.sh
+if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0"
+elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
+elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
+elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
+fi
+
+# Compress the fatbin with -compress-mode=size for CUDA 13
+if [[ "$DESIRED_CUDA" == *"13"* ]]; then
+    export TORCH_NVCC_FLAGS="-compress-mode=size"
+    # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
+    export BUILD_BUNDLE_PTXAS=1
+fi
+
+SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
+source $SCRIPTPATH/aarch64_ci_setup.sh
+
+###############################################################################
+# Run aarch64 builder python
+###############################################################################
+cd /
+# adding safe directory for git as the permissions will be
+# on the mounted pytorch repo
+git config --global --add safe.directory /pytorch
+pip install -r /pytorch/requirements.txt
+pip install auditwheel==6.2.0 wheel
+if [ "$DESIRED_CUDA" = "cpu" ]; then
+    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
+    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
+else
+    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
+    export USE_SYSTEM_NCCL=1
+
+    # Check if we should use NVIDIA libs from PyPI (similar to x86 build_cuda.sh logic)
+    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
+        echo "Bundling CUDA libraries with wheel for aarch64."
+    else
+        echo "Using nvidia libs from pypi for aarch64."
+        echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
+        export USE_NVIDIA_PYPI_LIBS=1
+    fi
+
+    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
+fi
--- a/.ci/aarch64_linux/aarch64_ci_setup.sh
+++ b/.ci/aarch64_linux/aarch64_ci_setup.sh
@ -0,0 +1,21 @@
+#!/bin/bash
+set -eux -o pipefail
+
+# This script is used to prepare the Docker container for aarch64_ci_wheel_build.py python script
+# By creating symlinks from desired /opt/python to /usr/local/bin/
+
+NUMPY_VERSION=2.0.2
+if [[ "$DESIRED_PYTHON"  == "3.13" || "$DESIRED_PYTHON" == "3.13t" ]]; then
+    NUMPY_VERSION=2.1.2
+fi
+
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+source $SCRIPTPATH/../manywheel/set_desired_python.sh
+
+pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2
+
+for tool in python python3 pip pip3 ninja scons patchelf; do
+    ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin;
+done
+
+python --version
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -0,0 +1,333 @@
+#!/usr/bin/env python3
+# encoding: UTF-8
+
+import os
+import shutil
+from subprocess import check_call, check_output
+
+
+def list_dir(path: str) -> list[str]:
+    """'
+    Helper for getting paths for Python
+    """
+    return check_output(["ls", "-1", path]).decode().split("\n")
+
+
+def replace_tag(filename) -> None:
+    with open(filename) as f:
+        lines = f.readlines()
+    for i, line in enumerate(lines):
+        if line.startswith("Tag:"):
+            lines[i] = line.replace("-linux_", "-manylinux_2_28_")
+            print(f"Updated tag from {line} to {lines[i]}")
+            break
+
+    with open(filename, "w") as f:
+        f.writelines(lines)
+
+
+def patch_library_rpath(
+    folder: str,
+    lib_name: str,
+    use_nvidia_pypi_libs: bool = False,
+    desired_cuda: str = "",
+) -> None:
+    """Apply patchelf to set RPATH for a library in torch/lib"""
+    lib_path = f"{folder}/tmp/torch/lib/{lib_name}"
+
+    if use_nvidia_pypi_libs:
+        # For PyPI NVIDIA libraries, construct CUDA RPATH
+        cuda_rpaths = [
+            "$ORIGIN/../../nvidia/cudnn/lib",
+            "$ORIGIN/../../nvidia/nvshmem/lib",
+            "$ORIGIN/../../nvidia/nccl/lib",
+            "$ORIGIN/../../nvidia/cusparselt/lib",
+        ]
+
+        if "130" in desired_cuda:
+            cuda_rpaths.append("$ORIGIN/../../nvidia/cu13/lib")
+        else:
+            cuda_rpaths.extend(
+                [
+                    "$ORIGIN/../../nvidia/cublas/lib",
+                    "$ORIGIN/../../nvidia/cuda_cupti/lib",
+                    "$ORIGIN/../../nvidia/cuda_nvrtc/lib",
+                    "$ORIGIN/../../nvidia/cuda_runtime/lib",
+                    "$ORIGIN/../../nvidia/cufft/lib",
+                    "$ORIGIN/../../nvidia/curand/lib",
+                    "$ORIGIN/../../nvidia/cusolver/lib",
+                    "$ORIGIN/../../nvidia/cusparse/lib",
+                    "$ORIGIN/../../nvidia/nvtx/lib",
+                    "$ORIGIN/../../nvidia/cufile/lib",
+                ]
+            )
+
+        # Add $ORIGIN for local torch libs
+        rpath = ":".join(cuda_rpaths) + ":$ORIGIN"
+    else:
+        # For bundled libraries, just use $ORIGIN
+        rpath = "$ORIGIN"
+
+    if os.path.exists(lib_path):
+        os.system(
+            f"cd {folder}/tmp/torch/lib/; "
+            f"patchelf --set-rpath '{rpath}' --force-rpath {lib_name}"
+        )
+
+
+def copy_and_patch_library(
+    src_path: str,
+    folder: str,
+    use_nvidia_pypi_libs: bool = False,
+    desired_cuda: str = "",
+) -> None:
+    """Copy a library to torch/lib and patch its RPATH"""
+    if os.path.exists(src_path):
+        lib_name = os.path.basename(src_path)
+        shutil.copy2(src_path, f"{folder}/tmp/torch/lib/{lib_name}")
+        patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
+
+
+def package_cuda_wheel(wheel_path, desired_cuda) -> None:
+    """
+    Package the cuda wheel libraries
+    """
+    folder = os.path.dirname(wheel_path)
+    os.mkdir(f"{folder}/tmp")
+    os.system(f"unzip {wheel_path} -d {folder}/tmp")
+    # Delete original wheel since it will be repackaged
+    os.system(f"rm {wheel_path}")
+
+    # Check if we should use PyPI NVIDIA libraries or bundle system libraries
+    use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
+
+    if use_nvidia_pypi_libs:
+        print("Using nvidia libs from pypi - skipping CUDA library bundling")
+        # For PyPI approach, we don't bundle CUDA libraries - they come from PyPI packages
+        # We only need to bundle non-NVIDIA libraries
+        minimal_libs_to_copy = [
+            "/lib64/libgomp.so.1",
+            "/usr/lib64/libgfortran.so.5",
+            "/acl/build/libarm_compute.so",
+            "/acl/build/libarm_compute_graph.so",
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_lapack_core.so.0",
+            "/usr/local/lib/libnvpl_blas_core.so.0",
+        ]
+
+        # Copy minimal libraries to unzipped_folder/torch/lib
+        for lib_path in minimal_libs_to_copy:
+            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
+
+        # Patch torch libraries used for searching libraries
+        torch_libs_to_patch = [
+            "libtorch.so",
+            "libtorch_cpu.so",
+            "libtorch_cuda.so",
+            "libtorch_cuda_linalg.so",
+            "libtorch_global_deps.so",
+            "libtorch_python.so",
+            "libtorch_nvshmem.so",
+            "libc10.so",
+            "libc10_cuda.so",
+            "libcaffe2_nvrtc.so",
+            "libshm.so",
+        ]
+        for lib_name in torch_libs_to_patch:
+            patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
+    else:
+        print("Bundling CUDA libraries with wheel")
+        # Original logic for bundling system CUDA libraries
+        # Common libraries for all CUDA versions
+        common_libs = [
+            # Non-NVIDIA system libraries
+            "/lib64/libgomp.so.1",
+            "/usr/lib64/libgfortran.so.5",
+            "/acl/build/libarm_compute.so",
+            "/acl/build/libarm_compute_graph.so",
+            # Common CUDA libraries (same for all versions)
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_lapack_core.so.0",
+            "/usr/local/lib/libnvpl_blas_core.so.0",
+            "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
+            "/usr/local/cuda/lib64/libcudnn.so.9",
+            "/usr/local/cuda/lib64/libcusparseLt.so.0",
+            "/usr/local/cuda/lib64/libcurand.so.10",
+            "/usr/local/cuda/lib64/libnccl.so.2",
+            "/usr/local/cuda/lib64/libnvshmem_host.so.3",
+            "/usr/local/cuda/lib64/libcudnn_adv.so.9",
+            "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
+            "/usr/local/cuda/lib64/libcudnn_graph.so.9",
+            "/usr/local/cuda/lib64/libcudnn_ops.so.9",
+            "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
+            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
+            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
+            "/usr/local/cuda/lib64/libcufile.so.0",
+            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
+            "/usr/local/cuda/lib64/libcusparse.so.12",
+        ]
+
+        # CUDA version-specific libraries
+        if "13" in desired_cuda:
+            minor_version = desired_cuda[-1]
+            version_specific_libs = [
+                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
+                "/usr/local/cuda/lib64/libcublas.so.13",
+                "/usr/local/cuda/lib64/libcublasLt.so.13",
+                "/usr/local/cuda/lib64/libcudart.so.13",
+                "/usr/local/cuda/lib64/libcufft.so.12",
+                "/usr/local/cuda/lib64/libcusolver.so.12",
+                "/usr/local/cuda/lib64/libnvJitLink.so.13",
+                "/usr/local/cuda/lib64/libnvrtc.so.13",
+                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",
+            ]
+        elif "12" in desired_cuda:
+            # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
+            minor_version = desired_cuda[-1]
+            version_specific_libs = [
+                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
+                "/usr/local/cuda/lib64/libcublas.so.12",
+                "/usr/local/cuda/lib64/libcublasLt.so.12",
+                "/usr/local/cuda/lib64/libcudart.so.12",
+                "/usr/local/cuda/lib64/libcufft.so.11",
+                "/usr/local/cuda/lib64/libcusolver.so.11",
+                "/usr/local/cuda/lib64/libnvJitLink.so.12",
+                "/usr/local/cuda/lib64/libnvrtc.so.12",
+                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
+            ]
+        else:
+            raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")
+
+        # Combine all libraries
+        libs_to_copy = common_libs + version_specific_libs
+
+        # Copy libraries to unzipped_folder/torch/lib
+        for lib_path in libs_to_copy:
+            copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
+
+    # Make sure the wheel is tagged with manylinux_2_28
+    for f in os.scandir(f"{folder}/tmp/"):
+        if f.is_dir() and f.name.endswith(".dist-info"):
+            replace_tag(f"{f.path}/WHEEL")
+            break
+
+    os.system(f"wheel pack {folder}/tmp/ -d {folder}")
+    os.system(f"rm -rf {folder}/tmp/")
+
+
+def complete_wheel(folder: str) -> str:
+    """
+    Complete wheel build and put in artifact location
+    """
+    wheel_name = list_dir(f"/{folder}/dist")[0]
+
+    # Please note for cuda we don't run auditwheel since we use custom script to package
+    # the cuda dependencies to the wheel file using update_wheel() method.
+    # However we need to make sure filename reflects the correct Manylinux platform.
+    if "pytorch" in folder and not enable_cuda:
+        print("Repairing Wheel with AuditWheel")
+        check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
+        repaired_wheel_name = list_dir(f"/{folder}/wheelhouse")[0]
+
+        print(f"Moving {repaired_wheel_name} wheel to /{folder}/dist")
+        os.rename(
+            f"/{folder}/wheelhouse/{repaired_wheel_name}",
+            f"/{folder}/dist/{repaired_wheel_name}",
+        )
+    else:
+        repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
+
+    print(f"Copying {repaired_wheel_name} to artifacts")
+    shutil.copy2(
+        f"/{folder}/dist/{repaired_wheel_name}", f"/artifacts/{repaired_wheel_name}"
+    )
+
+    return repaired_wheel_name
+
+
+def parse_arguments():
+    """
+    Parse inline arguments
+    """
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser("AARCH64 wheels python CD")
+    parser.add_argument("--debug", action="store_true")
+    parser.add_argument("--build-only", action="store_true")
+    parser.add_argument("--test-only", type=str)
+    parser.add_argument("--enable-mkldnn", action="store_true")
+    parser.add_argument("--enable-cuda", action="store_true")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    """
+    Entry Point
+    """
+    args = parse_arguments()
+    enable_mkldnn = args.enable_mkldnn
+    enable_cuda = args.enable_cuda
+    branch = check_output(
+        ["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd="/pytorch"
+    ).decode()
+
+    print("Building PyTorch wheel")
+    build_vars = ""
+    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
+    if enable_cuda:
+        build_vars += "MAX_JOBS=5 "
+
+        # Handle PyPI NVIDIA libraries vs bundled libraries
+        use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
+        if use_nvidia_pypi_libs:
+            print("Configuring build for PyPI NVIDIA libraries")
+            # Configure for dynamic linking (matching x86 logic)
+            build_vars += "ATEN_STATIC_CUDA=0 USE_CUDA_STATIC_LINK=0 USE_CUPTI_SO=1 "
+        else:
+            print("Configuring build for bundled NVIDIA libraries")
+            # Keep existing static linking approach - already configured above
+
+    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
+    desired_cuda = os.getenv("DESIRED_CUDA")
+    if override_package_version is not None:
+        version = override_package_version
+        build_vars += (
+            f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
+        )
+    elif branch in ["nightly", "main"]:
+        build_date = (
+            check_output(["git", "log", "--pretty=format:%cs", "-1"], cwd="/pytorch")
+            .decode()
+            .replace("-", "")
+        )
+        version = (
+            check_output(["cat", "version.txt"], cwd="/pytorch").decode().strip()[:-2]
+        )
+        if enable_cuda:
+            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date}+{desired_cuda} PYTORCH_BUILD_NUMBER=1 "
+        else:
+            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
+    elif branch.startswith(("v1.", "v2.")):
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
+
+    if enable_mkldnn:
+        print("build pytorch with mkldnn+acl backend")
+        build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
+        build_vars += "ACL_ROOT_DIR=/acl "
+        if enable_cuda:
+            build_vars += "BLAS=NVPL "
+        else:
+            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS "
+    else:
+        print("build pytorch without mkldnn backend")
+
+    os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")
+    if enable_cuda:
+        print("Updating Cuda Dependency")
+        filename = os.listdir("/pytorch/dist/")
+        wheel_path = f"/pytorch/dist/{filename[0]}"
+        package_cuda_wheel(wheel_path, desired_cuda)
+    pytorch_wheel_name = complete_wheel("/pytorch/")
+    print(f"Build Complete. Created {pytorch_wheel_name}..")
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -0,0 +1,999 @@
+#!/usr/bin/env python3
+
+# This script is for building  AARCH64 wheels using AWS EC2 instances.
+# To generate binaries for the release follow these steps:
+# 1. Update mappings for each of the Domain Libraries by adding new row to a table like this:
+#         "v1.11.0": ("0.11.0", "rc1"),
+# 2. Run script with following arguments for each of the supported python versions and required tag, for example:
+# build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch v1.11.0-rc3
+
+
+import os
+import subprocess
+import sys
+import time
+from typing import Optional, Union
+
+import boto3
+
+
+# AMI images for us-east-1, change the following based on your ~/.aws/config
+os_amis = {
+    "ubuntu20_04": "ami-052eac90edaa9d08f",  # login_name: ubuntu
+    "ubuntu22_04": "ami-0c6c29c5125214c77",  # login_name: ubuntu
+    "redhat8": "ami-0698b90665a2ddcf1",  # login_name: ec2-user
+}
+
+ubuntu20_04_ami = os_amis["ubuntu20_04"]
+
+
+def compute_keyfile_path(key_name: Optional[str] = None) -> tuple[str, str]:
+    if key_name is None:
+        key_name = os.getenv("AWS_KEY_NAME")
+        if key_name is None:
+            return os.getenv("SSH_KEY_PATH", ""), ""
+
+    homedir_path = os.path.expanduser("~")
+    default_path = os.path.join(homedir_path, ".ssh", f"{key_name}.pem")
+    return os.getenv("SSH_KEY_PATH", default_path), key_name
+
+
+ec2 = boto3.resource("ec2")
+
+
+def ec2_get_instances(filter_name, filter_value):
+    return ec2.instances.filter(
+        Filters=[{"Name": filter_name, "Values": [filter_value]}]
+    )
+
+
+def ec2_instances_of_type(instance_type="t4g.2xlarge"):
+    return ec2_get_instances("instance-type", instance_type)
+
+
+def ec2_instances_by_id(instance_id):
+    rc = list(ec2_get_instances("instance-id", instance_id))
+    return rc[0] if len(rc) > 0 else None
+
+
+def start_instance(
+    key_name, ami=ubuntu20_04_ami, instance_type="t4g.2xlarge", ebs_size: int = 50
+):
+    inst = ec2.create_instances(
+        ImageId=ami,
+        InstanceType=instance_type,
+        SecurityGroups=["ssh-allworld"],
+        KeyName=key_name,
+        MinCount=1,
+        MaxCount=1,
+        BlockDeviceMappings=[
+            {
+                "DeviceName": "/dev/sda1",
+                "Ebs": {
+                    "DeleteOnTermination": True,
+                    "VolumeSize": ebs_size,
+                    "VolumeType": "standard",
+                },
+            }
+        ],
+    )[0]
+    print(f"Create instance {inst.id}")
+    inst.wait_until_running()
+    running_inst = ec2_instances_by_id(inst.id)
+    print(f"Instance started at {running_inst.public_dns_name}")
+    return running_inst
+
+
+class RemoteHost:
+    addr: str
+    keyfile_path: str
+    login_name: str
+    container_id: Optional[str] = None
+    ami: Optional[str] = None
+
+    def __init__(self, addr: str, keyfile_path: str, login_name: str = "ubuntu"):
+        self.addr = addr
+        self.keyfile_path = keyfile_path
+        self.login_name = login_name
+
+    def _gen_ssh_prefix(self) -> list[str]:
+        return [
+            "ssh",
+            "-o",
+            "StrictHostKeyChecking=no",
+            "-i",
+            self.keyfile_path,
+            f"{self.login_name}@{self.addr}",
+            "--",
+        ]
+
+    @staticmethod
+    def _split_cmd(args: Union[str, list[str]]) -> list[str]:
+        return args.split() if isinstance(args, str) else args
+
+    def run_ssh_cmd(self, args: Union[str, list[str]]) -> None:
+        subprocess.check_call(self._gen_ssh_prefix() + self._split_cmd(args))
+
+    def check_ssh_output(self, args: Union[str, list[str]]) -> str:
+        return subprocess.check_output(
+            self._gen_ssh_prefix() + self._split_cmd(args)
+        ).decode("utf-8")
+
+    def scp_upload_file(self, local_file: str, remote_file: str) -> None:
+        subprocess.check_call(
+            [
+                "scp",
+                "-i",
+                self.keyfile_path,
+                local_file,
+                f"{self.login_name}@{self.addr}:{remote_file}",
+            ]
+        )
+
+    def scp_download_file(
+        self, remote_file: str, local_file: Optional[str] = None
+    ) -> None:
+        if local_file is None:
+            local_file = "."
+        subprocess.check_call(
+            [
+                "scp",
+                "-i",
+                self.keyfile_path,
+                f"{self.login_name}@{self.addr}:{remote_file}",
+                local_file,
+            ]
+        )
+
+    def start_docker(self, image="quay.io/pypa/manylinux2014_aarch64:latest") -> None:
+        self.run_ssh_cmd("sudo apt-get install -y docker.io")
+        self.run_ssh_cmd(f"sudo usermod -a -G docker {self.login_name}")
+        self.run_ssh_cmd("sudo service docker start")
+        self.run_ssh_cmd(f"docker pull {image}")
+        self.container_id = self.check_ssh_output(
+            f"docker run -t -d -w /root {image}"
+        ).strip()
+
+    def using_docker(self) -> bool:
+        return self.container_id is not None
+
+    def run_cmd(self, args: Union[str, list[str]]) -> None:
+        if not self.using_docker():
+            return self.run_ssh_cmd(args)
+        assert self.container_id is not None
+        docker_cmd = self._gen_ssh_prefix() + [
+            "docker",
+            "exec",
+            "-i",
+            self.container_id,
+            "bash",
+        ]
+        p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE)
+        p.communicate(
+            input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
+                "utf-8"
+            )
+        )
+        rc = p.wait()
+        if rc != 0:
+            raise subprocess.CalledProcessError(rc, docker_cmd)
+
+    def check_output(self, args: Union[str, list[str]]) -> str:
+        if not self.using_docker():
+            return self.check_ssh_output(args)
+        assert self.container_id is not None
+        docker_cmd = self._gen_ssh_prefix() + [
+            "docker",
+            "exec",
+            "-i",
+            self.container_id,
+            "bash",
+        ]
+        p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+        (out, err) = p.communicate(
+            input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
+                "utf-8"
+            )
+        )
+        rc = p.wait()
+        if rc != 0:
+            raise subprocess.CalledProcessError(rc, docker_cmd, output=out, stderr=err)
+        return out.decode("utf-8")
+
+    def upload_file(self, local_file: str, remote_file: str) -> None:
+        if not self.using_docker():
+            return self.scp_upload_file(local_file, remote_file)
+        tmp_file = os.path.join("/tmp", os.path.basename(local_file))
+        self.scp_upload_file(local_file, tmp_file)
+        self.run_ssh_cmd(
+            ["docker", "cp", tmp_file, f"{self.container_id}:/root/{remote_file}"]
+        )
+        self.run_ssh_cmd(["rm", tmp_file])
+
+    def download_file(self, remote_file: str, local_file: Optional[str] = None) -> None:
+        if not self.using_docker():
+            return self.scp_download_file(remote_file, local_file)
+        tmp_file = os.path.join("/tmp", os.path.basename(remote_file))
+        self.run_ssh_cmd(
+            ["docker", "cp", f"{self.container_id}:/root/{remote_file}", tmp_file]
+        )
+        self.scp_download_file(tmp_file, local_file)
+        self.run_ssh_cmd(["rm", tmp_file])
+
+    def download_wheel(
+        self, remote_file: str, local_file: Optional[str] = None
+    ) -> None:
+        if self.using_docker() and local_file is None:
+            basename = os.path.basename(remote_file)
+            local_file = basename.replace(
+                "-linux_aarch64.whl", "-manylinux2014_aarch64.whl"
+            )
+        self.download_file(remote_file, local_file)
+
+    def list_dir(self, path: str) -> list[str]:
+        return self.check_output(["ls", "-1", path]).split("\n")
+
+
+def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
+    import socket
+
+    for i in range(attempt_cnt):
+        try:
+            with socket.create_connection((addr, port), timeout=timeout):
+                return
+        except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203
+            if i == attempt_cnt - 1:
+                raise
+            time.sleep(timeout)
+
+
+def update_apt_repo(host: RemoteHost) -> None:
+    time.sleep(5)
+    host.run_cmd("sudo systemctl stop apt-daily.service || true")
+    host.run_cmd("sudo systemctl stop unattended-upgrades.service || true")
+    host.run_cmd(
+        "while systemctl is-active --quiet apt-daily.service; do sleep 1; done"
+    )
+    host.run_cmd(
+        "while systemctl is-active --quiet unattended-upgrades.service; do sleep 1; done"
+    )
+    host.run_cmd("sudo apt-get update")
+    time.sleep(3)
+    host.run_cmd("sudo apt-get update")
+
+
+def install_condaforge(
+    host: RemoteHost, suffix: str = "latest/download/Miniforge3-Linux-aarch64.sh"
+) -> None:
+    print("Install conda-forge")
+    host.run_cmd(f"curl -OL https://github.com/conda-forge/miniforge/releases/{suffix}")
+    host.run_cmd(f"sh -f {os.path.basename(suffix)} -b")
+    host.run_cmd(f"rm -f {os.path.basename(suffix)}")
+    if host.using_docker():
+        host.run_cmd("echo 'PATH=$HOME/miniforge3/bin:$PATH'>>.bashrc")
+    else:
+        host.run_cmd(
+            [
+                "sed",
+                "-i",
+                "'/^# If not running interactively.*/i PATH=$HOME/miniforge3/bin:$PATH'",
+                ".bashrc",
+            ]
+        )
+
+
+def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
+    if python_version == "3.6":
+        # Python-3.6 EOLed and not compatible with conda-4.11
+        install_condaforge(
+            host, suffix="download/4.10.3-10/Miniforge3-4.10.3-10-Linux-aarch64.sh"
+        )
+        host.run_cmd(f"conda install -y python={python_version} numpy pyyaml")
+    else:
+        install_condaforge(
+            host, suffix="download/4.11.0-4/Miniforge3-4.11.0-4-Linux-aarch64.sh"
+        )
+        # Pytorch-1.10 or older are not compatible with setuptools=59.6 or newer
+        host.run_cmd(
+            f"conda install -y python={python_version} numpy pyyaml setuptools>=59.5.0"
+        )
+
+
+def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:
+    host.run_cmd("pip3 install auditwheel")
+    host.run_cmd(
+        "conda install -y patchelf" if use_conda else "sudo apt-get install -y patchelf"
+    )
+    from tempfile import NamedTemporaryFile
+
+    with NamedTemporaryFile() as tmp:
+        tmp.write(embed_library_script.encode("utf-8"))
+        tmp.flush()
+        host.upload_file(tmp.name, "embed_library.py")
+
+    print("Embedding libgomp into wheel")
+    if host.using_docker():
+        host.run_cmd(f"python3 embed_library.py {wheel_name} --update-tag")
+    else:
+        host.run_cmd(f"python3 embed_library.py {wheel_name}")
+
+
+def checkout_repo(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    url: str,
+    git_clone_flags: str,
+    mapping: dict[str, tuple[str, str]],
+) -> Optional[str]:
+    for prefix in mapping:
+        if not branch.startswith(prefix):
+            continue
+        tag = f"v{mapping[prefix][0]}-{mapping[prefix][1]}"
+        host.run_cmd(f"git clone {url} -b {tag} {git_clone_flags}")
+        return mapping[prefix][0]
+
+    host.run_cmd(f"git clone {url} -b {branch} {git_clone_flags}")
+    return None
+
+
+def build_torchvision(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    use_conda: bool = True,
+    git_clone_flags: str,
+    run_smoke_tests: bool = True,
+) -> str:
+    print("Checking out TorchVision repo")
+    build_version = checkout_repo(
+        host,
+        branch=branch,
+        url="https://github.com/pytorch/vision",
+        git_clone_flags=git_clone_flags,
+        mapping={
+            "v1.7.1": ("0.8.2", "rc2"),
+            "v1.8.0": ("0.9.0", "rc3"),
+            "v1.8.1": ("0.9.1", "rc1"),
+            "v1.9.0": ("0.10.0", "rc1"),
+            "v1.10.0": ("0.11.1", "rc1"),
+            "v1.10.1": ("0.11.2", "rc1"),
+            "v1.10.2": ("0.11.3", "rc1"),
+            "v1.11.0": ("0.12.0", "rc1"),
+            "v1.12.0": ("0.13.0", "rc4"),
+            "v1.12.1": ("0.13.1", "rc6"),
+            "v1.13.0": ("0.14.0", "rc4"),
+            "v1.13.1": ("0.14.1", "rc2"),
+            "v2.0.0": ("0.15.1", "rc2"),
+            "v2.0.1": ("0.15.2", "rc2"),
+        },
+    )
+    print("Building TorchVision wheel")
+
+    # Please note libnpg and jpeg are required to build image.so extension
+    if use_conda:
+        host.run_cmd("conda install -y libpng jpeg")
+        # Remove .so files to force static linking
+        host.run_cmd(
+            "rm miniforge3/lib/libpng.so miniforge3/lib/libpng16.so miniforge3/lib/libjpeg.so"
+        )
+        # And patch setup.py to include libz dependency for libpng
+        host.run_cmd(
+            [
+                'sed -i -e \'s/image_link_flags\\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py'
+            ]
+        )
+
+    build_vars = ""
+    if branch == "nightly":
+        version = host.check_output(
+            ["if [ -f vision/version.txt ]; then cat vision/version.txt; fi"]
+        ).strip()
+        if len(version) == 0:
+            # In older revisions, version was embedded in setup.py
+            version = (
+                host.check_output(["grep", '"version = \'"', "vision/setup.py"])
+                .strip()
+                .split("'")[1][:-2]
+            )
+        build_date = (
+            host.check_output("cd vision && git log --pretty=format:%s -1")
+            .strip()
+            .split()[0]
+            .replace("-", "")
+        )
+        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
+    elif build_version is not None:
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
+    if host.using_docker():
+        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
+
+    host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation")
+    vision_wheel_name = host.list_dir("vision/dist")[0]
+    embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name))
+
+    print("Copying TorchVision wheel")
+    host.download_wheel(os.path.join("vision", "dist", vision_wheel_name))
+    if run_smoke_tests:
+        host.run_cmd(
+            f"pip3 install {os.path.join('vision', 'dist', vision_wheel_name)}"
+        )
+        host.run_cmd("python3 vision/test/smoke_test.py")
+    print("Delete vision checkout")
+    host.run_cmd("rm -rf vision")
+
+    return vision_wheel_name
+
+
+def build_torchdata(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    use_conda: bool = True,
+    git_clone_flags: str = "",
+) -> str:
+    print("Checking out TorchData repo")
+    git_clone_flags += " --recurse-submodules"
+    build_version = checkout_repo(
+        host,
+        branch=branch,
+        url="https://github.com/pytorch/data",
+        git_clone_flags=git_clone_flags,
+        mapping={
+            "v1.13.1": ("0.5.1", ""),
+            "v2.0.0": ("0.6.0", "rc5"),
+            "v2.0.1": ("0.6.1", "rc1"),
+        },
+    )
+    print("Building TorchData wheel")
+    build_vars = ""
+    if branch == "nightly":
+        version = host.check_output(
+            ["if [ -f data/version.txt ]; then cat data/version.txt; fi"]
+        ).strip()
+        build_date = (
+            host.check_output("cd data && git log --pretty=format:%s -1")
+            .strip()
+            .split()[0]
+            .replace("-", "")
+        )
+        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
+    elif build_version is not None:
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
+    if host.using_docker():
+        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
+
+    host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation")
+    wheel_name = host.list_dir("data/dist")[0]
+    embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name))
+
+    print("Copying TorchData wheel")
+    host.download_wheel(os.path.join("data", "dist", wheel_name))
+
+    return wheel_name
+
+
+def build_torchtext(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    use_conda: bool = True,
+    git_clone_flags: str = "",
+) -> str:
+    print("Checking out TorchText repo")
+    git_clone_flags += " --recurse-submodules"
+    build_version = checkout_repo(
+        host,
+        branch=branch,
+        url="https://github.com/pytorch/text",
+        git_clone_flags=git_clone_flags,
+        mapping={
+            "v1.9.0": ("0.10.0", "rc1"),
+            "v1.10.0": ("0.11.0", "rc2"),
+            "v1.10.1": ("0.11.1", "rc1"),
+            "v1.10.2": ("0.11.2", "rc1"),
+            "v1.11.0": ("0.12.0", "rc1"),
+            "v1.12.0": ("0.13.0", "rc2"),
+            "v1.12.1": ("0.13.1", "rc5"),
+            "v1.13.0": ("0.14.0", "rc3"),
+            "v1.13.1": ("0.14.1", "rc1"),
+            "v2.0.0": ("0.15.1", "rc2"),
+            "v2.0.1": ("0.15.2", "rc2"),
+        },
+    )
+    print("Building TorchText wheel")
+    build_vars = ""
+    if branch == "nightly":
+        version = host.check_output(
+            ["if [ -f text/version.txt ]; then cat text/version.txt; fi"]
+        ).strip()
+        build_date = (
+            host.check_output("cd text && git log --pretty=format:%s -1")
+            .strip()
+            .split()[0]
+            .replace("-", "")
+        )
+        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
+    elif build_version is not None:
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
+    if host.using_docker():
+        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
+
+    host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation")
+    wheel_name = host.list_dir("text/dist")[0]
+    embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name))
+
+    print("Copying TorchText wheel")
+    host.download_wheel(os.path.join("text", "dist", wheel_name))
+
+    return wheel_name
+
+
+def build_torchaudio(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    use_conda: bool = True,
+    git_clone_flags: str = "",
+) -> str:
+    print("Checking out TorchAudio repo")
+    git_clone_flags += " --recurse-submodules"
+    build_version = checkout_repo(
+        host,
+        branch=branch,
+        url="https://github.com/pytorch/audio",
+        git_clone_flags=git_clone_flags,
+        mapping={
+            "v1.9.0": ("0.9.0", "rc2"),
+            "v1.10.0": ("0.10.0", "rc5"),
+            "v1.10.1": ("0.10.1", "rc1"),
+            "v1.10.2": ("0.10.2", "rc1"),
+            "v1.11.0": ("0.11.0", "rc1"),
+            "v1.12.0": ("0.12.0", "rc3"),
+            "v1.12.1": ("0.12.1", "rc5"),
+            "v1.13.0": ("0.13.0", "rc4"),
+            "v1.13.1": ("0.13.1", "rc2"),
+            "v2.0.0": ("2.0.1", "rc3"),
+            "v2.0.1": ("2.0.2", "rc2"),
+        },
+    )
+    print("Building TorchAudio wheel")
+    build_vars = ""
+    if branch == "nightly":
+        version = (
+            host.check_output(["grep", '"version = \'"', "audio/setup.py"])
+            .strip()
+            .split("'")[1][:-2]
+        )
+        build_date = (
+            host.check_output("cd audio && git log --pretty=format:%s -1")
+            .strip()
+            .split()[0]
+            .replace("-", "")
+        )
+        build_vars += f"BUILD_VERSION={version}.dev{build_date}"
+    elif build_version is not None:
+        build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
+    if host.using_docker():
+        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
+
+    host.run_cmd(
+        f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \
+        && ./packaging/ffmpeg/build.sh \
+        && {build_vars} python3 -m build --wheel --no-isolation"
+    )
+
+    wheel_name = host.list_dir("audio/dist")[0]
+    embed_libgomp(host, use_conda, os.path.join("audio", "dist", wheel_name))
+
+    print("Copying TorchAudio wheel")
+    host.download_wheel(os.path.join("audio", "dist", wheel_name))
+
+    return wheel_name
+
+
+def configure_system(
+    host: RemoteHost,
+    *,
+    compiler: str = "gcc-8",
+    use_conda: bool = True,
+    python_version: str = "3.8",
+) -> None:
+    if use_conda:
+        install_condaforge_python(host, python_version)
+
+    print("Configuring the system")
+    if not host.using_docker():
+        update_apt_repo(host)
+        host.run_cmd("sudo apt-get install -y ninja-build g++ git cmake gfortran unzip")
+    else:
+        host.run_cmd("yum install -y sudo")
+        host.run_cmd("conda install -y ninja scons")
+
+    if not use_conda:
+        host.run_cmd(
+            "sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
+        )
+    host.run_cmd("pip3 install dataclasses typing-extensions")
+    if not use_conda:
+        print("Installing Cython + numpy from PyPy")
+        host.run_cmd("sudo pip3 install Cython")
+        host.run_cmd("sudo pip3 install numpy")
+
+
+def build_domains(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    use_conda: bool = True,
+    git_clone_flags: str = "",
+) -> tuple[str, str, str, str]:
+    vision_wheel_name = build_torchvision(
+        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
+    )
+    audio_wheel_name = build_torchaudio(
+        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
+    )
+    data_wheel_name = build_torchdata(
+        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
+    )
+    text_wheel_name = build_torchtext(
+        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
+    )
+    return (vision_wheel_name, audio_wheel_name, data_wheel_name, text_wheel_name)
+
+
+def start_build(
+    host: RemoteHost,
+    *,
+    branch: str = "main",
+    compiler: str = "gcc-8",
+    use_conda: bool = True,
+    python_version: str = "3.8",
+    pytorch_only: bool = False,
+    pytorch_build_number: Optional[str] = None,
+    shallow_clone: bool = True,
+    enable_mkldnn: bool = False,
+) -> tuple[str, str, str, str, str]:
+    git_clone_flags = " --depth 1 --shallow-submodules" if shallow_clone else ""
+    if host.using_docker() and not use_conda:
+        print("Auto-selecting conda option for docker images")
+        use_conda = True
+    if not host.using_docker():
+        print("Disable mkldnn for host builds")
+        enable_mkldnn = False
+
+    configure_system(
+        host, compiler=compiler, use_conda=use_conda, python_version=python_version
+    )
+
+    if host.using_docker():
+        print("Move libgfortant.a into a standard location")
+        # HACK: pypa gforntran.a is compiled without PIC, which leads to the following error
+        # libgfortran.a(error.o)(.text._gfortrani_st_printf+0x34): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `__stack_chk_guard@@GLIBC_2.17'  # noqa: E501, B950
+        # Workaround by copying gfortran library from the host
+        host.run_ssh_cmd("sudo apt-get install -y gfortran-8")
+        host.run_cmd("mkdir -p /usr/lib/gcc/aarch64-linux-gnu/8")
+        host.run_ssh_cmd(
+            [
+                "docker",
+                "cp",
+                "/usr/lib/gcc/aarch64-linux-gnu/8/libgfortran.a",
+                f"{host.container_id}:/opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/",
+            ]
+        )
+
+    print("Checking out PyTorch repo")
+    host.run_cmd(
+        f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"
+    )
+
+    host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")
+
+    print("Building PyTorch wheel")
+    build_opts = ""
+    if pytorch_build_number is not None:
+        build_opts += f" -C--build-option=--build-number={pytorch_build_number}"
+    # Breakpad build fails on aarch64
+    build_vars = "USE_BREAKPAD=0 "
+    if branch == "nightly":
+        build_date = (
+            host.check_output("cd pytorch && git log --pretty=format:%s -1")
+            .strip()
+            .split()[0]
+            .replace("-", "")
+        )
+        version = host.check_output("cat pytorch/version.txt").strip()[:-2]
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
+    if branch.startswith(("v1.", "v2.")):
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
+    if host.using_docker():
+        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
+    if enable_mkldnn:
+        host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")
+        print("build pytorch with mkldnn+acl backend")
+        build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
+        build_vars += " BLAS=OpenBLAS"
+        build_vars += " OpenBLAS_HOME=/opt/OpenBLAS"
+        build_vars += " ACL_ROOT_DIR=/acl"
+        host.run_cmd(
+            f"cd $HOME/pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
+        )
+        print("Repair the wheel")
+        pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
+        ld_library_path = "/acl/build:$HOME/pytorch/build/lib"
+        host.run_cmd(
+            f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"
+        )
+        print("replace the original wheel with the repaired one")
+        pytorch_repaired_wheel_name = host.list_dir("wheelhouse")[0]
+        host.run_cmd(
+            f"cp $HOME/wheelhouse/{pytorch_repaired_wheel_name} $HOME/pytorch/dist/{pytorch_wheel_name}"
+        )
+    else:
+        print("build pytorch without mkldnn backend")
+        host.run_cmd(
+            f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
+        )
+
+    print("Deleting build folder")
+    host.run_cmd("cd pytorch && rm -rf build")
+    pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
+    embed_libgomp(host, use_conda, os.path.join("pytorch", "dist", pytorch_wheel_name))
+    print("Copying the wheel")
+    host.download_wheel(os.path.join("pytorch", "dist", pytorch_wheel_name))
+
+    print("Installing PyTorch wheel")
+    host.run_cmd(f"pip3 install pytorch/dist/{pytorch_wheel_name}")
+
+    if pytorch_only:
+        return (pytorch_wheel_name, None, None, None, None)
+    domain_wheels = build_domains(
+        host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
+    )
+
+    return (pytorch_wheel_name, *domain_wheels)
+
+
+embed_library_script = """
+#!/usr/bin/env python3
+
+from auditwheel.patcher import Patchelf
+from auditwheel.wheeltools import InWheelCtx
+from auditwheel.elfutils import elf_file_filter
+from auditwheel.repair import copylib
+from auditwheel.lddtree import lddtree
+from subprocess import check_call
+import os
+import shutil
+import sys
+from tempfile import TemporaryDirectory
+
+
+def replace_tag(filename):
+   with open(filename, 'r') as f:
+     lines = f.read().split("\\n")
+   for i,line in enumerate(lines):
+       if not line.startswith("Tag: "):
+           continue
+       lines[i] = line.replace("-linux_", "-manylinux2014_")
+       print(f'Updated tag from {line} to {lines[i]}')
+
+   with open(filename, 'w') as f:
+       f.write("\\n".join(lines))
+
+
+class AlignedPatchelf(Patchelf):
+    def set_soname(self, file_name: str, new_soname: str) -> None:
+        check_call(['patchelf', '--page-size', '65536', '--set-soname', new_soname, file_name])
+
+    def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
+        check_call(['patchelf', '--page-size', '65536', '--replace-needed', soname, new_soname, file_name])
+
+
+def embed_library(whl_path, lib_soname, update_tag=False):
+    patcher = AlignedPatchelf()
+    out_dir = TemporaryDirectory()
+    whl_name = os.path.basename(whl_path)
+    tmp_whl_name = os.path.join(out_dir.name, whl_name)
+    with InWheelCtx(whl_path) as ctx:
+        torchlib_path = os.path.join(ctx._tmpdir.name, 'torch', 'lib')
+        ctx.out_wheel=tmp_whl_name
+        new_lib_path, new_lib_soname = None, None
+        for filename, elf in elf_file_filter(ctx.iter_files()):
+            if not filename.startswith('torch/lib'):
+                continue
+            libtree = lddtree(filename)
+            if lib_soname not in libtree['needed']:
+                continue
+            lib_path = libtree['libs'][lib_soname]['path']
+            if lib_path is None:
+                print(f"Can't embed {lib_soname} as it could not be found")
+                break
+            if lib_path.startswith(torchlib_path):
+                continue
+
+            if new_lib_path is None:
+                new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
+            patcher.replace_needed(filename, lib_soname, new_lib_soname)
+            print(f'Replacing {lib_soname} with {new_lib_soname} for {filename}')
+        if update_tag:
+            # Add manylinux2014 tag
+            for filename in ctx.iter_files():
+                if os.path.basename(filename) != 'WHEEL':
+                    continue
+                replace_tag(filename)
+    shutil.move(tmp_whl_name, whl_path)
+
+
+if __name__ == '__main__':
+    embed_library(sys.argv[1], 'libgomp.so.1', len(sys.argv) > 2 and sys.argv[2] == '--update-tag')
+"""
+
+
+def run_tests(host: RemoteHost, whl: str, branch="main") -> None:
+    print("Configuring the system")
+    update_apt_repo(host)
+    host.run_cmd("sudo apt-get install -y python3-pip git")
+    host.run_cmd("sudo pip3 install Cython")
+    host.run_cmd("sudo pip3 install numpy")
+    host.upload_file(whl, ".")
+    host.run_cmd(f"sudo pip3 install {whl}")
+    host.run_cmd("python3 -c 'import torch;print(torch.rand((3,3))'")
+    host.run_cmd(f"git clone -b {branch} https://github.com/pytorch/pytorch")
+    host.run_cmd("cd pytorch/test; python3 test_torch.py -v")
+
+
+def get_instance_name(instance) -> Optional[str]:
+    if instance.tags is None:
+        return None
+    for tag in instance.tags:
+        if tag["Key"] == "Name":
+            return tag["Value"]
+    return None
+
+
+def list_instances(instance_type: str) -> None:
+    print(f"All instances of type {instance_type}")
+    for instance in ec2_instances_of_type(instance_type):
+        ifaces = instance.network_interfaces
+        az = ifaces[0].subnet.availability_zone if len(ifaces) > 0 else None
+        print(
+            f"{instance.id} {get_instance_name(instance)} {instance.public_dns_name} {instance.state['Name']} {az}"
+        )
+
+
+def terminate_instances(instance_type: str) -> None:
+    print(f"Terminating all instances of type {instance_type}")
+    instances = list(ec2_instances_of_type(instance_type))
+    for instance in instances:
+        print(f"Terminating {instance.id}")
+        instance.terminate()
+    print("Waiting for termination to complete")
+    for instance in instances:
+        instance.wait_until_terminated()
+
+
+def parse_arguments():
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser("Build and test AARCH64 wheels using EC2")
+    parser.add_argument("--key-name", type=str)
+    parser.add_argument("--debug", action="store_true")
+    parser.add_argument("--build-only", action="store_true")
+    parser.add_argument("--test-only", type=str)
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument("--os", type=str, choices=list(os_amis.keys()))
+    group.add_argument("--ami", type=str)
+    parser.add_argument(
+        "--python-version",
+        type=str,
+        choices=[f"3.{d}" for d in range(6, 12)],
+        default=None,
+    )
+    parser.add_argument("--alloc-instance", action="store_true")
+    parser.add_argument("--list-instances", action="store_true")
+    parser.add_argument("--pytorch-only", action="store_true")
+    parser.add_argument("--keep-running", action="store_true")
+    parser.add_argument("--terminate-instances", action="store_true")
+    parser.add_argument("--instance-type", type=str, default="t4g.2xlarge")
+    parser.add_argument("--ebs-size", type=int, default=50)
+    parser.add_argument("--branch", type=str, default="main")
+    parser.add_argument("--use-docker", action="store_true")
+    parser.add_argument(
+        "--compiler",
+        type=str,
+        choices=["gcc-7", "gcc-8", "gcc-9", "clang"],
+        default="gcc-8",
+    )
+    parser.add_argument("--use-torch-from-pypi", action="store_true")
+    parser.add_argument("--pytorch-build-number", type=str, default=None)
+    parser.add_argument("--disable-mkldnn", action="store_true")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_arguments()
+    ami = (
+        args.ami
+        if args.ami is not None
+        else os_amis[args.os]
+        if args.os is not None
+        else ubuntu20_04_ami
+    )
+    keyfile_path, key_name = compute_keyfile_path(args.key_name)
+
+    if args.list_instances:
+        list_instances(args.instance_type)
+        sys.exit(0)
+
+    if args.terminate_instances:
+        terminate_instances(args.instance_type)
+        sys.exit(0)
+
+    if len(key_name) == 0:
+        raise RuntimeError("""
+            Cannot start build without key_name, please specify
+            --key-name argument or AWS_KEY_NAME environment variable.""")
+    if len(keyfile_path) == 0 or not os.path.exists(keyfile_path):
+        raise RuntimeError(f"""
+            Cannot find keyfile with name: [{key_name}] in path: [{keyfile_path}], please
+            check `~/.ssh/` folder or manually set SSH_KEY_PATH environment variable.""")
+
+    # Starting the instance
+    inst = start_instance(
+        key_name, ami=ami, instance_type=args.instance_type, ebs_size=args.ebs_size
+    )
+    instance_name = f"{args.key_name}-{args.os}"
+    if args.python_version is not None:
+        instance_name += f"-py{args.python_version}"
+    inst.create_tags(
+        DryRun=False,
+        Tags=[
+            {
+                "Key": "Name",
+                "Value": instance_name,
+            }
+        ],
+    )
+    addr = inst.public_dns_name
+    wait_for_connection(addr, 22)
+    host = RemoteHost(addr, keyfile_path)
+    host.ami = ami
+    if args.use_docker:
+        update_apt_repo(host)
+        host.start_docker()
+
+    if args.test_only:
+        run_tests(host, args.test_only)
+        sys.exit(0)
+
+    if args.alloc_instance:
+        if args.python_version is None:
+            sys.exit(0)
+        install_condaforge_python(host, args.python_version)
+        sys.exit(0)
+
+    python_version = args.python_version if args.python_version is not None else "3.10"
+
+    if args.use_torch_from_pypi:
+        configure_system(host, compiler=args.compiler, python_version=python_version)
+        print("Installing PyTorch wheel")
+        host.run_cmd("pip3 install torch")
+        build_domains(
+            host, branch=args.branch, git_clone_flags=" --depth 1 --shallow-submodules"
+        )
+    else:
+        start_build(
+            host,
+            branch=args.branch,
+            compiler=args.compiler,
+            python_version=python_version,
+            pytorch_only=args.pytorch_only,
+            pytorch_build_number=args.pytorch_build_number,
+            enable_mkldnn=not args.disable_mkldnn,
+        )
+    if not args.keep_running:
+        print(f"Waiting for instance {inst.id} to terminate")
+        inst.terminate()
+        inst.wait_until_terminated()
--- a/.ci/aarch64_linux/embed_library.py
+++ b/.ci/aarch64_linux/embed_library.py
@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+
+import os
+import shutil
+import sys
+from subprocess import check_call
+from tempfile import TemporaryDirectory
+
+from auditwheel.elfutils import elf_file_filter
+from auditwheel.lddtree import lddtree
+from auditwheel.patcher import Patchelf
+from auditwheel.repair import copylib
+from auditwheel.wheeltools import InWheelCtx
+
+
+def replace_tag(filename):
+    with open(filename) as f:
+        lines = f.read().split("\\n")
+    for i, line in enumerate(lines):
+        if not line.startswith("Tag: "):
+            continue
+        lines[i] = line.replace("-linux_", "-manylinux2014_")
+        print(f"Updated tag from {line} to {lines[i]}")
+
+    with open(filename, "w") as f:
+        f.write("\\n".join(lines))
+
+
+class AlignedPatchelf(Patchelf):
+    def set_soname(self, file_name: str, new_soname: str) -> None:
+        check_call(
+            ["patchelf", "--page-size", "65536", "--set-soname", new_soname, file_name]
+        )
+
+    def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
+        check_call(
+            [
+                "patchelf",
+                "--page-size",
+                "65536",
+                "--replace-needed",
+                soname,
+                new_soname,
+                file_name,
+            ]
+        )
+
+
+def embed_library(whl_path, lib_soname, update_tag=False):
+    patcher = AlignedPatchelf()
+    out_dir = TemporaryDirectory()
+    whl_name = os.path.basename(whl_path)
+    tmp_whl_name = os.path.join(out_dir.name, whl_name)
+    with InWheelCtx(whl_path) as ctx:
+        torchlib_path = os.path.join(ctx._tmpdir.name, "torch", "lib")
+        ctx.out_wheel = tmp_whl_name
+        new_lib_path, new_lib_soname = None, None
+        for filename, _ in elf_file_filter(ctx.iter_files()):
+            if not filename.startswith("torch/lib"):
+                continue
+            libtree = lddtree(filename)
+            if lib_soname not in libtree["needed"]:
+                continue
+            lib_path = libtree["libs"][lib_soname]["path"]
+            if lib_path is None:
+                print(f"Can't embed {lib_soname} as it could not be found")
+                break
+            if lib_path.startswith(torchlib_path):
+                continue
+
+            if new_lib_path is None:
+                new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
+            patcher.replace_needed(filename, lib_soname, new_lib_soname)
+            print(f"Replacing {lib_soname} with {new_lib_soname} for {filename}")
+        if update_tag:
+            # Add manylinux2014 tag
+            for filename in ctx.iter_files():
+                if os.path.basename(filename) != "WHEEL":
+                    continue
+                replace_tag(filename)
+    shutil.move(tmp_whl_name, whl_path)
+
+
+if __name__ == "__main__":
+    embed_library(
+        sys.argv[1], "libgomp.so.1", len(sys.argv) > 2 and sys.argv[2] == "--update-tag"
+    )
--- a/.ci/manywheel/build.sh
+++ b/.ci/manywheel/build.sh
@ -4,17 +4,14 @@ set -ex

 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

-# Source the common build script for architecture-specific configurations (MKLDNN, ACL, etc.)
-source "${SCRIPTPATH}/../pytorch/build.sh" || true
-
 case "${GPU_ARCH_TYPE:-BLANK}" in
-    cuda | cuda-aarch64)
+    cuda)
        bash "${SCRIPTPATH}/build_cuda.sh"
        ;;
    rocm)
        bash "${SCRIPTPATH}/build_rocm.sh"
        ;;
-    cpu | cpu-cxx11-abi | cpu-aarch64 | cpu-s390x)
+    cpu | cpu-cxx11-abi | cpu-s390x)
        bash "${SCRIPTPATH}/build_cpu.sh"
        ;;
    xpu)
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -18,31 +18,12 @@ retry () {
    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
 }

-# Detect architecture first
-ARCH=$(uname -m)
-echo "Detected architecture: $ARCH"
-
 PLATFORM=""
 # TODO move this into the Docker images
 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
 if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    retry yum install -q -y zip openssl
-    # Set platform based on architecture
-    case $ARCH in
-        x86_64)
-            PLATFORM="manylinux_2_28_x86_64"
-            ;;
-        aarch64)
-            PLATFORM="manylinux_2_28_aarch64"
-            ;;
-        s390x)
-            PLATFORM="manylinux_2_28_s390x"
-            ;;
-        *)
-            echo "Unsupported architecture: $ARCH"
-            exit 1
-            ;;
-    esac
+    PLATFORM="manylinux_2_28_x86_64"
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    retry dnf install -q -y zip openssl
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
@ -57,8 +38,6 @@ else
    exit 1
 fi

-echo "Platform set to: $PLATFORM"
-
 # We use the package name to test the package by passing this to 'pip install'
 # This is the env variable that setup.py uses to name the package. Note that
 # pip 'normalizes' the name first by changing all - to _
@ -320,8 +299,8 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
            # ROCm workaround for roctracer dlopens
            if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
                patchedpath=$(fname_without_so_number $destpath)
-            # Keep the so number for XPU dependencies, libgomp.so.1, ACL libraries, and NVPL libraries to avoid twice load
-            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" || "$filename" == libarm_compute* || "$filename" == libnvpl* || "$filename" == "libgfortran.so.5" ]]; then
+            # Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load
+            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then
                patchedpath=$destpath
            else
                patchedpath=$(fname_with_sha256 $destpath)
@ -367,22 +346,9 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
    done

    # create Manylinux 2_28 tag this needs to happen before regenerate the RECORD
-    # Support all architectures (x86_64, aarch64, s390x)
-    if [[ "$IS_MANYLINUX2_28" == "1" && $GPU_ARCH_TYPE != "xpu" ]]; then
+    if [[ $PLATFORM == "manylinux_2_28_x86_64" && $GPU_ARCH_TYPE != "cpu-s390x" && $GPU_ARCH_TYPE != "xpu" ]]; then
        wheel_file=$(echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/WHEEL/g')
-        echo "Updating wheel tag for $ARCH architecture"
-        # Replace linux_* with manylinux_2_28_* based on architecture
-        case $ARCH in
-            x86_64)
-                sed -i -e 's#linux_x86_64#manylinux_2_28_x86_64#g' $wheel_file
-                ;;
-            aarch64)
-                sed -i -e 's#linux_aarch64#manylinux_2_28_aarch64#g' $wheel_file
-                ;;
-            s390x)
-                sed -i -e 's#linux_s390x#manylinux_2_28_s390x#g' $wheel_file
-                ;;
-        esac
+        sed -i -e s#linux_x86_64#"${PLATFORM}"# $wheel_file;
    fi

    # regenerate the RECORD file with new hashes
--- a/.ci/manywheel/build_cpu.sh
+++ b/.ci/manywheel/build_cpu.sh
@ -15,10 +15,6 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
    EXTRA_CAFFE2_CMAKE_FLAGS=()
 fi

-# Detect architecture
-ARCH=$(uname -m)
-echo "Building CPU wheel for architecture: $ARCH"
-
 WHEELHOUSE_DIR="wheelhousecpu"
 LIBTORCH_HOUSE_DIR="libtorch_housecpu"
 if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
@ -38,10 +34,8 @@ elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
 elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
-    if [[ "$ARCH" == "s390x" ]]; then
+    if [[ "$(uname -m)" == "s390x" ]]; then
        LIBGOMP_PATH="/usr/lib/s390x-linux-gnu/libgomp.so.1"
-    elif [[ "$ARCH" == "aarch64" ]]; then
-        LIBGOMP_PATH="/usr/lib/aarch64-linux-gnu/libgomp.so.1"
    else
        LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
    fi
@ -55,32 +49,6 @@ DEPS_SONAME=(
    "libgomp.so.1"
 )

-# Add ARM-specific library dependencies for CPU builds
-if [[ "$ARCH" == "aarch64" ]]; then
-    echo "Adding ARM-specific CPU library dependencies"
-
-    # ARM Compute Library (if available)
-    if [[ -d "/acl/build" ]]; then
-        echo "Adding ARM Compute Library for CPU"
-        DEPS_LIST+=(
-            "/acl/build/libarm_compute.so"
-            "/acl/build/libarm_compute_graph.so"
-        )
-        DEPS_SONAME+=(
-            "libarm_compute.so"
-            "libarm_compute_graph.so"
-        )
-    fi
-
-    # ARM system libraries
-    DEPS_LIST+=(
-        "/usr/lib64/libgfortran.so.5"
-    )
-    DEPS_SONAME+=(
-        "libgfortran.so.5"
-    )
-fi
-
 rm -rf /usr/local/cuda*

 SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -29,10 +29,6 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
    EXTRA_CAFFE2_CMAKE_FLAGS=()
 fi

-# Detect architecture
-ARCH=$(uname -m)
-echo "Building for architecture: $ARCH"
-
 # Determine CUDA version and architectures to build for
 #
 # NOTE: We should first check `DESIRED_CUDA` when determining `CUDA_VERSION`,
@ -57,60 +53,34 @@ fi
 cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
 EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")

-# Function to remove architectures from a list
-remove_archs() {
-    local result="$1"
-    shift
-    for arch in "$@"; do
-        result="${result//${arch};/}"
-    done
-    echo "$result"
-}
-
-# Function to filter CUDA architectures for aarch64
-# aarch64 ARM GPUs only support certain compute capabilities
-# Keep: 8.0 (A100), 9.0+ (Hopper, Grace Hopper, newer)
-# Remove: < 8.0 (no ARM GPUs), 8.6 (x86_64 RTX 3090/A6000 only)
-filter_aarch64_archs() {
-    local arch_list="$1"
-    # Explicitly remove architectures not needed on aarch64
-    arch_list=$(remove_archs "$arch_list" "5.0" "6.0" "7.0" "7.5" "8.6")
-    echo "$arch_list"
-}
-
-# Base: Common architectures across all modern CUDA versions
-TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0"
-
 case ${CUDA_VERSION} in
-    12.6) TORCH_CUDA_ARCH_LIST="5.0;6.0;${TORCH_CUDA_ARCH_LIST}" ;;  # Only 12.6 includes Legacy Maxwell/Pascal that will be removed in future releases
-    12.8) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0" ;;  # +Hopper/Blackwell support
-    12.9) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0+PTX" # +Hopper/Blackwell support + PTX for forward compatibility
+    #removing sm_50-sm_60 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
+    #however we would like to keep sm_70 architecture see: https://github.com/pytorch/pytorch/issues/157517
+    12.8)
+        TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0"
+        ;;
+    12.9)
+        TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0+PTX"
+        # WAR to resolve the ld error in libtorch build with CUDA 12.9
        if [[ "$PACKAGE_TYPE" == "libtorch" ]]; then
-            TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//7.0;/}"  # Remove 7.0 to resolve the ld error
-            TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//8.6;/}"  # Remove 8.6 for libtorch
+            TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
        fi
        ;;
    13.0)
-        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;$([[ "$ARCH" == "aarch64" ]] && echo "11.0;" || echo "")12.0+PTX"
-        export TORCH_NVCC_FLAGS="-compress-mode=size"
-        export BUILD_BUNDLE_PTXAS=1
+        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
+        ;;
+    12.6)
+        TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
+        ;;
+    *)
+        echo "unknown cuda version $CUDA_VERSION"
+        exit 1
        ;;
-    *) echo "unknown cuda version $CUDA_VERSION"; exit 1 ;;
 esac

-# Filter for aarch64: Remove < 8.0 and 8.6
-[[ "$ARCH" == "aarch64" ]] && TORCH_CUDA_ARCH_LIST=$(filter_aarch64_archs "$TORCH_CUDA_ARCH_LIST")
-
-echo "TORCH_CUDA_ARCH_LIST set to: $TORCH_CUDA_ARCH_LIST"
 export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
 echo "${TORCH_CUDA_ARCH_LIST}"

-# Disable MAGMA for aarch64 as pre-built libraries are x86-64 only
-if [[ "$ARCH" == "aarch64" ]]; then
-    echo "Disabling MAGMA for aarch64 architecture"
-    export USE_MAGMA=0
-fi
-
 # Package directories
 WHEELHOUSE_DIR="wheelhouse$cuda_version_nodot"
 LIBTORCH_HOUSE_DIR="libtorch_house$cuda_version_nodot"
@ -274,51 +244,6 @@ else
    exit 1
 fi

-# Add ARM-specific library dependencies
-if [[ "$ARCH" == "aarch64" ]]; then
-    echo "Adding ARM-specific library dependencies"
-
-    # ARM Compute Library (if available)
-    if [[ -d "/acl/build" ]]; then
-        echo "Adding ARM Compute Library"
-        DEPS_LIST+=(
-            "/acl/build/libarm_compute.so"
-            "/acl/build/libarm_compute_graph.so"
-        )
-        DEPS_SONAME+=(
-            "libarm_compute.so"
-            "libarm_compute_graph.so"
-        )
-    fi
-
-    # ARM system libraries
-    DEPS_LIST+=(
-        "/lib64/libgomp.so.1"
-        "/usr/lib64/libgfortran.so.5"
-    )
-    DEPS_SONAME+=(
-        "libgomp.so.1"
-        "libgfortran.so.5"
-    )
-
-    # NVPL libraries (ARM optimized BLAS/LAPACK)
-    if [[ -d "/usr/local/lib" && -f "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0" ]]; then
-        echo "Adding NVPL libraries for ARM"
-        DEPS_LIST+=(
-            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0"
-            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0"
-            "/usr/local/lib/libnvpl_lapack_core.so.0"
-            "/usr/local/lib/libnvpl_blas_core.so.0"
-        )
-        DEPS_SONAME+=(
-            "libnvpl_lapack_lp64_gomp.so.0"
-            "libnvpl_blas_lp64_gomp.so.0"
-            "libnvpl_lapack_core.so.0"
-            "libnvpl_blas_core.so.0"
-        )
-    fi
-fi
-
 # run_tests.sh requires DESIRED_CUDA to know what tests to exclude
 export DESIRED_CUDA="$cuda_version_nodot"

@ -326,11 +251,9 @@ export DESIRED_CUDA="$cuda_version_nodot"
 rm -rf /usr/local/cuda || true
 ln -s "/usr/local/cuda-${CUDA_VERSION}" /usr/local/cuda

-# Switch `/usr/local/magma` to the desired CUDA version (skip for aarch64)
-if [[ "$ARCH" != "aarch64" ]]; then
-    rm -rf /usr/local/magma || true
-    ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
-fi
+# Switch `/usr/local/magma` to the desired CUDA version
+rm -rf /usr/local/magma || true
+ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma

 export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev) # 10.0.130
 export CUDA_VERSION_SHORT=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev | cut -f1,2 -d".") # 10.0
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -86,20 +86,10 @@ else
  fi
 fi

-# Enable MKLDNN with ARM Compute Library for ARM builds
 if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
  export USE_MKLDNN=1
-
-  # ACL is required for aarch64 builds
-  if [[ ! -d "/acl" ]]; then
-    echo "ERROR: ARM Compute Library not found at /acl"
-    echo "ACL is required for aarch64 builds. Check Docker image setup."
-    exit 1
-  fi
-
  export USE_MKLDNN_ACL=1
  export ACL_ROOT_DIR=/acl
-  echo "ARM Compute Library enabled for MKLDNN: ACL_ROOT_DIR=/acl"
 fi

 if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -389,6 +389,13 @@ test_lazy_tensor_meta_reference_disabled() {
  export -n TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE
 }

+test_dynamo_core() {
+  time python test/run_test.py \
+    --include-dynamo-core-tests \
+    --verbose \
+    --upload-artifacts-while-running
+  assert_git_not_dirty
+}

 test_dynamo_wrapped_shard() {
  if [[ -z "$NUM_TEST_SHARDS" ]]; then
@ -1814,6 +1821,8 @@ elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  test_inductor_shard "${SHARD_NUMBER}"
 elif [[ "${TEST_CONFIG}" == *einops* ]]; then
  test_einops
+elif [[ "${TEST_CONFIG}" == *dynamo_core* ]]; then
+  test_dynamo_core
 elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
  install_torchvision
  test_dynamo_wrapped_shard "${SHARD_NUMBER}"
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-07b6cbde121417a70e4dc871adb6d27030e0ce3f
+ee1a1350eb37804b94334768f328144f058f14e9
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-acccf86477759b2d3500f1ae1be065f7b1e409ec
+2d82dc5caa336d179d9b46ac4a0fb8c43d84c5cc
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a
+94631807d22c09723dd006f7be5beb649d5f88d0
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -7,6 +7,7 @@ ciflow_push_tags:
 - ciflow/binaries
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
+- ciflow/dynamo
 - ciflow/h100
 - ciflow/h100-cutlass-backend
 - ciflow/h100-distributed
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -260,8 +260,11 @@ jobs:
            "${DOCKER_IMAGE}"
          )
          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
-          # Unified build script for all architectures (x86_64, aarch64, s390x)
-          docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
+          if [[ ${BUILD_ENVIRONMENT} == *"aarch64"* ]]; then
+            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/aarch64_linux/aarch64_ci_build.sh"
+          else
+            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
+          fi

      - name: Chown artifacts
        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -326,7 +326,7 @@ jobs:
          SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
          SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
          SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
-          DOCKER_IMAGE: ${{ inputs.docker-image }}
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
          XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
--- a/.github/workflows/dynamo-unittest.yml
+++ b/.github/workflows/dynamo-unittest.yml
@ -0,0 +1,70 @@
+# Workflow: Dynamo Unit Test
+# runs unit tests for dynamo.
+name: dynamo-unittest
+
+on:
+  push:
+    tags:
+      - ciflow/dynamo/*
+  workflow_call:
+  schedule:
+    - cron: 29 8 * * * # about 1:29am PDT
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+      opt_out_experiments: lf
+
+  dynamo-build:
+    name: dynamo-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    strategy:
+      matrix:
+        python-version: ['3.11', '3.12']
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py${{ matrix.python-version }}-clang12
+      docker-image-name: ci-image:pytorch-linux-jammy-py${{ matrix.python-version }}-clang12
+      test-matrix: |
+        { include: [
+          { config: "dynamo_core", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+        ]}
+    secrets: inherit
+
+  dynamo-test:
+    name: dynamo-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs: [get-label-type, dynamo-build]
+    strategy:
+      matrix:
+        python-version: ['3.11', '3.12']
+    with:
+      build-environment: linux-jammy-py${{ matrix.python-version }}-clang12
+      docker-image: ci-image:pytorch-linux-jammy-py${{ matrix.python-version }}-clang12
+      test-matrix: |
+        { include: [
+          { config: "dynamo_core", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+        ]}
+    secrets: inherit
--- a/aten/src/ATen/core/TensorAccessor.h
+++ b/aten/src/ATen/core/TensorAccessor.h
@ -1,5 +1,6 @@
 #pragma once

+#include <torch/headeronly/core/TensorAccessor.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/ArrayRef.h>
 #include <c10/util/Deprecated.h>
@ -11,252 +12,37 @@

 namespace at {

-// The PtrTraits argument to the TensorAccessor/GenericPackedTensorAccessor
-// is used to enable the __restrict__ keyword/modifier for the data
-// passed to cuda.
-template <typename T>
-struct DefaultPtrTraits {
-  typedef T* PtrType;
-};
-
+using torch::headeronly::DefaultPtrTraits;
 #if defined(__CUDACC__) || defined(__HIPCC__)
-template <typename T>
-struct RestrictPtrTraits {
-  typedef T* __restrict__ PtrType;
-};
+  using torch::headeronly::RestrictPtrTraits;
 #endif

-// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors.
-// For CUDA tensors it is used in device code (only). This means that we restrict ourselves
-// to functions and types available there (e.g. IntArrayRef isn't).
-
-// The PtrTraits argument is only relevant to cuda to support `__restrict__` pointers.
 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-class TensorAccessorBase {
-public:
-  typedef typename PtrTraits<T>::PtrType PtrType;
+using TensorAccessorBase = torch::headeronly::detail::TensorAccessorBase<c10::IntArrayRef, T, N, PtrTraits, index_t>;

-  C10_HOST_DEVICE TensorAccessorBase(
-      PtrType data_,
-      const index_t* sizes_,
-      const index_t* strides_)
-      : data_(data_), sizes_(sizes_), strides_(strides_) {}
-  C10_HOST IntArrayRef sizes() const {
-    return IntArrayRef(sizes_,N);
-  }
-  C10_HOST IntArrayRef strides() const {
-    return IntArrayRef(strides_,N);
-  }
-  C10_HOST_DEVICE index_t stride(index_t i) const {
-    return strides_[i];
-  }
-  C10_HOST_DEVICE index_t size(index_t i) const {
-    return sizes_[i];
-  }
-  C10_HOST_DEVICE PtrType data() {
-    return data_;
-  }
-  C10_HOST_DEVICE const PtrType data() const {
-    return data_;
-  }
-protected:
-  PtrType data_;
-  const index_t* sizes_;
-  const index_t* strides_;
-};
-
-// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using
-// `Tensor.accessor<T, N>()`.
-// For CUDA `Tensor`s, `GenericPackedTensorAccessor` is used on the host and only
-// indexing on the device uses `TensorAccessor`s.
 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-class TensorAccessor : public TensorAccessorBase<T,N,PtrTraits,index_t> {
-public:
-  typedef typename PtrTraits<T>::PtrType PtrType;
+using TensorAccessor = torch::headeronly::detail::TensorAccessor<c10::IntArrayRef, T, N, PtrTraits, index_t>;

-  C10_HOST_DEVICE TensorAccessor(
-      PtrType data_,
-      const index_t* sizes_,
-      const index_t* strides_)
-      : TensorAccessorBase<T, N, PtrTraits, index_t>(data_,sizes_,strides_) {}
+namespace detail {

-  C10_HOST_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
-    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
-  }
-
-  C10_HOST_DEVICE const TensorAccessor<T, N-1, PtrTraits, index_t> operator[](index_t i) const {
-    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
-  }
-};
-
-template<typename T, template <typename U> class PtrTraits, typename index_t>
-class TensorAccessor<T,1,PtrTraits,index_t> : public TensorAccessorBase<T,1,PtrTraits,index_t> {
-public:
-  typedef typename PtrTraits<T>::PtrType PtrType;
-
-  C10_HOST_DEVICE TensorAccessor(
-      PtrType data_,
-      const index_t* sizes_,
-      const index_t* strides_)
-      : TensorAccessorBase<T, 1, PtrTraits, index_t>(data_,sizes_,strides_) {}
-  C10_HOST_DEVICE T & operator[](index_t i) {
-    // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
-    return this->data_[this->strides_[0]*i];
-  }
-  C10_HOST_DEVICE const T & operator[](index_t i) const {
-    return this->data_[this->strides_[0]*i];
-  }
-};
-
-
-// GenericPackedTensorAccessorBase and GenericPackedTensorAccessor are used on for CUDA `Tensor`s on the host
-// and as
-// In contrast to `TensorAccessor`s, they copy the strides and sizes on instantiation (on the host)
-// in order to transfer them on the device when calling kernels.
-// On the device, indexing of multidimensional tensors gives to `TensorAccessor`s.
-// Use RestrictPtrTraits as PtrTraits if you want the tensor's data pointer to be marked as __restrict__.
-// Instantiation from data, sizes, strides is only needed on the host and std::copy isn't available
-// on the device, so those functions are host only.
-template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-class GenericPackedTensorAccessorBase {
-public:
-  typedef typename PtrTraits<T>::PtrType PtrType;
-  C10_HOST GenericPackedTensorAccessorBase(
-      PtrType data_,
-      const index_t* sizes_,
-      const index_t* strides_)
-      : data_(data_) {
-    std::copy(sizes_, sizes_ + N, std::begin(this->sizes_));
-    std::copy(strides_, strides_ + N, std::begin(this->strides_));
-  }
-
-  // if index_t is not int64_t, we want to have an int64_t constructor
-  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
-  C10_HOST GenericPackedTensorAccessorBase(
-      PtrType data_,
-      const source_index_t* sizes_,
-      const source_index_t* strides_)
-      : data_(data_) {
-    for (const auto i : c10::irange(N)) {
-      this->sizes_[i] = sizes_[i];
-      this->strides_[i] = strides_[i];
-    }
-  }
-
-  C10_HOST_DEVICE index_t stride(index_t i) const {
-    return strides_[i];
-  }
-  C10_HOST_DEVICE index_t size(index_t i) const {
-    return sizes_[i];
-  }
-  C10_HOST_DEVICE PtrType data() {
-    return data_;
-  }
-  C10_HOST_DEVICE const PtrType data() const {
-    return data_;
-  }
-protected:
-  PtrType data_;
-  // NOLINTNEXTLINE(*c-arrays*)
-  index_t sizes_[N];
-  // NOLINTNEXTLINE(*c-arrays*)
-  index_t strides_[N];
-  C10_HOST void bounds_check_(index_t i) const {
-    TORCH_CHECK_INDEX(
+template <size_t N, typename index_t>
+struct IndexBoundsCheck {
+    IndexBoundsCheck(index_t i) {
+      TORCH_CHECK_INDEX(
        0 <= i && i < index_t{N},
        "Index ",
        i,
        " is not within bounds of a tensor of dimension ",
        N);
-  }
+    }
 };
+}  // namespace detail

 template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-class GenericPackedTensorAccessor : public GenericPackedTensorAccessorBase<T,N,PtrTraits,index_t> {
-public:
-  typedef typename PtrTraits<T>::PtrType PtrType;
-
-  C10_HOST GenericPackedTensorAccessor(
-      PtrType data_,
-      const index_t* sizes_,
-      const index_t* strides_)
-      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
-
-  // if index_t is not int64_t, we want to have an int64_t constructor
-  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
-  C10_HOST GenericPackedTensorAccessor(
-      PtrType data_,
-      const source_index_t* sizes_,
-      const source_index_t* strides_)
-      : GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
-
-  C10_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
-    index_t* new_sizes = this->sizes_ + 1;
-    index_t* new_strides = this->strides_ + 1;
-    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
-  }
-
-  C10_DEVICE const TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) const {
-    const index_t* new_sizes = this->sizes_ + 1;
-    const index_t* new_strides = this->strides_ + 1;
-    return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
-  }
-
-  /// Returns a PackedTensorAccessor of the same dimension after transposing the
-  /// two dimensions given. Does not actually move elements; transposition is
-  /// made by permuting the size/stride arrays. If the dimensions are not valid,
-  /// asserts.
-  C10_HOST GenericPackedTensorAccessor<T, N, PtrTraits, index_t> transpose(
-      index_t dim1,
-      index_t dim2) const {
-    this->bounds_check_(dim1);
-    this->bounds_check_(dim2);
-    GenericPackedTensorAccessor<T, N, PtrTraits, index_t> result(
-        this->data_, this->sizes_, this->strides_);
-    std::swap(result.strides_[dim1], result.strides_[dim2]);
-    std::swap(result.sizes_[dim1], result.sizes_[dim2]);
-    return result;
-  }
-};
-
-template<typename T, template <typename U> class PtrTraits, typename index_t>
-class GenericPackedTensorAccessor<T,1,PtrTraits,index_t> : public GenericPackedTensorAccessorBase<T,1,PtrTraits,index_t> {
-public:
-  typedef typename PtrTraits<T>::PtrType PtrType;
-  C10_HOST GenericPackedTensorAccessor(
-      PtrType data_,
-      const index_t* sizes_,
-      const index_t* strides_)
-      : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
-
-  // if index_t is not int64_t, we want to have an int64_t constructor
-  template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
-  C10_HOST GenericPackedTensorAccessor(
-      PtrType data_,
-      const source_index_t* sizes_,
-      const source_index_t* strides_)
-      : GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
-
-  C10_DEVICE T & operator[](index_t i) {
-    return this->data_[this->strides_[0] * i];
-  }
-  C10_DEVICE const T& operator[](index_t i) const {
-    return this->data_[this->strides_[0]*i];
-  }
-
-  // Same as in the general N-dimensional case, but note that in the
-  // 1-dimensional case the returned PackedTensorAccessor will always be an
-  // identical copy of the original
-  C10_HOST GenericPackedTensorAccessor<T, 1, PtrTraits, index_t> transpose(
-      index_t dim1,
-      index_t dim2) const {
-    this->bounds_check_(dim1);
-    this->bounds_check_(dim2);
-    return GenericPackedTensorAccessor<T, 1, PtrTraits, index_t>(
-        this->data_, this->sizes_, this->strides_);
-  }
-};
+using GenericPackedTensorAccessorBase = torch::headeronly::detail::GenericPackedTensorAccessorBase<detail::IndexBoundsCheck<N, index_t>, T, N, PtrTraits, index_t>;

+template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+using GenericPackedTensorAccessor = torch::headeronly::detail::GenericPackedTensorAccessor<TensorAccessor<T, N-1, PtrTraits, index_t>, detail::IndexBoundsCheck<N, index_t>, T, N, PtrTraits, index_t>;

 // Can't put this directly into the macro function args because of commas
 #define AT_X GenericPackedTensorAccessor<T, N, PtrTraits, index_t>
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -245,6 +245,9 @@ class TORCH_API TensorBase {
  size_t weak_use_count() const noexcept {
    return impl_.weak_use_count();
  }
+  bool is_uniquely_owned() const noexcept {
+    return impl_.is_uniquely_owned();
+  }

  std::string toString() const;

--- a/aten/src/ATen/cuda/CUDAContextLight.h
+++ b/aten/src/ATen/cuda/CUDAContextLight.h
@ -3,6 +3,7 @@

 #include <cstdint>
 #include <map>
+#include <shared_mutex>

 #include <cuda_runtime_api.h>
 #include <cusparse.h>
@ -88,8 +89,13 @@ TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
 TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();

 TORCH_CUDA_CPP_API void clearCublasWorkspaces();
-TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace();
-TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace();
+struct WorkspaceMapWithMutex {
+  std::map<std::tuple<void*, void*>, at::DataPtr> map;
+  std::shared_mutex mutex;
+};
+
+TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
+TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
 TORCH_CUDA_CPP_API size_t getChosenWorkspaceSize();
 TORCH_CUDA_CPP_API size_t getCUDABlasLtWorkspaceSize();
 TORCH_CUDA_CPP_API void* getCUDABlasLtWorkspace();
--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@ -175,17 +175,24 @@ void CUDAGraph::instantiate() {
    // Trailing NULL, NULL, 0 arguments were recommended by Cuda driver people,
    // who prefer not to report error message through these arguments moving forward
    // (they prefer return value, or errors on api calls internal to the capture)
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000)
-    AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, 0));
+    // ROCM appears to fail with HIP error: invalid argument
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && !defined(USE_ROCM)
+    AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, cudaGraphInstantiateFlagUseNodePriority));
 #else
    AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0));
 #endif
 //Since ROCm 6.2, we want to go down this path as hipGraphExecDestroy in the destructor will not immediately free the memory.
 //It will wait for the next sync operation. cudaGraphInstantiateFlagAutoFreeOnLaunch will add async frees after graph launch.
  } else {
+#if !defined(USE_ROCM)
+    AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
+                                                graph_,
+                                                cudaGraphInstantiateFlagAutoFreeOnLaunch | cudaGraphInstantiateFlagUseNodePriority));
+#else
    AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
                                                graph_,
                                                cudaGraphInstantiateFlagAutoFreeOnLaunch));
+#endif
  }
  has_graph_exec_ = true;
 }
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@ -99,7 +99,7 @@ void destroyCublasHandle(cublasHandle_t handle) {
 //   - Comments of @soumith copied from cuDNN handle pool implementation
 #ifdef NO_CUDNN_DESTROY_HANDLE
 #else
-    cublasDestroy(handle);
+  cublasDestroy(handle);
 #endif
 }

@ -107,19 +107,27 @@ using CuBlasPoolType = DeviceThreadHandlePool<cublasHandle_t, createCublasHandle

 } // namespace

-std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace() {
-  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
+WorkspaceMapWithMutex& cublas_handle_stream_to_workspace() {
+  static auto& instance = *new WorkspaceMapWithMutex;
  return instance;
 }

-std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace() {
-  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
+WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace() {
+  static auto& instance = *new WorkspaceMapWithMutex;
  return instance;
 }

 void clearCublasWorkspaces() {
-  cublas_handle_stream_to_workspace().clear();
-  cublaslt_handle_stream_to_workspace().clear();
+  {
+    auto& workspace = cublas_handle_stream_to_workspace();
+    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
+    workspace.map.clear();
+  }
+  {
+    auto& workspace = cublaslt_handle_stream_to_workspace();
+    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
+    workspace.map.clear();
+  }
 }

 size_t parseChosenWorkspaceSize() {
@ -233,6 +241,38 @@ at::DataPtr getNewCUDABlasLtWorkspace() {
  return c10::cuda::CUDACachingAllocator::get()->allocate(getCUDABlasLtWorkspaceSize());
 }

+void setWorkspaceForHandle(cublasHandle_t handle, c10::cuda::CUDAStream stream) {
+  cudaStream_t _stream = stream;
+  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
+
+  auto& workspace = cublas_handle_stream_to_workspace();
+
+  size_t workspace_size = getChosenWorkspaceSize();
+
+  // Fast path: check if workspace already exists
+  {
+    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it = workspace.map.find(key);
+    if (workspace_it != workspace.map.end()) {
+      TORCH_CUDABLAS_CHECK(cublasSetWorkspace(
+          handle, workspace_it->second.get(), workspace_size));
+      return;
+    }
+  }
+
+  // Slow path: allocate workspace outside the lock
+  auto new_workspace = getNewWorkspace();
+
+  // Insert with lock (double-check in case another thread inserted while we
+  // were allocating)
+  {
+    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it = workspace.map.try_emplace(key, std::move(new_workspace)).first;
+    TORCH_CUDABLAS_CHECK(
+        cublasSetWorkspace(handle, workspace_it->second.get(), workspace_size));
+  }
+}
+
 void* getCUDABlasLtWorkspace() {
 #ifndef USE_ROCM
  static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true;
@ -241,8 +281,10 @@ void* getCUDABlasLtWorkspace() {
    auto stream = c10::cuda::getCurrentCUDAStream();
    cudaStream_t _stream = stream;
    auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-    auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key);
-    TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end());
+    auto& workspace = at::cuda::cublas_handle_stream_to_workspace();
+    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it = workspace.map.find(key);
+    TORCH_INTERNAL_ASSERT(workspace_it != workspace.map.end());
    return workspace_it->second.mutable_get();
  }
 #endif
@ -250,11 +292,29 @@ void* getCUDABlasLtWorkspace() {
  auto stream = c10::cuda::getCurrentCUDAStream();
  cudaStream_t _stream = stream;
  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-  auto workspace_it = cublaslt_handle_stream_to_workspace().find(key);
-  if (workspace_it == cublaslt_handle_stream_to_workspace().end()) {
-    workspace_it = cublaslt_handle_stream_to_workspace().insert(workspace_it, {key, getNewCUDABlasLtWorkspace()});
+
+  auto& workspace = cublaslt_handle_stream_to_workspace();
+
+  // Fast path: check if workspace already exists
+  {
+    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it = workspace.map.find(key);
+    if (workspace_it != workspace.map.end()) {
+      return workspace_it->second.mutable_get();
+    }
+  }
+
+  // Slow path: allocate workspace outside the lock
+  auto new_workspace = getNewCUDABlasLtWorkspace();
+
+  // Insert with lock (double-check in case another thread inserted while we
+  // were allocating)
+  {
+    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it =
+          workspace.map.try_emplace(key, std::move(new_workspace)).first;
+    return workspace_it->second.mutable_get();
  }
-  return workspace_it->second.mutable_get();
 }

 cublasHandle_t getCurrentCUDABlasHandle() {
@ -298,13 +358,8 @@ cublasHandle_t getCurrentCUDABlasHandle() {
  // will allocate memory dynamically (even if they're cheap) outside
  // PyTorch's CUDA caching allocator. It's possible that CCA used up
  // all the memory and cublas's cudaMallocAsync will return OOM
-  cudaStream_t _stream = stream;
-  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-  auto workspace_it = cublas_handle_stream_to_workspace().find(key);
-  if (workspace_it == cublas_handle_stream_to_workspace().end()) {
-    workspace_it = cublas_handle_stream_to_workspace().insert(workspace_it, {key, getNewWorkspace()});
-  }
-  TORCH_CUDABLAS_CHECK(cublasSetWorkspace(handle, workspace_it->second.get(), getChosenWorkspaceSize()));
+  setWorkspaceForHandle(handle, stream);
+
 #if !defined(USE_ROCM)
  // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
  // FP32 data type calculations based on the value of the allow_tf32 flag.
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@ -1936,7 +1936,7 @@ static bool should_fold(const Tensor& tensor1, const Tensor& tensor2, bool has_o

  // We order the tensors. t1 will be the larger tensor
  // We can always transpose tensor2 as the dimensions are always >= 1 (precondition from matmul)
-  // and tensor1_larger iff tensor2.dim() > tensor1.dim()
+  // and tensor1_larger iff tensor2.dim() > tensor1.dim(9
  const auto t1 = tensor1_larger ? MaybeOwned<Tensor>::borrowed(tensor1)
                                 : MaybeOwned<Tensor>::owned(tensor2.mT());
  const int64_t dim_t1 = t1->dim();
@ -1948,11 +1948,20 @@ static bool should_fold(const Tensor& tensor1, const Tensor& tensor2, bool has_o
    return false;
  }

-  // If we require a gradient, we should fold to minimize backward memory usage - even if this
-  // leads to a copy in forward because is needed in backward,
-  // only time we avoid this strict pre-allocated memory usage (has_out = True)
-  bool requires_grad = tensor1.requires_grad() || tensor2.requires_grad();
-  if (requires_grad && !has_out) {
+  // In this case we *do* incur in an extra copy to avoid creating an unnecessary large tensor in the backward
+  // Suppose we don't fold here. Let t1.shape = [b, m, n] t2.shape = [n, k] like in a transformer
+  // t2 will be expanded to a tensor of shape [b, n, k] and then we do t1.bmm(t2_expanded)
+  // The issue appears in the backward.
+  // The output gradient g of this operation would have shape [b, m, k]
+  // The backward wrt. t2 of bmm would be given by t1.mH @ g, which has shape [b, n, k]
+  // Then, the backward of expand is simply `sum(0)`. As such, we are instantiating a tensor
+  // of shape [b, n, k] unnecessarily, which may cause a large memory footprint, and in the
+  // worst case, an OOM
+  bool t2_requires_grad = tensor1_larger ? tensor2.requires_grad() : tensor1.requires_grad();
+  if (t2_requires_grad && !has_out) {
+    // We should be checking !at::GradMode::is_enabled(), but apparently
+    // this regresses performance in some cases:
+    // https://github.com/pytorch/pytorch/issues/118548#issuecomment-1916022394
    return true;
  }

--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@ -1087,7 +1087,8 @@ TORCH_IMPL_FUNC(index_copy_out)
    result.copy_(self);

  // See Note [Enabling Deterministic Operations]
-  if (result.is_cuda() && globalContext().deterministicAlgorithms()) {
+  if ((result.is_cuda() || result.is_xpu()) &&
+      globalContext().deterministicAlgorithms()) {
    torch::List<std::optional<Tensor>> indices;
    indices.resize(dim + 1);
    indices.set(dim, index);
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -296,7 +296,7 @@ template <typename scalar_t, typename res_scalar_t = scalar_t>
 bool launchGemmAndBiasCublasLt(
    // args contains result which is modified
    cublasCommonArgs& args,
-    const Tensor& self,
+    const std::optional<Tensor>& self,
    const Scalar& alpha,
    Activation activation = Activation::None
 ) {
@ -304,12 +304,8 @@ bool launchGemmAndBiasCublasLt(
  // or when it can be squeezed to 1D.
  // self_ptr == nullptr implies ignore bias epilogue
  // and use standard gemm-like API.
-  const auto* self_ptr = [&]() -> auto {
-    if (self.dim() == 1 || self.squeeze().dim() == 1) {
-      return self.const_data_ptr<scalar_t>();
-    }
-    return static_cast<const scalar_t*>(nullptr);
-  }();
+  const auto* self_ptr = self.has_value() ? self.value().const_data_ptr<scalar_t>() : static_cast<const scalar_t*>(nullptr);
+

  const auto tuning_ctx = at::cuda::tunable::getTuningContext();
  if (tuning_ctx->IsTunableOpEnabled()) {
@ -392,35 +388,30 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  bool disable_addmm_cuda_lt = persistent_disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
  #ifdef USE_ROCM
  // Conditioned on the device index, which is not persistent
-  disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt;
+  disable_addmm_cuda_lt = disable_addmm_cuda_lt || isGloballyDisabledAddmmCudaLt(self.device());
  #endif
  // Condition on the input
-  disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation) || disable_addmm_cuda_lt;
-  // }
+  disable_addmm_cuda_lt = disable_addmm_cuda_lt || !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation);

  at::ScalarType scalar_type = mat1.scalar_type();
  bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;

+  #ifdef USE_ROCM
+  disable_addmm_cuda_lt = disable_addmm_cuda_lt || is_float_output_with_half_input;
+  #endif
+
+  bool use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
+  // for float output with half input cublasLT with bias produces wrong results
+  use_bias_ptr_lt &= !is_float_output_with_half_input;
+
  // Handle result/self shapes
  if (!result.is_same(self)) {
    at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]});

-    // We use bias ptr in the Lt path only when bias is 1D
-    const auto use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
-    const auto self_maybe_expanded = [&]() -> c10::MaybeOwned<Tensor> {
-      if (!use_bias_ptr_lt) {
-        // We do expand self even before
-        // check for beta != 0.0 to make sure that
-        // test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_*
-        // runs green.
-        return expand_size(self, result.sizes(), "addmm");
-      }
-      return c10::MaybeOwned<Tensor>::borrowed(self);
-    }();
-    // We do not copy bias only when we need the bias ptr
+      // We do not copy bias only when we need the bias ptr
    if (beta.toComplexDouble() != 0.0 && !use_bias_ptr_lt) {
      // NOTE: self should broadcast over result
-      at::native::copy_(result, *self_maybe_expanded);
+      at::native::copy_(result, *expand_size(self, result.sizes(), "addmm"));
    }
  }

@ -468,7 +459,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
        scalar_type,
        "addmm_cuda_lt",
        [&] {
-          lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, self, alpha, activation);
+          lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
        }
      );
      #endif
@ -480,7 +471,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
        scalar_type,
        "addmm_cuda_lt",
        [&] {
-          lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, self, alpha, activation);
+          lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
        }
      );
    } // end is_float_output_with_half_input
@ -936,7 +927,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
  return _int_mm_out_cuda(self, mat2, result);
 }

-static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
+static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
  // ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm
  TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
  TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
@ -960,7 +951,7 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
    (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");

-  if (!is_bmm && self_baddbmm.has_value()) {
+  if (self_baddbmm.has_value()) {
    const auto& self = self_baddbmm.value();
    TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor");
    TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output");
@ -968,15 +959,12 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
 }

 Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
-  IntArrayRef batch1_sizes = batch1.sizes();
-  IntArrayRef batch2_sizes = batch2.sizes();
-
-  Tensor out = at::empty({batch1_sizes[0], batch1_sizes[1], batch2_sizes[2]}, batch1.options().dtype(out_dtype));
+  Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
  return _bmm_out_dtype_cuda(batch1, batch2, out_dtype, out);
 }

 Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
-  baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true);
+  baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype);
  Scalar beta(0.0);
  Scalar alpha(1.0);
  {
@ -988,14 +976,16 @@ Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at
 }

 Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
-  // We need to copy the tensor
-  Tensor out = self.clone().to(self.options().dtype(out_dtype));
-
-  return _baddbmm_out_dtype_cuda(out, batch1, batch2, out_dtype, beta, alpha, out);
+  TORCH_CHECK(self.scalar_type() == out_dtype || self.scalar_type() == batch1.dtype(),
+  "self dtype must match either out_dtype or batch1 dtype");
+  Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
+  return _baddbmm_out_dtype_cuda(self, batch1, batch2, out_dtype, beta, alpha, out);
 }

 Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
-  baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self);
+  baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, out);
+  // We need to copy the tensor
+  out.copy_(self);
  {
    NoNamesGuard guard;
    baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
@ -1030,24 +1020,27 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca
 }

 Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
-  Tensor result = at::empty(self.sizes(), self.options().dtype(out_dtype));
+  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
+  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
+  Tensor result = at::empty({mat1.size(0), mat2.size(1)}, self.options().dtype(out_dtype));
  return _addmm_dtype_out_cuda(self, mat1, mat2, out_dtype, beta, alpha, result);
 }

 Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
-  TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type());
-  TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
+// repeat dimensionality checks for direct calls to `out` overload
  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
  TORCH_CHECK(
      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
+  TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
+  TORCH_CHECK(out_dtype == mat1.scalar_type() ||
+  (out_dtype == at::ScalarType::Float && (mat1.scalar_type() == at::ScalarType::Half || mat1.scalar_type() == at::ScalarType::BFloat16)),
+  "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");

  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
-  TORCH_CHECK(out_dtype == self.scalar_type() ||
-    (out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
-    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
-  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+  TORCH_CHECK(out_dtype == self.scalar_type() || self.scalar_type() == mat1.scalar_type(),
+    "self dtype must match either out_dtype or mat1 dtype");

  addmm_out_cuda_impl(out, self, mat1, mat2, beta, alpha);

--- a/aten/src/ATen/native/cuda/EmbeddingBag.cu
+++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu
@ -78,9 +78,18 @@ __global__ void EmbeddingBag_updateOutputKernel_max(
      scalar_t weightFeatMax = 0;
      int64_t bag_size_ = 0;
      int64_t maxWord = -1;
+
+      // Separate validation loop reduces register pressure in the main loop below.
+      // No early exit (break) on invalid input as benchmarking shows it degrades performance.
+      bool has_invalid_index = false;
+      for (int64_t emb = begin; emb < end; emb++) {
+        index_t input_idx = input[emb];
+        has_invalid_index = has_invalid_index || (input_idx < 0 || input_idx >= numRows);
+      }
+      CUDA_KERNEL_ASSERT(!has_invalid_index && "Invalid input index in EmbeddingBag: index out of range [0, numRows)");
+
      for (int64_t emb = begin; emb < end; emb++) {
        bool pad = (input[emb] == padding_idx);
-        CUDA_KERNEL_ASSERT(input[emb] < numRows);
        const int64_t weightRow = input[emb] * weight_stride0;
        scalar_t weightValue = weightFeat[weightRow];
        if (bag_size_ == 0 || weightValue > weightFeatMax) {
@ -129,10 +138,19 @@ __global__ void EmbeddingBag_updateOutputKernel_sum_mean(
      CUDA_KERNEL_ASSERT(end >= begin);
      accscalar_t weightFeatSum = 0;
      int64_t bag_size_ = 0;
+
+      // Separate validation loop reduces register pressure in the main loop below.
+      // No early exit (break) on invalid input as benchmarking shows it degrades performance.
+      bool has_invalid_index = false;
+      for (int64_t emb = begin; emb < end; emb++) {
+        index_t input_idx = input[emb];
+        has_invalid_index = has_invalid_index || (input_idx < 0 || input_idx >= numRows);
+      }
+      CUDA_KERNEL_ASSERT(!has_invalid_index && "Invalid input index in EmbeddingBag: index out of range [0, numRows)");
+
      for (int64_t emb = begin; emb < end; emb++) {
        index_t input_idx = input[emb];
        bool pad = (input_idx == padding_idx);
-        CUDA_KERNEL_ASSERT(0 <= input_idx && input_idx < numRows);
        const int64_t weightRow = input_idx * weight_stride0;
        scalar_t weightValue = weightFeat[weightRow];
        weightValue = pad ? static_cast<scalar_t>(0) : weightValue;
--- a/aten/src/ATen/native/cuda/GroupedBlas.cpp
+++ b/aten/src/ATen/native/cuda/GroupedBlas.cpp
@ -78,9 +78,9 @@ _mx8_mx8_bf16_grouped_mm_fbgemm(
        const Tensor& mat_a,
        const Tensor& mat_b,
        const Tensor& scale_a,
-        const SwizzleType& swizzle_a,
+        const SwizzleType swizzle_a,
        const Tensor& scale_b,
-        const SwizzleType& swizzle_b,
+        const SwizzleType swizzle_b,
        const std::optional<at::Tensor>& offs,
        Tensor& out) {
    const bool a_is_2d = mat_a.dim() == 2;
--- a/aten/src/ATen/native/cuda/KernelUtils.cuh
+++ b/aten/src/ATen/native/cuda/KernelUtils.cuh
@ -5,69 +5,11 @@
 #include <cuda_bf16.h>
 #endif

-// ROCm 6.3 is planned to have these functions, but until then here they are.
 #if defined(USE_ROCM)
 #include <device_functions.h>
 #include <hip/hip_fp16.h>
 #include <hip/hip_bf16.h>
-
-__device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) {
-#if (defined(__gfx942__)) && \
-  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16)
-  typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
-  static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw));
-  union {
-    __hip_bfloat162_raw bf162_raw;
-    vec_short2 vs2;
-  } u{static_cast<__hip_bfloat162_raw>(value)};
-  u.vs2 = __builtin_amdgcn_flat_atomic_fadd_v2bf16((vec_short2*)address, u.vs2);
-  return static_cast<__hip_bfloat162>(u.bf162_raw);
-#else
-  static_assert(sizeof(unsigned int) == sizeof(__hip_bfloat162_raw));
-  union u_hold {
-    __hip_bfloat162_raw h2r;
-    unsigned int u32;
-  };
-  u_hold old_val, new_val;
-  old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-  do {
-    new_val.h2r = __hadd2(old_val.h2r, value);
-  } while (!__hip_atomic_compare_exchange_strong(
-        (unsigned int*)address, &old_val.u32, new_val.u32,
-        __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
-  return old_val.h2r;
-#endif
-}
-
-__device__ inline __half2 preview_unsafeAtomicAdd(__half2* address, __half2 value) {
-#if (defined(__gfx942__)) && \
-  __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2f16)
-  // The api expects an ext_vector_type of half
-  typedef _Float16 __attribute__((ext_vector_type(2))) vec_fp162;
-  static_assert(sizeof(vec_fp162) == sizeof(__half2_raw));
-  union {
-    __half2_raw h2r;
-    vec_fp162 fp16;
-  } u {static_cast<__half2_raw>(value)};
-  u.fp16 = __builtin_amdgcn_flat_atomic_fadd_v2f16((vec_fp162*)address, u.fp16);
-  return static_cast<__half2>(u.h2r);
-#else
-  static_assert(sizeof(__half2_raw) == sizeof(unsigned int));
-  union u_hold {
-    __half2_raw h2r;
-    unsigned int u32;
-  };
-  u_hold old_val, new_val;
-  old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-  do {
-    new_val.h2r = __hadd2(old_val.h2r, value);
-  } while (!__hip_atomic_compare_exchange_strong(
-        (unsigned int*)address, &old_val.u32, new_val.u32,
-        __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT));
-  return old_val.h2r;
-#endif
-}
-#define ATOMICADD preview_unsafeAtomicAdd
+#define ATOMICADD unsafeAtomicAdd
 #define NATIVE_ZERO_BF16 __float2bfloat16(0.0f)
 #else
 #define ATOMICADD atomicAdd
--- a/aten/src/ATen/native/cuda/LogAddExpKernel.cu
+++ b/aten/src/ATen/native/cuda/LogAddExpKernel.cu
@ -2,18 +2,250 @@
 #include <ATen/Dispatch.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/JitLoops.cuh>
+#include <ATen/native/cuda/jit_utils.h>
+#include <ATen/native/cuda/ScanUtils.cuh>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/BinaryOps.h>
 #include <ATen/OpMathType.h>
 #include <c10/util/MathConstants.h>
+#include <c10/util/complex.h>
+
+#include <cmath>
+#include <limits>

 // NOTE: CUDA on Windows requires that the enclosing function
 // of a __device__ lambda not have internal linkage.

 namespace at::native {

+// custom min and max to be used in logaddexp for  complex arguments
+template <typename scalar_t, bool min>
+__host__ __device__ c10::complex<scalar_t> _logaddexp_minmax(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
+  scalar_t xr = std::real(x);
+  scalar_t yr = std::real(y);
+  if (::isnan(yr) || (::isnan(std::imag(y)))) {
+    return y;
+  } else if (::isnan(xr) || (::isnan(std::imag(x)))) {
+    return x;
+  } else if (min) { // min
+    return (xr < yr) ? x : y;
+  } else { // max
+    return (xr >= yr) ? x : y;
+  }
+}
+
+template <typename scalar_t>
+__host__ __device__ scalar_t _log_add_exp_helper(const scalar_t& x, const scalar_t& y) {
+  // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
+  // Using the original expression: `at::_isnan(y) ? y : std::min(x, y)` causes an error in ROCM
+  const auto isnan_x = at::_isnan(x);
+  const auto isnan_y = at::_isnan(y);
+  scalar_t min = isnan_y ? y : (isnan_x ? x : std::min(x, y));
+  scalar_t max = isnan_y ? y : (isnan_x ? x : std::max(x, y));
+  if (min != max || ::isfinite(min)) {
+    // nan will be propagated here
+    return ::log1p(std::exp(min - max)) + max;
+  } else {
+    // special case to correctly handle infinite cases
+    return x;
+  }
+}
+
+template <typename scalar_t>
+__host__ __device__ c10::complex<scalar_t> _fast_build_exp(const c10::complex<scalar_t>& x) {
+  // complex exponential function, but implemented manually to get fast compilation time
+  // this function only handles the case where the x is finite (not inf nor nan)
+  const auto xreal = std::real(x);
+  const auto ximag = std::imag(x);
+  const auto exp_x_abs = std::exp(xreal);
+  auto exp_x_real = exp_x_abs * std::cos(ximag);
+  auto exp_x_imag = exp_x_abs * std::sin(ximag);
+  return {exp_x_real, exp_x_imag};
+}
+
+template <typename scalar_t>
+__host__ __device__ c10::complex<scalar_t> _fast_build_exp_inf(const c10::complex<scalar_t>& x) {
+  // complex exponential function, but implemented manually to get fast compilation time
+  // this function only handles the case where the real part of x is infinite
+  const auto ximag = std::imag(x);
+  constexpr auto exp_x_abs = std::numeric_limits<scalar_t>::infinity();
+  if (!::isfinite(ximag)) {  // add this to make consitent with std::exp(x+yi)
+    return {exp_x_abs, std::numeric_limits<scalar_t>::quiet_NaN()};
+  }
+  const auto sin = std::sin(ximag);
+  const auto cos = std::cos(ximag);
+  // special case if the angle is exactly the multiple of pi/2
+  auto exp_x_real = (cos == 0) ? (scalar_t)0.0 : exp_x_abs * cos;
+  auto exp_x_imag = (sin == 0) ? (scalar_t)0.0 : exp_x_abs * sin;
+  return {exp_x_real, exp_x_imag};
+}
+
+template <typename scalar_t>
+__host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
+  c10::complex<scalar_t> min = _logaddexp_minmax<scalar_t, /*min=*/true>(x, y);
+  c10::complex<scalar_t> max = _logaddexp_minmax<scalar_t, /*min=*/false>(x, y);
+  scalar_t min_real = std::real(min);
+  scalar_t max_real = std::real(max);
+
+  if (::isnan(min_real) || ::isnan(std::imag(min))) {
+    // handling the "infectious" NaNs
+    return {std::numeric_limits<scalar_t>::quiet_NaN(), std::numeric_limits<scalar_t>::quiet_NaN()};
+  }
+  else if ((!::isfinite(min_real)) && (min_real == max_real)) {
+    if (min_real < 0) {
+      // handle the -inf case, the imaginary part here does not really matter as the exp(value)
+      // will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined.
+      // It does not matter if we're taking the exp of this value
+      return min;
+    } else {
+      // handle the +inf case, we don't need the special precision for log1p for small values
+      // and to avoid producing nan in case of real(max) == real(min) == +inf
+      const auto exp_min = _fast_build_exp_inf(min);
+      const auto exp_max = _fast_build_exp_inf(max);
+      return ::log1p(exp_min + exp_max - 1);  // log1p(x - 1) builds faster than log
+    }
+  } else {
+    const auto minmax = min - max;
+    c10::complex<scalar_t> exp_minmax;
+    if (!::isfinite(minmax.real())) {
+        exp_minmax = minmax.real() < 0 ? c10::complex<scalar_t>{0.0, 0.0} : _fast_build_exp_inf(minmax);
+    } else {
+        exp_minmax = _fast_build_exp(minmax);
+    }
+    return ::log1p(exp_minmax) + max;
+  }
+}
+
+// Complex logaddexp jiterator string
+const auto logaddexp_complex_string = jiterator_stringify(
+    template<typename T>
+    std::complex<T> log1p(const std::complex<T>& z)
+    {
+      using complex_t = std::complex<T>;
+      T x = z.real();
+      T y = z.imag();
+      T zabs = abs(z);
+      T theta = atan2(y, x + T(1));
+      if (zabs < 0.5) {
+          T r = x * (T(2) + x) + y * y;
+          if (r == 0) { // handle underflow
+              return complex_t(x, theta);
+          }
+          return complex_t(T(0.5) * std::log1p(r), theta);
+      } else {
+          T z0 = std::hypot(x + 1, y);
+          return complex_t(log(z0), theta);
+      }
+    }
+
+    // separated _logaddexp_minmax into 2 different functions for jiterator_string
+    template <typename T>
+    std::complex<T> logaddexp_min(const std::complex<T>& x, const std::complex<T>& y) {
+        T xr = x.real();
+        T yr = y.real();
+        if (isnan(yr) || isnan(y.imag())) {
+            return y;
+        } else if (isnan(xr) || isnan(x.imag())) {
+            return x;
+        } else {
+            return (xr < yr) ? x : y;
+        }
+    }
+
+    template <typename T>
+    std::complex<T> logaddexp_max(const std::complex<T>& x, const std::complex<T>& y) {
+        T xr = x.real();
+        T yr = y.real();
+        if (isnan(yr) || isnan(y.imag())) {
+            return y;
+        } else if (isnan(xr) || isnan(x.imag())) {
+            return x;
+        } else {
+            return (xr >= yr) ? x : y;
+        }
+    }
+
+    template <typename T>
+    std::complex<T> fast_build_exp(const std::complex<T>& x) {
+        const auto xreal = x.real();
+        const auto ximag = x.imag();
+        const auto exp_x_abs = exp(xreal);
+        auto exp_x_real = exp_x_abs * cos(ximag);
+        auto exp_x_imag = exp_x_abs * sin(ximag);
+        return std::complex<T>(exp_x_real, exp_x_imag);
+    }
+
+    template <typename T>
+    std::complex<T> fast_build_exp_inf(const std::complex<T>& x) {
+        using complex_t = std::complex<T>;
+        const auto ximag = x.imag();
+        const T exp_x_abs = INFINITY;
+        if (!isfinite(ximag)) {
+            return complex_t(exp_x_abs, NAN);
+        }
+        const auto sin_val = sin(ximag);
+        const auto cos_val = cos(ximag);
+        auto exp_x_real = (cos_val == T(0)) ? T(0) : exp_x_abs * cos_val;
+        auto exp_x_imag = (sin_val == T(0)) ? T(0) : exp_x_abs * sin_val;
+        return complex_t(exp_x_real, exp_x_imag);
+    }
+
+    template <typename complex_t>
+    complex_t logaddexp_complex(complex_t x, complex_t y) {
+        using T = typename complex_t::value_type;
+        complex_t min_val = logaddexp_min(x, y);
+        complex_t max_val = logaddexp_max(x, y);
+        T min_real = min_val.real();
+        T max_real = max_val.real();
+
+        if (isnan(min_real) || isnan(min_val.imag())) {
+            return complex_t(NAN, NAN);
+        }
+        else if ((!isfinite(min_real)) && (min_real == max_real)) {
+            if (min_real < T(0)) {
+                return min_val;
+            } else {
+                const auto exp_min = fast_build_exp_inf<T>(min_val);
+                const auto exp_max = fast_build_exp_inf<T>(max_val);
+                return log1p(exp_min + exp_max - complex_t(1, 0));
+            }
+        } else {
+            const auto minmax = min_val - max_val;
+            complex_t exp_minmax;
+            if (!isfinite(minmax.real())) {
+                exp_minmax = (minmax.real() < T(0)) ? complex_t(0, 0) : fast_build_exp_inf<T>(minmax);
+            } else {
+                exp_minmax = fast_build_exp<T>(minmax);
+            }
+            return log1p(exp_minmax) + max_val;
+        }
+    }
+);
+
+constexpr char logaddexp_complex_name[] = "logaddexp_complex";
 void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
-  AT_DISPATCH_FLOATING_TYPES_AND2(
+  if (at::isComplexType(iter.dtype())) {
+#if AT_USE_JITERATOR()
+    AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
+      jitted_gpu_kernel<
+          /*name=*/logaddexp_complex_name,
+          /*return_dtype=*/scalar_t,
+          /*common_dtype=*/scalar_t,
+          /*arity=*/2>(iter, logaddexp_complex_string);
+    });
+#else
+    AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
+      using opmath_t = at::opmath_type<scalar_t>;
+      gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a_, scalar_t b_) -> scalar_t {
+        const auto a = static_cast<opmath_t>(a_);
+        const auto b = static_cast<opmath_t>(b_);
+        return static_cast<scalar_t>(_log_add_exp_helper(a, b));
+      });
+    });
+#endif
+  } else {
+    AT_DISPATCH_FLOATING_TYPES_AND2(
      ScalarType::BFloat16, ScalarType::Half,
      iter.dtype(), "logaddexp_cuda",
      [&]() {
@ -29,6 +261,7 @@ void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
          }
        });
      });
+  }
 }

 void logaddexp2_kernel_cuda(TensorIteratorBase& iter) {
--- a/aten/src/ATen/native/cuda/ScaledBlas.cpp
+++ b/aten/src/ATen/native/cuda/ScaledBlas.cpp
@ -740,7 +740,12 @@ _scaled_rowwise_rowwise(
  TORCH_CHECK_VALUE(scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat, "scale_a must have ", mat_a.size(0), " Float elements, got ", scale_a.numel())
  TORCH_CHECK_VALUE(scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat, "scale_b must have ", mat_b.size(1), " Float elements, got ", scale_b.numel())

-  TORCH_CHECK_VALUE(scale_a.stride(1) == 1, "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1));
+  // if we have a scale of shape [256, 1] (say), then stride can be [1, 0] - handle this case
+  TORCH_CHECK_VALUE(
+      scale_a.stride(1) == 1 ||
+      scale_a.size(1) == 1,
+      "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1)
+  );
  TORCH_CHECK_VALUE(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1));

  auto scaling_choice_a = ScalingType::RowWise;
@ -1096,6 +1101,19 @@ _scaled_mxfp8_mxfp8(
  return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
 }

+void
+_check_mxfp4_support() {
+#ifndef USE_ROCM
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  // Only on B200 GPUs
+  TORCH_CHECK_NOT_IMPLEMENTED(
+    // B200 = 10.0, B300 = 10.3
+    dprops->major == 10,
+    "MXFP4 scaling only supported in CUDA for B200/B300"
+  );
+#endif
+}
+

 Tensor&
 _scaled_mxfp4_mxfp4(
@ -1108,6 +1126,7 @@ _scaled_mxfp4_mxfp4(
 #if defined(_WIN32) || (!defined(USE_ROCM) && !defined(USE_FBGEMM_GENAI))
  TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM and CUDA+FBGEMM_GENAI only");
 #else
+  _check_mxfp4_support();
  // Restrictions:
  // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
  TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@ -82,6 +82,7 @@ NSArray<NSNumber*>* getTensorAxes(const TensorBase& t);
 NSArray<NSNumber*>* getTensorAxes(const IntArrayRef& sizes, at::OptionalIntArrayRef dim);
 std::string getMPSShapeString(MPSShape* shape);
 std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = true, bool exclude_shape = false);
+std::string to_hex_key(float);
 std::string getArrayRefString(const IntArrayRef s);
 // use has_storage() on the returned tensor to determine if src actually is a view
 Tensor gatherViewTensor(const Tensor& src, Tensor& dst);
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@ -301,6 +301,10 @@ std::string getArrayRefString(const IntArrayRef s) {
  return fmt::to_string(fmt::join(s, ","));
 }

+std::string to_hex_key(float f) {
+  return fmt::format("{:a}", f);
+}
+
 std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype, bool exclude_shape) {
  fmt::basic_memory_buffer<char, 100> buffer;
  auto buf_iterator = std::back_inserter(buffer);
--- a/aten/src/ATen/native/mps/operations/Repeat.mm
+++ b/aten/src/ATen/native/mps/operations/Repeat.mm
@ -91,25 +91,30 @@ static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
 #include <ATen/native/mps/Repeat_metallib.h>
 #endif

-template <typename index_t>
-void computeRepeatIndices(const index_t* repeat_ptr,
-                          const int64_t* cumsum_ptr,
-                          index_t* result_ptr,
-                          int64_t size,
-                          int64_t result_size) {
-  id<MTLBuffer> repeatBuffer = reinterpret_cast<id<MTLBuffer>>(repeat_ptr);
-  id<MTLBuffer> cumsumBuffer = reinterpret_cast<id<MTLBuffer>>(cumsum_ptr);
-  id<MTLBuffer> resultBuffer = reinterpret_cast<id<MTLBuffer>>(result_ptr);
-  TORCH_CHECK(repeatBuffer && cumsumBuffer && resultBuffer);
-
+Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
+  TORCH_CHECK(repeat.dim() == 1, "repeat_interleave only accept 1D vector as repeat");
  std::string scalar_type;
-  if constexpr (std::is_same_v<index_t, int32_t>) {
+  if (repeat.scalar_type() == kInt) {
    scalar_type = "int32_t";
-  } else if constexpr (std::is_same_v<index_t, int64_t>) {
+  } else if (repeat.scalar_type() == kLong) {
    scalar_type = "int64_t";
  } else {
-    TORCH_CHECK(false, "repeat_interleave: unsupported indexing data type");
+    TORCH_CHECK(false, "repeats has to be Long or Int tensor");
  }
+  if (repeat.size(0) == 0) {
+    return at::empty_like(repeat, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
+  }
+  Tensor repeat_ = repeat.contiguous();
+  Tensor cumsum = repeat.cumsum(0);
+  int64_t total = 0;
+  if (output_size.has_value()) {
+    total = output_size.value();
+  } else {
+    total = cumsum[-1].item<int64_t>();
+    TORCH_CHECK((repeat >= 0).all().item<uint8_t>(), "repeats can not be negative");
+  }
+
+  auto result = at::empty({total}, repeat.options());

  MPSStream* mpsStream = getCurrentMPSStream();
  dispatch_sync(mpsStream->queue(), ^() {
@ -121,20 +126,13 @@ void computeRepeatIndices(const index_t* repeat_ptr,
      getMPSProfiler().beginProfileKernel(pipelineState, "repeat_interleave:" + scalar_type, false);

      [computeEncoder setComputePipelineState:pipelineState];
-      mps::mtl_setArgs(computeEncoder, repeatBuffer, cumsumBuffer, resultBuffer, size);
-      mps::mtl_dispatch1DJob(computeEncoder, pipelineState, size);
+      mps::mtl_setArgs(computeEncoder, repeat_, cumsum, result, repeat.size(0));
+      mps::mtl_dispatch1DJob(computeEncoder, pipelineState, repeat.size(0));

      getMPSProfiler().endProfileKernel(pipelineState);
    }
  });
-}
-
-Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
-  Tensor output;
-  AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_mps", [&]() {
-    output = repeat_interleave_common<index_t, computeRepeatIndices<index_t>>(repeat, output_size);
-  });
-  return output;
+  return result;
 }

 } // namespace at::native
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@ -5,6 +5,7 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/TensorCompare.h>
 #include <ATen/native/mps/OperationUtils.h>
+#include <algorithm>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -89,13 +90,21 @@ static void check_min_max_dims(const OptionalTensorRef clamp_opt, const Tensor&
    auto clamp_shape = clamp_opt->sizes();
    auto input_shape = input_t.sizes();

-    TORCH_CHECK(num_clamp_dims <= num_input_dims,
-                op_name + ": clamp tensor number of dims must not be greater than that of input tensor")
+    if (num_clamp_dims > num_input_dims) {
+      auto leading_dims = num_clamp_dims - num_input_dims;
+      for (int64_t i = 0; i < leading_dims; ++i) {
+        TORCH_CHECK(clamp_shape[i] == 1,
+                    op_name + ": clamp tensor leading shape must be 1 to broadcast with input tensor");
+      }
+    }

-    for (int i = 0; i < num_clamp_dims; i++)
+    auto clamp_idx = num_clamp_dims - 1;
+    auto input_idx = num_input_dims - 1;
+    auto common_dims = std::min(num_clamp_dims, num_input_dims);
+    for (int64_t i = 0; i < common_dims; ++i)
      // One of the indices is allowed to be 1; will be handled by broadcast
-      TORCH_CHECK(clamp_shape[num_clamp_dims - 1 - i] == input_shape[num_input_dims - 1 - i] ||
-                      clamp_shape[num_clamp_dims - 1 - i] == 1 || input_shape[num_input_dims - 1 - i] == 1,
+      TORCH_CHECK(clamp_shape[clamp_idx - i] == input_shape[input_idx - i] || clamp_shape[clamp_idx - i] == 1 ||
+                      input_shape[input_idx - i] == 1,
                  op_name + ": clamp tensor trailing shape must match input tensor")
  }
 }
@ -136,9 +145,6 @@ static void clamp_tensor_out_mps(const Tensor& input_t,

  auto result_type = output_t.scalar_type();

-  IntArrayRef new_min_shape;
-  IntArrayRef new_max_shape;
-
  auto num_min_dims = min_opt->dim();
  auto num_max_dims = max_opt->dim();
  auto num_input_dims = input_t.dim();
@ -146,24 +152,32 @@ static void clamp_tensor_out_mps(const Tensor& input_t,
  std::vector<int64_t> new_min_arr(num_input_dims);
  std::vector<int64_t> new_max_arr(num_input_dims);

-  if (has_min && num_min_dims < num_input_dims) {
-    fill_new_shape(num_input_dims, num_min_dims, new_min_arr.data(), min_opt->sizes());
-    new_min_shape = IntArrayRef(new_min_arr);
-  }
-
-  if (has_max && num_max_dims < num_input_dims) {
-    fill_new_shape(num_input_dims, num_max_dims, new_max_arr.data(), max_opt->sizes());
-    new_max_shape = IntArrayRef(new_max_arr);
-  }
-
  Tensor min_opt_tensor;
  Tensor max_opt_tensor;

+  auto reshape_clamp_tensor = [&](const OptionalTensorRef clamp_tensor_ref,
+                                  int64_t num_clamp_dims,
+                                  std::vector<int64_t>& new_shape_storage) -> Tensor {
+    IntArrayRef clamp_shape = clamp_tensor_ref->sizes();
+    bool requires_view = false;
+
+    if (num_clamp_dims > num_input_dims) {
+      clamp_shape = clamp_shape.slice(num_clamp_dims - num_input_dims);
+      requires_view = true;
+    } else if (num_clamp_dims < num_input_dims) {
+      fill_new_shape(num_input_dims, num_clamp_dims, new_shape_storage.data(), clamp_shape);
+      clamp_shape = IntArrayRef(new_shape_storage);
+      requires_view = true;
+    }
+
+    return requires_view ? (*clamp_tensor_ref).view(clamp_shape) : *clamp_tensor_ref;
+  };
+
  if (has_min) {
-    min_opt_tensor = (num_min_dims < num_input_dims) ? (*min_opt).view(new_min_shape) : *min_opt;
+    min_opt_tensor = reshape_clamp_tensor(min_opt, num_min_dims, new_min_arr);
  }
  if (has_max) {
-    max_opt_tensor = (num_max_dims < num_input_dims) ? (*max_opt).view(new_max_shape) : *max_opt;
+    max_opt_tensor = reshape_clamp_tensor(max_opt, num_max_dims, new_max_arr);
  }

  @autoreleasepool {
@ -244,8 +258,8 @@ static void clamp_scalar_out_mps(const Tensor& input_t,

  @autoreleasepool {
    // the optional min/max refs could affect how we build the cached graph
-    std::string key = op_name + (has_min ? ("_min:" + std::to_string(min_scalar)) : "") +
-        (has_max ? ("_max:" + std::to_string(max_scalar)) : "") + "_scalar:" + getTensorsStringKey({input_t});
+    std::string key = op_name + (has_min ? ("_min:" + to_hex_key(min_scalar)) : "") +
+        (has_max ? ("_max:" + to_hex_key(max_scalar)) : "") + "_scalar:" + getTensorsStringKey({input_t});
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
      if (has_min)
        newCachedGraph->minTensor = [mpsGraph constantWithScalar:min_scalar
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -4225,7 +4225,7 @@
    MTIA: mm_out_mtia
    MPS: mm_out_mps
    XPU: mm_out_xpu
-    SparseCPU, SparseCUDA: _sparse_mm_out
+    SparseCPU, SparseCUDA, SparseMPS: _sparse_mm_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm_out

 - func: mm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@ -61,6 +61,7 @@ list(APPEND ATen_CUDA_TEST_SRCS
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_math_test.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_test.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cub_test.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cublas_handle_pool_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_device_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_distributions_test.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_dlconvertor_test.cpp
--- a/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp
+++ b/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp
@ -0,0 +1,77 @@
+#include <gtest/gtest.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <atomic>
+#include <thread>
+#include <vector>
+
+// Test concurrent access to getCurrentCUDABlasHandle and getCUDABlasLtWorkspace
+// to verify that the data race fix is working correctly
+
+TEST(CUDABlasHandlePoolTest, ConcurrentGetAndClearWorkspaces) {
+  if (!at::cuda::is_available()) {
+    return;
+  }
+
+  constexpr int num_accessor_threads = 15;
+  constexpr int num_clear_threads = 5;
+  constexpr int iterations_per_thread = 50;
+
+  std::atomic<bool> stop{false};
+  std::atomic<int> error_count{0};
+  std::vector<std::thread> threads;
+  threads.reserve(num_accessor_threads + num_clear_threads);
+
+  // Launch accessor threads
+  for (int i = 0; i < num_accessor_threads; ++i) {
+    threads.emplace_back([&stop, &error_count]() {
+      try {
+        at::cuda::CUDAGuard device_guard(0);
+
+        while (!stop.load(std::memory_order_relaxed)) {
+          const auto handle = at::cuda::getCurrentCUDABlasHandle();
+          const auto workspace = at::cuda::getCUDABlasLtWorkspace();
+
+          if (handle == nullptr || workspace == nullptr) {
+            error_count++;
+          }
+        }
+      } catch (const std::exception& e) {
+        error_count++;
+      }
+    });
+  }
+
+  // Launch threads that clear workspaces
+  for (int i = 0; i < num_clear_threads; ++i) {
+    threads.emplace_back([&error_count]() {
+      try {
+        for (int j = 0; j < iterations_per_thread; ++j) {
+          at::cuda::clearCublasWorkspaces();
+          std::this_thread::yield();
+        }
+      } catch (const std::exception& e) {
+        error_count++;
+      }
+    });
+  }
+
+  // Let them run for a bit
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  stop.store(true, std::memory_order_relaxed);
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+
+  EXPECT_EQ(error_count.load(), 0);
+}
+
+int main(int argc, char* argv[]) {
+  ::testing::InitGoogleTest(&argc, argv);
+  c10::cuda::CUDACachingAllocator::init(1);
+  return RUN_ALL_TESTS();
+}
--- a/aten/tools/valgrind.sup
+++ b/aten/tools/valgrind.sup
@ -10,6 +10,13 @@
   ...
 }

+{
+   ignore_empty_generic_uninitialised_conditional_jump
+   Memcheck:Cond
+   fun:_ZN2at6detail13empty_genericEN3c108ArrayRefIlEEPNS1_9AllocatorENS1_14DispatchKeySetENS1_10ScalarTypeESt8optionalINS1_12MemoryFormatEE
+   ...
+}
+
 {
   Cond_cuda
   Memcheck:Cond
--- a/benchmarks/dynamo/check_perf_csv.py
+++ b/benchmarks/dynamo/check_perf_csv.py
@ -9,28 +9,61 @@ def check_perf_csv(filename, threshold, threshold_scale):
    """
    Basic performance checking.
    """
+    try:
+        df = pd.read_csv(filename)
+    except FileNotFoundError:
+        print(f"Error: File {filename} not found")
+        sys.exit(1)

-    df = pd.read_csv(filename)
+    effective_threshold = threshold * threshold_scale
+    print(f"Checking {filename} (speedup threshold >= {effective_threshold:.2f}x)\n")

    failed = []
    for _, row in df.iterrows():
        model_name = row["name"]
-        speedup = row["speedup"]
-        if speedup < threshold * threshold_scale:
-            failed.append(model_name)
+        speedup = float(row["speedup"])
+        abs_latency = float(row["abs_latency"])
+        compilation_latency = float(row["compilation_latency"])
+        compression_ratio = float(row["compression_ratio"])
+        eager_peak_mem = float(row["eager_peak_mem"])
+        dynamo_peak_mem = float(row["dynamo_peak_mem"])

-        print(f"{model_name:34} {speedup}")
+        perf_summary = f"{model_name:34} speedup={speedup:.3f}x"
+        if pd.notna(abs_latency):
+            perf_summary += f", latency={abs_latency:.1f} ms/iter"
+        if pd.notna(compilation_latency):
+            perf_summary += f", compile={compilation_latency:.3f}s"
+        if pd.notna(compression_ratio):
+            perf_summary += f", mem_ratio={1 / compression_ratio:.2f}x"
+            if pd.notna(eager_peak_mem) and pd.notna(dynamo_peak_mem):
+                perf_summary += (
+                    f" (eager={eager_peak_mem:.1f} GB, dynamo={dynamo_peak_mem:.1f} GB)"
+                )
+
+        if speedup < effective_threshold:
+            failed.append((model_name, speedup))
+
+        print(perf_summary)

    if failed:
        print(
            textwrap.dedent(
                f"""
-                Error {len(failed)} models performance regressed
-                    {" ".join(failed)}
+                Error {len(failed)} model(s) performance regressed
+                    {" ".join([name for name, _ in failed])}
                """
            )
        )
+        for name, sp in sorted(failed, key=lambda x: x[1]):
+            pct_from_target = (sp / effective_threshold - 1.0) * 100.0
+            print(
+                f"  - {name}: {sp:.3f}x (< {effective_threshold:.2f}x; {pct_from_target:.1f}% from target)"
+            )
        sys.exit(1)
+    else:
+        print(
+            f"\nAll {len(df)} model(s) passed threshold check (>= {effective_threshold:.2f}x)"
+        )


 if __name__ == "__main__":
@ -44,7 +77,7 @@ if __name__ == "__main__":
        "-s",
        type=float,
        default=1.0,
-        help="multiple threshold by this value to relax the check",
+        help="multiply threshold by this value to relax the check",
    )
    args = parser.parse_args()
    check_perf_csv(args.file, args.threshold, args.threshold_scale)
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -2379,7 +2379,9 @@ class BenchmarkRunner:
                    print(
                        f"Load model outputs from {self.args.compare_model_outputs_with} to compare"
                    )
-                    saved_result = torch.load(self.args.compare_model_outputs_with)
+                    saved_result = torch.load(
+                        self.args.compare_model_outputs_with, weights_only=False
+                    )
                    is_bitwise_same = bitwise_same(saved_result, new_result)
                    if not is_bitwise_same:
                        print(
--- a/benchmarks/dynamo/torchbench.yaml
+++ b/benchmarks/dynamo/torchbench.yaml
@ -189,6 +189,10 @@ skip:
    - hf_Whisper
    - hf_distil_whisper
    - timm_vision_transformer_large
+    # https://github.com/pytorch/pytorch/issues/167895
+    - stable_diffusion
+    - stable_diffusion_text_encoder
+    - stable_diffusion_unet

  device:
    cpu:
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@ -2,6 +2,7 @@
 # These load paths point to different files in internal and OSS environment

 load("@bazel_skylib//lib:paths.bzl", "paths")
+load("//tools/build_defs:cell_defs.bzl", "get_fbsource_cell")
 load("//tools/build_defs:fb_native_wrapper.bzl", "fb_native")
 load("//tools/build_defs:fb_xplat_cxx_library.bzl", "fb_xplat_cxx_library")
 load("//tools/build_defs:fb_xplat_genrule.bzl", "fb_xplat_genrule")
@ -590,6 +591,9 @@ def pt_operator_query_codegen(
        pt_allow_forced_schema_registration = True,
        compatible_with = [],
        apple_sdks = None):
+    if get_fbsource_cell() == "fbcode":
+        return
+
    oplist_dir_name = name + "_pt_oplist"

    # @lint-ignore BUCKLINT
@ -865,6 +869,9 @@ def define_buck_targets(
        pt_xplat_cxx_library = fb_xplat_cxx_library,
        c2_fbandroid_xplat_compiler_flags = [],
        labels = []):
+    if get_fbsource_cell() == "fbcode":
+        return
+
    # @lint-ignore BUCKLINT
    fb_native.filegroup(
        name = "metal_build_srcs",
--- a/c10/core/SafePyObject.h
+++ b/c10/core/SafePyObject.h
@ -44,7 +44,7 @@ struct C10_API SafePyObject {
      (*other.pyinterpreter_)->incref(other.data_);
    }
    if (data_ != nullptr) {
-      (*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
+      (*pyinterpreter_)->decref(data_);
    }
    data_ = other.data_;
    pyinterpreter_ = other.pyinterpreter_;
@ -53,7 +53,7 @@ struct C10_API SafePyObject {

  ~SafePyObject() {
    if (data_ != nullptr) {
-      (*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
+      (*pyinterpreter_)->decref(data_);
    }
  }

--- a/c10/core/ScalarType.h
+++ b/c10/core/ScalarType.h
@ -34,20 +34,6 @@ namespace c10 {
 // See [dtype Macros note] in torch/headeronly/core/ScalarType.h
 // regarding macros.

-template <typename T>
-struct CppTypeToScalarType;
-
-#define SPECIALIZE_CppTypeToScalarType(cpp_type, scalar_type)                  \
-  template <>                                                                  \
-  struct CppTypeToScalarType<cpp_type>                                         \
-      : std::                                                                  \
-            integral_constant<c10::ScalarType, c10::ScalarType::scalar_type> { \
-  };
-
-AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(SPECIALIZE_CppTypeToScalarType)
-
-#undef SPECIALIZE_CppTypeToScalarType
-
 #define DEFINE_CONSTANT(_, name) \
  constexpr ScalarType k##name = ScalarType::name;

@ -106,13 +92,6 @@ inline bool isComplexType(ScalarType t) {
      t == ScalarType::ComplexDouble);
 }

-inline bool isQIntType(ScalarType t) {
-  // Don't forget to extend this when adding new QInt types
-  return t == ScalarType::QInt8 || t == ScalarType::QUInt8 ||
-      t == ScalarType::QInt32 || t == ScalarType::QUInt4x2 ||
-      t == ScalarType::QUInt2x4;
-}
-
 inline bool isBitsType(ScalarType t) {
  return t == ScalarType::Bits1x8 || t == ScalarType::Bits2x4 ||
      t == ScalarType::Bits4x2 || t == ScalarType::Bits8 ||
--- a/c10/core/StorageImpl.cpp
+++ b/c10/core/StorageImpl.cpp
@ -48,6 +48,30 @@ void warnDeprecatedDataPtr() {
  TORCH_CHECK(false, "Cannot access data pointer of Storage that is invalid.");
 }

+void StorageImpl::incref_pyobject() const {
+  // Because intrusive_ptr incref uses relaxed memory order, we need to
+  // do an acquire fence to ensure that the kHasPyObject bit was
+  // observed before the load of the PyObject* below.
+  // NB: This is a no-op on x86/x86-64
+  std::atomic_thread_fence(std::memory_order_acquire);
+
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->incref(obj);
+}
+
+void StorageImpl::decref_pyobject() const {
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->decref(obj);
+}
+
+bool StorageImpl::try_incref_pyobject() const {
+  c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
+  if (C10_UNLIKELY(!interp)) {
+    return false;
+  }
+  return (*interp)->try_incref(pyobj_slot_);
+}
+
 void SetStorageImplCreate(DeviceType t, StorageImplCreateHelper fptr) {
  // Allowlist verification.
  // Only if the devicetype is in the allowlist,
--- a/c10/core/StorageImpl.h
+++ b/c10/core/StorageImpl.h
@ -105,6 +105,12 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
    data_ptr_.clear();
  }

+  void incref_pyobject() const override final;
+
+  void decref_pyobject() const override final;
+
+  bool try_incref_pyobject() const override final;
+
  size_t nbytes() const {
    // OK to do this instead of maybe_as_int as nbytes is guaranteed positive
    TORCH_CHECK(!size_bytes_is_heap_allocated_);
@ -370,4 +376,18 @@ C10_API c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
    bool resizable,
    std::optional<at::Device> device_opt);

+namespace detail {
+
+#ifndef C10_MOBILE
+template <class T>
+struct TargetTraits<
+    T,
+    std::enable_if_t<
+        std::is_base_of_v<c10::StorageImpl, std::remove_cv_t<T>>>> {
+  static constexpr bool can_have_pyobject = true;
+};
+#endif
+
+} // namespace detail
+
 } // namespace c10
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@ -277,7 +277,6 @@ void TensorImpl::release_resources() {
  if (storage_) {
    storage_ = {};
  }
-  pyobj_slot_.maybe_destroy_pyobj();
 }

 #ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
@ -989,6 +988,30 @@ void TensorImpl::empty_tensor_restride_symint(MemoryFormat memory_format) {
  }
 }

+void TensorImpl::incref_pyobject() const {
+  // Because intrusive_ptr incref uses relaxed memory order, we need to
+  // do an acquire fence to ensure that the kHasPyObject bit was
+  // observed before the load of the PyObject* below.
+  // NB: This is a no-op on x86/x86-64
+  std::atomic_thread_fence(std::memory_order_acquire);
+
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->incref(obj);
+}
+
+void TensorImpl::decref_pyobject() const {
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->decref(obj);
+}
+
+bool TensorImpl::try_incref_pyobject() const {
+  c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
+  if (C10_UNLIKELY(!interp)) {
+    return false;
+  }
+  return (*interp)->try_incref(pyobj_slot_);
+}
+
 namespace impl {

 namespace {
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@ -2178,6 +2178,12 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    return &pyobj_slot_;
  }

+  void incref_pyobject() const override final;
+
+  void decref_pyobject() const override final;
+
+  bool try_incref_pyobject() const override final;
+
 private:
  // See NOTE [std::optional operator usage in CUDA]
  // We probably don't want to expose this publicly until
@ -3079,6 +3085,19 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
  friend class C10_TensorImpl_Size_Check_Dummy_Class;
 };

+namespace detail {
+
+#ifndef C10_MOBILE
+template <class T>
+struct TargetTraits<
+    T,
+    std::enable_if_t<std::is_base_of_v<c10::TensorImpl, std::remove_cv_t<T>>>> {
+  static constexpr bool can_have_pyobject = true;
+};
+#endif
+
+} // namespace detail
+
 // Note [TensorImpl size constraints]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // Changed the size of TensorImpl?  If the size went down, good for
--- a/c10/core/impl/PyInterpreter.cpp
+++ b/c10/core/impl/PyInterpreter.cpp
@ -11,8 +11,11 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {

  void incref(PyObject* pyobj) const override {} // do nothing

-  void decref(PyObject* pyobj, bool has_pyobj_slot) const override {
-  } // do nothing
+  void decref(PyObject* pyobj) const override {} // do nothing
+
+  bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const override {
+    return false;
+  }

 #define PANIC(m)              \
  TORCH_INTERNAL_ASSERT(      \
@ -20,6 +23,10 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
      "attempted to call " #m \
      " on a Tensor with nontrivial PyObject after corresponding interpreter died")

+  size_t refcnt(PyObject* pyobj) const override {
+    PANIC(refcnt);
+  }
+
  c10::intrusive_ptr<TensorImpl> detach(const TensorImpl* self) const override {
    PANIC(detach);
  }
--- a/c10/core/impl/PyInterpreter.h
+++ b/c10/core/impl/PyInterpreter.h
@ -18,6 +18,9 @@ namespace c10 {
 struct IValue;
 class OperatorHandle;
 struct TensorImpl;
+namespace impl {
+struct PyObjectSlot;
+} // namespace impl
 } // namespace c10

 namespace torch::jit {
@ -126,9 +129,12 @@ struct C10_API PyInterpreterVTable {

  // Run Py_INCREF on a PyObject.
  virtual void incref(PyObject* pyobj) const = 0;
-  // Run Py_DECREF on a PyObject.  We DO NOT assume the GIL is held on call
-  // See NOTE [PyInterpreter::decref takes a `has_pyobj_slot` arg]
-  virtual void decref(PyObject* pyobj, bool has_pyobj_slot) const = 0;
+  // Run Py_DECREF on a PyObject.  We DO NOT assume the GIL is held on call.
+  virtual void decref(PyObject* pyobj) const = 0;
+  // Run PyUnstable_TryIncRef on a PyObject if it's not NULL.
+  virtual bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const = 0;
+  // Run Py_REFCNT on a PyObject.
+  virtual size_t refcnt(PyObject* pyobj) const = 0;

  // Perform a detach by deferring to the __torch_dispatch__ implementation of
  // detach, which will also arrange for the PyObject to get copied in this
--- a/c10/core/impl/PyObjectSlot.cpp
+++ b/c10/core/impl/PyObjectSlot.cpp
@ -1,56 +0,0 @@
-#include <c10/core/impl/PyObjectSlot.h>
-
-namespace c10::impl {
-
-PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
-
-PyObjectSlot::~PyObjectSlot() {
-  maybe_destroy_pyobj();
-}
-
-void PyObjectSlot::maybe_destroy_pyobj() {
-  if (owns_pyobj()) {
-    TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
-    TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
-    (*pyobj_interpreter_.load(std::memory_order_acquire))
-        ->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
-    // NB: this destructor can only be entered when there are no
-    // references to this C++ object (obviously), NOR any references
-    // to the PyObject (if there are references to the PyObject,
-    // then the PyObject holds an owning reference to the tensor).
-    // So it is OK to clear pyobj_ here as it is impossible for it to
-    // be used again (modulo weak reference races)
-    pyobj_ = nullptr; // for safety
-  }
-}
-
-PyInterpreter* PyObjectSlot::pyobj_interpreter() {
-  return pyobj_interpreter_.load(std::memory_order_acquire);
-}
-
-PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
-  // NOLINTNEXTLINE(performance-no-int-to-ptr)
-  return reinterpret_cast<PyObject*>(
-      reinterpret_cast<uintptr_t>(pyobj_) & ~0x1ULL);
-}
-
-PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
-  auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
-  if (interpreter) {
-    return *interpreter;
-  }
-  TORCH_CHECK(false, "cannot access PyObject for Tensor - no interpreter set");
-}
-
-bool PyObjectSlot::owns_pyobj() {
-  // NOLINTNEXTLINE(performance-no-int-to-ptr)
-  return reinterpret_cast<uintptr_t>(pyobj_) & 1;
-}
-
-void PyObjectSlot::set_owns_pyobj(bool b) {
-  // NOLINTNEXTLINE(performance-no-int-to-ptr)
-  pyobj_ = reinterpret_cast<PyObject*>(
-      reinterpret_cast<uintptr_t>(_unchecked_untagged_pyobj()) | b);
-}
-
-} // namespace c10::impl
--- a/c10/core/impl/PyObjectSlot.h
+++ b/c10/core/impl/PyObjectSlot.h
@ -8,117 +8,58 @@

 #include <atomic>

+namespace torch::utils {
+class PyObjectPreservation;
+}
+
 namespace c10::impl {

 struct C10_API PyObjectSlot {
 public:
-  PyObjectSlot();
-
-  ~PyObjectSlot();
-
-  void maybe_destroy_pyobj();
-
-  // Associate the TensorImpl with the specified PyObject, and, if necessary,
-  // also tag the interpreter.
-  //
-  // NB: This lives in a header so that we can inline away the switch on status
-  //
-  // NB: THIS FUNCTION CAN RAISE AN EXCEPTION.  Make sure to clean up after
-  // PyObject if necessary!
-  void init_pyobj(PyObject* pyobj) {
-    pyobj_interpreter_.store(
-        getGlobalPyInterpreter(), std::memory_order_relaxed);
-    pyobj_ = pyobj;
-  }
+  PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}

  // Query the PyObject interpreter.  This may return null if there is no
-  // interpreter.  This is racy!
-  PyInterpreter* pyobj_interpreter();
-
-  PyObject* _unchecked_untagged_pyobj() const;
-
-  // Test the interpreter tag.  If tagged for the current interpreter, return
-  // a non-nullopt (but possibly null) PyObject.  If (possibly) untagged,
-  // returns a nullopt.  If it is definitely invalid, raises an error.
-  //
-  // If `ignore_hermetic_tls` is false and this function is called from a
-  // hermetic context (ie, `HermeticPyObjectTLS::get_state()` is true), then
-  // nullopt is returned. If `ignore_hermetic_tls` is true, then the hermetic
-  // context is ignored, allowing you to check the interpreter tag of a
-  // nonhermetic PyObject from within a hermetic context. This is necessary
-  // because there are some cases where the deallocator function of a
-  // nonhermetic PyObject is called from within a hermetic context, so it must
-  // be properly treated as a nonhermetic PyObject.
-  //
-  // NB: this lives in header so that we can avoid actually creating the
-  // std::optional
-
-  // @todo alban: I'm not too sure what's going on here, we can probably delete
-  // it but it's worthwhile making sure
-  std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
-    impl::PyInterpreter* interpreter =
-        pyobj_interpreter_.load(std::memory_order_acquire);
-    if (interpreter == nullptr) {
-      return std::nullopt;
-    }
-
-    if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
-      return std::nullopt;
-    } else {
-      return _unchecked_untagged_pyobj();
-    }
+  // interpreter.
+  PyInterpreter* pyobj_interpreter() const {
+    return pyobj_interpreter_.load(std::memory_order_acquire);
  }

-  PyInterpreter& load_pyobj_interpreter() const;
+  PyInterpreter& load_pyobj_interpreter() const {
+    auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
+    TORCH_INTERNAL_ASSERT(
+        interpreter, "cannot access PyObject for Tensor - no interpreter set");
+    return *interpreter;
+  }

-  bool owns_pyobj();
+  PyObject* load_pyobj() const {
+    return pyobj_.load(std::memory_order_acquire);
+  }

-  void set_owns_pyobj(bool b);
+  void store_pyobj(PyObject* obj) {
+    pyobj_.store(obj, std::memory_order_release);
+  }
+
+  bool has_unique_reference() const {
+    PyObject* pyobj = load_pyobj();
+    return pyobj != nullptr && load_pyobj_interpreter()->refcnt(pyobj) == 1;
+  }
+
+  void clear() {
+    pyobj_.store(nullptr, std::memory_order_relaxed);
+    pyobj_interpreter_.store(nullptr, std::memory_order_relaxed);
+  }

 private:
-  // This field contains the interpreter tag for this object.  See
-  // Note [Python interpreter tag] for general context
-  //
-  // Note [Memory ordering on Python interpreter tag]
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // What memory_order do we need when accessing this atomic?  We don't
-  // need a single total modification order (as provided by
-  // memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
-  // transition from -1 to some positive integer and never changes afterwards.
-  // Because there is only one modification, it trivially already has a total
-  // modification order (e.g., we don't need fences or locked instructions on
-  // x86)
-  //
-  // In fact, one could make a reasonable argument that relaxed reads are OK,
-  // due to the presence of external locking (GIL) to ensure that interactions
-  // with other data structures are still correctly synchronized, so that
-  // we fall in the "Single-Location Data Structures" case as described in
-  // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
-  // However, on x86, it doesn't matter if I use acquire or relaxed on the load
-  // as I get the same assembly in both cases.  So I just use the more
-  // conservative acquire (which will impede compiler optimizations but I don't
-  // care)
+  // This is now always the global interpreter if the PyObject is set.
+  // Maybe we can remove this field some day...
  std::atomic<PyInterpreter*> pyobj_interpreter_;

-  // This field contains a reference to a PyObject representing this Tensor.
-  // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
-  // PyObject for it and set this field.  This field does not have to be
-  // protected by an atomic as it is only allowed to be accessed when you hold
-  // the GIL, or during destruction of the tensor.
-  //
-  // When a PyObject dies, you are obligated to clear this field
-  // (otherwise, you will try to use-after-free the pyobj); this currently
-  // occurs in THPVariable_clear in torch/csrc/autograd/python_variable.cpp
-  //
-  // NB: Ordinarily, this should not be a strong reference, as if the
-  // PyObject owns the Tensor, this would create a reference cycle.
-  // However, sometimes this ownership flips.  To track who owns
-  // who, this has a single pointer tag indicating whether or not the
-  // C++ object owns the PyObject (the common case, zero, means PyObject
-  // owns the C++ object); see _unchecked_untagged_pyobj for raw access
-  // or check_pyobj for checked access.  See references to PyObject
-  // resurrection in torch/csrc/autograd/python_variable.cpp
-  PyObject* pyobj_;
+  // The PyObject representing this Tensor or nullptr. Ownership is managed
+  // by intrusive_ptr. By the time the PyObjectSlot is destroyed, this
+  // reference is already dead.
+  std::atomic<PyObject*> pyobj_;
+
+  friend class torch::utils::PyObjectPreservation;
 };

 } // namespace c10::impl
--- a/c10/util/ArrayRef.h
+++ b/c10/util/ArrayRef.h
@ -50,7 +50,13 @@ namespace c10 {
 /// However, you should prefer to use ArrayRef when possible, because its use
 /// of TORCH_CHECK will lead to better user-facing error messages.
 template <typename T>
-class ArrayRef final : public HeaderOnlyArrayRef<T> {
+// ArrayRef cannot be derived from. Normally, we would use `final`
+// specifier to force this constraint at compile time.  However, Intel
+// compiler does not recognize ArrayRef as a class template (which is
+// required in the definition of at::TensorAccessor, for instance)
+// when `final` specifier is used. So, we cannot define ArrayRef as
+// final because of the Intel compiler issue.
+class ArrayRef : public HeaderOnlyArrayRef<T> {
 public:
  /// @name Constructors, all inherited from HeaderOnlyArrayRef except for
  /// SmallVector. As inherited constructors won't work with class template
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@ -379,7 +379,11 @@ C10_API std::string GetExceptionString(const std::exception& e);
 // ----------------------------------------------------------------------------

 #ifdef STRIP_ERROR_MESSAGES
-#define TORCH_RETHROW(e, ...) throw
+#define TORCH_RETHROW(e, ...)                       \
+  do {                                              \
+    (void)e; /* Suppress unused variable warning */ \
+    throw;                                          \
+  } while (false)
 #else
 #define TORCH_RETHROW(e, ...)               \
  do {                                      \
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@ -12,6 +12,10 @@ template <typename, typename...>
 class class_;
 }

+namespace torch::utils {
+class PyObjectPreservation;
+}
+
 namespace c10 {
 class intrusive_ptr_target;
 namespace raw {
@ -33,6 +37,8 @@ constexpr uint64_t kImpracticallyHugeWeakReferenceCount =
 constexpr uint64_t kReferenceCountOne = 1;
 constexpr uint64_t kWeakReferenceCountOne = (kReferenceCountOne << 32);
 constexpr uint64_t kUniqueRef = (kReferenceCountOne | kWeakReferenceCountOne);
+// Indicates whether the object has a PyObject wrapper.
+constexpr uint64_t kHasPyObject = (uint64_t(1) << 63);

 template <class TTarget>
 struct intrusive_target_default_null_type final {
@ -55,7 +61,11 @@ inline uint32_t refcount(uint64_t combined_refcount) {
 }

 inline uint32_t weakcount(uint64_t combined_refcount) {
-  return static_cast<uint32_t>(combined_refcount >> 32);
+  return static_cast<uint32_t>((combined_refcount & ~kHasPyObject) >> 32);
+}
+
+inline bool has_pyobject(uint64_t combined_refcount) {
+  return (combined_refcount & kHasPyObject) != 0;
 }

 // The only requirement for refcount increment is that it happens-before
@ -66,12 +76,6 @@ inline uint64_t atomic_combined_refcount_increment(
  return combined_refcount.fetch_add(inc, std::memory_order_relaxed) + inc;
 }

-inline uint32_t atomic_refcount_increment(
-    std::atomic<uint64_t>& combined_refcount) {
-  return detail::refcount(atomic_combined_refcount_increment(
-      combined_refcount, kReferenceCountOne));
-}
-
 inline uint32_t atomic_weakcount_increment(
    std::atomic<uint64_t>& combined_refcount) {
  return detail::weakcount(atomic_combined_refcount_increment(
@ -99,6 +103,11 @@ inline uint32_t atomic_weakcount_decrement(
      combined_refcount, kWeakReferenceCountOne));
 }

+template <class T, class = void>
+struct TargetTraits {
+  static constexpr bool can_have_pyobject = false;
+};
+
 } // namespace detail

 /**
@ -155,6 +164,23 @@ class C10_API intrusive_ptr_target {
  // we can atomically operate on both at the same time for performance
  // and defined behaviors.
  //
+  // Note [PyObject preservation for Tensor and Storages]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // intrusive_ptr has special support for preserving PyObject wrappers
+  // for TensorImpl and StorageImpl. The most significant bit (kHasPyObject) of
+  // the combined_refcount_ is used to indicate whether the object has a
+  // PyObject wrapper.
+  //
+  //   - The PyObject, if it exists, holds a strong reference to the
+  //     intrusive_ptr_target.
+  //
+  //   - When the refcount goes from 1 to 2, we incref the PyObject.
+  //
+  //   - When the refcount goes from 2 to 1, we decref the PyObject.
+  //
+  // In other words, the intrusive_ptr keeps the PyObject alive as long as there
+  // are other C++ references to the intrusive_ptr_target.
+
  mutable std::atomic<uint64_t> combined_refcount_;
  static_assert(sizeof(std::atomic<uint64_t>) == 8);
  static_assert(alignof(std::atomic<uint64_t>) == 8);
@ -172,6 +198,8 @@ class C10_API intrusive_ptr_target {
  template <typename T>
  friend struct ExclusivelyOwnedTensorTraits;

+  friend class torch::utils::PyObjectPreservation;
+
 protected:
  // protected destructor. We never want to destruct intrusive_ptr_target*
  // directly.
@ -255,6 +283,16 @@ class C10_API intrusive_ptr_target {
   */
  virtual void release_resources() {}

+  /**
+   * These two methods are called when the refcount transitions between one
+   * and two and the object has a PyObject wrapper.
+   */
+  virtual void incref_pyobject() const {}
+  virtual void decref_pyobject() const {}
+  virtual bool try_incref_pyobject() const {
+    return false;
+  }
+
  uint32_t refcount(std::memory_order order = std::memory_order_relaxed) const {
    return detail::refcount(combined_refcount_.load(order));
  }
@ -265,6 +303,19 @@ class C10_API intrusive_ptr_target {
  }
 };

+namespace detail {
+
+#ifndef C10_MOBILE
+template <>
+struct TargetTraits<c10::intrusive_ptr_target> {
+  // A generic intrusive_ptr<intrusive_ptr_target> may actually be a TensorImpl
+  // or StorageImpl, so we have to allow for PyObject support.
+  static constexpr bool can_have_pyobject = true;
+};
+#endif
+
+} // namespace detail
+
 template <class TTarget, class NullType>
 class weak_intrusive_ptr;

@ -314,18 +365,34 @@ class intrusive_ptr final {

  void retain_() {
    if (target_ != NullType::singleton()) {
-      uint32_t new_refcount =
-          detail::atomic_refcount_increment(target_->combined_refcount_);
+      uint64_t combined = detail::atomic_combined_refcount_increment(
+          target_->combined_refcount_, detail::kReferenceCountOne);
+      uint32_t new_refcount = detail::refcount(combined);
      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
          new_refcount != 1,
          "intrusive_ptr: Cannot increase refcount after it reached zero.");
+
+      if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+        // If the refcount transitioned from 1 to 2, we need to incref the
+        // PyObject. In other words, we need to ensure that the PyObject stays
+        // alive now that we have a C++ reference to this object in addition to
+        // the PyObject itself.
+        if (C10_UNLIKELY(
+                detail::has_pyobject(combined) &&
+                detail::refcount(combined) == 2)) {
+          target_->incref_pyobject();
+        }
+      } else {
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+            !detail::has_pyobject(combined),
+            "TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
+      }
    }
  }

  void reset_() noexcept {
    if (target_ != NullType::singleton()) {
-      if (target_->combined_refcount_.load(std::memory_order_acquire) ==
-          detail::kUniqueRef) {
+      if (is_uniquely_owned()) {
        // Both counts are 1, so there are no weak references and
        // we are releasing the last strong reference. No other
        // threads can observe the effects of this target_ deletion
@ -337,9 +404,10 @@ class intrusive_ptr final {

      auto combined_refcount = detail::atomic_combined_refcount_decrement(
          target_->combined_refcount_, detail::kReferenceCountOne);
-      if (detail::refcount(combined_refcount) == 0) {
-        bool should_delete =
-            (combined_refcount == detail::kWeakReferenceCountOne);
+      uint32_t new_refcount = detail::refcount(combined_refcount);
+      bool has_pyobject = detail::has_pyobject(combined_refcount);
+      if (new_refcount == 0) {
+        bool should_delete = detail::weakcount(combined_refcount) == 1;
        // See comment above about weakcount. As long as refcount>0,
        // weakcount is one larger than the actual number of weak references.
        // So we need to decrement it here.
@ -356,6 +424,18 @@ class intrusive_ptr final {
        if (should_delete) {
          delete target_;
        }
+      } else if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+        // If the refcount transitioned from 2 to 1, we need to decref the
+        // PyObject. In other words, we don't want to keep the PyObject alive if
+        // there are no C++ references to this object other than the PyObject
+        // itself.
+        if (C10_UNLIKELY(has_pyobject && new_refcount == 1)) {
+          target_->decref_pyobject();
+        }
+      } else {
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+            !has_pyobject,
+            "TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
      }
    }
  }
@ -522,6 +602,16 @@ class intrusive_ptr final {
    return use_count() == 1;
  }

+  /**
+   * Stronger than unique() in that it must not have any weakrefs as well.
+   */
+  bool is_uniquely_owned() const noexcept {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(target_ != NullType::singleton());
+    uint64_t combined =
+        target_->combined_refcount_.load(std::memory_order_acquire);
+    return (combined & ~detail::kHasPyObject) == detail::kUniqueRef;
+  }
+
  /**
   * Returns an owning (!) pointer to the underlying object and makes the
   * intrusive_ptr instance invalid. That means the refcount is not decreased.
@ -932,6 +1022,7 @@ class weak_intrusive_ptr final {
    if (target_ == NullType::singleton()) {
      return intrusive_ptr<TTarget, NullType>();
    } else {
+      bool increfed = false;
      auto combined_refcount =
          target_->combined_refcount_.load(std::memory_order_relaxed);
      do {
@ -940,12 +1031,31 @@ class weak_intrusive_ptr final {
          // Return nullptr.
          return intrusive_ptr<TTarget, NullType>();
        }
+        if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+          if (detail::has_pyobject(combined_refcount) &&
+              detail::refcount(combined_refcount) == 1 && !increfed) {
+            // Object has a python wrapper with no other C++ references.
+            // We need to to incref the Python object before we acquire a
+            // strong reference to the C++ object to avoid a situation
+            // where the Python object is deallocated concurrently.
+            if (!target_->try_incref_pyobject()) {
+              return intrusive_ptr<TTarget, NullType>();
+            }
+            increfed = true;
+          }
+        }
      } while (!target_->combined_refcount_.compare_exchange_weak(
          combined_refcount,
          combined_refcount + detail::kReferenceCountOne,
          std::memory_order_acquire,
          std::memory_order_relaxed));

+      if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+        if (increfed && detail::refcount(combined_refcount) != 1) {
+          target_->decref_pyobject();
+        }
+      }
+
      return intrusive_ptr<TTarget, NullType>(
          target_, raw::DontIncreaseRefcount{});
    }
@ -1060,7 +1170,18 @@ namespace intrusive_ptr {
 // NullType::singleton to this function
 inline void incref(intrusive_ptr_target* self) {
  if (self) {
-    detail::atomic_refcount_increment(self->combined_refcount_);
+    uint64_t combined = detail::atomic_combined_refcount_increment(
+        self->combined_refcount_, detail::kReferenceCountOne);
+
+#ifndef C10_MOBILE
+    if (C10_UNLIKELY(
+            detail::has_pyobject(combined) &&
+            detail::refcount(combined) == 2)) {
+      self->incref_pyobject();
+    }
+#else
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!detail::has_pyobject(combined));
+#endif
  }
 }

--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@ -15,6 +15,8 @@ using namespace c10::CachingDeviceAllocator;
 // newly allocated memory with 512-byte alignment.
 constexpr size_t kDeviceAlignment = 512;

+class XPUAllocator;
+
 namespace {
 using stream_set = ska::flat_hash_set<xpu::XPUStream>;

@ -23,14 +25,19 @@ typedef bool (*Comparison)(const Block*, const Block*);
 bool BlockComparatorSize(const Block* a, const Block* b);
 bool BlockComparatorAddress(const Block* a, const Block* b);

+struct PrivatePool;
+
 struct BlockPool {
-  BlockPool(bool small)
+  BlockPool(bool small, PrivatePool* private_pool = nullptr)
      : blocks(BlockComparatorSize),
        unmapped(BlockComparatorAddress),
-        is_small(small) {}
+        is_small(small),
+        owner_PrivatePool(private_pool) {}
+
  std::set<Block*, Comparison> blocks;
  std::set<Block*, Comparison> unmapped;
  const bool is_small;
+  PrivatePool* owner_PrivatePool;
 };

 struct ExpandableSegment;
@ -349,6 +356,43 @@ struct AllocParams {
  StatTypes stat_types = {};
 };

+// Internal implementation that manages actual memory blocks.
+// high level MemPool interface wraps PrivatePool via MempoolId.
+struct PrivatePool {
+  PrivatePool(MempoolId_t id, XPUAllocator* allocator = nullptr)
+      : id(std::move(id)),
+        allocator_(allocator),
+        large_blocks(/*small=*/false, this),
+        small_blocks(/*small=*/true, this) {}
+  PrivatePool(const PrivatePool&) = delete;
+  PrivatePool(PrivatePool&&) = delete;
+  PrivatePool& operator=(const PrivatePool&) = delete;
+  PrivatePool& operator=(PrivatePool&&) = delete;
+  ~PrivatePool() = default;
+
+  // default Mempool when no Mempool is specified
+  MempoolId_t id{0, 0};
+  // Number of live graphs using this pool
+  int use_count{1};
+  // Number of unfreed allocations made for this pool. When use_count and
+  // allocation_count drop to zero, we can delete this PrivatePool from
+  // graph_pools.
+  int allocation_count{0};
+  XPUAllocator* allocator_;
+  BlockPool large_blocks;
+  BlockPool small_blocks;
+
+ public:
+  XPUAllocator* allocator() {
+    return allocator_;
+  }
+};
+struct MempoolIdHash {
+  std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
+    return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
+  }
+};
+
 } // anonymous namespace

 class DeviceCachingAllocator {
@ -365,6 +409,13 @@ class DeviceCachingAllocator {
  bool set_fraction = false;
  std::vector<ExpandableSegment*> expandable_segments;
  std::vector<c10::DeviceIndex> devices_with_peer_access; // reserved
+  std::vector<std::pair<MempoolId_t, std::function<bool(sycl::queue*)>>>
+      captures_underway;
+  ska::flat_hash_map<MempoolId_t, std::unique_ptr<PrivatePool>, MempoolIdHash>
+      graph_pools;
+  // Pools no longer referenced by any graph.
+  ska::flat_hash_map<MempoolId_t, PrivatePool*, MempoolIdHash>
+      graph_pools_freeable;

  size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
    if (!src || src->allocated || src->event_count > 0 ||
@ -463,7 +514,22 @@ class DeviceCachingAllocator {
    }
  }

-  BlockPool& get_pool(size_t size) {
+  BlockPool& get_pool(size_t size, sycl::queue* queue) {
+    if (C10_UNLIKELY(!captures_underway.empty())) {
+      for (auto& entry : captures_underway) {
+        // lookup for mempool id matching current capture graph
+        if (entry.second(queue)) {
+          auto it1 = graph_pools.find(entry.first);
+          // lookup mempool
+          TORCH_INTERNAL_ASSERT(it1 != graph_pools.end());
+          if (size <= kSmallSize) {
+            return it1->second->small_blocks;
+          } else {
+            return it1->second->large_blocks;
+          }
+        }
+      }
+    }
    if (size < kSmallSize) {
      return small_blocks;
    } else {
@ -669,6 +735,10 @@ class DeviceCachingAllocator {
    if (!ptr) {
      return false;
    }
+
+    if (p.pool->owner_PrivatePool) {
+      p.pool->owner_PrivatePool->allocation_count++;
+    }
    p.block = new Block(device, p.queue(), size, p.pool, ptr);
    for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
      stats.reserved_bytes[stat_type].increase(size);
@ -677,11 +747,14 @@ class DeviceCachingAllocator {
    return true;
  }

-  void synchronize_and_free_events() {
+  void synchronize_and_free_events(PrivatePool* pool = nullptr) {
    for (auto& xe : xpu_events) {
      for (auto& e : xe.second) {
        auto event = e.first;
        auto* block = e.second;
+        if (pool && block->pool->owner_PrivatePool != pool) {
+          continue;
+        }
        event.wait();
        block->event_count--;
        if (block->event_count == 0) {
@ -785,6 +858,13 @@ class DeviceCachingAllocator {
    for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
      stats.reserved_bytes[stat_type].decrease(unmapped.size);
    });
+
+    if (block->pool->owner_PrivatePool) {
+      // The Freed block belonged to a XPU graph's PrivatePool.
+      TORCH_INTERNAL_ASSERT(
+          block->pool->owner_PrivatePool->allocation_count > 0);
+      block->pool->owner_PrivatePool->allocation_count--;
+    }
  }

  void release_blocks(BlockPool& pool) {
@ -812,13 +892,41 @@ class DeviceCachingAllocator {
    }
  }

-  bool release_cached_blocks() {
-    synchronize_and_free_events();
-    // See Note [Safe to Free Blocks on BlockPool]
-    c10::xpu::syncStreamsOnDevice(device_index);
+  bool release_cached_blocks(MempoolId_t mempool_id) {
+    if (mempool_id.first == 0 && mempool_id.second == 0 &&
+        captures_underway.empty()) {
+      synchronize_and_free_events();
+      // See Note [Safe to Free Blocks on BlockPool]
+      c10::xpu::syncStreamsOnDevice(device_index);

-    release_blocks(large_blocks);
-    release_blocks(small_blocks);
+      release_blocks(large_blocks);
+      release_blocks(small_blocks);
+    }
+
+    for (auto it = graph_pools_freeable.begin();
+         it != graph_pools_freeable.end();) {
+      if (mempool_id.first != 0 || mempool_id.second != 0) {
+        if (it->first == mempool_id) {
+          // If there is an active mempool, we sync only the events
+          // associated with the pool
+          synchronize_and_free_events(it->second);
+        } else {
+          // otherwise we move on
+          ++it;
+          continue;
+        }
+      }
+      TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
+      release_blocks(it->second->small_blocks);
+      release_blocks(it->second->large_blocks);
+      if (it->second->allocation_count == 0) {
+        auto erase_count = graph_pools.erase(it->first);
+        TORCH_INTERNAL_ASSERT(erase_count == 1);
+        it = graph_pools_freeable.erase(it);
+      } else {
+        ++it;
+      }
+    }
    return true;
  }

@ -903,6 +1011,30 @@ class DeviceCachingAllocator {
    }
  }

+  void create_or_incref_pool(
+      MempoolId_t mempool_id,
+      XPUAllocator* allocator = nullptr) {
+    auto it = graph_pools.find(mempool_id);
+    if (it == graph_pools.end()) {
+      // mempool_id does not reference an existing pool.
+      // Make a new pool for XPU graph capture or memory pool usage.
+      graph_pools.emplace(
+          mempool_id, std::make_unique<PrivatePool>(mempool_id, allocator));
+    } else {
+      // mempool_id references an existing pool, which the current XPU graph
+      // capture will share.
+      TORCH_INTERNAL_ASSERT(it->second->use_count > 0);
+      TORCH_INTERNAL_ASSERT(allocator == nullptr);
+      it->second->use_count++;
+    }
+  }
+
+  PrivatePool* get_private_pool(MempoolId_t mempool_id) {
+    auto it = graph_pools.find(mempool_id);
+    TORCH_INTERNAL_ASSERT(it != graph_pools.end());
+    return it->second.get();
+  }
+
 public:
  DeviceCachingAllocator(DeviceIndex device_index)
      : large_blocks(/* small */ false),
@ -911,9 +1043,11 @@ class DeviceCachingAllocator {

  Block* malloc(DeviceIndex device, size_t orig_size, sycl::queue& queue) {
    std::scoped_lock<std::recursive_mutex> lock(mutex);
-    process_events();
+    if (C10_LIKELY(captures_underway.empty())) {
+      process_events();
+    }
    size_t size = round_size(orig_size);
-    auto& pool = get_pool(size);
+    auto& pool = get_pool(size, &queue);
    const size_t alloc_size = get_allocation_size(size);
    AllocParams params(device, size, &queue, &pool, alloc_size);
    params.stat_types = get_stat_types_for_pool(pool);
@ -923,7 +1057,7 @@ class DeviceCachingAllocator {
    // Can't reuse an existing block, try to get a new one.
    if (!block_found) {
      block_found = alloc_block(params, false) ||
-          (release_cached_blocks() && alloc_block(params, true));
+          (release_cached_blocks({0, 0}) && alloc_block(params, true));
    }
    if (!block_found) {
      const auto& raw_device = c10::xpu::get_raw_device(device);
@ -1016,9 +1150,9 @@ class DeviceCachingAllocator {
    block->stream_uses.insert(stream);
  }

-  void emptyCache() {
+  void emptyCache(MempoolId_t mempool_id) {
    std::scoped_lock<std::recursive_mutex> lock(mutex);
-    release_cached_blocks();
+    release_cached_blocks(mempool_id);
  }

  DeviceStats getStats() {
@ -1172,9 +1306,9 @@ class XPUAllocator : public DeviceAllocator {
    }
  }

-  void emptyCache(MempoolId_t mempool_id [[maybe_unused]] = {0, 0}) override {
+  void emptyCache(MempoolId_t mempool_id) override {
    for (auto& da : device_allocators) {
-      da->emptyCache();
+      da->emptyCache(mempool_id);
    }
  }

@ -1290,8 +1424,8 @@ void init(DeviceIndex device_count) {
  return allocator.init(device_count);
 }

-void emptyCache() {
-  return allocator.emptyCache();
+void emptyCache(MempoolId_t mempool_id) {
+  return allocator.emptyCache(mempool_id);
 }

 void resetPeakStats(DeviceIndex device) {
--- a/c10/xpu/XPUCachingAllocator.h
+++ b/c10/xpu/XPUCachingAllocator.h
@ -10,7 +10,7 @@ C10_XPU_API Allocator* get();

 C10_XPU_API void init(DeviceIndex device_count);

-C10_XPU_API void emptyCache();
+C10_XPU_API void emptyCache(MempoolId_t mempool_id = {0, 0});

 C10_XPU_API void resetPeakStats(DeviceIndex device);

--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@ -773,8 +773,20 @@ void PyTorchStreamWriter::writeRecord(
    bool compress) {
  AT_ASSERT(!finalized_);
  AT_ASSERT(!archive_name_plus_slash_.empty());
-  TORCH_INTERNAL_ASSERT(
-      files_written_.count(name) == 0, "Tried to serialize file twice: ", name);
+  if (files_written_.count(name) > 0) {
+    // Allow multiple writes for triton binaries
+    bool is_triton_extension =
+        c10::ends_with(name, ".so") ||
+        c10::ends_with(name, ".cubin") ||
+        c10::ends_with(name, ".hsaco");
+
+    if (is_triton_extension) {
+      LOG(WARNING) << "File '" << name << "' is being serialized multiple times";
+      return;
+    }
+
+    TORCH_INTERNAL_ASSERT(false, "Tried to serialize file twice: ", name);
+  }
  if (name == kSerializationIdRecordName && serialization_id_.empty()) {
    // In case of copying records from another file, skip writing a different
    // serialization_id than the one computed in this writer.
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@ -118,11 +118,6 @@ if(INTERN_BUILD_ATEN_OPS)
            list(APPEND _file_compile_flags "-gencode;arch=compute_120a,code=sm_120a")
          endif()
        endif()
-        if("${_arch}" STREQUAL "121a")
-          if(_existing_arch_flags MATCHES ".*compute_120.*")
-            list(APPEND _file_compile_flags "-gencode;arch=compute_121a,code=sm_121a")
-          endif()
-        endif()
      endforeach()
      list(JOIN _file_compile_flags " " _file_compile_flags)

@ -131,7 +126,7 @@ if(INTERN_BUILD_ATEN_OPS)

    _BUILD_FOR_ADDITIONAL_ARCHS(
      "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/RowwiseScaledMM.cu"
-      "89;90a;100a;103a;120a;121a")
+      "89;90a;100a;103a;120a")
    _BUILD_FOR_ADDITIONAL_ARCHS(
      "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/ScaledGroupMM.cu"
      "90a")
--- a/docs/cpp/source/index.rst
+++ b/docs/cpp/source/index.rst
@ -10,7 +10,7 @@ API.  This API can roughly be divided into five parts:
 - **TorchScript**: An interface to the TorchScript JIT compiler and interpreter.
 - **C++ Extensions**: A means of extending the Python API with custom C++ and CUDA routines.

-Combining, these building blocks form a research and
+Combined, these building blocks form a research and
 production ready C++ library for tensor computation and dynamic neural
 networks with strong emphasis on GPU acceleration as well as fast CPU
 performance. It is currently in use at Facebook in research and
@ -76,7 +76,7 @@ C++ Frontend
 ------------

 The PyTorch C++ frontend provides a high level, pure C++ modeling interface for
-neural network and general ML(Machine Learning) research and production use cases,
+neural networks and general ML (Machine Learning) research and production use cases,
 largely following the Python API in design and provided functionality. The C++
 frontend includes the following:

--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@ -254,7 +254,7 @@ To toggle the reduced precision reduction flags in C++, one can do

 .. _fp16accumulation:

-Full FP16 Accmumulation in FP16 GEMMs
+Full FP16 Accumulation in FP16 GEMMs
 -------------------------------------

 Certain GPUs have increased performance when doing _all_ FP16 GEMM accumulation
--- a/pyrefly.toml
+++ b/pyrefly.toml
@ -32,7 +32,7 @@ project-excludes = [
  "torch/utils/tensorboard/summary.py",
  # formatting issues, will turn on after adjusting where suppressions can be
  # in import statements
-  "tools/flight_recorder/components/types.py",
+  "torch/distributed/flight_recorder/components/types.py",
  "torch/linalg/__init__.py",
  "torch/package/importer.py",
  "torch/package/_package_pickler.py",
--- a/setup.py
+++ b/setup.py
@ -1632,7 +1632,7 @@ def configure_extension_build() -> tuple[
    if cmake_cache_vars["USE_DISTRIBUTED"]:
        # Only enable fr_trace command if distributed is enabled
        entry_points["console_scripts"].append(
-            "torchfrtrace = tools.flight_recorder.fr_trace:main",
+            "torchfrtrace = torch.distributed.flight_recorder.fr_trace:main",
        )
    return ext_modules, cmdclass, packages, entry_points, extra_install_requires

--- a/test/cpp/aoti_abi_check/CMakeLists.txt
+++ b/test/cpp/aoti_abi_check/CMakeLists.txt
@ -8,6 +8,7 @@ set(AOTI_ABI_CHECK_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_abi_check)
 # Build the cpp gtest binary containing the cpp-only tests.
 set(AOTI_ABI_CHECK_TEST_SRCS
  ${AOTI_ABI_CHECK_TEST_ROOT}/main.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_accessor.cpp
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_cast.cpp
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_devicetype.cpp
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_dispatch.cpp
--- a/test/cpp/aoti_abi_check/test_accessor.cpp
+++ b/test/cpp/aoti_abi_check/test_accessor.cpp
@ -0,0 +1,50 @@
+#include <gtest/gtest.h>
+#include <torch/headeronly/core/TensorAccessor.h>
+#include <string>
+
+TEST(TestAccessor, HeaderOnlyTensorAccessor) {
+  std::vector<int32_t> v = {11, 12, 13, 21, 22, 23};
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = {3, 1};
+
+  auto acc = torch::headeronly::HeaderOnlyTensorAccessor<int32_t, 2>(
+      v.data(), sizes.data(), strides.data());
+  EXPECT_EQ(acc[0][0], 11);
+  EXPECT_EQ(acc[0][1], 12);
+  EXPECT_EQ(acc[0][2], 13);
+  EXPECT_EQ(acc[1][0], 21);
+  EXPECT_EQ(acc[1][1], 22);
+  EXPECT_EQ(acc[1][2], 23);
+}
+
+TEST(TestAccessor, HeaderOnlyGenericPackedTensorAccessor) {
+  std::vector<int32_t> v = {11, 12, 13, 21, 22, 23};
+  std::vector<int64_t> sizes = {2, 3};
+  std::vector<int64_t> strides = {3, 1};
+
+  auto acc =
+      torch::headeronly::HeaderOnlyGenericPackedTensorAccessor<int32_t, 2>(
+          v.data(), sizes.data(), strides.data());
+  EXPECT_EQ(acc[0][0], 11);
+  EXPECT_EQ(acc[0][1], 12);
+  EXPECT_EQ(acc[0][2], 13);
+  EXPECT_EQ(acc[1][0], 21);
+  EXPECT_EQ(acc[1][1], 22);
+  EXPECT_EQ(acc[1][2], 23);
+
+  auto tacc = acc.transpose(0, 1);
+  EXPECT_EQ(tacc[0][0], 11);
+  EXPECT_EQ(tacc[0][1], 21);
+  EXPECT_EQ(tacc[1][0], 12);
+  EXPECT_EQ(tacc[1][1], 22);
+  EXPECT_EQ(tacc[2][0], 13);
+  EXPECT_EQ(tacc[2][1], 23);
+
+  try {
+    acc.transpose(0, 2);
+  } catch (const std::exception& e) {
+    EXPECT_TRUE(
+        std::string(e.what()).find("HeaderOnlyIndexBoundsCheck") !=
+        std::string::npos);
+  }
+}
--- a/test/cpp/aoti_abi_check/test_scalartype.cpp
+++ b/test/cpp/aoti_abi_check/test_scalartype.cpp
@ -13,6 +13,17 @@ TEST(TestScalarType, ScalarTypeToCPPTypeT) {
 #undef DEFINE_CHECK
 }

+TEST(TestScalarType, CppTypeToScalarType) {
+  using torch::headeronly::CppTypeToScalarType;
+  using torch::headeronly::ScalarType;
+
+#define DEFINE_CHECK(TYPE, SCALARTYPE) \
+  EXPECT_EQ(CppTypeToScalarType<TYPE>::value, ScalarType::SCALARTYPE);
+
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CHECK);
+#undef DEFINE_CHECK
+}
+
 #define DEFINE_CHECK(TYPE, SCALARTYPE)                                       \
  {                                                                          \
    EXPECT_EQ(                                                               \
@ -90,3 +101,14 @@ TEST(TestScalarType, toUnderlying) {
  AT_FORALL_FLOAT8_TYPES(DEFINE_CHECK);
 #undef DEFINE_CHECK
 }
+
+TEST(TestScalarType, isQIntType) {
+  using torch::headeronly::isQIntType;
+  using torch::headeronly::ScalarType;
+#define DEFINE_CHECK(_, name) EXPECT_TRUE(isQIntType(ScalarType::name));
+  AT_FORALL_QINT_TYPES(DEFINE_CHECK);
+#undef DEFINE_CHECK
+#define DEFINE_CHECK(_, name) EXPECT_FALSE(isQIntType(ScalarType::name));
+  AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CHECK);
+#undef DEFINE_CHECK
+}
--- a/test/cpp/jit/test_custom_operators.cpp
+++ b/test/cpp/jit/test_custom_operators.cpp
@ -15,7 +15,7 @@ namespace jit {
 TEST(CustomOperatorTest, InferredSchema) {
  torch::RegisterOperators reg(
      "foo::bar", [](double a, at::Tensor b) { return a + b; });
-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
+  auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
  ASSERT_EQ(ops.size(), 1);

  auto& op = ops.front();
@ -43,8 +43,7 @@ TEST(CustomOperatorTest, ExplicitSchema) {
      "foo::bar_with_schema(float a, Tensor b) -> Tensor",
      [](double a, at::Tensor b) { return a + b; });

-  auto& ops =
-      getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
+  auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
  ASSERT_EQ(ops.size(), 1);

  auto& op = ops.front();
@ -77,7 +76,7 @@ TEST(CustomOperatorTest, ListParameters) {
         torch::List<c10::complex<double>> complexdoubles,
         torch::List<at::Tensor> tensors) { return floats; });

-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
+  auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
  ASSERT_EQ(ops.size(), 1);

  auto& op = ops.front();
@ -123,7 +122,7 @@ TEST(CustomOperatorTest, ListParameters2) {
      "foo::lists2(Tensor[] tensors) -> Tensor[]",
      [](torch::List<at::Tensor> tensors) { return tensors; });

-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
+  auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
  ASSERT_EQ(ops.size(), 1);

  auto& op = ops.front();
@ -213,7 +212,7 @@ TEST(TestCustomOperator, OperatorGeneratorUndeclared) {
      },
      aliasAnalysisFromSchema())});

-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
+  auto ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
  ASSERT_EQ(ops.size(), 0);
 }

@ -232,7 +231,7 @@ TEST(TestCustomOperator, OperatorGeneratorBasic) {
      },
      aliasAnalysisFromSchema())});

-  auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
+  auto ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
  ASSERT_EQ(ops.size(), 1);

  auto& op = ops.front();
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/cuda_kernel.cu
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/cuda_kernel.cu
@ -0,0 +1,30 @@
+#include "kernel.h"
+
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/stable/ops.h>
+#include <cuda_runtime.h>
+
+using torch::stable::Tensor;
+
+Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
+  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
+  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
+  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
+  STD_TORCH_CHECK(m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
+  STD_TORCH_CHECK(m.device() == v.device(), "m and v must be on the same device");
+  Tensor res = new_empty(m, {m.size(0)});
+  THO_DISPATCH_V2(m.scalar_type(), "mv_tensor_accessor_cuda",
+                  AT_WRAP(([&]() {
+                    auto resa = Accessor_cuda<scalar_t, 1>(reinterpret_cast<scalar_t*>(res.data_ptr()), res.sizes().data(), res.strides().data());
+                    auto ma = Accessor_cuda<scalar_t, 2>(reinterpret_cast<scalar_t*>(m.data_ptr()), m.sizes().data(), m.strides().data());
+                    auto va = Accessor_cuda<scalar_t, 1>(reinterpret_cast<scalar_t*>(v.data_ptr()), v.sizes().data(), v.strides().data());
+                    mv_tensor_accessor_kernel<Accessor_cuda, scalar_t><<<1, 1, 0, 0>>>(resa, ma, va);
+                  })),
+                  AT_FLOATING_TYPES);
+  return res;
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CUDA, m) {
+  m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
+}
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
@ -1,3 +1,5 @@
+#include "kernel.h"
+
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 #include <torch/csrc/stable/accelerator.h>
 #include <torch/csrc/stable/device.h>
@ -308,7 +310,7 @@ STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
  m.def("my_amax(Tensor a) -> Tensor");
  m.def("my_amax_vec(Tensor a) -> Tensor");
  m.def("my_is_cpu(Tensor t) -> bool");
-   m.def("test_default_constructor(bool undefined) -> bool");
+  m.def("test_default_constructor(bool undefined) -> bool");
 }

 bool test_default_constructor(bool defined) {
@ -330,12 +332,47 @@ bool test_default_constructor(bool defined) {
  return out.defined();
 }

+uint64_t get_any_data_ptr(Tensor t, bool mutable_) {
+  if (mutable_) {
+    return reinterpret_cast<uint64_t>(t.mutable_data_ptr());
+  } else {
+    return reinterpret_cast<uint64_t>(t.const_data_ptr());
+  }
+}
+
+uint64_t get_template_any_data_ptr(Tensor t, c10::ScalarType dtype, bool mutable_) {
+#define DEFINE_CASE(T, name)                                            \
+  case torch::headeronly::ScalarType::name: {                           \
+    if (mutable_) {                                                     \
+      return reinterpret_cast<uint64_t>(t.mutable_data_ptr<T>());       \
+    } else {                                                            \
+      return reinterpret_cast<uint64_t>(t.const_data_ptr<T>());         \
+    }                                                                   \
+  }
+  switch (dtype) {
+    // per aten/src/ATen/templates/TensorMethods.cpp:
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
+    DEFINE_CASE(uint16_t, UInt16)
+    DEFINE_CASE(uint32_t, UInt32)
+    DEFINE_CASE(uint64_t, UInt64)
+  default:
+      return 0;
+  }
+#undef DEFINE_CASE
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("get_any_data_ptr(Tensor t, bool mutable_) -> int");
+  m.def("get_template_any_data_ptr(Tensor t, ScalarType dtype, bool mutable_) -> int");
+}

 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("my_zero_", TORCH_BOX(&my_zero_));
  m.impl("my_amax", TORCH_BOX(&my_amax));
  m.impl("my_amax_vec", TORCH_BOX(&my_amax_vec));
  m.impl("test_default_constructor", TORCH_BOX(&test_default_constructor));
+  m.impl("get_any_data_ptr", TORCH_BOX(&get_any_data_ptr));
+  m.impl("get_template_any_data_ptr", TORCH_BOX(&get_template_any_data_ptr));
 }

 std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
@ -514,6 +551,32 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("test_device_is_cpu", &boxed_test_device_is_cpu);
 }

+Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
+  STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
+  STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
+  STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
+  STD_TORCH_CHECK(m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
+  STD_TORCH_CHECK(m.device() == v.device(), "m and v must be on the same device");
+  Tensor res = new_empty(m, {m.size(0)});
+  THO_DISPATCH_V2(m.scalar_type(), "mv_tensor_accessor_cpu",
+                  AT_WRAP(([&]() {
+                    auto resa = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(res.data_ptr()), res.sizes().data(), res.strides().data());
+                    auto ma = Accessor_cpu<scalar_t, 2>(reinterpret_cast<scalar_t*>(m.data_ptr()), m.sizes().data(), m.strides().data());
+                    auto va = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(v.data_ptr()), v.sizes().data(), v.strides().data());
+                    mv_tensor_accessor_kernel<Accessor_cpu, scalar_t>(resa, ma, va);
+                  })),
+                  AT_FLOATING_TYPES);
+  return res;
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("mv_tensor_accessor(Tensor m, Tensor v) -> Tensor");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
+  m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cpu));
+}
+
 // Test functions for torch::stable::accelerator APIs

 #ifdef LAE_USE_CUDA
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.h
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.h
@ -0,0 +1,26 @@
+#include <torch/headeronly/core/Dispatch_v2.h>
+#include <torch/headeronly/core/TensorAccessor.h>
+
+template <typename T, size_t N>
+using Accessor_cpu = torch::headeronly::HeaderOnlyTensorAccessor<T, N>;
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#define MAYBE_GLOBAL __global__
+
+template <typename T, size_t N>
+using Accessor_cuda = torch::headeronly::HeaderOnlyGenericPackedTensorAccessor<T, N, torch::headeronly::RestrictPtrTraits>;
+
+#else
+#define MAYBE_GLOBAL
+#endif
+
+template <template <typename, size_t> class Accessor, typename scalar_t>
+MAYBE_GLOBAL void mv_tensor_accessor_kernel(Accessor<scalar_t, 1> resa, Accessor<scalar_t, 2> ma, Accessor<scalar_t, 1> va) {
+  for (int64_t i = 0; i < resa.size(0); i++) {
+    scalar_t val = 0;
+    for (int64_t j = 0; j < ma.size(1); j++) {
+      val += ma[i][j] * va[j];
+    }
+    resa[i] = val;
+  }
+}
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
@ -227,6 +227,37 @@ def test_tensor_device(t):
    return torch.ops.libtorch_agnostic.test_tensor_device.default(t)


+def get_any_data_ptr(t, mutable) -> int:
+    """
+    Return data pointer value of the tensor.
+
+    Args:
+        t: Input tensor
+        mutable: whether data pointer qualifier is mutable or const
+
+    Returns: int - pointer value
+    """
+    return torch.ops.libtorch_agnostic.get_any_data_ptr.default(t, mutable)
+
+
+def get_template_any_data_ptr(t, dtype, mutable) -> int:
+    """
+    Return data pointer value of the tensor iff it has dtype.
+
+    Args:
+        t: Input tensor
+        dtype: Input dtype
+        mutable: whether data pointer qualifier is mutable or const
+
+    Returns: int - pointer value
+
+    Raises RuntimeError when t.dtype() != dtype.
+    """
+    return torch.ops.libtorch_agnostic.get_template_any_data_ptr.default(
+        t, dtype, mutable
+    )
+
+
 def my_pad(t) -> Tensor:
    """
    Pads the input tensor with hardcoded padding parameters.
@ -542,3 +573,17 @@ def my_view(t, size) -> Tensor:
    Returns: Tensor - tensor with new view
    """
    return torch.ops.libtorch_agnostic.my_view.default(t, size)
+
+
+def mv_tensor_accessor(m, v) -> Tensor:
+    """
+    Returns matrix-vector product.
+
+    Args:
+        m: any 2-D Tensor with shape (N, M)
+        v: any 1-D Tensor with shape (M,)
+
+    Returns:
+        a 1-D Tensor with shape (N,)
+    """
+    return torch.ops.libtorch_agnostic.mv_tensor_accessor.default(m, v)
--- a/test/cpp_extensions/libtorch_agnostic_extension/setup.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/setup.py
@ -35,14 +35,15 @@ def get_extension():
    extra_compile_args = {
        "cxx": ["-fdiagnostics-color=always", "-DTORCH_STABLE_ONLY"],
    }
+    sources = list(CSRC_DIR.glob("**/*.cpp"))

    extension = CppExtension
    # allow including <cuda_runtime.h>
    if torch.cuda.is_available():
        extra_compile_args["cxx"].append("-DLAE_USE_CUDA")
+        extra_compile_args["nvcc"] = ["-O2"]
        extension = CUDAExtension
-
-    sources = list(CSRC_DIR.glob("**/*.cpp"))
+        sources.extend(CSRC_DIR.glob("**/*.cu"))

    return [
        extension(
--- a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
@ -14,11 +14,38 @@ from torch.testing._internal.common_utils import (
    install_cpp_extension,
    IS_WINDOWS,
    run_tests,
+    skipIfTorchDynamo,
    TestCase,
    xfailIfTorchDynamo,
 )


+def get_supported_dtypes():
+    """Return a list of dtypes that are supported by torch stable ABI."""
+    return [
+        torch.int8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+        torch.uint8,
+        torch.uint16,
+        torch.uint32,
+        torch.uint64,
+        torch.bfloat16,
+        torch.float16,
+        torch.float32,
+        torch.float64,
+        torch.float8_e5m2,
+        torch.float8_e4m3fn,
+        torch.float8_e5m2fnuz,
+        torch.float8_e4m3fnuz,
+        torch.complex32,
+        torch.complex64,
+        torch.complex128,
+        torch.bool,
+    ]
+
+
 # TODO: Fix this error in Windows:
 # LINK : error LNK2001: unresolved external symbol PyInit__C
 if not IS_WINDOWS:
@ -274,6 +301,43 @@ if not IS_WINDOWS:
            expected0 = torch.narrow(t, dim0, start0, length0)
            self.assertEqual(out0, expected0)

+        @skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
+        def test_get_any_data_ptr(self, device):
+            import libtorch_agnostic
+
+            t = torch.empty(2, 5, device=device, dtype=torch.float32)
+            expected_p = t.data_ptr()
+
+            for mutable in [True, False]:
+                p = libtorch_agnostic.ops.get_any_data_ptr(t, mutable)
+                self.assertEqual(p, expected_p)
+
+        @skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
+        def test_get_template_any_data_ptr(self, device):
+            import libtorch_agnostic
+
+            supported_dtypes = get_supported_dtypes()
+
+            for dtype in supported_dtypes:
+                t = torch.empty(2, 5, device=device, dtype=dtype)
+                expected_p = t.data_ptr()
+
+                for rdtype in supported_dtypes:
+                    if dtype == rdtype:
+                        for mutable in [True, False]:
+                            p = libtorch_agnostic.ops.get_template_any_data_ptr(
+                                t, rdtype, mutable
+                            )
+                            self.assertEqual(p, expected_p)
+                    else:
+                        for mutable in [True, False]:
+                            with self.assertRaisesRegex(
+                                RuntimeError, "expected scalar type.* but found"
+                            ):
+                                libtorch_agnostic.ops.get_template_any_data_ptr(
+                                    t, rdtype, mutable
+                                )
+
        @onlyCUDA
        @deviceCountAtLeast(2)
        def test_device_guard(self, device):
@ -616,6 +680,22 @@ if not IS_WINDOWS:
            expected_flat = t.view([-1])
            self.assertEqual(result_flat, expected_flat)

+        def test_mv_tensor_accessor(self, device):
+            import libtorch_agnostic
+
+            m = torch.rand(3, 5, device=device)
+            v = torch.rand(5, device=device)
+            result = libtorch_agnostic.ops.mv_tensor_accessor(m, v)
+            expected = torch.mv(m, v)
+            self.assertEqual(result, expected)
+
+            # non-contiguous inputs
+            m = torch.rand(3 * 2, 5 * 3, device=device)[::2, ::3]
+            v = torch.rand(5 * 4, device=device)[::4]
+            result = libtorch_agnostic.ops.mv_tensor_accessor(m, v)
+            expected = torch.mv(m, v)
+            self.assertEqual(result, expected)
+
    instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)

 if __name__ == "__main__":
--- a/test/custom_operator/test_custom_ops.cpp
+++ b/test/custom_operator/test_custom_ops.cpp
@ -22,7 +22,7 @@ void check_all_parameters(

 template<class Result, class... Args>
 Result get_operator_from_registry_and_execute(const char* op_name, Args&&... args) {
-  auto& ops = torch::jit::getAllOperatorsFor(
+  auto ops = torch::jit::getAllOperatorsFor(
      torch::jit::Symbol::fromQualString(op_name));
  TORCH_INTERNAL_ASSERT(ops.size() == 1);

--- a/test/distributed/_composable/test_composability/test_2d_composability.py
+++ b/test/distributed/_composable/test_composability/test_2d_composability.py
@ -65,7 +65,6 @@ from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir


 device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
-curr_backend = dist.get_default_backend_for_device(device_type)


 class SimpleModel(nn.Module):
@ -423,10 +422,10 @@ class TestFullyShard2DStateDict(DTensorTestBase):
    @property
    def backend(self):
        # need to specify gloo backend for testing cpu offload
-        return f"cpu:gloo,{device_type}:{curr_backend}"
+        return "cpu:gloo,xpu:xccl" if TEST_XPU else "cpu:gloo,cuda:nccl"

-    @skip_if_lt_x_gpu(4)
    @with_comms
+    @skip_if_lt_x_gpu(4)
    def test_fully_shard_tp_2d_set_full_state_dict(self):
        dummy_model = SimpleModel().to(device_type)
        mesh_2d = init_device_mesh(
@ -515,8 +514,8 @@ class Test2dFSDP1ParallelIntegration(DTensorTestBase):
                ).to_local()
            self.assertEqual(param_m2, param_m1)

-    @skip_if_lt_x_gpu(4)
    @with_comms
+    @skip_if_lt_x_gpu(4)
    def test_2d_ddp_integration_functionality(self) -> None:
        model, twod_model, dp_pg = self.init_model(self.device_type)
        optim = torch.optim.Adam(model.parameters(), lr=3e-5)
@ -567,8 +566,8 @@ class TestNew2dParallelTraining(DTensorTestBase):
                        p2 = p2.redistribute(p2.device_mesh, [Replicate()]).to_local()
                    self.assertTrue(torch.allclose(p1, p2), f"{p1} vs {p2}")

-    @skip_if_lt_x_gpu(4)
    @with_comms
+    @skip_if_lt_x_gpu(4)
    def test_2d_fsdp_state_enable_extension(self):
        mesh_2d = init_device_mesh(
            self.device_type, (2, self.world_size // 2), mesh_dim_names=("dp", "tp")
@ -643,18 +642,18 @@ class TestNew2dParallelTraining(DTensorTestBase):
        # Ensure all params are still the same after optimizer update.
        self._compare_params(model, model_2d)

-    @skip_if_lt_x_gpu(4)
    @with_comms
+    @skip_if_lt_x_gpu(4)
    def test_2d_e2e_training_default(self):
        self._test_2d_e2e_training()

-    @skip_if_lt_x_gpu(4)
    @with_comms
+    @skip_if_lt_x_gpu(4)
    def test_2d_e2e_training_use_orig_params(self):
        self._test_2d_e2e_training(use_orig_params=True)

-    @skip_if_lt_x_gpu(4)
    @with_comms
+    @skip_if_lt_x_gpu(4)
    def test_2d_e2e_training_not_use_orig_params(self):
        # TODO: need to revisit input_reshard API about why it failed multi-gpu tests.
        # self._test_2d_e2e_training(recompute_activation=True)
@ -667,10 +666,10 @@ class TestNew2dParallelStateDict(DTensorTestBase):
    @property
    def backend(self):
        # need to specify gloo backend for testing cpu offload
-        return f"cpu:gloo,{device_type}:{curr_backend}"
+        return "cpu:gloo,xpu:xccl" if TEST_XPU else "cpu:gloo,cuda:nccl"

-    @skip_if_lt_x_gpu(4)
    @with_comms
+    @skip_if_lt_x_gpu(4)
    def test_fsdp_2d_extension(self):
        """
        Test whether _fsdp_extension from FSDPstate has been set correctly.
@ -701,8 +700,8 @@ class TestNew2dParallelStateDict(DTensorTestBase):
        model_1d_fsdp_state = _get_module_fsdp_state(model_1d)
        self.assertEqual(model_1d_fsdp_state._fsdp_extension, None)

-    @skip_if_lt_x_gpu(4)
    @with_comms
+    @skip_if_lt_x_gpu(4)
    @parametrize("is_even_sharded_model", [True, False])
    def test_2d_state_dict(self, is_even_sharded_model):
        simple_model = SimpleModel if is_even_sharded_model else SimpleModelUneven
@ -757,8 +756,8 @@ class TestNew2dParallelStateDict(DTensorTestBase):
                torch.allclose(no_wrap_v, all_gather_two_d_v.to_local()), True
            )

-    @skip_if_lt_x_gpu(4)
    @with_comms
+    @skip_if_lt_x_gpu(4)
    @parametrize("is_even_sharded_model", [True, False])
    def test_2d_load_state_dict(self, is_even_sharded_model):
        simple_model = SimpleModel if is_even_sharded_model else SimpleModelUneven
@ -812,8 +811,8 @@ class TestNew2dParallelStateDict(DTensorTestBase):
            self.assertEqual(v1.device_mesh, v2.device_mesh)
            self.assertEqual(v1.placements, v2.placements)

-    @skip_if_lt_x_gpu(4)
    @with_comms
+    @skip_if_lt_x_gpu(4)
    @parametrize("is_even_sharded_model", [True, False])
    def test_2d_optim_state_dict(self, is_even_sharded_model):
        simple_model = SimpleModel if is_even_sharded_model else SimpleModelUneven
@ -900,9 +899,9 @@ class TestNew2dParallelStateDict(DTensorTestBase):
                else:
                    self.assertEqual(new_state, state)

-    @skip_if_lt_x_gpu(4)
    @with_comms
    @with_temp_dir
+    @skip_if_lt_x_gpu(4)
    def test_fsdp1_tp_2d_set_full_state_dict(self):
        """
        This is a workaround for loading full state dict into a FSDP1+TP 2D model.
--- a/test/distributed/_composable/test_composability/test_pp_composability.py
+++ b/test/distributed/_composable/test_composability/test_pp_composability.py
@ -29,8 +29,8 @@ from torch.distributed.tensor.parallel import (
    parallelize_module,
    RowwiseParallel,
 )
+from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_distributed import (
-    at_least_x_gpu,
    MultiProcessTestCase,
    requires_accelerator_dist_backend,
    skip_if_lt_x_gpu,
@ -40,6 +40,7 @@ from torch.testing._internal.common_utils import (
    parametrize,
    run_tests,
    skip_but_pass_in_sandcastle_if,
+    TEST_XPU,
 )
 from torch.testing._internal.distributed.checkpoint_utils import with_temp_dir

@ -106,9 +107,11 @@ class ComposabilityTest(MultiProcessTestCase):
    def device(self):
        return self.rank

-    @requires_accelerator_dist_backend()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
    @skip_if_lt_x_gpu(8)
-    @skip_but_pass_in_sandcastle_if(not at_least_x_gpu(8), "Test requires 8+ GPUs")
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIGPU and not TEST_XPU, "Test requires 4+ GPUs"
+    )
    def test_pp_and_dcp(self):
        """
        Test that pipeline parallelism and distributed checkpointing can be used together and
@ -198,9 +201,11 @@ class ComposabilityTest(MultiProcessTestCase):

        _dcp_test(self)

-    @requires_accelerator_dist_backend()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
    @skip_if_lt_x_gpu(8)
-    @skip_but_pass_in_sandcastle_if(not at_least_x_gpu(8), "Test requires 8+ GPUs")
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
+    )
    @parametrize(
        "ScheduleClass",
        [
@ -350,9 +355,11 @@ class ComposabilityTest(MultiProcessTestCase):

        torch.distributed.destroy_process_group()

-    @requires_accelerator_dist_backend()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
    @skip_if_lt_x_gpu(8)
-    @skip_but_pass_in_sandcastle_if(not at_least_x_gpu(8), "Test requires 8+ GPUs")
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
+    )
    @parametrize(
        "ScheduleClass",
        [
@ -543,9 +550,11 @@ class ComposabilityTest(MultiProcessTestCase):

        torch.distributed.destroy_process_group()

-    @requires_accelerator_dist_backend()
+    @requires_accelerator_dist_backend(["nccl", "xccl"])
    @skip_if_lt_x_gpu(8)
-    @skip_but_pass_in_sandcastle_if(not at_least_x_gpu(8), "Test requires 8+ GPUs")
+    @skip_but_pass_in_sandcastle_if(
+        not TEST_MULTIGPU and not TEST_XPU, "Test requires 8+ GPUs"
+    )
    @parametrize(
        "ScheduleClass",
        [
--- a/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
+++ b/test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py
@ -1,5 +1,6 @@
 # Owner(s): ["oncall: distributed"]

+import os
 import sys

 import torch
@ -17,8 +18,8 @@ from torch.distributed.algorithms.ddp_comm_hooks import (
 )
 from torch.nn.parallel import DistributedDataParallel
 from torch.testing._internal.common_distributed import (
-    DistributedTestBase,
-    requires_accelerator_dist_backend,
+    MultiProcessTestCase,
+    requires_nccl,
    skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
@ -29,12 +30,9 @@ if TEST_WITH_DEV_DBG_ASAN:
    sys.exit(0)


-device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
-
-
 def gpus_for_rank(world_size):
-    visible_devices = list(range(torch.accelerator.device_count()))
-    gpus_per_process = torch.accelerator.device_count() // world_size
+    visible_devices = list(range(torch.cuda.device_count()))
+    gpus_per_process = torch.cuda.device_count() // world_size
    gpus_for_rank = []
    for rank in range(world_size):
        gpus_for_rank.append(
@ -62,7 +60,27 @@ class TestDdpCommHook(nn.Module):
        return self.t0(x ** (1 + rank))


-class DistributedDataParallelCommHookTest(DistributedTestBase):
+class DistributedDataParallelCommHookTest(MultiProcessTestCase):
+    def setUp(self):
+        super().setUp()
+        self._spawn_processes()
+
+    def tearDown(self):
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    def _get_process_group_nccl(self):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            backend="nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        return dist.distributed_c10d._get_default_group()
+
    @property
    def world_size(self):
        return 2
@ -101,14 +119,14 @@ class DistributedDataParallelCommHookTest(DistributedTestBase):
        param = next(model.parameters())
        return param.grad

-    @requires_accelerator_dist_backend()
+    @requires_nccl()
    @skip_if_lt_x_gpu(2)
    def test_ddp_comm_hook_allreduce_hook(self):
        """
        This unit test verifies the ``allreduce`` hook registered case gives same result
        with no hook registered case.
        """
-        process_group = self.create_pg(device_type)
+        process_group = self._get_process_group_nccl()

        # No hook registered case, get the reference grads.
        reference_grads = self._get_grads(process_group, None)
@ -117,14 +135,14 @@ class DistributedDataParallelCommHookTest(DistributedTestBase):

        torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=0)

-    @requires_accelerator_dist_backend()
+    @requires_nccl()
    @skip_if_lt_x_gpu(2)
    def test_ddp_comm_hook_fp16compress_hook(self):
        """
        This unit test verifies the ``fp16 compress`` hook registered case
        gives close result with no hook registered case.
        """
-        process_group = self.create_pg(device_type)
+        process_group = self._get_process_group_nccl()

        # No hook registered case, get the reference grads.
        reference_grads = self._get_grads(process_group, None)
@ -133,14 +151,14 @@ class DistributedDataParallelCommHookTest(DistributedTestBase):

        torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)

-    @requires_accelerator_dist_backend()
+    @requires_nccl()
    @skip_if_lt_x_gpu(2)
    def test_ddp_comm_hook_quantize_per_tensor_hook(self):
        """
        This unit test verifies the ``quantize per tensor`` hook registered case
        gives close result with no hook registered case.
        """
-        process_group = self.create_pg(device_type)
+        process_group = self._get_process_group_nccl()

        # No hook registered case, get the reference grads.
        reference_grads = self._get_grads(process_group, None)
@ -149,14 +167,14 @@ class DistributedDataParallelCommHookTest(DistributedTestBase):

        torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)

-    @requires_accelerator_dist_backend()
+    @requires_nccl()
    @skip_if_lt_x_gpu(2)
    def test_ddp_comm_hook_quantize_per_channel_hook(self):
        """
        This unit test verifies the ``quantize per channel`` hook registered case
        gives close result with no hook registered case.
        """
-        process_group = self.create_pg(device_type)
+        process_group = self._get_process_group_nccl()

        # No hook registered case, get the reference grads.
        reference_grads = self._get_grads(process_group, None)
@ -167,14 +185,14 @@ class DistributedDataParallelCommHookTest(DistributedTestBase):

        torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)

-    @requires_accelerator_dist_backend()
+    @requires_nccl()
    @skip_if_lt_x_gpu(2)
    def test_ddp_comm_hook_noop_hook(self):
        """
        This unit test verifies the ``noop`` hook registered case and a subsequent allreduce
        gives same result with no hook registered case.
        """
-        process_group = self.create_pg(device_type)
+        process_group = self._get_process_group_nccl()

        # No hook registered case, get the reference grads.
        reference_grads = self._get_grads(process_group, None)
@ -186,10 +204,10 @@ class DistributedDataParallelCommHookTest(DistributedTestBase):

        torch.testing.assert_close(hook_grads, reference_grads, rtol=1e-5, atol=0)

-    @requires_accelerator_dist_backend()
+    @requires_nccl()
    @skip_if_lt_x_gpu(2)
    def test_is_last_hook(self):
-        process_group = self.create_pg(device_type)
+        process_group = self._get_process_group_nccl()

        def hook(flags, bucket):
            flags.append(bucket.is_last())
--- a/test/distributed/checkpoint/test_state_dict_utils.py
+++ b/test/distributed/checkpoint/test_state_dict_utils.py
@ -32,7 +32,7 @@ from torch.testing._internal.distributed._tensor.common_dtensor import (
 class TestStateDictUtils(DTensorTestBase):
    @property
    def world_size(self):
-        return min(4, torch.accelerator.device_count())
+        return min(4, torch.cuda.device_count())

    @with_comms
    @skip_if_lt_x_gpu(2)
@ -49,7 +49,7 @@ class TestStateDictUtils(DTensorTestBase):
            dist_tensor.to_local(), gather_dim=0, group=(device_mesh, 0)
        )
        self.assertEqual(expected_gathered_dtensor, gathered_state_dict["dtensor"])
-        self.assertEqual(gathered_state_dict["dtensor"].device.type, self.device_type)
+        self.assertTrue(gathered_state_dict["dtensor"].is_cuda)

    @with_comms
    @skip_if_lt_x_gpu(4)
@ -69,16 +69,14 @@ class TestStateDictUtils(DTensorTestBase):
        )
        if dist.get_rank() in (0, 2):
            self.assertEqual(expected_gathered_dtensor, gathered_state_dict["dtensor"])
-            self.assertNotEqual(
-                gathered_state_dict["dtensor"].device.type, self.device_type
-            )
+            self.assertFalse(gathered_state_dict["dtensor"].is_cuda)
        else:
            self.assertEqual(gathered_state_dict, {})

    @with_comms
    @skip_if_lt_x_gpu(4)
    def test_cpu_and_ranks_only(self):
-        device = torch.device(self.device_type)
+        device = torch.device("cuda")
        state_dict = {
            "tensor1": torch.arange(10, device=device),
            "tensor2": torch.ones(10, device=device),
@ -87,7 +85,7 @@ class TestStateDictUtils(DTensorTestBase):
        cpu_state_dict = _offload_state_dict_to_cpu(state_dict, ranks_only=(0, 2))
        if dist.get_rank() in (0, 2):
            for v in cpu_state_dict.values():
-                self.assertNotEqual(v.device.type, self.device_type)
+                self.assertFalse(v.is_cuda)
            self.assertEqual(cpu_state_dict["tensor1"], torch.arange(10))
            self.assertEqual(cpu_state_dict["tensor2"], torch.ones(10))
        else:
@ -111,27 +109,27 @@ class TestStateDictUtils(DTensorTestBase):
        for _ in range(10):
            tensor, dtensor = create_dtensor()
            ltensor.append(tensor)
-            ltensor.append(torch.ones(10, device=torch.device(self.device_type)))
+            ltensor.append(torch.ones(10, device=torch.device("cuda")))
            ldtensor.append(dtensor)
-            ldtensor.append(torch.ones(10, device=torch.device(self.device_type)))
+            ldtensor.append(torch.ones(10, device=torch.device("cuda")))

        tensor, dtensor = create_dtensor()
        dist_state_dict = {
            "local": dtensor,
            "list": ldtensor,
-            "arange": torch.arange(10, device=torch.device(self.device_type)),
+            "arange": torch.arange(10, device=torch.device("cuda")),
        }
        state_dict = {
            "local": tensor,
            "list": ltensor,
-            "arange": torch.arange(10, device=torch.device(self.device_type)),
+            "arange": torch.arange(10, device=torch.device("cuda")),
        }
        self.assertEqual(state_dict, _gather_state_dict(dist_state_dict))

    @with_comms
    @skip_if_lt_x_gpu(2)
    def test_create_cpu_state_dict(self):
-        device = torch.device(self.device_type)
+        device = torch.device("cuda")
        rank = dist.get_rank()
        # Scale tensors based on world size
        # to fit in the tensor shards accurately.
@ -151,7 +149,7 @@ class TestStateDictUtils(DTensorTestBase):
                        metadata=ShardMetadata(
                            shard_offsets=[5 * rank, 0],
                            shard_sizes=[5, 10],
-                            placement=f"rank:{rank}/{self.device_type}:{rank}",
+                            placement=f"rank:{rank}/cuda:{rank}",
                        ),
                    )
                ],
@ -161,7 +159,7 @@ class TestStateDictUtils(DTensorTestBase):
                torch.arange(50 * scale_factor, device=device).reshape(
                    5 * scale_factor, 10
                ),
-                init_device_mesh(self.device_type, mesh_shape=(self.world_size,)),
+                init_device_mesh("cuda", mesh_shape=(self.world_size,)),
                [Shard(0)],
            ),
            "non_tensor_bytes_io": copy.deepcopy(buffer),
@ -247,7 +245,7 @@ class TestStateDictUtils(DTensorTestBase):
        even_tensor = torch.randn(self.world_size, 2)
        uneven_tensor = torch.randn(1, 2)

-        mesh = init_device_mesh(self.device_type, mesh_shape=(self.world_size,))
+        mesh = init_device_mesh("cuda", mesh_shape=(self.world_size,))
        even_dtensor = distribute_tensor(
            torch.randn(self.world_size, 2), mesh, [Shard(0)]
        )
@ -275,10 +273,10 @@ class TestStateDictUtils(DTensorTestBase):
    @with_comms
    @skip_if_lt_x_gpu(2)
    def test_cpu_offload_for_dtensor(self):
-        device_mesh = init_device_mesh(self.device_type, mesh_shape=(self.world_size,))
+        device_mesh = init_device_mesh("cuda", mesh_shape=(self.world_size,))
        sd = {
            "k": DTensor.from_local(
-                torch.ones(8, 8, device=self.device_type), device_mesh, [Shard(0)]
+                torch.ones(8, 8, device="cuda"), device_mesh, [Shard(0)]
            )
        }
        cpu_sd = _create_cpu_state_dict(sd)
@ -292,12 +290,12 @@ class TestStateDictUtils(DTensorTestBase):

        self.assertFalse(torch.equal(sd["k"].cpu(), cpu_sd["k"]))
        _copy_state_dict(sd, cpu_sd, non_blocking=True)
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        self.assertTrue(torch.equal(sd["k"].cpu(), cpu_sd["k"]))
        sd["k"] += 1
        self.assertFalse(torch.equal(sd["k"].cpu(), cpu_sd["k"]))
        _copy_state_dict(sd, cpu_sd, non_blocking=True)
-        torch.accelerator.synchronize()
+        torch.cuda.synchronize()
        self.assertTrue(torch.equal(sd["k"].cpu(), cpu_sd["k"]))


--- a/test/distributed/flight_recorder/test_fr_analysis.py
+++ b/test/distributed/flight_recorder/test_fr_analysis.py
@ -2,23 +2,16 @@

 import copy
 import math
-import pathlib
-import sys
 from typing import Any

-
-REPO_ROOT = pathlib.Path(__file__).resolve().parent.parent.parent.parent
-
-sys.path.insert(0, str(REPO_ROOT))
-from tools.flight_recorder.components.builder import build_db
-from tools.flight_recorder.components.config_manager import JobConfig
-from tools.flight_recorder.components.types import COLLECTIVES, MatchInfo, MatchState
-from tools.flight_recorder.components.utils import match_one_event
-
-
-# Make sure to remove REPO_ROOT after import is done
-sys.path.remove(str(REPO_ROOT))
-
+from torch.distributed.flight_recorder.components.builder import build_db
+from torch.distributed.flight_recorder.components.config_manager import JobConfig
+from torch.distributed.flight_recorder.components.types import (
+    COLLECTIVES,
+    MatchInfo,
+    MatchState,
+)
+from torch.distributed.flight_recorder.components.utils import match_one_event
 from torch.testing._internal.common_utils import run_tests, TestCase


--- a/test/distributed/optim/test_zero_redundancy_optimizer.py
+++ b/test/distributed/optim/test_zero_redundancy_optimizer.py
@ -7,7 +7,7 @@

 import copy
 import sys
-from contextlib import contextmanager, nullcontext
+from contextlib import nullcontext
 from typing import Any, cast

 import numpy as np
@ -40,6 +40,7 @@ from torch.testing._internal.common_distributed import (
    skip_if_rocm_multiprocess,
    skip_if_win32,
 )
+from torch.testing._internal.common_fsdp import get_devtype
 from torch.testing._internal.common_utils import (
    instantiate_parametrized_tests,
    parametrize,
@ -56,17 +57,7 @@ except ImportError:
    HAS_TORCHVISION = False


-device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
-
-
-@contextmanager
-def deterministic_algorithms(enabled=True):
-    prev_state = torch.are_deterministic_algorithms_enabled()
-    torch.use_deterministic_algorithms(enabled)
-    try:
-        yield
-    finally:
-        torch.use_deterministic_algorithms(prev_state)
+device_type = str(get_devtype())


 class TestZeroRedundancyOptimizer(DistributedTestBase):
@ -1250,7 +1241,7 @@ class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer):
                    enabled=True, deterministic=True, benchmark=False
                )
                if "cuda" in device
-                else deterministic_algorithms(True)
+                else torch.use_deterministic_algorithms(True)
            )
            with det_ctx:
                device_ids = [rank] if requires_ddp_rank(device) else None
--- a/test/distributed/tensor/test_dtensor_ops.py
+++ b/test/distributed/tensor/test_dtensor_ops.py
@ -157,7 +157,6 @@ dtensor_fails = {
    xfail("cholesky_solve"),
    xfail("combinations"),
    xfail("complex"),
-    xfail("convolution_backward"),
    xfail("count_nonzero"),
    xfail("cross"),
    xfail("cummax"),
--- a/test/distributed/test_c10d_functional_native.py
+++ b/test/distributed/test_c10d_functional_native.py
@ -24,7 +24,7 @@ from torch.distributed._functional_collectives import (
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
 from torch.testing._internal.common_device_type import e4m3_type
 from torch.testing._internal.common_distributed import (
-    DistributedTestBase,
+    MultiProcessTestCase,
    requires_accelerator_dist_backend,
    skip_if_lt_x_gpu,
 )
@ -59,8 +59,12 @@ if not dist.is_available():
    sys.exit(0)


-@requires_accelerator_dist_backend()
-class TestWithNCCL(DistributedTestBase):
+@requires_accelerator_dist_backend(["nccl", "xccl"])
+class TestWithNCCL(MultiProcessTestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        self._spawn_processes()
+
    @property
    def world_size(self) -> int:
        return 2
@ -74,7 +78,16 @@ class TestWithNCCL(DistributedTestBase):
        return torch.device(self.rank)

    def _init_process_group(self) -> None:
-        self.create_pg(self.device.type)
+        torch.accelerator.set_device_index(self.rank)
+        store = dist.FileStore(self.file_name, self.world_size)
+        backend = dist.get_default_backend_for_device(self.device.type)
+
+        dist.init_process_group(
+            backend=backend,
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
        torch._C._distributed_c10d._register_process_group("default", dist.group.WORLD)

    @skip_if_lt_x_gpu(2)
--- a/test/distributed/test_c10d_object_collectives.py
+++ b/test/distributed/test_c10d_object_collectives.py
@ -11,10 +11,13 @@ if not dist.is_available():
    print("Distributed not available, skipping tests", file=sys.stderr)
    sys.exit(0)

+from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_distributed import DistributedTestBase, TEST_SKIPS
 from torch.testing._internal.common_utils import (
    run_tests,
    skipIfHpu,
+    TEST_CUDA,
+    TEST_HPU,
    TEST_WITH_DEV_DBG_ASAN,
 )

@ -26,8 +29,16 @@ if TEST_WITH_DEV_DBG_ASAN:
    )
    sys.exit(0)

-device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
-device_count = torch.accelerator.device_count()
+if TEST_HPU:
+    DEVICE = "hpu"
+elif TEST_CUDA:
+    DEVICE = "cuda"
+else:
+    DEVICE = "cpu"
+
+device_module = torch.get_device_module(DEVICE)
+device_count = device_module.device_count()
+BACKEND = dist.get_default_backend_for_device(DEVICE)


 def with_comms(func=None):
@ -38,10 +49,11 @@ def with_comms(func=None):

    @wraps(func)
    def wrapper(self, *args, **kwargs):
-        if device_type != "cpu" and device_count < self.world_size:
+        if DEVICE != "cpu" and device_count < self.world_size:
            sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)

-        self.pg = self.create_pg(device=device_type)
+        kwargs["device"] = DEVICE
+        self.pg = self.create_pg(device=DEVICE)
        try:
            return func(self, *args, **kwargs)
        finally:
@ -52,7 +64,7 @@ def with_comms(func=None):

 class TestObjectCollectives(DistributedTestBase):
    @with_comms()
-    def test_all_gather_object(self):
+    def test_all_gather_object(self, device):
        output = [None] * dist.get_world_size()
        dist.all_gather_object(object_list=output, obj=self.rank)

@ -60,7 +72,7 @@ class TestObjectCollectives(DistributedTestBase):
            self.assertEqual(i, v, f"rank: {self.rank}")

    @with_comms()
-    def test_gather_object(self):
+    def test_gather_object(self, device):
        output = [None] * dist.get_world_size() if self.rank == 0 else None
        dist.gather_object(obj=self.rank, object_gather_list=output)

@ -70,7 +82,7 @@ class TestObjectCollectives(DistributedTestBase):

    @skipIfHpu
    @with_comms()
-    def test_send_recv_object_list(self):
+    def test_send_recv_object_list(self, device):
        val = 99 if self.rank == 0 else None
        object_list = [val] * dist.get_world_size()
        if self.rank == 0:
@ -84,7 +96,7 @@ class TestObjectCollectives(DistributedTestBase):
            self.assertEqual(None, object_list[0])

    @with_comms()
-    def test_broadcast_object_list(self):
+    def test_broadcast_object_list(self, device):
        val = 99 if self.rank == 0 else None
        object_list = [val] * dist.get_world_size()
        # TODO test with broadcast_object_list's device argument
@ -93,7 +105,7 @@ class TestObjectCollectives(DistributedTestBase):
        self.assertEqual(99, object_list[0])

    @with_comms()
-    def test_scatter_object_list(self):
+    def test_scatter_object_list(self, device):
        input_list = list(range(dist.get_world_size())) if self.rank == 0 else None
        output_list = [None]
        dist.scatter_object_list(
@ -111,30 +123,34 @@ class TestObjectCollectives(DistributedTestBase):
        my_pg = dist.new_group(ranks, use_local_synchronization=True)
        return rank, ranks, my_pg

+    @skipIfHpu
    @with_comms()
-    def test_subpg_scatter_object(self):
+    def test_subpg_scatter_object(self, device):
        rank, ranks, my_pg = self.setup_sub_pg()
        out_list = [None]
        dist.scatter_object_list(out_list, ranks, src=ranks[0], group=my_pg)
        self.assertEqual(rank, out_list[0])

+    @skipIfHpu
    @with_comms()
-    def test_subpg_all_gather_object(self):
+    def test_subpg_all_gather_object(self, device):
        rank, ranks, my_pg = self.setup_sub_pg()
        out_list = [None] * len(ranks)
        dist.all_gather_object(out_list, rank, group=my_pg)
        self.assertEqual(ranks, out_list)

+    @skipIfHpu
    @with_comms()
-    def test_subpg_gather_object(self):
+    def test_subpg_gather_object(self, device):
        rank, ranks, my_pg = self.setup_sub_pg()
        out_list = [None] * len(ranks) if rank == ranks[0] else None
        dist.gather_object(rank, out_list, dst=ranks[0], group=my_pg)
        if rank == ranks[0]:
            self.assertEqual(ranks, out_list)

+    @skipIfHpu
    @with_comms()
-    def test_subpg_broadcast_object(self):
+    def test_subpg_broadcast_object(self, device):
        rank, ranks, my_pg = self.setup_sub_pg()
        out_list = [None]
        if rank == ranks[0]:
@ -143,5 +159,7 @@ class TestObjectCollectives(DistributedTestBase):
        self.assertEqual(ranks[0], out_list[0])


+devices = ("cpu", "cuda", "hpu")
+instantiate_device_type_tests(TestObjectCollectives, globals(), only_for=devices)
 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/test_device_mesh.py
+++ b/test/distributed/test_device_mesh.py
@ -29,7 +29,7 @@ from torch.distributed.tensor._collective_utils import (
 )
 from torch.distributed.tensor.placement_types import _Partial, Shard
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_utils import run_tests, TEST_HPU, TEST_XPU, TestCase
+from torch.testing._internal.common_utils import run_tests, TEST_XPU, TestCase
 from torch.testing._internal.distributed._tensor.common_dtensor import (
    DTensorTestBase,
    with_comms,
@ -58,7 +58,7 @@ def _set_env_var(addr="localhost", port="25364", world_size=1, rank=0, local_ran
        os.environ["LOCAL_RANK"] = f"{local_rank}"


-@unittest.skipIf(TEST_XPU or TEST_HPU, "XPU/HPU does not support gloo backend.")
+@unittest.skipIf(TEST_XPU, "XPU does not support gloo backend.")
 class DeviceMeshTestGlooBackend(DTensorTestBase):
    @property
    def backend(self):
--- a/test/distributed/test_inductor_collectives.py
+++ b/test/distributed/test_inductor_collectives.py
@ -40,6 +40,7 @@ from torch.testing._internal.common_distributed import (
    DynamoDistributedSingleProcTestCase,
    MultiProcessTestCase,
    requires_accelerator_dist_backend,
+    requires_gloo,
    skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (
@ -1773,16 +1774,10 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase):
        inputs = [x, w, ar_0, ar_1]
        f(*inputs, **self.get_world_trs())

-        def _pass(g):
-            from torch._inductor.fx_passes.bucketing import bucket_all_reduce
-
-            bucket_all_reduce(g.owning_module, lambda _: 2000)
-
-        torch._inductor.config.post_grad_custom_post_pass = _pass
-
        with torch._inductor.config.patch(
            {
                "reorder_for_compute_comm_overlap": False,
+                "bucket_all_reduces_fx": bucket_mode,
            }
        ):
            compiled = torch.compile(f)
@ -2234,6 +2229,50 @@ class TestSyncDecisionCrossRanks(MultiProcessTestCase):
                )
                assert est_ms_nccl > 0

+    @skip_if_lt_x_gpu(2)
+    @requires_gloo()
+    def test_regression_use_nccl_estimate_with_gloo(self):
+        # Test checks that using nccl estimator option does not hard fail
+        # with backends that does not support runtime estimations, e.g. gloo
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="gloo", store=store, rank=self.rank, world_size=self.world_size
+        )
+        group = c10d.distributed_c10d._get_default_group()
+        group_name = "default"
+        torch._C._distributed_c10d._register_process_group(
+            group_name, torch.distributed.group.WORLD
+        )
+        group_size = group.size()
+
+        def func(inp, group_size, group_name):
+            ag_0_out = torch.ops._c10d_functional.all_gather_into_tensor(
+                inp, group_size, group_name
+            )
+            ag_0_wait = torch.ops.c10d_functional.wait_tensor(ag_0_out)
+            ag_1_out = torch.ops._c10d_functional.all_gather_into_tensor(
+                ag_0_wait, group_size, group_name
+            )
+            ag_1_wait = torch.ops.c10d_functional.wait_tensor(ag_1_out)
+            return ag_1_wait
+
+        gm = make_fx(func)(torch.ones(4, 4), group_size, group_name)
+        g = gm.graph
+        for n in g.nodes:
+            if is_all_gather_into_tensor(n):
+                from torch._inductor.comm_analysis import (
+                    estimate_nccl_collective_runtime_from_fx_node,
+                )
+
+                est_ms = estimate_nccl_collective_runtime_from_fx_node(
+                    n, use_nccl_estimator=False
+                )
+                assert est_ms > 0
+                est_ms_nccl = estimate_nccl_collective_runtime_from_fx_node(
+                    n, use_nccl_estimator=True
+                )
+                assert est_ms_nccl > 0
+

 if __name__ == "__main__":
    from torch._dynamo.test_case import run_tests
--- a/test/dynamo/test_aot_compile.py
+++ b/test/dynamo/test_aot_compile.py
@ -29,6 +29,8 @@ from torch.testing._internal.common_utils import (

 MY_LAMBDA = lambda x: x + 1  # noqa: E731

+EPS = torch.tensor(1e-7)
+

 class CustomCompiledFunction(torch._dynamo.aot_compile.SerializableCallable):
    def __init__(self, gm: torch.fx.GraphModule, example_inputs: list[torch.Tensor]):
@ -587,6 +589,18 @@ from user code:
        actual = compiled_fn(fn, *inputs)
        self.assertEqual(expected, actual)

+    def test_aot_compile_with_global_tensor(self):
+        def fn(x, y):
+            return x + y + EPS
+
+        def make_inputs():
+            return (torch.randn(3, 4), torch.randn(3, 4))
+
+        compiled_fn = torch.compile(fn, fullgraph=True).aot_compile((make_inputs(), {}))
+
+        test_inputs = make_inputs()
+        self.assertEqual(compiled_fn(*test_inputs), fn(*test_inputs))
+
    def test_aot_compile_with_default_args(self):
        def fn(x, y=1):
            return x + x
--- a/test/dynamo/test_regional_inductor.py
+++ b/test/dynamo/test_regional_inductor.py
@ -10,10 +10,6 @@ import torch.utils.checkpoint
 from torch._dynamo.backends.common import aot_autograd
 from torch._functorch._aot_autograd.autograd_cache import BundledCompiledForward
 from torch._guards import detect_fake_mode
-from torch._higher_order_ops.invoke_subgraph import (
-    NestedCompileBackend,
-    NestedCompileRegionOptions,
-)
 from torch._inductor.output_code import RegionalOutputCode
 from torch._inductor.test_case import run_tests
 from torch._inductor.utils import run_fw_bw_and_get_code
@ -472,86 +468,6 @@ class RegionalInductorTests(torch._inductor.test_case.TestCase):
        # flex in forward and flex_backward in backward
        self.assertEqual(len(codes), 2)

-    @parametrize("serialize", [True, False])
-    def test_invoke_subgraph_regional_compile(self, serialize):
-        call_test_partitioner_ct = 0
-        original_default_partitioner = torch._functorch.partitioners.default_partition
-
-        def test_partitioner(
-            *args, **kwargs
-        ) -> tuple[torch.fx.GraphModule, torch.fx.GraphModule]:
-            nonlocal call_test_partitioner_ct
-            call_test_partitioner_ct += 1
-            return original_default_partitioner(*args, **kwargs)
-
-        # pyrefly: ignore [not-iterable]
-        if serialize:
-            # Callable cannot be serialized
-            torch._functorch.partitioners.default_partition = test_partitioner
-            partitioner = "default_partition"
-        else:
-            partitioner = test_partitioner
-        backend = NestedCompileRegionOptions(
-            backend=NestedCompileBackend.INDUCTOR,
-            inductor_configs={
-                "max_autotune": True,
-                "triton.cudagraphs": False,
-            },
-            partitioner=partitioner,
-        )
-
-        @torch.compiler.nested_compile_region(backend_options=backend)
-        def gn_with_backend(x):
-            return torch.sin(x)
-
-        @torch.compiler.nested_compile_region
-        def gn_without_backend(x):
-            return torch.cos(x)
-
-        def fn(x):
-            return gn_with_backend(x) + gn_without_backend(x)
-
-        backend = aot_eager_regional_inductor(serialize=serialize)
-        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
-
-        import torch._inductor.config as inductor_config
-
-        # Hook to verify options
-        original_compile = torch._inductor.standalone_compile
-        captured_options = []
-
-        def verify_options(*args, **kwargs):
-            options = kwargs.get("options", {})
-            captured_options.append(options)
-
-            # Verify config is set as expected from explicit options
-            assert inductor_config.max_autotune, "max_autotune should be True"
-            assert not inductor_config.triton.cudagraphs, (
-                "triton.cudagraphs should be False"
-            )
-
-            return original_compile(*args, **kwargs)
-
-        torch._inductor.standalone_compile = verify_options
-
-        try:
-            x = torch.randn(8, 8, requires_grad=True)
-            # opt_fn(x)
-            res, codes = run_fw_bw_and_get_code(lambda: opt_fn(x))
-            self.assertEqual(len(codes), 2)
-            self.assertTrue("repeated_subgraph0" in codes[0])
-            self.assertTrue("repeated_subgraph1" not in codes[0])
-            self.assertTrue("repeated_subgraph0" in codes[1])
-            self.assertTrue("repeated_subgraph1" not in codes[1])
-            self.assertEqual(call_test_partitioner_ct, 1)
-            true_res = fn(x)
-            self.assertEqual(res, true_res)
-        finally:
-            torch._inductor.standalone_compile = original_compile
-            torch._functorch.partitioners.default_partition = (
-                original_default_partitioner
-            )
-

@skipIfTorchDynamo("Not a suitable dynamo wrapped test")
 class TestRegionalOutputCode(torch._inductor.test_case.TestCase):
--- a/test/dynamo/test_utils.py
+++ b/test/dynamo/test_utils.py
@ -920,7 +920,7 @@ class TestDynamoTimed(TestCase):
        first, second = {
            (3, 9): (10, 6),
            (3, 10): (10, 6),
-            (3, 11): (10, 6),
+            (3, 11): (11, 7),
            (3, 12): (11, 7),
            (3, 13): (11, 7),
            (3, 14): (11, 7),
--- a/test/export/test_experimental.py
+++ b/test/export/test_experimental.py
@ -456,6 +456,31 @@ def forward(self, x):
        test_inputs = make_inputs()
        self.assertEqual(gm(*test_inputs), foo(*test_inputs))

+    def test_dynamo_graph_capture_with_call_override(self):
+        class _InterestingModule(torch.nn.Module):
+            def __init__(self, module):
+                super().__init__()
+                self._module = module
+
+            def __call__(self, *args, **kwargs):
+                return self._module(*args, **kwargs)
+
+        class MyModel(torch.nn.Module):
+            def forward(self, x):
+                return x + 1
+
+        foo = _InterestingModule(MyModel())
+
+        def make_inputs():
+            return (torch.randn(2, 3),)
+
+        trace_inputs = make_inputs()
+        gm = dynamo_graph_capture_for_export(foo)(*trace_inputs)
+        test_inputs = make_inputs()
+        self.assertEqual(gm(*test_inputs), foo(*test_inputs))
+        self.assertEqual(len(list(gm.buffers())), len(list(foo.buffers())))
+        self.assertEqual(len(list(gm.parameters())), len(list(foo.parameters())))
+
    def test_dynamo_graph_capture_custom_pytree_type(self):
        import torch.utils._pytree as pytree

--- a/test/export/test_serialize.py
+++ b/test/export/test_serialize.py
@ -38,6 +38,7 @@ from torch._export.serde.serialize import (
    _to_json_bytes,
    canonicalize,
    deserialize,
+    deserialize_torch_artifact,
    ExportedProgramDeserializer,
    ExportedProgramSerializer,
    GraphModuleSerializer,
@ -1904,6 +1905,16 @@ class TestSaveLoad(TestCase):

        self.assertTrue(torch.allclose(ep.module()(*inp), loaded_ep.module()(*inp)))

+    def test_deserialize_torch_artifact_dict(self):
+        data = {"key": torch.tensor([1, 2, 3])}
+        buf = io.BytesIO()
+        torch.save(data, buf)
+        serialized = buf.getvalue()
+        result = deserialize_torch_artifact(serialized)
+
+        self.assertIsInstance(result, dict)
+        self.assertTrue(torch.equal(result["key"], torch.tensor([1, 2, 3])))
+
    @unittest.skipIf(IS_WINDOWS, "Cannot modify file in windows")
    def test_save_file(self):
        class Foo(torch.nn.Module):
@ -2010,7 +2021,6 @@ class TestSaveLoad(TestCase):
        save(ep, buffer)
        buffer.seek(0)
        loaded_ep = load(buffer)
-
        inp = (torch.tensor(1),)
        self.assertTrue(torch.allclose(ep.module()(*inp), loaded_ep.module()(*inp)))

--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@ -8218,9 +8218,6 @@ symbolic_aot_autograd_failures = {
        "nn.functional.fractional_max_pool3d", ""
    ),  # rand() received an invalid combination of arguments - g...
    xfail("trace", ""),  # Cannot call sizes() on tensor with symbolic sizes/strides
-    xfail(
-        "convolution_backward", ""
-    ),  # Cannot call sizes() on tensor with symbolic sizes/strides
    decorate(
        "linalg.householder_product",
        decorator=unittest.skipIf(IS_MACOS and IS_X86, "flaky"),
--- a/test/higher_order_ops/test_invoke_subgraph.py
+++ b/test/higher_order_ops/test_invoke_subgraph.py
@ -21,10 +21,6 @@ from torch._dynamo.testing import (
    InductorAndRecordGraphs,
    normalize_gm,
 )
-from torch._higher_order_ops.invoke_subgraph import (
-    NestedCompileBackend,
-    NestedCompileRegionOptions,
-)
 from torch._higher_order_ops.schema import find_hop_schema
 from torch._inductor import config as inductor_config
 from torch._inductor.pattern_matcher import (
@ -1560,101 +1556,6 @@ class GraphModule(torch.nn.Module):
        res = opt_fn(x)
        self.assertEqual(ref, res)

-    def test_unbacked_expr(self):
-        @nested_compile_region
-        def gn(x):
-            return x + 1
-
-        def fn(c):
-            d = torch.concat([c, c], dim=0)
-            d = gn(d)
-            return d
-
-        c = torch.randn((64, 32))
-        torch._dynamo.decorators.mark_unbacked(c, 0)
-
-        ref = fn(c)
-        opt_fn = torch.compile(fn, backend="inductor", fullgraph=True)
-        res = opt_fn(c)
-        self.assertEqual(ref, res)
-
-    def test_grad_accumulation(self):
-        mod1 = torch.nn.Linear(8, 8)
-        mod2 = torch.nn.Linear(8, 8)
-        mod3 = torch.nn.Linear(8, 8)
-
-        @nested_compile_region
-        def gn(x):
-            return mod1(x) - mod2(x)
-
-        def fn(c):
-            d = gn(c) - mod3(c)
-            return d * 2
-
-        c = torch.randn((8, 8), requires_grad=True)
-
-        backend = AotEagerAndRecordGraphs()
-        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
-        res = opt_fn(c)
-        res.sum().backward()
-
-        # fw_add_nodes = backend.fw_graphs[0].graph.find_nodes(op="call_function", target = torch.ops.aten.add.Tensor)
-        # The gradient addition node for mod3 is not in the subgraph.
-        bw_add_nodes = backend.bw_graphs[0].graph.find_nodes(
-            op="call_function", target=torch.ops.aten.add.Tensor
-        )
-        self.assertEqual(len(bw_add_nodes), 1)
-        subgraph_node = backend.bw_graphs[0].graph.find_nodes(op="get_attr")[0]
-        subgraph_name = subgraph_node.target
-        # The gradient addition node between mod1 and mode2 will be in the subgraph
-        bw_add_nodes = getattr(backend.bw_graphs[0], subgraph_name).graph.find_nodes(
-            op="call_function", target=torch.ops.aten.add.Tensor
-        )
-        self.assertEqual(len(bw_add_nodes), 1)
-
-    def test_backend_parameter(self):
-        backend = NestedCompileRegionOptions(NestedCompileBackend.INDUCTOR)
-
-        # Test that backend parameter is properly set in node.meta
-        @nested_compile_region(backend_options=backend)
-        def gn_with_backend(x):
-            return torch.sin(x)
-
-        @nested_compile_region
-        def gn_without_backend(x):
-            return torch.cos(x)
-
-        def fn(x):
-            return gn_with_backend(x) + gn_without_backend(x)
-
-        backend = EagerAndRecordGraphs()
-        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
-
-        x = torch.randn(8, 8, requires_grad=False)
-        opt_fn(x)
-
-        # Check that we captured the graph
-        self.assertEqual(len(backend.graphs), 1)
-        graph = backend.graphs[0]
-
-        # Find invoke_subgraph nodes and check their backend metadata
-        invoke_subgraph_nodes = [
-            node
-            for node in graph.graph.nodes
-            if node.op == "call_function"
-            and node.target == torch.ops.higher_order.invoke_subgraph
-        ]
-
-        # We should have 2 invoke_subgraph calls
-        self.assertEqual(len(invoke_subgraph_nodes), 2)
-
-        # First invoke_subgraph (gn_with_backend) should have backend
-        self.assertIn("custom", invoke_subgraph_nodes[0].meta)
-
-        # Second invoke_subgraph (gn_without_backend) should have custom=None or no custom
-        backend_value = invoke_subgraph_nodes[1].meta.get("custom", None)
-        self.assertIsNone(backend_value)
-
    def test_complex(self):
        # Observed in Wan2.1
        @nested_compile_region
--- a/test/higher_order_ops/test_print.py
+++ b/test/higher_order_ops/test_print.py
@ -4,6 +4,7 @@ from unittest.mock import patch

 import torch
 from torch._dynamo.utils import counters
+from torch._functorch.aot_autograd import aot_export_module
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing._internal.common_utils import run_tests, TestCase

@ -90,6 +91,99 @@ def forward(self, arg0_1):

        self.assertEqual(printed_output, f"moo 1 2\nmoo {new_inp}\nmoo 1 2\nyeehop 4")

+    def test_print_with_side_effect(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                torch._higher_order_ops.print("moo {x} {y}", x=1, y=2)
+                res = x + x
+                torch._higher_order_ops.print("moo {x} {y}", x=1, y=2)
+                return (res,)
+
+        inputs = (torch.randn(3),)
+
+        # With functionalization, it should appear wrapped with with_effects()
+        gm, gs = aot_export_module(M(), inputs, trace_joint=False)
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    with_effects = torch.ops.higher_order.with_effects(arg0_1, torch.ops.higher_order.print, 'moo {x} {y}', x = 1, y = 2);  \
+arg0_1 = None
+    getitem = with_effects[0];  with_effects = None
+    add = torch.ops.aten.add.Tensor(arg1_1, arg1_1);  arg1_1 = None
+    with_effects_1 = torch.ops.higher_order.with_effects(getitem, torch.ops.higher_order.print, 'moo {x} {y}', x = 1, y = 2);  \
+getitem = None
+    getitem_2 = with_effects_1[0];  with_effects_1 = None
+    return (getitem_2, add)""",
+        )
+        self.assertEqual(len(gs.input_tokens), 1)
+        self.assertEqual(len(gs.output_tokens), 1)
+
+    def test_print_with_input_mutations(self):
+        class M(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+
+            def forward(self, x):
+                torch._higher_order_ops.print("moo {x} {y}", x=x, y=2)
+                res = x + x
+                x.add_(res)
+                res = x + x
+                torch._higher_order_ops.print("moo {x} {y}", x=x, y=res)
+                return (res,)
+
+        inputs = (torch.randn(3),)
+
+        # With functionalization, it should appear wrapped with with_effects()
+        gm, gs = aot_export_module(M(), inputs, trace_joint=False)
+        self.assertEqual(len(gs.input_tokens), 1)
+        self.assertEqual(len(gs.output_tokens), 1)
+        self.assertEqual(len(gs.user_inputs_to_mutate), 1)
+        self.assertExpectedInline(
+            str(gm.code).strip(),
+            """\
+def forward(self, arg0_1, arg1_1):
+    with_effects = torch.ops.higher_order.with_effects(arg0_1, torch.ops.higher_order.print, 'moo {x} {y}', \
+x = arg1_1, y = 2);  arg0_1 = None
+    getitem = with_effects[0];  with_effects = None
+    add = torch.ops.aten.add.Tensor(arg1_1, arg1_1)
+    add_1 = torch.ops.aten.add.Tensor(arg1_1, add);  arg1_1 = add = None
+    add_2 = torch.ops.aten.add.Tensor(add_1, add_1)
+    with_effects_1 = torch.ops.higher_order.with_effects(getitem, torch.ops.higher_order.print, 'moo {x} {y}', \
+x = add_1, y = add_2);  getitem = None
+    getitem_2 = with_effects_1[0];  with_effects_1 = None
+    return (getitem_2, add_1, add_2)""",
+        )
+
+    def test_print_gen_schema(self):
+        from torch._higher_order_ops.print import print as print_op
+
+        # Test basic schema generation with simple kwargs int
+        format_str = "Hello {x} {y}"
+        schema = print_op.gen_schema(format_str, x=1, y=2)
+        self.assertExpectedInline(
+            str(schema),
+            """print(str format_str, *, int x, int y) -> ()""",
+        )
+        # Test schema generation with different types of inputs
+
+        # Tensor input
+        tensor = torch.randn(2, 2)
+        schema_tensor = print_op.gen_schema("Tensor: {x}", x=tensor)
+        self.assertExpectedInline(
+            str(schema_tensor),
+            """print(str format_str, *, Tensor x) -> ()""",
+        )
+
+        # TODO: Add schema support with kwargs with value of list type
+
+        # No kwargs
+        schema_no_kwargs = print_op.gen_schema("Simple message")
+        self.assertExpectedInline(
+            str(schema_no_kwargs),
+            """print(str format_str) -> ()""",
+        )
+

 if __name__ == "__main__":
    run_tests()
--- a/test/inductor/test_ck_backend.py
+++ b/test/inductor/test_ck_backend.py
@ -235,7 +235,6 @@ class TestCKBackend(TestCase):
            Y_eager = a @ b
            torch.testing.assert_close(Y_compiled, Y_eager, equal_nan=True)

-    @unittest.skip("Autotune Mismatch being investigated")
    @unittest.skipIf(not torch.version.hip, "ROCM only")
    @unittest.mock.patch.dict(os.environ, _test_env)
    @parametrize("max_autotune_gemm_backends", ("CK", "ATen,Triton,CK"))
--- a/test/inductor/test_codecache.py
+++ b/test/inductor/test_codecache.py
@ -479,17 +479,14 @@ class TestFxGraphCache(TestCase):

        if device == GPU_TYPE and not HAS_GPU:
            raise unittest.SkipTest(f"requires {GPU_TYPE}")
-        if (
-            device == "cuda"
-            and torch.version.hip is None
-            and dtype == torch.bfloat16
-            and not SM80OrLater
-        ):
+        if device == "cuda" and dtype == torch.bfloat16 and not SM80OrLater:
            raise unittest.SkipTest("requires SM80 or later")
        if use_static_cuda_launcher and not (device == "cuda" and bundle_triton):
            raise unittest.SkipTest(
                "Static cuda launcher requires cuda and triton bundling"
            )
+        if use_static_cuda_launcher and TEST_WITH_ROCM:
+            raise unittest.SkipTest("Static cuda launcher doesn't work with ROCM")

        def fn(x, y):
            return (x * 2, y @ y)
--- a/test/inductor/test_compile_subprocess.py
+++ b/test/inductor/test_compile_subprocess.py
@ -62,6 +62,8 @@ test_failures = {
    "test_remove_noop_slice_scatter": TestFailure(("xpu"), is_skip=True),
    "test_remove_noop_view_default": TestFailure(("xpu"), is_skip=True),
    "test_remove_noop_view_dtype": TestFailure(("xpu"), is_skip=True),
+    # can not pickle ParametrizedConv2d
+    "test_weight_norm_conv2d": TestFailure(("cpu", "cuda"), is_skip=True),
 }


--- a/test/inductor/test_cpu_repro.py
+++ b/test/inductor/test_cpu_repro.py
@ -4449,16 +4449,17 @@ class CPUReproTests(TestCase):
            def forward(self, x):
                return self.gn(x)

-        for dynamic in [True, False]:
-            torch._dynamo.reset()
-            metrics.reset()
-            mod = M().eval()
-            x = torch.randn(1, 32, 128, 128, 128)
-            with torch.no_grad():
-                expected = mod(x)
-                compiled_m = torch.compile(mod, dynamic=dynamic)
-                actual = compiled_m(x)
-                self.assertEqual(expected, actual)
+        for simdlen, dynamic in itertools.product([None, 0], [True, False]):
+            with config.patch({"cpp.simdlen": simdlen}):
+                torch._dynamo.reset()
+                metrics.reset()
+                mod = M().eval()
+                x = torch.randn(1, 32, 128, 128, 128)
+                with torch.no_grad():
+                    expected = mod(x)
+                    compiled_m = torch.compile(mod, dynamic=dynamic)
+                    actual = compiled_m(x)
+                    self.assertEqual(expected, actual)

    @torch._dynamo.config.patch(
        capture_scalar_outputs=True, capture_dynamic_output_shape_ops=True
--- a/test/inductor/test_deterministic.py
+++ b/test/inductor/test_deterministic.py
@ -1,5 +1,9 @@
 # Owner(s): ["module: inductor"]
 import contextlib
+import os
+import subprocess
+import sys
+import tempfile
 import unittest

 import torch
@ -104,6 +108,64 @@ class DeterministicTest(TestCase):
            else:
                self.assertTrue(counters["inductor"]["coordesc_tuning_bench"] > 0)

+    @parametrize("model_name", ["GoogleFnet", "BertForMaskedLM", "DistillGPT2"])
+    @parametrize("training_or_inference", ["training", "inference"])
+    @parametrize("precision", ["float32", "bfloat16", "float16", "amp"])
+    def test_run2run_determinism(self, model_name, training_or_inference, precision):
+        """
+        Test run2run determinism for a few huggingface models.
+
+        The test assumes benchmarks/dynamo/huggingface.py can be found from
+        the current working directory.
+        """
+
+        if not os.path.exists("benchmarks/dynamo/huggingface.py"):
+            self.skipTest("Skip due to benchmarks/dynamo/huggingface.py not found.")
+
+        def _setup_env(env):
+            env["TORCHINDUCTOR_FORCE_DISABLE_CACHES"] = "1"  # disable autotune cache
+            env["TORCHINDUCTOR_FX_GRAPH_REMOTE_CACHE"] = "0"
+            env["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "0"
+            if enable_determinism:
+                env["TORCHINDUCTOR_DETERMINISTIC"] = "1"
+
+        # set to false if you want to check how the test fails without
+        # the deterministic mode
+        enable_determinism = True
+        with tempfile.TemporaryDirectory() as tmpdir:
+            saved_pkl = os.path.join(tmpdir, "saved.pkl")
+            cmd = (
+                f"{sys.executable} benchmarks/dynamo/huggingface.py --backend inductor"
+                + f" --{precision} --accuracy --only {model_name} --{training_or_inference}"
+                + f" --disable-cudagraphs --save-model-outputs-to={saved_pkl}"
+            )
+            print("Command", cmd)
+            env = os.environ.copy()
+            _setup_env(env)
+            out = subprocess.run(cmd.split(), capture_output=True, env=env)
+
+            # We don't check the accuracy against eager here because some
+            # of the combination between model and precision can not
+            # pass that accuracy test. But it's still valuable to make
+            # sure we generate bitwise equivalent result from run to run.
+            # self.assertTrue("pass" in out.stdout.decode())
+
+            cmd = (
+                f"{sys.executable} benchmarks/dynamo/huggingface.py --backend inductor"
+                + f" --{precision} --accuracy --only {model_name} --{training_or_inference}"
+                + f" --disable-cudagraphs --compare-model-outputs-with={saved_pkl}"
+            )
+            print("Command", cmd)
+
+            # distort benchmarking results
+            env["TORCHINDUCTOR_DISTORT_BENCHMARKING_RESULT"] = "inverse"
+            out = subprocess.run(cmd.split(), capture_output=True, env=env)
+            self.assertTrue(
+                "The result is bitwise equivalent to the previously saved result"
+                in out.stdout.decode(),
+                f"stdout: {out.stdout.decode()}, stderr: {out.stderr.decode()}",
+            )
+

 if __name__ == "__main__":
    if HAS_CUDA_AND_TRITON:
--- a/Show More
+++ b/Show More