mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-19 18:14:54 +08:00
Compare commits
1 Commits
optimizer_
...
ciflow/tru
| Author | SHA1 | Date | |
|---|---|---|---|
| 5591f46dc5 |
@ -1,19 +0,0 @@
|
||||
# Aarch64 (ARM/Graviton) Support Scripts
|
||||
Scripts for building aarch64 PyTorch PIP Wheels. These scripts build the following wheels:
|
||||
* torch
|
||||
* torchvision
|
||||
* torchaudio
|
||||
* torchtext
|
||||
* torchdata
|
||||
## Aarch64_ci_build.sh
|
||||
This script is design to support CD operations within PyPi manylinux aarch64 container, and be executed in the container. It prepares the container and then executes __aarch64_wheel_ci_build.py__ to build the wheels. The script "assumes" the PyTorch repo is located at: ```/pytorch``` and will put the wheels into ```/artifacts```.
|
||||
### Usage
|
||||
```DESIRED_PYTHON=<PythonVersion> aarch64_ci_build.sh```
|
||||
|
||||
__NOTE:__ CI build is currently __EXPERMINTAL__
|
||||
|
||||
## Build_aarch64_wheel.py
|
||||
This app allows a person to build using AWS EC3 resources and requires AWS-CLI and Boto3 with AWS credentials to support building EC2 instances for the wheel builds. Can be used in a codebuild CD or from a local system.
|
||||
|
||||
### Usage
|
||||
```build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch <RCtag>```
|
||||
@ -1,53 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -eux -o pipefail
|
||||
|
||||
GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
|
||||
|
||||
# Set CUDA architecture lists to match x86 build_cuda.sh
|
||||
if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0"
|
||||
elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
|
||||
elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
|
||||
elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
|
||||
fi
|
||||
|
||||
# Compress the fatbin with -compress-mode=size for CUDA 13
|
||||
if [[ "$DESIRED_CUDA" == *"13"* ]]; then
|
||||
export TORCH_NVCC_FLAGS="-compress-mode=size"
|
||||
# Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
|
||||
export BUILD_BUNDLE_PTXAS=1
|
||||
fi
|
||||
|
||||
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
|
||||
source $SCRIPTPATH/aarch64_ci_setup.sh
|
||||
|
||||
###############################################################################
|
||||
# Run aarch64 builder python
|
||||
###############################################################################
|
||||
cd /
|
||||
# adding safe directory for git as the permissions will be
|
||||
# on the mounted pytorch repo
|
||||
git config --global --add safe.directory /pytorch
|
||||
pip install -r /pytorch/requirements.txt
|
||||
pip install auditwheel==6.2.0 wheel
|
||||
if [ "$DESIRED_CUDA" = "cpu" ]; then
|
||||
echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
|
||||
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
|
||||
else
|
||||
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
|
||||
export USE_SYSTEM_NCCL=1
|
||||
|
||||
# Check if we should use NVIDIA libs from PyPI (similar to x86 build_cuda.sh logic)
|
||||
if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
|
||||
echo "Bundling CUDA libraries with wheel for aarch64."
|
||||
else
|
||||
echo "Using nvidia libs from pypi for aarch64."
|
||||
echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
|
||||
export USE_NVIDIA_PYPI_LIBS=1
|
||||
fi
|
||||
|
||||
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
|
||||
fi
|
||||
@ -1,21 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -eux -o pipefail
|
||||
|
||||
# This script is used to prepare the Docker container for aarch64_ci_wheel_build.py python script
|
||||
# By creating symlinks from desired /opt/python to /usr/local/bin/
|
||||
|
||||
NUMPY_VERSION=2.0.2
|
||||
if [[ "$DESIRED_PYTHON" == "3.13" || "$DESIRED_PYTHON" == "3.13t" ]]; then
|
||||
NUMPY_VERSION=2.1.2
|
||||
fi
|
||||
|
||||
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
|
||||
source $SCRIPTPATH/../manywheel/set_desired_python.sh
|
||||
|
||||
pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2
|
||||
|
||||
for tool in python python3 pip pip3 ninja scons patchelf; do
|
||||
ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin;
|
||||
done
|
||||
|
||||
python --version
|
||||
@ -1,333 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# encoding: UTF-8
|
||||
|
||||
import os
|
||||
import shutil
|
||||
from subprocess import check_call, check_output
|
||||
|
||||
|
||||
def list_dir(path: str) -> list[str]:
|
||||
"""'
|
||||
Helper for getting paths for Python
|
||||
"""
|
||||
return check_output(["ls", "-1", path]).decode().split("\n")
|
||||
|
||||
|
||||
def replace_tag(filename) -> None:
|
||||
with open(filename) as f:
|
||||
lines = f.readlines()
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith("Tag:"):
|
||||
lines[i] = line.replace("-linux_", "-manylinux_2_28_")
|
||||
print(f"Updated tag from {line} to {lines[i]}")
|
||||
break
|
||||
|
||||
with open(filename, "w") as f:
|
||||
f.writelines(lines)
|
||||
|
||||
|
||||
def patch_library_rpath(
|
||||
folder: str,
|
||||
lib_name: str,
|
||||
use_nvidia_pypi_libs: bool = False,
|
||||
desired_cuda: str = "",
|
||||
) -> None:
|
||||
"""Apply patchelf to set RPATH for a library in torch/lib"""
|
||||
lib_path = f"{folder}/tmp/torch/lib/{lib_name}"
|
||||
|
||||
if use_nvidia_pypi_libs:
|
||||
# For PyPI NVIDIA libraries, construct CUDA RPATH
|
||||
cuda_rpaths = [
|
||||
"$ORIGIN/../../nvidia/cudnn/lib",
|
||||
"$ORIGIN/../../nvidia/nvshmem/lib",
|
||||
"$ORIGIN/../../nvidia/nccl/lib",
|
||||
"$ORIGIN/../../nvidia/cusparselt/lib",
|
||||
]
|
||||
|
||||
if "130" in desired_cuda:
|
||||
cuda_rpaths.append("$ORIGIN/../../nvidia/cu13/lib")
|
||||
else:
|
||||
cuda_rpaths.extend(
|
||||
[
|
||||
"$ORIGIN/../../nvidia/cublas/lib",
|
||||
"$ORIGIN/../../nvidia/cuda_cupti/lib",
|
||||
"$ORIGIN/../../nvidia/cuda_nvrtc/lib",
|
||||
"$ORIGIN/../../nvidia/cuda_runtime/lib",
|
||||
"$ORIGIN/../../nvidia/cufft/lib",
|
||||
"$ORIGIN/../../nvidia/curand/lib",
|
||||
"$ORIGIN/../../nvidia/cusolver/lib",
|
||||
"$ORIGIN/../../nvidia/cusparse/lib",
|
||||
"$ORIGIN/../../nvidia/nvtx/lib",
|
||||
"$ORIGIN/../../nvidia/cufile/lib",
|
||||
]
|
||||
)
|
||||
|
||||
# Add $ORIGIN for local torch libs
|
||||
rpath = ":".join(cuda_rpaths) + ":$ORIGIN"
|
||||
else:
|
||||
# For bundled libraries, just use $ORIGIN
|
||||
rpath = "$ORIGIN"
|
||||
|
||||
if os.path.exists(lib_path):
|
||||
os.system(
|
||||
f"cd {folder}/tmp/torch/lib/; "
|
||||
f"patchelf --set-rpath '{rpath}' --force-rpath {lib_name}"
|
||||
)
|
||||
|
||||
|
||||
def copy_and_patch_library(
|
||||
src_path: str,
|
||||
folder: str,
|
||||
use_nvidia_pypi_libs: bool = False,
|
||||
desired_cuda: str = "",
|
||||
) -> None:
|
||||
"""Copy a library to torch/lib and patch its RPATH"""
|
||||
if os.path.exists(src_path):
|
||||
lib_name = os.path.basename(src_path)
|
||||
shutil.copy2(src_path, f"{folder}/tmp/torch/lib/{lib_name}")
|
||||
patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
|
||||
|
||||
|
||||
def package_cuda_wheel(wheel_path, desired_cuda) -> None:
|
||||
"""
|
||||
Package the cuda wheel libraries
|
||||
"""
|
||||
folder = os.path.dirname(wheel_path)
|
||||
os.mkdir(f"{folder}/tmp")
|
||||
os.system(f"unzip {wheel_path} -d {folder}/tmp")
|
||||
# Delete original wheel since it will be repackaged
|
||||
os.system(f"rm {wheel_path}")
|
||||
|
||||
# Check if we should use PyPI NVIDIA libraries or bundle system libraries
|
||||
use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
|
||||
|
||||
if use_nvidia_pypi_libs:
|
||||
print("Using nvidia libs from pypi - skipping CUDA library bundling")
|
||||
# For PyPI approach, we don't bundle CUDA libraries - they come from PyPI packages
|
||||
# We only need to bundle non-NVIDIA libraries
|
||||
minimal_libs_to_copy = [
|
||||
"/lib64/libgomp.so.1",
|
||||
"/usr/lib64/libgfortran.so.5",
|
||||
"/acl/build/libarm_compute.so",
|
||||
"/acl/build/libarm_compute_graph.so",
|
||||
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_lapack_core.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_core.so.0",
|
||||
]
|
||||
|
||||
# Copy minimal libraries to unzipped_folder/torch/lib
|
||||
for lib_path in minimal_libs_to_copy:
|
||||
copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
|
||||
|
||||
# Patch torch libraries used for searching libraries
|
||||
torch_libs_to_patch = [
|
||||
"libtorch.so",
|
||||
"libtorch_cpu.so",
|
||||
"libtorch_cuda.so",
|
||||
"libtorch_cuda_linalg.so",
|
||||
"libtorch_global_deps.so",
|
||||
"libtorch_python.so",
|
||||
"libtorch_nvshmem.so",
|
||||
"libc10.so",
|
||||
"libc10_cuda.so",
|
||||
"libcaffe2_nvrtc.so",
|
||||
"libshm.so",
|
||||
]
|
||||
for lib_name in torch_libs_to_patch:
|
||||
patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
|
||||
else:
|
||||
print("Bundling CUDA libraries with wheel")
|
||||
# Original logic for bundling system CUDA libraries
|
||||
# Common libraries for all CUDA versions
|
||||
common_libs = [
|
||||
# Non-NVIDIA system libraries
|
||||
"/lib64/libgomp.so.1",
|
||||
"/usr/lib64/libgfortran.so.5",
|
||||
"/acl/build/libarm_compute.so",
|
||||
"/acl/build/libarm_compute_graph.so",
|
||||
# Common CUDA libraries (same for all versions)
|
||||
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_lapack_core.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_core.so.0",
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
|
||||
"/usr/local/cuda/lib64/libcudnn.so.9",
|
||||
"/usr/local/cuda/lib64/libcusparseLt.so.0",
|
||||
"/usr/local/cuda/lib64/libcurand.so.10",
|
||||
"/usr/local/cuda/lib64/libnccl.so.2",
|
||||
"/usr/local/cuda/lib64/libnvshmem_host.so.3",
|
||||
"/usr/local/cuda/lib64/libcudnn_adv.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_cnn.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_graph.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_ops.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
|
||||
"/usr/local/cuda/lib64/libcufile.so.0",
|
||||
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
|
||||
"/usr/local/cuda/lib64/libcusparse.so.12",
|
||||
]
|
||||
|
||||
# CUDA version-specific libraries
|
||||
if "13" in desired_cuda:
|
||||
minor_version = desired_cuda[-1]
|
||||
version_specific_libs = [
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
|
||||
"/usr/local/cuda/lib64/libcublas.so.13",
|
||||
"/usr/local/cuda/lib64/libcublasLt.so.13",
|
||||
"/usr/local/cuda/lib64/libcudart.so.13",
|
||||
"/usr/local/cuda/lib64/libcufft.so.12",
|
||||
"/usr/local/cuda/lib64/libcusolver.so.12",
|
||||
"/usr/local/cuda/lib64/libnvJitLink.so.13",
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.13",
|
||||
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",
|
||||
]
|
||||
elif "12" in desired_cuda:
|
||||
# Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
|
||||
minor_version = desired_cuda[-1]
|
||||
version_specific_libs = [
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
|
||||
"/usr/local/cuda/lib64/libcublas.so.12",
|
||||
"/usr/local/cuda/lib64/libcublasLt.so.12",
|
||||
"/usr/local/cuda/lib64/libcudart.so.12",
|
||||
"/usr/local/cuda/lib64/libcufft.so.11",
|
||||
"/usr/local/cuda/lib64/libcusolver.so.11",
|
||||
"/usr/local/cuda/lib64/libnvJitLink.so.12",
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.12",
|
||||
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
|
||||
]
|
||||
else:
|
||||
raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")
|
||||
|
||||
# Combine all libraries
|
||||
libs_to_copy = common_libs + version_specific_libs
|
||||
|
||||
# Copy libraries to unzipped_folder/torch/lib
|
||||
for lib_path in libs_to_copy:
|
||||
copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
|
||||
|
||||
# Make sure the wheel is tagged with manylinux_2_28
|
||||
for f in os.scandir(f"{folder}/tmp/"):
|
||||
if f.is_dir() and f.name.endswith(".dist-info"):
|
||||
replace_tag(f"{f.path}/WHEEL")
|
||||
break
|
||||
|
||||
os.system(f"wheel pack {folder}/tmp/ -d {folder}")
|
||||
os.system(f"rm -rf {folder}/tmp/")
|
||||
|
||||
|
||||
def complete_wheel(folder: str) -> str:
|
||||
"""
|
||||
Complete wheel build and put in artifact location
|
||||
"""
|
||||
wheel_name = list_dir(f"/{folder}/dist")[0]
|
||||
|
||||
# Please note for cuda we don't run auditwheel since we use custom script to package
|
||||
# the cuda dependencies to the wheel file using update_wheel() method.
|
||||
# However we need to make sure filename reflects the correct Manylinux platform.
|
||||
if "pytorch" in folder and not enable_cuda:
|
||||
print("Repairing Wheel with AuditWheel")
|
||||
check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
|
||||
repaired_wheel_name = list_dir(f"/{folder}/wheelhouse")[0]
|
||||
|
||||
print(f"Moving {repaired_wheel_name} wheel to /{folder}/dist")
|
||||
os.rename(
|
||||
f"/{folder}/wheelhouse/{repaired_wheel_name}",
|
||||
f"/{folder}/dist/{repaired_wheel_name}",
|
||||
)
|
||||
else:
|
||||
repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
|
||||
|
||||
print(f"Copying {repaired_wheel_name} to artifacts")
|
||||
shutil.copy2(
|
||||
f"/{folder}/dist/{repaired_wheel_name}", f"/artifacts/{repaired_wheel_name}"
|
||||
)
|
||||
|
||||
return repaired_wheel_name
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
"""
|
||||
Parse inline arguments
|
||||
"""
|
||||
from argparse import ArgumentParser
|
||||
|
||||
parser = ArgumentParser("AARCH64 wheels python CD")
|
||||
parser.add_argument("--debug", action="store_true")
|
||||
parser.add_argument("--build-only", action="store_true")
|
||||
parser.add_argument("--test-only", type=str)
|
||||
parser.add_argument("--enable-mkldnn", action="store_true")
|
||||
parser.add_argument("--enable-cuda", action="store_true")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
Entry Point
|
||||
"""
|
||||
args = parse_arguments()
|
||||
enable_mkldnn = args.enable_mkldnn
|
||||
enable_cuda = args.enable_cuda
|
||||
branch = check_output(
|
||||
["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd="/pytorch"
|
||||
).decode()
|
||||
|
||||
print("Building PyTorch wheel")
|
||||
build_vars = ""
|
||||
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
|
||||
if enable_cuda:
|
||||
build_vars += "MAX_JOBS=5 "
|
||||
|
||||
# Handle PyPI NVIDIA libraries vs bundled libraries
|
||||
use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
|
||||
if use_nvidia_pypi_libs:
|
||||
print("Configuring build for PyPI NVIDIA libraries")
|
||||
# Configure for dynamic linking (matching x86 logic)
|
||||
build_vars += "ATEN_STATIC_CUDA=0 USE_CUDA_STATIC_LINK=0 USE_CUPTI_SO=1 "
|
||||
else:
|
||||
print("Configuring build for bundled NVIDIA libraries")
|
||||
# Keep existing static linking approach - already configured above
|
||||
|
||||
override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
|
||||
desired_cuda = os.getenv("DESIRED_CUDA")
|
||||
if override_package_version is not None:
|
||||
version = override_package_version
|
||||
build_vars += (
|
||||
f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
|
||||
)
|
||||
elif branch in ["nightly", "main"]:
|
||||
build_date = (
|
||||
check_output(["git", "log", "--pretty=format:%cs", "-1"], cwd="/pytorch")
|
||||
.decode()
|
||||
.replace("-", "")
|
||||
)
|
||||
version = (
|
||||
check_output(["cat", "version.txt"], cwd="/pytorch").decode().strip()[:-2]
|
||||
)
|
||||
if enable_cuda:
|
||||
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date}+{desired_cuda} PYTORCH_BUILD_NUMBER=1 "
|
||||
else:
|
||||
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
|
||||
elif branch.startswith(("v1.", "v2.")):
|
||||
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
|
||||
|
||||
if enable_mkldnn:
|
||||
print("build pytorch with mkldnn+acl backend")
|
||||
build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
|
||||
build_vars += "ACL_ROOT_DIR=/acl "
|
||||
if enable_cuda:
|
||||
build_vars += "BLAS=NVPL "
|
||||
else:
|
||||
build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS "
|
||||
else:
|
||||
print("build pytorch without mkldnn backend")
|
||||
|
||||
os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")
|
||||
if enable_cuda:
|
||||
print("Updating Cuda Dependency")
|
||||
filename = os.listdir("/pytorch/dist/")
|
||||
wheel_path = f"/pytorch/dist/{filename[0]}"
|
||||
package_cuda_wheel(wheel_path, desired_cuda)
|
||||
pytorch_wheel_name = complete_wheel("/pytorch/")
|
||||
print(f"Build Complete. Created {pytorch_wheel_name}..")
|
||||
@ -1,999 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# This script is for building AARCH64 wheels using AWS EC2 instances.
|
||||
# To generate binaries for the release follow these steps:
|
||||
# 1. Update mappings for each of the Domain Libraries by adding new row to a table like this:
|
||||
# "v1.11.0": ("0.11.0", "rc1"),
|
||||
# 2. Run script with following arguments for each of the supported python versions and required tag, for example:
|
||||
# build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch v1.11.0-rc3
|
||||
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from typing import Optional, Union
|
||||
|
||||
import boto3
|
||||
|
||||
|
||||
# AMI images for us-east-1, change the following based on your ~/.aws/config
|
||||
os_amis = {
|
||||
"ubuntu20_04": "ami-052eac90edaa9d08f", # login_name: ubuntu
|
||||
"ubuntu22_04": "ami-0c6c29c5125214c77", # login_name: ubuntu
|
||||
"redhat8": "ami-0698b90665a2ddcf1", # login_name: ec2-user
|
||||
}
|
||||
|
||||
ubuntu20_04_ami = os_amis["ubuntu20_04"]
|
||||
|
||||
|
||||
def compute_keyfile_path(key_name: Optional[str] = None) -> tuple[str, str]:
|
||||
if key_name is None:
|
||||
key_name = os.getenv("AWS_KEY_NAME")
|
||||
if key_name is None:
|
||||
return os.getenv("SSH_KEY_PATH", ""), ""
|
||||
|
||||
homedir_path = os.path.expanduser("~")
|
||||
default_path = os.path.join(homedir_path, ".ssh", f"{key_name}.pem")
|
||||
return os.getenv("SSH_KEY_PATH", default_path), key_name
|
||||
|
||||
|
||||
ec2 = boto3.resource("ec2")
|
||||
|
||||
|
||||
def ec2_get_instances(filter_name, filter_value):
|
||||
return ec2.instances.filter(
|
||||
Filters=[{"Name": filter_name, "Values": [filter_value]}]
|
||||
)
|
||||
|
||||
|
||||
def ec2_instances_of_type(instance_type="t4g.2xlarge"):
|
||||
return ec2_get_instances("instance-type", instance_type)
|
||||
|
||||
|
||||
def ec2_instances_by_id(instance_id):
|
||||
rc = list(ec2_get_instances("instance-id", instance_id))
|
||||
return rc[0] if len(rc) > 0 else None
|
||||
|
||||
|
||||
def start_instance(
|
||||
key_name, ami=ubuntu20_04_ami, instance_type="t4g.2xlarge", ebs_size: int = 50
|
||||
):
|
||||
inst = ec2.create_instances(
|
||||
ImageId=ami,
|
||||
InstanceType=instance_type,
|
||||
SecurityGroups=["ssh-allworld"],
|
||||
KeyName=key_name,
|
||||
MinCount=1,
|
||||
MaxCount=1,
|
||||
BlockDeviceMappings=[
|
||||
{
|
||||
"DeviceName": "/dev/sda1",
|
||||
"Ebs": {
|
||||
"DeleteOnTermination": True,
|
||||
"VolumeSize": ebs_size,
|
||||
"VolumeType": "standard",
|
||||
},
|
||||
}
|
||||
],
|
||||
)[0]
|
||||
print(f"Create instance {inst.id}")
|
||||
inst.wait_until_running()
|
||||
running_inst = ec2_instances_by_id(inst.id)
|
||||
print(f"Instance started at {running_inst.public_dns_name}")
|
||||
return running_inst
|
||||
|
||||
|
||||
class RemoteHost:
|
||||
addr: str
|
||||
keyfile_path: str
|
||||
login_name: str
|
||||
container_id: Optional[str] = None
|
||||
ami: Optional[str] = None
|
||||
|
||||
def __init__(self, addr: str, keyfile_path: str, login_name: str = "ubuntu"):
|
||||
self.addr = addr
|
||||
self.keyfile_path = keyfile_path
|
||||
self.login_name = login_name
|
||||
|
||||
def _gen_ssh_prefix(self) -> list[str]:
|
||||
return [
|
||||
"ssh",
|
||||
"-o",
|
||||
"StrictHostKeyChecking=no",
|
||||
"-i",
|
||||
self.keyfile_path,
|
||||
f"{self.login_name}@{self.addr}",
|
||||
"--",
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def _split_cmd(args: Union[str, list[str]]) -> list[str]:
|
||||
return args.split() if isinstance(args, str) else args
|
||||
|
||||
def run_ssh_cmd(self, args: Union[str, list[str]]) -> None:
|
||||
subprocess.check_call(self._gen_ssh_prefix() + self._split_cmd(args))
|
||||
|
||||
def check_ssh_output(self, args: Union[str, list[str]]) -> str:
|
||||
return subprocess.check_output(
|
||||
self._gen_ssh_prefix() + self._split_cmd(args)
|
||||
).decode("utf-8")
|
||||
|
||||
def scp_upload_file(self, local_file: str, remote_file: str) -> None:
|
||||
subprocess.check_call(
|
||||
[
|
||||
"scp",
|
||||
"-i",
|
||||
self.keyfile_path,
|
||||
local_file,
|
||||
f"{self.login_name}@{self.addr}:{remote_file}",
|
||||
]
|
||||
)
|
||||
|
||||
def scp_download_file(
|
||||
self, remote_file: str, local_file: Optional[str] = None
|
||||
) -> None:
|
||||
if local_file is None:
|
||||
local_file = "."
|
||||
subprocess.check_call(
|
||||
[
|
||||
"scp",
|
||||
"-i",
|
||||
self.keyfile_path,
|
||||
f"{self.login_name}@{self.addr}:{remote_file}",
|
||||
local_file,
|
||||
]
|
||||
)
|
||||
|
||||
def start_docker(self, image="quay.io/pypa/manylinux2014_aarch64:latest") -> None:
|
||||
self.run_ssh_cmd("sudo apt-get install -y docker.io")
|
||||
self.run_ssh_cmd(f"sudo usermod -a -G docker {self.login_name}")
|
||||
self.run_ssh_cmd("sudo service docker start")
|
||||
self.run_ssh_cmd(f"docker pull {image}")
|
||||
self.container_id = self.check_ssh_output(
|
||||
f"docker run -t -d -w /root {image}"
|
||||
).strip()
|
||||
|
||||
def using_docker(self) -> bool:
|
||||
return self.container_id is not None
|
||||
|
||||
def run_cmd(self, args: Union[str, list[str]]) -> None:
|
||||
if not self.using_docker():
|
||||
return self.run_ssh_cmd(args)
|
||||
assert self.container_id is not None
|
||||
docker_cmd = self._gen_ssh_prefix() + [
|
||||
"docker",
|
||||
"exec",
|
||||
"-i",
|
||||
self.container_id,
|
||||
"bash",
|
||||
]
|
||||
p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE)
|
||||
p.communicate(
|
||||
input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
|
||||
"utf-8"
|
||||
)
|
||||
)
|
||||
rc = p.wait()
|
||||
if rc != 0:
|
||||
raise subprocess.CalledProcessError(rc, docker_cmd)
|
||||
|
||||
def check_output(self, args: Union[str, list[str]]) -> str:
|
||||
if not self.using_docker():
|
||||
return self.check_ssh_output(args)
|
||||
assert self.container_id is not None
|
||||
docker_cmd = self._gen_ssh_prefix() + [
|
||||
"docker",
|
||||
"exec",
|
||||
"-i",
|
||||
self.container_id,
|
||||
"bash",
|
||||
]
|
||||
p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
(out, err) = p.communicate(
|
||||
input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
|
||||
"utf-8"
|
||||
)
|
||||
)
|
||||
rc = p.wait()
|
||||
if rc != 0:
|
||||
raise subprocess.CalledProcessError(rc, docker_cmd, output=out, stderr=err)
|
||||
return out.decode("utf-8")
|
||||
|
||||
def upload_file(self, local_file: str, remote_file: str) -> None:
|
||||
if not self.using_docker():
|
||||
return self.scp_upload_file(local_file, remote_file)
|
||||
tmp_file = os.path.join("/tmp", os.path.basename(local_file))
|
||||
self.scp_upload_file(local_file, tmp_file)
|
||||
self.run_ssh_cmd(
|
||||
["docker", "cp", tmp_file, f"{self.container_id}:/root/{remote_file}"]
|
||||
)
|
||||
self.run_ssh_cmd(["rm", tmp_file])
|
||||
|
||||
def download_file(self, remote_file: str, local_file: Optional[str] = None) -> None:
|
||||
if not self.using_docker():
|
||||
return self.scp_download_file(remote_file, local_file)
|
||||
tmp_file = os.path.join("/tmp", os.path.basename(remote_file))
|
||||
self.run_ssh_cmd(
|
||||
["docker", "cp", f"{self.container_id}:/root/{remote_file}", tmp_file]
|
||||
)
|
||||
self.scp_download_file(tmp_file, local_file)
|
||||
self.run_ssh_cmd(["rm", tmp_file])
|
||||
|
||||
def download_wheel(
|
||||
self, remote_file: str, local_file: Optional[str] = None
|
||||
) -> None:
|
||||
if self.using_docker() and local_file is None:
|
||||
basename = os.path.basename(remote_file)
|
||||
local_file = basename.replace(
|
||||
"-linux_aarch64.whl", "-manylinux2014_aarch64.whl"
|
||||
)
|
||||
self.download_file(remote_file, local_file)
|
||||
|
||||
def list_dir(self, path: str) -> list[str]:
|
||||
return self.check_output(["ls", "-1", path]).split("\n")
|
||||
|
||||
|
||||
def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
|
||||
import socket
|
||||
|
||||
for i in range(attempt_cnt):
|
||||
try:
|
||||
with socket.create_connection((addr, port), timeout=timeout):
|
||||
return
|
||||
except (ConnectionRefusedError, TimeoutError): # noqa: PERF203
|
||||
if i == attempt_cnt - 1:
|
||||
raise
|
||||
time.sleep(timeout)
|
||||
|
||||
|
||||
def update_apt_repo(host: RemoteHost) -> None:
|
||||
time.sleep(5)
|
||||
host.run_cmd("sudo systemctl stop apt-daily.service || true")
|
||||
host.run_cmd("sudo systemctl stop unattended-upgrades.service || true")
|
||||
host.run_cmd(
|
||||
"while systemctl is-active --quiet apt-daily.service; do sleep 1; done"
|
||||
)
|
||||
host.run_cmd(
|
||||
"while systemctl is-active --quiet unattended-upgrades.service; do sleep 1; done"
|
||||
)
|
||||
host.run_cmd("sudo apt-get update")
|
||||
time.sleep(3)
|
||||
host.run_cmd("sudo apt-get update")
|
||||
|
||||
|
||||
def install_condaforge(
|
||||
host: RemoteHost, suffix: str = "latest/download/Miniforge3-Linux-aarch64.sh"
|
||||
) -> None:
|
||||
print("Install conda-forge")
|
||||
host.run_cmd(f"curl -OL https://github.com/conda-forge/miniforge/releases/{suffix}")
|
||||
host.run_cmd(f"sh -f {os.path.basename(suffix)} -b")
|
||||
host.run_cmd(f"rm -f {os.path.basename(suffix)}")
|
||||
if host.using_docker():
|
||||
host.run_cmd("echo 'PATH=$HOME/miniforge3/bin:$PATH'>>.bashrc")
|
||||
else:
|
||||
host.run_cmd(
|
||||
[
|
||||
"sed",
|
||||
"-i",
|
||||
"'/^# If not running interactively.*/i PATH=$HOME/miniforge3/bin:$PATH'",
|
||||
".bashrc",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
|
||||
if python_version == "3.6":
|
||||
# Python-3.6 EOLed and not compatible with conda-4.11
|
||||
install_condaforge(
|
||||
host, suffix="download/4.10.3-10/Miniforge3-4.10.3-10-Linux-aarch64.sh"
|
||||
)
|
||||
host.run_cmd(f"conda install -y python={python_version} numpy pyyaml")
|
||||
else:
|
||||
install_condaforge(
|
||||
host, suffix="download/4.11.0-4/Miniforge3-4.11.0-4-Linux-aarch64.sh"
|
||||
)
|
||||
# Pytorch-1.10 or older are not compatible with setuptools=59.6 or newer
|
||||
host.run_cmd(
|
||||
f"conda install -y python={python_version} numpy pyyaml setuptools>=59.5.0"
|
||||
)
|
||||
|
||||
|
||||
def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:
|
||||
host.run_cmd("pip3 install auditwheel")
|
||||
host.run_cmd(
|
||||
"conda install -y patchelf" if use_conda else "sudo apt-get install -y patchelf"
|
||||
)
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
with NamedTemporaryFile() as tmp:
|
||||
tmp.write(embed_library_script.encode("utf-8"))
|
||||
tmp.flush()
|
||||
host.upload_file(tmp.name, "embed_library.py")
|
||||
|
||||
print("Embedding libgomp into wheel")
|
||||
if host.using_docker():
|
||||
host.run_cmd(f"python3 embed_library.py {wheel_name} --update-tag")
|
||||
else:
|
||||
host.run_cmd(f"python3 embed_library.py {wheel_name}")
|
||||
|
||||
|
||||
def checkout_repo(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
url: str,
|
||||
git_clone_flags: str,
|
||||
mapping: dict[str, tuple[str, str]],
|
||||
) -> Optional[str]:
|
||||
for prefix in mapping:
|
||||
if not branch.startswith(prefix):
|
||||
continue
|
||||
tag = f"v{mapping[prefix][0]}-{mapping[prefix][1]}"
|
||||
host.run_cmd(f"git clone {url} -b {tag} {git_clone_flags}")
|
||||
return mapping[prefix][0]
|
||||
|
||||
host.run_cmd(f"git clone {url} -b {branch} {git_clone_flags}")
|
||||
return None
|
||||
|
||||
|
||||
def build_torchvision(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
use_conda: bool = True,
|
||||
git_clone_flags: str,
|
||||
run_smoke_tests: bool = True,
|
||||
) -> str:
|
||||
print("Checking out TorchVision repo")
|
||||
build_version = checkout_repo(
|
||||
host,
|
||||
branch=branch,
|
||||
url="https://github.com/pytorch/vision",
|
||||
git_clone_flags=git_clone_flags,
|
||||
mapping={
|
||||
"v1.7.1": ("0.8.2", "rc2"),
|
||||
"v1.8.0": ("0.9.0", "rc3"),
|
||||
"v1.8.1": ("0.9.1", "rc1"),
|
||||
"v1.9.0": ("0.10.0", "rc1"),
|
||||
"v1.10.0": ("0.11.1", "rc1"),
|
||||
"v1.10.1": ("0.11.2", "rc1"),
|
||||
"v1.10.2": ("0.11.3", "rc1"),
|
||||
"v1.11.0": ("0.12.0", "rc1"),
|
||||
"v1.12.0": ("0.13.0", "rc4"),
|
||||
"v1.12.1": ("0.13.1", "rc6"),
|
||||
"v1.13.0": ("0.14.0", "rc4"),
|
||||
"v1.13.1": ("0.14.1", "rc2"),
|
||||
"v2.0.0": ("0.15.1", "rc2"),
|
||||
"v2.0.1": ("0.15.2", "rc2"),
|
||||
},
|
||||
)
|
||||
print("Building TorchVision wheel")
|
||||
|
||||
# Please note libnpg and jpeg are required to build image.so extension
|
||||
if use_conda:
|
||||
host.run_cmd("conda install -y libpng jpeg")
|
||||
# Remove .so files to force static linking
|
||||
host.run_cmd(
|
||||
"rm miniforge3/lib/libpng.so miniforge3/lib/libpng16.so miniforge3/lib/libjpeg.so"
|
||||
)
|
||||
# And patch setup.py to include libz dependency for libpng
|
||||
host.run_cmd(
|
||||
[
|
||||
'sed -i -e \'s/image_link_flags\\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py'
|
||||
]
|
||||
)
|
||||
|
||||
build_vars = ""
|
||||
if branch == "nightly":
|
||||
version = host.check_output(
|
||||
["if [ -f vision/version.txt ]; then cat vision/version.txt; fi"]
|
||||
).strip()
|
||||
if len(version) == 0:
|
||||
# In older revisions, version was embedded in setup.py
|
||||
version = (
|
||||
host.check_output(["grep", '"version = \'"', "vision/setup.py"])
|
||||
.strip()
|
||||
.split("'")[1][:-2]
|
||||
)
|
||||
build_date = (
|
||||
host.check_output("cd vision && git log --pretty=format:%s -1")
|
||||
.strip()
|
||||
.split()[0]
|
||||
.replace("-", "")
|
||||
)
|
||||
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
|
||||
elif build_version is not None:
|
||||
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation")
|
||||
vision_wheel_name = host.list_dir("vision/dist")[0]
|
||||
embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name))
|
||||
|
||||
print("Copying TorchVision wheel")
|
||||
host.download_wheel(os.path.join("vision", "dist", vision_wheel_name))
|
||||
if run_smoke_tests:
|
||||
host.run_cmd(
|
||||
f"pip3 install {os.path.join('vision', 'dist', vision_wheel_name)}"
|
||||
)
|
||||
host.run_cmd("python3 vision/test/smoke_test.py")
|
||||
print("Delete vision checkout")
|
||||
host.run_cmd("rm -rf vision")
|
||||
|
||||
return vision_wheel_name
|
||||
|
||||
|
||||
def build_torchdata(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
use_conda: bool = True,
|
||||
git_clone_flags: str = "",
|
||||
) -> str:
|
||||
print("Checking out TorchData repo")
|
||||
git_clone_flags += " --recurse-submodules"
|
||||
build_version = checkout_repo(
|
||||
host,
|
||||
branch=branch,
|
||||
url="https://github.com/pytorch/data",
|
||||
git_clone_flags=git_clone_flags,
|
||||
mapping={
|
||||
"v1.13.1": ("0.5.1", ""),
|
||||
"v2.0.0": ("0.6.0", "rc5"),
|
||||
"v2.0.1": ("0.6.1", "rc1"),
|
||||
},
|
||||
)
|
||||
print("Building TorchData wheel")
|
||||
build_vars = ""
|
||||
if branch == "nightly":
|
||||
version = host.check_output(
|
||||
["if [ -f data/version.txt ]; then cat data/version.txt; fi"]
|
||||
).strip()
|
||||
build_date = (
|
||||
host.check_output("cd data && git log --pretty=format:%s -1")
|
||||
.strip()
|
||||
.split()[0]
|
||||
.replace("-", "")
|
||||
)
|
||||
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
|
||||
elif build_version is not None:
|
||||
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation")
|
||||
wheel_name = host.list_dir("data/dist")[0]
|
||||
embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name))
|
||||
|
||||
print("Copying TorchData wheel")
|
||||
host.download_wheel(os.path.join("data", "dist", wheel_name))
|
||||
|
||||
return wheel_name
|
||||
|
||||
|
||||
def build_torchtext(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
use_conda: bool = True,
|
||||
git_clone_flags: str = "",
|
||||
) -> str:
|
||||
print("Checking out TorchText repo")
|
||||
git_clone_flags += " --recurse-submodules"
|
||||
build_version = checkout_repo(
|
||||
host,
|
||||
branch=branch,
|
||||
url="https://github.com/pytorch/text",
|
||||
git_clone_flags=git_clone_flags,
|
||||
mapping={
|
||||
"v1.9.0": ("0.10.0", "rc1"),
|
||||
"v1.10.0": ("0.11.0", "rc2"),
|
||||
"v1.10.1": ("0.11.1", "rc1"),
|
||||
"v1.10.2": ("0.11.2", "rc1"),
|
||||
"v1.11.0": ("0.12.0", "rc1"),
|
||||
"v1.12.0": ("0.13.0", "rc2"),
|
||||
"v1.12.1": ("0.13.1", "rc5"),
|
||||
"v1.13.0": ("0.14.0", "rc3"),
|
||||
"v1.13.1": ("0.14.1", "rc1"),
|
||||
"v2.0.0": ("0.15.1", "rc2"),
|
||||
"v2.0.1": ("0.15.2", "rc2"),
|
||||
},
|
||||
)
|
||||
print("Building TorchText wheel")
|
||||
build_vars = ""
|
||||
if branch == "nightly":
|
||||
version = host.check_output(
|
||||
["if [ -f text/version.txt ]; then cat text/version.txt; fi"]
|
||||
).strip()
|
||||
build_date = (
|
||||
host.check_output("cd text && git log --pretty=format:%s -1")
|
||||
.strip()
|
||||
.split()[0]
|
||||
.replace("-", "")
|
||||
)
|
||||
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
|
||||
elif build_version is not None:
|
||||
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation")
|
||||
wheel_name = host.list_dir("text/dist")[0]
|
||||
embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name))
|
||||
|
||||
print("Copying TorchText wheel")
|
||||
host.download_wheel(os.path.join("text", "dist", wheel_name))
|
||||
|
||||
return wheel_name
|
||||
|
||||
|
||||
def build_torchaudio(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
use_conda: bool = True,
|
||||
git_clone_flags: str = "",
|
||||
) -> str:
|
||||
print("Checking out TorchAudio repo")
|
||||
git_clone_flags += " --recurse-submodules"
|
||||
build_version = checkout_repo(
|
||||
host,
|
||||
branch=branch,
|
||||
url="https://github.com/pytorch/audio",
|
||||
git_clone_flags=git_clone_flags,
|
||||
mapping={
|
||||
"v1.9.0": ("0.9.0", "rc2"),
|
||||
"v1.10.0": ("0.10.0", "rc5"),
|
||||
"v1.10.1": ("0.10.1", "rc1"),
|
||||
"v1.10.2": ("0.10.2", "rc1"),
|
||||
"v1.11.0": ("0.11.0", "rc1"),
|
||||
"v1.12.0": ("0.12.0", "rc3"),
|
||||
"v1.12.1": ("0.12.1", "rc5"),
|
||||
"v1.13.0": ("0.13.0", "rc4"),
|
||||
"v1.13.1": ("0.13.1", "rc2"),
|
||||
"v2.0.0": ("2.0.1", "rc3"),
|
||||
"v2.0.1": ("2.0.2", "rc2"),
|
||||
},
|
||||
)
|
||||
print("Building TorchAudio wheel")
|
||||
build_vars = ""
|
||||
if branch == "nightly":
|
||||
version = (
|
||||
host.check_output(["grep", '"version = \'"', "audio/setup.py"])
|
||||
.strip()
|
||||
.split("'")[1][:-2]
|
||||
)
|
||||
build_date = (
|
||||
host.check_output("cd audio && git log --pretty=format:%s -1")
|
||||
.strip()
|
||||
.split()[0]
|
||||
.replace("-", "")
|
||||
)
|
||||
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
|
||||
elif build_version is not None:
|
||||
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
host.run_cmd(
|
||||
f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \
|
||||
&& ./packaging/ffmpeg/build.sh \
|
||||
&& {build_vars} python3 -m build --wheel --no-isolation"
|
||||
)
|
||||
|
||||
wheel_name = host.list_dir("audio/dist")[0]
|
||||
embed_libgomp(host, use_conda, os.path.join("audio", "dist", wheel_name))
|
||||
|
||||
print("Copying TorchAudio wheel")
|
||||
host.download_wheel(os.path.join("audio", "dist", wheel_name))
|
||||
|
||||
return wheel_name
|
||||
|
||||
|
||||
def configure_system(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
compiler: str = "gcc-8",
|
||||
use_conda: bool = True,
|
||||
python_version: str = "3.8",
|
||||
) -> None:
|
||||
if use_conda:
|
||||
install_condaforge_python(host, python_version)
|
||||
|
||||
print("Configuring the system")
|
||||
if not host.using_docker():
|
||||
update_apt_repo(host)
|
||||
host.run_cmd("sudo apt-get install -y ninja-build g++ git cmake gfortran unzip")
|
||||
else:
|
||||
host.run_cmd("yum install -y sudo")
|
||||
host.run_cmd("conda install -y ninja scons")
|
||||
|
||||
if not use_conda:
|
||||
host.run_cmd(
|
||||
"sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
|
||||
)
|
||||
host.run_cmd("pip3 install dataclasses typing-extensions")
|
||||
if not use_conda:
|
||||
print("Installing Cython + numpy from PyPy")
|
||||
host.run_cmd("sudo pip3 install Cython")
|
||||
host.run_cmd("sudo pip3 install numpy")
|
||||
|
||||
|
||||
def build_domains(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
use_conda: bool = True,
|
||||
git_clone_flags: str = "",
|
||||
) -> tuple[str, str, str, str]:
|
||||
vision_wheel_name = build_torchvision(
|
||||
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
|
||||
)
|
||||
audio_wheel_name = build_torchaudio(
|
||||
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
|
||||
)
|
||||
data_wheel_name = build_torchdata(
|
||||
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
|
||||
)
|
||||
text_wheel_name = build_torchtext(
|
||||
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
|
||||
)
|
||||
return (vision_wheel_name, audio_wheel_name, data_wheel_name, text_wheel_name)
|
||||
|
||||
|
||||
def start_build(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
compiler: str = "gcc-8",
|
||||
use_conda: bool = True,
|
||||
python_version: str = "3.8",
|
||||
pytorch_only: bool = False,
|
||||
pytorch_build_number: Optional[str] = None,
|
||||
shallow_clone: bool = True,
|
||||
enable_mkldnn: bool = False,
|
||||
) -> tuple[str, str, str, str, str]:
|
||||
git_clone_flags = " --depth 1 --shallow-submodules" if shallow_clone else ""
|
||||
if host.using_docker() and not use_conda:
|
||||
print("Auto-selecting conda option for docker images")
|
||||
use_conda = True
|
||||
if not host.using_docker():
|
||||
print("Disable mkldnn for host builds")
|
||||
enable_mkldnn = False
|
||||
|
||||
configure_system(
|
||||
host, compiler=compiler, use_conda=use_conda, python_version=python_version
|
||||
)
|
||||
|
||||
if host.using_docker():
|
||||
print("Move libgfortant.a into a standard location")
|
||||
# HACK: pypa gforntran.a is compiled without PIC, which leads to the following error
|
||||
# libgfortran.a(error.o)(.text._gfortrani_st_printf+0x34): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `__stack_chk_guard@@GLIBC_2.17' # noqa: E501, B950
|
||||
# Workaround by copying gfortran library from the host
|
||||
host.run_ssh_cmd("sudo apt-get install -y gfortran-8")
|
||||
host.run_cmd("mkdir -p /usr/lib/gcc/aarch64-linux-gnu/8")
|
||||
host.run_ssh_cmd(
|
||||
[
|
||||
"docker",
|
||||
"cp",
|
||||
"/usr/lib/gcc/aarch64-linux-gnu/8/libgfortran.a",
|
||||
f"{host.container_id}:/opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/",
|
||||
]
|
||||
)
|
||||
|
||||
print("Checking out PyTorch repo")
|
||||
host.run_cmd(
|
||||
f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"
|
||||
)
|
||||
|
||||
host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")
|
||||
|
||||
print("Building PyTorch wheel")
|
||||
build_opts = ""
|
||||
if pytorch_build_number is not None:
|
||||
build_opts += f" -C--build-option=--build-number={pytorch_build_number}"
|
||||
# Breakpad build fails on aarch64
|
||||
build_vars = "USE_BREAKPAD=0 "
|
||||
if branch == "nightly":
|
||||
build_date = (
|
||||
host.check_output("cd pytorch && git log --pretty=format:%s -1")
|
||||
.strip()
|
||||
.split()[0]
|
||||
.replace("-", "")
|
||||
)
|
||||
version = host.check_output("cat pytorch/version.txt").strip()[:-2]
|
||||
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
|
||||
if branch.startswith(("v1.", "v2.")):
|
||||
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
if enable_mkldnn:
|
||||
host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")
|
||||
print("build pytorch with mkldnn+acl backend")
|
||||
build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
|
||||
build_vars += " BLAS=OpenBLAS"
|
||||
build_vars += " OpenBLAS_HOME=/opt/OpenBLAS"
|
||||
build_vars += " ACL_ROOT_DIR=/acl"
|
||||
host.run_cmd(
|
||||
f"cd $HOME/pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
|
||||
)
|
||||
print("Repair the wheel")
|
||||
pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
|
||||
ld_library_path = "/acl/build:$HOME/pytorch/build/lib"
|
||||
host.run_cmd(
|
||||
f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"
|
||||
)
|
||||
print("replace the original wheel with the repaired one")
|
||||
pytorch_repaired_wheel_name = host.list_dir("wheelhouse")[0]
|
||||
host.run_cmd(
|
||||
f"cp $HOME/wheelhouse/{pytorch_repaired_wheel_name} $HOME/pytorch/dist/{pytorch_wheel_name}"
|
||||
)
|
||||
else:
|
||||
print("build pytorch without mkldnn backend")
|
||||
host.run_cmd(
|
||||
f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
|
||||
)
|
||||
|
||||
print("Deleting build folder")
|
||||
host.run_cmd("cd pytorch && rm -rf build")
|
||||
pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
|
||||
embed_libgomp(host, use_conda, os.path.join("pytorch", "dist", pytorch_wheel_name))
|
||||
print("Copying the wheel")
|
||||
host.download_wheel(os.path.join("pytorch", "dist", pytorch_wheel_name))
|
||||
|
||||
print("Installing PyTorch wheel")
|
||||
host.run_cmd(f"pip3 install pytorch/dist/{pytorch_wheel_name}")
|
||||
|
||||
if pytorch_only:
|
||||
return (pytorch_wheel_name, None, None, None, None)
|
||||
domain_wheels = build_domains(
|
||||
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
|
||||
)
|
||||
|
||||
return (pytorch_wheel_name, *domain_wheels)
|
||||
|
||||
|
||||
embed_library_script = """
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from auditwheel.patcher import Patchelf
|
||||
from auditwheel.wheeltools import InWheelCtx
|
||||
from auditwheel.elfutils import elf_file_filter
|
||||
from auditwheel.repair import copylib
|
||||
from auditwheel.lddtree import lddtree
|
||||
from subprocess import check_call
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
|
||||
def replace_tag(filename):
|
||||
with open(filename, 'r') as f:
|
||||
lines = f.read().split("\\n")
|
||||
for i,line in enumerate(lines):
|
||||
if not line.startswith("Tag: "):
|
||||
continue
|
||||
lines[i] = line.replace("-linux_", "-manylinux2014_")
|
||||
print(f'Updated tag from {line} to {lines[i]}')
|
||||
|
||||
with open(filename, 'w') as f:
|
||||
f.write("\\n".join(lines))
|
||||
|
||||
|
||||
class AlignedPatchelf(Patchelf):
|
||||
def set_soname(self, file_name: str, new_soname: str) -> None:
|
||||
check_call(['patchelf', '--page-size', '65536', '--set-soname', new_soname, file_name])
|
||||
|
||||
def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
|
||||
check_call(['patchelf', '--page-size', '65536', '--replace-needed', soname, new_soname, file_name])
|
||||
|
||||
|
||||
def embed_library(whl_path, lib_soname, update_tag=False):
|
||||
patcher = AlignedPatchelf()
|
||||
out_dir = TemporaryDirectory()
|
||||
whl_name = os.path.basename(whl_path)
|
||||
tmp_whl_name = os.path.join(out_dir.name, whl_name)
|
||||
with InWheelCtx(whl_path) as ctx:
|
||||
torchlib_path = os.path.join(ctx._tmpdir.name, 'torch', 'lib')
|
||||
ctx.out_wheel=tmp_whl_name
|
||||
new_lib_path, new_lib_soname = None, None
|
||||
for filename, elf in elf_file_filter(ctx.iter_files()):
|
||||
if not filename.startswith('torch/lib'):
|
||||
continue
|
||||
libtree = lddtree(filename)
|
||||
if lib_soname not in libtree['needed']:
|
||||
continue
|
||||
lib_path = libtree['libs'][lib_soname]['path']
|
||||
if lib_path is None:
|
||||
print(f"Can't embed {lib_soname} as it could not be found")
|
||||
break
|
||||
if lib_path.startswith(torchlib_path):
|
||||
continue
|
||||
|
||||
if new_lib_path is None:
|
||||
new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
|
||||
patcher.replace_needed(filename, lib_soname, new_lib_soname)
|
||||
print(f'Replacing {lib_soname} with {new_lib_soname} for {filename}')
|
||||
if update_tag:
|
||||
# Add manylinux2014 tag
|
||||
for filename in ctx.iter_files():
|
||||
if os.path.basename(filename) != 'WHEEL':
|
||||
continue
|
||||
replace_tag(filename)
|
||||
shutil.move(tmp_whl_name, whl_path)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
embed_library(sys.argv[1], 'libgomp.so.1', len(sys.argv) > 2 and sys.argv[2] == '--update-tag')
|
||||
"""
|
||||
|
||||
|
||||
def run_tests(host: RemoteHost, whl: str, branch="main") -> None:
|
||||
print("Configuring the system")
|
||||
update_apt_repo(host)
|
||||
host.run_cmd("sudo apt-get install -y python3-pip git")
|
||||
host.run_cmd("sudo pip3 install Cython")
|
||||
host.run_cmd("sudo pip3 install numpy")
|
||||
host.upload_file(whl, ".")
|
||||
host.run_cmd(f"sudo pip3 install {whl}")
|
||||
host.run_cmd("python3 -c 'import torch;print(torch.rand((3,3))'")
|
||||
host.run_cmd(f"git clone -b {branch} https://github.com/pytorch/pytorch")
|
||||
host.run_cmd("cd pytorch/test; python3 test_torch.py -v")
|
||||
|
||||
|
||||
def get_instance_name(instance) -> Optional[str]:
|
||||
if instance.tags is None:
|
||||
return None
|
||||
for tag in instance.tags:
|
||||
if tag["Key"] == "Name":
|
||||
return tag["Value"]
|
||||
return None
|
||||
|
||||
|
||||
def list_instances(instance_type: str) -> None:
|
||||
print(f"All instances of type {instance_type}")
|
||||
for instance in ec2_instances_of_type(instance_type):
|
||||
ifaces = instance.network_interfaces
|
||||
az = ifaces[0].subnet.availability_zone if len(ifaces) > 0 else None
|
||||
print(
|
||||
f"{instance.id} {get_instance_name(instance)} {instance.public_dns_name} {instance.state['Name']} {az}"
|
||||
)
|
||||
|
||||
|
||||
def terminate_instances(instance_type: str) -> None:
|
||||
print(f"Terminating all instances of type {instance_type}")
|
||||
instances = list(ec2_instances_of_type(instance_type))
|
||||
for instance in instances:
|
||||
print(f"Terminating {instance.id}")
|
||||
instance.terminate()
|
||||
print("Waiting for termination to complete")
|
||||
for instance in instances:
|
||||
instance.wait_until_terminated()
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
from argparse import ArgumentParser
|
||||
|
||||
parser = ArgumentParser("Build and test AARCH64 wheels using EC2")
|
||||
parser.add_argument("--key-name", type=str)
|
||||
parser.add_argument("--debug", action="store_true")
|
||||
parser.add_argument("--build-only", action="store_true")
|
||||
parser.add_argument("--test-only", type=str)
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
group.add_argument("--os", type=str, choices=list(os_amis.keys()))
|
||||
group.add_argument("--ami", type=str)
|
||||
parser.add_argument(
|
||||
"--python-version",
|
||||
type=str,
|
||||
choices=[f"3.{d}" for d in range(6, 12)],
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument("--alloc-instance", action="store_true")
|
||||
parser.add_argument("--list-instances", action="store_true")
|
||||
parser.add_argument("--pytorch-only", action="store_true")
|
||||
parser.add_argument("--keep-running", action="store_true")
|
||||
parser.add_argument("--terminate-instances", action="store_true")
|
||||
parser.add_argument("--instance-type", type=str, default="t4g.2xlarge")
|
||||
parser.add_argument("--ebs-size", type=int, default=50)
|
||||
parser.add_argument("--branch", type=str, default="main")
|
||||
parser.add_argument("--use-docker", action="store_true")
|
||||
parser.add_argument(
|
||||
"--compiler",
|
||||
type=str,
|
||||
choices=["gcc-7", "gcc-8", "gcc-9", "clang"],
|
||||
default="gcc-8",
|
||||
)
|
||||
parser.add_argument("--use-torch-from-pypi", action="store_true")
|
||||
parser.add_argument("--pytorch-build-number", type=str, default=None)
|
||||
parser.add_argument("--disable-mkldnn", action="store_true")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_arguments()
|
||||
ami = (
|
||||
args.ami
|
||||
if args.ami is not None
|
||||
else os_amis[args.os]
|
||||
if args.os is not None
|
||||
else ubuntu20_04_ami
|
||||
)
|
||||
keyfile_path, key_name = compute_keyfile_path(args.key_name)
|
||||
|
||||
if args.list_instances:
|
||||
list_instances(args.instance_type)
|
||||
sys.exit(0)
|
||||
|
||||
if args.terminate_instances:
|
||||
terminate_instances(args.instance_type)
|
||||
sys.exit(0)
|
||||
|
||||
if len(key_name) == 0:
|
||||
raise RuntimeError("""
|
||||
Cannot start build without key_name, please specify
|
||||
--key-name argument or AWS_KEY_NAME environment variable.""")
|
||||
if len(keyfile_path) == 0 or not os.path.exists(keyfile_path):
|
||||
raise RuntimeError(f"""
|
||||
Cannot find keyfile with name: [{key_name}] in path: [{keyfile_path}], please
|
||||
check `~/.ssh/` folder or manually set SSH_KEY_PATH environment variable.""")
|
||||
|
||||
# Starting the instance
|
||||
inst = start_instance(
|
||||
key_name, ami=ami, instance_type=args.instance_type, ebs_size=args.ebs_size
|
||||
)
|
||||
instance_name = f"{args.key_name}-{args.os}"
|
||||
if args.python_version is not None:
|
||||
instance_name += f"-py{args.python_version}"
|
||||
inst.create_tags(
|
||||
DryRun=False,
|
||||
Tags=[
|
||||
{
|
||||
"Key": "Name",
|
||||
"Value": instance_name,
|
||||
}
|
||||
],
|
||||
)
|
||||
addr = inst.public_dns_name
|
||||
wait_for_connection(addr, 22)
|
||||
host = RemoteHost(addr, keyfile_path)
|
||||
host.ami = ami
|
||||
if args.use_docker:
|
||||
update_apt_repo(host)
|
||||
host.start_docker()
|
||||
|
||||
if args.test_only:
|
||||
run_tests(host, args.test_only)
|
||||
sys.exit(0)
|
||||
|
||||
if args.alloc_instance:
|
||||
if args.python_version is None:
|
||||
sys.exit(0)
|
||||
install_condaforge_python(host, args.python_version)
|
||||
sys.exit(0)
|
||||
|
||||
python_version = args.python_version if args.python_version is not None else "3.10"
|
||||
|
||||
if args.use_torch_from_pypi:
|
||||
configure_system(host, compiler=args.compiler, python_version=python_version)
|
||||
print("Installing PyTorch wheel")
|
||||
host.run_cmd("pip3 install torch")
|
||||
build_domains(
|
||||
host, branch=args.branch, git_clone_flags=" --depth 1 --shallow-submodules"
|
||||
)
|
||||
else:
|
||||
start_build(
|
||||
host,
|
||||
branch=args.branch,
|
||||
compiler=args.compiler,
|
||||
python_version=python_version,
|
||||
pytorch_only=args.pytorch_only,
|
||||
pytorch_build_number=args.pytorch_build_number,
|
||||
enable_mkldnn=not args.disable_mkldnn,
|
||||
)
|
||||
if not args.keep_running:
|
||||
print(f"Waiting for instance {inst.id} to terminate")
|
||||
inst.terminate()
|
||||
inst.wait_until_terminated()
|
||||
@ -1,87 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from subprocess import check_call
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from auditwheel.elfutils import elf_file_filter
|
||||
from auditwheel.lddtree import lddtree
|
||||
from auditwheel.patcher import Patchelf
|
||||
from auditwheel.repair import copylib
|
||||
from auditwheel.wheeltools import InWheelCtx
|
||||
|
||||
|
||||
def replace_tag(filename):
|
||||
with open(filename) as f:
|
||||
lines = f.read().split("\\n")
|
||||
for i, line in enumerate(lines):
|
||||
if not line.startswith("Tag: "):
|
||||
continue
|
||||
lines[i] = line.replace("-linux_", "-manylinux2014_")
|
||||
print(f"Updated tag from {line} to {lines[i]}")
|
||||
|
||||
with open(filename, "w") as f:
|
||||
f.write("\\n".join(lines))
|
||||
|
||||
|
||||
class AlignedPatchelf(Patchelf):
|
||||
def set_soname(self, file_name: str, new_soname: str) -> None:
|
||||
check_call(
|
||||
["patchelf", "--page-size", "65536", "--set-soname", new_soname, file_name]
|
||||
)
|
||||
|
||||
def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
|
||||
check_call(
|
||||
[
|
||||
"patchelf",
|
||||
"--page-size",
|
||||
"65536",
|
||||
"--replace-needed",
|
||||
soname,
|
||||
new_soname,
|
||||
file_name,
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def embed_library(whl_path, lib_soname, update_tag=False):
|
||||
patcher = AlignedPatchelf()
|
||||
out_dir = TemporaryDirectory()
|
||||
whl_name = os.path.basename(whl_path)
|
||||
tmp_whl_name = os.path.join(out_dir.name, whl_name)
|
||||
with InWheelCtx(whl_path) as ctx:
|
||||
torchlib_path = os.path.join(ctx._tmpdir.name, "torch", "lib")
|
||||
ctx.out_wheel = tmp_whl_name
|
||||
new_lib_path, new_lib_soname = None, None
|
||||
for filename, _ in elf_file_filter(ctx.iter_files()):
|
||||
if not filename.startswith("torch/lib"):
|
||||
continue
|
||||
libtree = lddtree(filename)
|
||||
if lib_soname not in libtree["needed"]:
|
||||
continue
|
||||
lib_path = libtree["libs"][lib_soname]["path"]
|
||||
if lib_path is None:
|
||||
print(f"Can't embed {lib_soname} as it could not be found")
|
||||
break
|
||||
if lib_path.startswith(torchlib_path):
|
||||
continue
|
||||
|
||||
if new_lib_path is None:
|
||||
new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
|
||||
patcher.replace_needed(filename, lib_soname, new_lib_soname)
|
||||
print(f"Replacing {lib_soname} with {new_lib_soname} for {filename}")
|
||||
if update_tag:
|
||||
# Add manylinux2014 tag
|
||||
for filename in ctx.iter_files():
|
||||
if os.path.basename(filename) != "WHEEL":
|
||||
continue
|
||||
replace_tag(filename)
|
||||
shutil.move(tmp_whl_name, whl_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
embed_library(
|
||||
sys.argv[1], "libgomp.so.1", len(sys.argv) > 2 and sys.argv[2] == "--update-tag"
|
||||
)
|
||||
@ -188,7 +188,7 @@ case "$tag" in
|
||||
fi
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
ROCM_VERSION=7.1
|
||||
ROCM_VERSION=7.0
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
KATEX=yes
|
||||
|
||||
@ -60,16 +60,14 @@ EOF
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
|
||||
fi
|
||||
|
||||
if [[ $(ver $ROCM_VERSION) -lt $(ver 7.1) ]]; then
|
||||
# precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5, removed in ROCm 7.1
|
||||
# search for all unversioned packages
|
||||
# if search fails it will abort this script; use true to avoid case where search fails
|
||||
MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
|
||||
if [[ "x${MIOPENHIPGFX}" = x ]]; then
|
||||
echo "miopen-hip-gfx package not available" && exit 1
|
||||
else
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
|
||||
fi
|
||||
# precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
|
||||
# search for all unversioned packages
|
||||
# if search fails it will abort this script; use true to avoid case where search fails
|
||||
MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
|
||||
if [[ "x${MIOPENHIPGFX}" = x ]]; then
|
||||
echo "miopen-hip-gfx package not available" && exit 1
|
||||
else
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
|
||||
fi
|
||||
|
||||
# ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
|
||||
|
||||
@ -12,8 +12,8 @@ function do_install() {
|
||||
|
||||
rocm_version_nodot=${rocm_version//./}
|
||||
|
||||
# https://github.com/icl-utk-edu/magma/pull/65
|
||||
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
|
||||
# post merge of https://github.com/icl-utk-edu/magma/pull/65
|
||||
MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
|
||||
magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
|
||||
|
||||
rocm_dir="/opt/rocm"
|
||||
|
||||
@ -4,14 +4,17 @@ set -ex
|
||||
|
||||
SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||
|
||||
# Source the common build script for architecture-specific configurations (MKLDNN, ACL, etc.)
|
||||
source "${SCRIPTPATH}/../pytorch/build.sh" || true
|
||||
|
||||
case "${GPU_ARCH_TYPE:-BLANK}" in
|
||||
cuda)
|
||||
cuda | cuda-aarch64)
|
||||
bash "${SCRIPTPATH}/build_cuda.sh"
|
||||
;;
|
||||
rocm)
|
||||
bash "${SCRIPTPATH}/build_rocm.sh"
|
||||
;;
|
||||
cpu | cpu-cxx11-abi | cpu-s390x)
|
||||
cpu | cpu-cxx11-abi | cpu-aarch64 | cpu-s390x)
|
||||
bash "${SCRIPTPATH}/build_cpu.sh"
|
||||
;;
|
||||
xpu)
|
||||
|
||||
@ -18,12 +18,31 @@ retry () {
|
||||
$* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
|
||||
}
|
||||
|
||||
# Detect architecture first
|
||||
ARCH=$(uname -m)
|
||||
echo "Detected architecture: $ARCH"
|
||||
|
||||
PLATFORM=""
|
||||
# TODO move this into the Docker images
|
||||
OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
|
||||
if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
|
||||
retry yum install -q -y zip openssl
|
||||
PLATFORM="manylinux_2_28_x86_64"
|
||||
# Set platform based on architecture
|
||||
case $ARCH in
|
||||
x86_64)
|
||||
PLATFORM="manylinux_2_28_x86_64"
|
||||
;;
|
||||
aarch64)
|
||||
PLATFORM="manylinux_2_28_aarch64"
|
||||
;;
|
||||
s390x)
|
||||
PLATFORM="manylinux_2_28_s390x"
|
||||
;;
|
||||
*)
|
||||
echo "Unsupported architecture: $ARCH"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
|
||||
retry dnf install -q -y zip openssl
|
||||
elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
|
||||
@ -38,6 +57,8 @@ else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Platform set to: $PLATFORM"
|
||||
|
||||
# We use the package name to test the package by passing this to 'pip install'
|
||||
# This is the env variable that setup.py uses to name the package. Note that
|
||||
# pip 'normalizes' the name first by changing all - to _
|
||||
@ -299,8 +320,8 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
|
||||
# ROCm workaround for roctracer dlopens
|
||||
if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
|
||||
patchedpath=$(fname_without_so_number $destpath)
|
||||
# Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load
|
||||
elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then
|
||||
# Keep the so number for XPU dependencies, libgomp.so.1, ACL libraries, and NVPL libraries to avoid twice load
|
||||
elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" || "$filename" == libarm_compute* || "$filename" == libnvpl* || "$filename" == "libgfortran.so.5" ]]; then
|
||||
patchedpath=$destpath
|
||||
else
|
||||
patchedpath=$(fname_with_sha256 $destpath)
|
||||
@ -346,9 +367,22 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
|
||||
done
|
||||
|
||||
# create Manylinux 2_28 tag this needs to happen before regenerate the RECORD
|
||||
if [[ $PLATFORM == "manylinux_2_28_x86_64" && $GPU_ARCH_TYPE != "cpu-s390x" && $GPU_ARCH_TYPE != "xpu" ]]; then
|
||||
# Support all architectures (x86_64, aarch64, s390x)
|
||||
if [[ "$IS_MANYLINUX2_28" == "1" && $GPU_ARCH_TYPE != "xpu" ]]; then
|
||||
wheel_file=$(echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/WHEEL/g')
|
||||
sed -i -e s#linux_x86_64#"${PLATFORM}"# $wheel_file;
|
||||
echo "Updating wheel tag for $ARCH architecture"
|
||||
# Replace linux_* with manylinux_2_28_* based on architecture
|
||||
case $ARCH in
|
||||
x86_64)
|
||||
sed -i -e 's#linux_x86_64#manylinux_2_28_x86_64#g' $wheel_file
|
||||
;;
|
||||
aarch64)
|
||||
sed -i -e 's#linux_aarch64#manylinux_2_28_aarch64#g' $wheel_file
|
||||
;;
|
||||
s390x)
|
||||
sed -i -e 's#linux_s390x#manylinux_2_28_s390x#g' $wheel_file
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# regenerate the RECORD file with new hashes
|
||||
|
||||
@ -15,6 +15,10 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
|
||||
EXTRA_CAFFE2_CMAKE_FLAGS=()
|
||||
fi
|
||||
|
||||
# Detect architecture
|
||||
ARCH=$(uname -m)
|
||||
echo "Building CPU wheel for architecture: $ARCH"
|
||||
|
||||
WHEELHOUSE_DIR="wheelhousecpu"
|
||||
LIBTORCH_HOUSE_DIR="libtorch_housecpu"
|
||||
if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
|
||||
@ -34,8 +38,10 @@ elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
|
||||
elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
|
||||
LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
|
||||
elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
|
||||
if [[ "$(uname -m)" == "s390x" ]]; then
|
||||
if [[ "$ARCH" == "s390x" ]]; then
|
||||
LIBGOMP_PATH="/usr/lib/s390x-linux-gnu/libgomp.so.1"
|
||||
elif [[ "$ARCH" == "aarch64" ]]; then
|
||||
LIBGOMP_PATH="/usr/lib/aarch64-linux-gnu/libgomp.so.1"
|
||||
else
|
||||
LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
|
||||
fi
|
||||
@ -49,6 +55,32 @@ DEPS_SONAME=(
|
||||
"libgomp.so.1"
|
||||
)
|
||||
|
||||
# Add ARM-specific library dependencies for CPU builds
|
||||
if [[ "$ARCH" == "aarch64" ]]; then
|
||||
echo "Adding ARM-specific CPU library dependencies"
|
||||
|
||||
# ARM Compute Library (if available)
|
||||
if [[ -d "/acl/build" ]]; then
|
||||
echo "Adding ARM Compute Library for CPU"
|
||||
DEPS_LIST+=(
|
||||
"/acl/build/libarm_compute.so"
|
||||
"/acl/build/libarm_compute_graph.so"
|
||||
)
|
||||
DEPS_SONAME+=(
|
||||
"libarm_compute.so"
|
||||
"libarm_compute_graph.so"
|
||||
)
|
||||
fi
|
||||
|
||||
# ARM system libraries
|
||||
DEPS_LIST+=(
|
||||
"/usr/lib64/libgfortran.so.5"
|
||||
)
|
||||
DEPS_SONAME+=(
|
||||
"libgfortran.so.5"
|
||||
)
|
||||
fi
|
||||
|
||||
rm -rf /usr/local/cuda*
|
||||
|
||||
SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
|
||||
|
||||
@ -29,6 +29,10 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
|
||||
EXTRA_CAFFE2_CMAKE_FLAGS=()
|
||||
fi
|
||||
|
||||
# Detect architecture
|
||||
ARCH=$(uname -m)
|
||||
echo "Building for architecture: $ARCH"
|
||||
|
||||
# Determine CUDA version and architectures to build for
|
||||
#
|
||||
# NOTE: We should first check `DESIRED_CUDA` when determining `CUDA_VERSION`,
|
||||
@ -53,34 +57,60 @@ fi
|
||||
cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
|
||||
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
|
||||
|
||||
# Function to remove architectures from a list
|
||||
remove_archs() {
|
||||
local result="$1"
|
||||
shift
|
||||
for arch in "$@"; do
|
||||
result="${result//${arch};/}"
|
||||
done
|
||||
echo "$result"
|
||||
}
|
||||
|
||||
# Function to filter CUDA architectures for aarch64
|
||||
# aarch64 ARM GPUs only support certain compute capabilities
|
||||
# Keep: 8.0 (A100), 9.0+ (Hopper, Grace Hopper, newer)
|
||||
# Remove: < 8.0 (no ARM GPUs), 8.6 (x86_64 RTX 3090/A6000 only)
|
||||
filter_aarch64_archs() {
|
||||
local arch_list="$1"
|
||||
# Explicitly remove architectures not needed on aarch64
|
||||
arch_list=$(remove_archs "$arch_list" "5.0" "6.0" "7.0" "7.5" "8.6")
|
||||
echo "$arch_list"
|
||||
}
|
||||
|
||||
# Base: Common architectures across all modern CUDA versions
|
||||
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0"
|
||||
|
||||
case ${CUDA_VERSION} in
|
||||
#removing sm_50-sm_60 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
|
||||
#however we would like to keep sm_70 architecture see: https://github.com/pytorch/pytorch/issues/157517
|
||||
12.8)
|
||||
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0"
|
||||
;;
|
||||
12.9)
|
||||
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0+PTX"
|
||||
# WAR to resolve the ld error in libtorch build with CUDA 12.9
|
||||
12.6) TORCH_CUDA_ARCH_LIST="5.0;6.0;${TORCH_CUDA_ARCH_LIST}" ;; # Only 12.6 includes Legacy Maxwell/Pascal that will be removed in future releases
|
||||
12.8) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0" ;; # +Hopper/Blackwell support
|
||||
12.9) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0+PTX" # +Hopper/Blackwell support + PTX for forward compatibility
|
||||
if [[ "$PACKAGE_TYPE" == "libtorch" ]]; then
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
|
||||
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//7.0;/}" # Remove 7.0 to resolve the ld error
|
||||
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//8.6;/}" # Remove 8.6 for libtorch
|
||||
fi
|
||||
;;
|
||||
13.0)
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
|
||||
;;
|
||||
12.6)
|
||||
TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
|
||||
;;
|
||||
*)
|
||||
echo "unknown cuda version $CUDA_VERSION"
|
||||
exit 1
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;$([[ "$ARCH" == "aarch64" ]] && echo "11.0;" || echo "")12.0+PTX"
|
||||
export TORCH_NVCC_FLAGS="-compress-mode=size"
|
||||
export BUILD_BUNDLE_PTXAS=1
|
||||
;;
|
||||
*) echo "unknown cuda version $CUDA_VERSION"; exit 1 ;;
|
||||
esac
|
||||
|
||||
# Filter for aarch64: Remove < 8.0 and 8.6
|
||||
[[ "$ARCH" == "aarch64" ]] && TORCH_CUDA_ARCH_LIST=$(filter_aarch64_archs "$TORCH_CUDA_ARCH_LIST")
|
||||
|
||||
echo "TORCH_CUDA_ARCH_LIST set to: $TORCH_CUDA_ARCH_LIST"
|
||||
export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
||||
echo "${TORCH_CUDA_ARCH_LIST}"
|
||||
|
||||
# Disable MAGMA for aarch64 as pre-built libraries are x86-64 only
|
||||
if [[ "$ARCH" == "aarch64" ]]; then
|
||||
echo "Disabling MAGMA for aarch64 architecture"
|
||||
export USE_MAGMA=0
|
||||
fi
|
||||
|
||||
# Package directories
|
||||
WHEELHOUSE_DIR="wheelhouse$cuda_version_nodot"
|
||||
LIBTORCH_HOUSE_DIR="libtorch_house$cuda_version_nodot"
|
||||
@ -244,6 +274,51 @@ else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Add ARM-specific library dependencies
|
||||
if [[ "$ARCH" == "aarch64" ]]; then
|
||||
echo "Adding ARM-specific library dependencies"
|
||||
|
||||
# ARM Compute Library (if available)
|
||||
if [[ -d "/acl/build" ]]; then
|
||||
echo "Adding ARM Compute Library"
|
||||
DEPS_LIST+=(
|
||||
"/acl/build/libarm_compute.so"
|
||||
"/acl/build/libarm_compute_graph.so"
|
||||
)
|
||||
DEPS_SONAME+=(
|
||||
"libarm_compute.so"
|
||||
"libarm_compute_graph.so"
|
||||
)
|
||||
fi
|
||||
|
||||
# ARM system libraries
|
||||
DEPS_LIST+=(
|
||||
"/lib64/libgomp.so.1"
|
||||
"/usr/lib64/libgfortran.so.5"
|
||||
)
|
||||
DEPS_SONAME+=(
|
||||
"libgomp.so.1"
|
||||
"libgfortran.so.5"
|
||||
)
|
||||
|
||||
# NVPL libraries (ARM optimized BLAS/LAPACK)
|
||||
if [[ -d "/usr/local/lib" && -f "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0" ]]; then
|
||||
echo "Adding NVPL libraries for ARM"
|
||||
DEPS_LIST+=(
|
||||
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0"
|
||||
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0"
|
||||
"/usr/local/lib/libnvpl_lapack_core.so.0"
|
||||
"/usr/local/lib/libnvpl_blas_core.so.0"
|
||||
)
|
||||
DEPS_SONAME+=(
|
||||
"libnvpl_lapack_lp64_gomp.so.0"
|
||||
"libnvpl_blas_lp64_gomp.so.0"
|
||||
"libnvpl_lapack_core.so.0"
|
||||
"libnvpl_blas_core.so.0"
|
||||
)
|
||||
fi
|
||||
fi
|
||||
|
||||
# run_tests.sh requires DESIRED_CUDA to know what tests to exclude
|
||||
export DESIRED_CUDA="$cuda_version_nodot"
|
||||
|
||||
@ -251,9 +326,11 @@ export DESIRED_CUDA="$cuda_version_nodot"
|
||||
rm -rf /usr/local/cuda || true
|
||||
ln -s "/usr/local/cuda-${CUDA_VERSION}" /usr/local/cuda
|
||||
|
||||
# Switch `/usr/local/magma` to the desired CUDA version
|
||||
rm -rf /usr/local/magma || true
|
||||
ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
|
||||
# Switch `/usr/local/magma` to the desired CUDA version (skip for aarch64)
|
||||
if [[ "$ARCH" != "aarch64" ]]; then
|
||||
rm -rf /usr/local/magma || true
|
||||
ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
|
||||
fi
|
||||
|
||||
export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev) # 10.0.130
|
||||
export CUDA_VERSION_SHORT=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev | cut -f1,2 -d".") # 10.0
|
||||
|
||||
@ -21,87 +21,3 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
|
||||
fi
|
||||
|
||||
mkdir -p "$pytest_reports_dir" || true
|
||||
|
||||
##########################################
|
||||
# copied from .ci/pytorch/common_utils.sh
|
||||
##########################################
|
||||
|
||||
function get_pinned_commit() {
|
||||
cat .github/ci_commit_pins/"${1}".txt
|
||||
}
|
||||
|
||||
function pip_install_whl() {
|
||||
# This is used to install PyTorch and other build artifacts wheel locally
|
||||
# without using any network connection
|
||||
|
||||
# Convert the input arguments into an array
|
||||
local args=("$@")
|
||||
|
||||
# Check if the first argument contains multiple paths separated by spaces
|
||||
if [[ "${args[0]}" == *" "* ]]; then
|
||||
# Split the string by spaces into an array
|
||||
IFS=' ' read -r -a paths <<< "${args[0]}"
|
||||
# Loop through each path and install individually
|
||||
for path in "${paths[@]}"; do
|
||||
echo "Installing $path"
|
||||
python3 -mpip install --no-index --no-deps "$path"
|
||||
done
|
||||
else
|
||||
# Loop through each argument and install individually
|
||||
for path in "${args[@]}"; do
|
||||
echo "Installing $path"
|
||||
python3 -mpip install --no-index --no-deps "$path"
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
function pip_build_and_install() {
|
||||
local build_target=$1
|
||||
local wheel_dir=$2
|
||||
|
||||
local found_whl=0
|
||||
for file in "${wheel_dir}"/*.whl
|
||||
do
|
||||
if [[ -f "${file}" ]]; then
|
||||
found_whl=1
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
# Build the wheel if it doesn't exist
|
||||
if [ "${found_whl}" == "0" ]; then
|
||||
python3 -m pip wheel \
|
||||
--no-build-isolation \
|
||||
--no-deps \
|
||||
-w "${wheel_dir}" \
|
||||
"${build_target}"
|
||||
fi
|
||||
|
||||
for file in "${wheel_dir}"/*.whl
|
||||
do
|
||||
pip_install_whl "${file}"
|
||||
done
|
||||
}
|
||||
|
||||
function install_torchvision() {
|
||||
local orig_preload
|
||||
local commit
|
||||
commit=$(get_pinned_commit vision)
|
||||
orig_preload=${LD_PRELOAD}
|
||||
if [ -n "${LD_PRELOAD}" ]; then
|
||||
# Silence dlerror to work-around glibc ASAN bug, see https://sourceware.org/bugzilla/show_bug.cgi?id=27653#c9
|
||||
echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c -
|
||||
LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so
|
||||
fi
|
||||
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
|
||||
# Not sure if both are needed, but why not
|
||||
export FORCE_CUDA=1
|
||||
export WITH_CUDA=1
|
||||
fi
|
||||
pip_build_and_install "git+https://github.com/pytorch/vision.git@${commit}" dist/vision
|
||||
|
||||
if [ -n "${LD_PRELOAD}" ]; then
|
||||
LD_PRELOAD=${orig_preload}
|
||||
fi
|
||||
}
|
||||
|
||||
@ -19,7 +19,7 @@ git config --global --add safe.directory /var/lib/jenkins/workspace
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
|
||||
# TODO: This can be removed later once vision is also part of the Docker image
|
||||
install_torchvision
|
||||
pip install -q --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
|
||||
# JIT C++ extensions require ninja, so put it into PATH.
|
||||
export PATH="/var/lib/jenkins/.local/bin:$PATH"
|
||||
# NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we
|
||||
|
||||
@ -86,10 +86,20 @@ else
|
||||
fi
|
||||
fi
|
||||
|
||||
# Enable MKLDNN with ARM Compute Library for ARM builds
|
||||
if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
|
||||
export USE_MKLDNN=1
|
||||
|
||||
# ACL is required for aarch64 builds
|
||||
if [[ ! -d "/acl" ]]; then
|
||||
echo "ERROR: ARM Compute Library not found at /acl"
|
||||
echo "ACL is required for aarch64 builds. Check Docker image setup."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export USE_MKLDNN_ACL=1
|
||||
export ACL_ROOT_DIR=/acl
|
||||
echo "ARM Compute Library enabled for MKLDNN: ACL_ROOT_DIR=/acl"
|
||||
fi
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then
|
||||
|
||||
@ -1250,97 +1250,6 @@ test_custom_script_ops() {
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_libtorch_agnostic_targetting() {
|
||||
echo "Testing libtorch_agnostic runs correctly on TORCH_TARGET_VERSION"
|
||||
|
||||
REPO_DIR=$(pwd)
|
||||
WHEEL_DIR="${REPO_DIR}/test/cpp_extensions/.wheels"
|
||||
|
||||
# Build wheel with current PyTorch (this has TORCH_TARGET_VERSION 2_9_0)
|
||||
echo "Building 2.9 extension wheel with current PyTorch..."
|
||||
pushd test/cpp_extensions/libtorch_agnostic_2_9_extension
|
||||
time python setup.py bdist_wheel
|
||||
|
||||
# Save the wheel
|
||||
mkdir -p "$WHEEL_DIR"
|
||||
cp dist/*.whl "$WHEEL_DIR/"
|
||||
WHEEL_FILE=$(find "$WHEEL_DIR" -maxdepth 1 -name "*.whl" -type f | head -1)
|
||||
echo "Built wheel: $(basename "$WHEEL_FILE")"
|
||||
popd
|
||||
|
||||
# Create venv and install PyTorch 2.9
|
||||
python -m venv venv_pytorch_2_9
|
||||
# shellcheck disable=SC1091
|
||||
. venv_pytorch_2_9/bin/activate
|
||||
|
||||
# Clear PYTHONPATH to avoid using the development PyTorch
|
||||
echo "Clearing PYTHONPATH to use only venv packages..."
|
||||
unset PYTHONPATH
|
||||
|
||||
# Upgrade pip to latest version
|
||||
echo "Upgrading pip to latest version..."
|
||||
pip install --upgrade pip
|
||||
pip --version
|
||||
|
||||
echo "Installing PyTorch 2.9..."
|
||||
|
||||
# Install from release channel only
|
||||
PYTORCH_VERSION="2.9.0"
|
||||
|
||||
# Extract CUDA version from BUILD_ENVIRONMENT (e.g., "cuda12.1" -> "cu121")
|
||||
if [[ "$BUILD_ENVIRONMENT" =~ cuda([0-9]+)\.([0-9]+) ]]; then
|
||||
CUDA_MAJOR="${BASH_REMATCH[1]}"
|
||||
CUDA_MINOR="${BASH_REMATCH[2]}"
|
||||
CUDA_VERSION="cu${CUDA_MAJOR}${CUDA_MINOR}"
|
||||
echo " Detected CUDA ${CUDA_MAJOR}.${CUDA_MINOR} from BUILD_ENVIRONMENT, using ${CUDA_VERSION}"
|
||||
else
|
||||
# Default to CPU build
|
||||
CUDA_VERSION="cpu"
|
||||
echo " No CUDA detected in BUILD_ENVIRONMENT, using CPU build"
|
||||
fi
|
||||
|
||||
if pip install torch=="${PYTORCH_VERSION}" --index-url https://download.pytorch.org/whl/${CUDA_VERSION}/; then
|
||||
echo "Installed PyTorch ${PYTORCH_VERSION} from release channel (${CUDA_VERSION})"
|
||||
else
|
||||
echo " FAILED to install PyTorch 2.9.0 from release channel"
|
||||
echo " URL: https://download.pytorch.org/whl/${CUDA_VERSION}/"
|
||||
deactivate
|
||||
rm -rf venv_pytorch_2_9
|
||||
return 1
|
||||
fi
|
||||
|
||||
INSTALLED_VERSION=$(python -c "import torch; print(torch.__version__)" 2>/dev/null || echo "unknown")
|
||||
echo " Installed version: $INSTALLED_VERSION"
|
||||
|
||||
# Install test dependencies
|
||||
echo "Installing test dependencies..."
|
||||
pip install expecttest numpy unittest-xml-reporting
|
||||
|
||||
# Install the pre-built wheel
|
||||
echo ""
|
||||
echo "Installing pre-built 2.9 extension wheel (built with PyTorch 2.10)..."
|
||||
pip install "$WHEEL_FILE"
|
||||
echo "Installed $(basename "$WHEEL_FILE") into PyTorch 2.9 environment"
|
||||
|
||||
# Run tests with PyTorch 2.9 runtime (2.10 tests will be skipped automatically)
|
||||
echo ""
|
||||
echo "Running tests with PyTorch 2.9 runtime (using wheel built on PyTorch 2.10)..."
|
||||
if time python test/cpp_extensions/test_libtorch_agnostic.py -v; then
|
||||
echo ""
|
||||
echo " Wheel built with current torch and TORCH_TARGET_VERSION 2_9_0 works with PyTorch 2.9 runtime!"
|
||||
else
|
||||
echo "targeting test failed"
|
||||
deactivate
|
||||
rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
|
||||
return 1
|
||||
fi
|
||||
|
||||
deactivate
|
||||
rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
|
||||
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_jit_hooks() {
|
||||
echo "Testing jit hooks in cpp"
|
||||
HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
|
||||
@ -1768,7 +1677,7 @@ test_operator_microbenchmark() {
|
||||
|
||||
cd "${TEST_DIR}"/benchmarks/operator_benchmark
|
||||
|
||||
for OP_BENCHMARK_TESTS in optimizer; do
|
||||
for OP_BENCHMARK_TESTS in matmul mm addmm bmm conv; do
|
||||
$TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
|
||||
--output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
|
||||
--benchmark-name "PyTorch operator microbenchmark" --use-compile
|
||||
@ -1813,8 +1722,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]];
|
||||
elif [[ "${TEST_CONFIG}" == *backward* ]]; then
|
||||
test_forward_backward_compatibility
|
||||
# Do NOT add tests after bc check tests, see its comment.
|
||||
elif [[ "${TEST_CONFIG}" == *libtorch_agnostic_targetting* ]]; then
|
||||
test_libtorch_agnostic_targetting
|
||||
elif [[ "${TEST_CONFIG}" == *xla* ]]; then
|
||||
install_torchvision
|
||||
build_xla
|
||||
|
||||
2
.github/ci_commit_pins/xla.txt
vendored
2
.github/ci_commit_pins/xla.txt
vendored
@ -1 +1 @@
|
||||
94631807d22c09723dd006f7be5beb649d5f88d0
|
||||
e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a
|
||||
|
||||
7
.github/labeler.yml
vendored
7
.github/labeler.yml
vendored
@ -91,6 +91,13 @@
|
||||
"ciflow/trunk":
|
||||
- .ci/docker/ci_commit_pins/triton.txt
|
||||
|
||||
"oncall: distributed":
|
||||
- torch/csrc/distributed/**
|
||||
- torch/distributed/**
|
||||
- torch/nn/parallel/**
|
||||
- test/distributed/**
|
||||
- torch/testing/_internal/distributed/**
|
||||
|
||||
"release notes: distributed (checkpoint)":
|
||||
- torch/distributed/checkpoint/**
|
||||
- test/distributed/checkpoint/**
|
||||
|
||||
7
.github/workflows/_binary-build-linux.yml
vendored
7
.github/workflows/_binary-build-linux.yml
vendored
@ -260,11 +260,8 @@ jobs:
|
||||
"${DOCKER_IMAGE}"
|
||||
)
|
||||
docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"aarch64"* ]]; then
|
||||
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/aarch64_linux/aarch64_ci_build.sh"
|
||||
else
|
||||
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
|
||||
fi
|
||||
# Unified build script for all architectures (x86_64, aarch64, s390x)
|
||||
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
|
||||
|
||||
- name: Chown artifacts
|
||||
if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
|
||||
|
||||
1
.github/workflows/pull.yml
vendored
1
.github/workflows/pull.yml
vendored
@ -70,7 +70,6 @@ jobs:
|
||||
{ config: "distributed", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "distributed", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "numpy_2_x", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
{ config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
1
.github/workflows/trunk.yml
vendored
1
.github/workflows/trunk.yml
vendored
@ -83,7 +83,6 @@ jobs:
|
||||
{ config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
|
||||
{ config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
|
||||
{ config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
@ -245,9 +245,6 @@ class TORCH_API TensorBase {
|
||||
size_t weak_use_count() const noexcept {
|
||||
return impl_.weak_use_count();
|
||||
}
|
||||
bool is_uniquely_owned() const noexcept {
|
||||
return impl_.is_uniquely_owned();
|
||||
}
|
||||
|
||||
std::string toString() const;
|
||||
|
||||
|
||||
@ -3,7 +3,6 @@
|
||||
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <shared_mutex>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cusparse.h>
|
||||
@ -89,13 +88,8 @@ TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
|
||||
TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
|
||||
|
||||
TORCH_CUDA_CPP_API void clearCublasWorkspaces();
|
||||
struct WorkspaceMapWithMutex {
|
||||
std::map<std::tuple<void*, void*>, at::DataPtr> map;
|
||||
std::shared_mutex mutex;
|
||||
};
|
||||
|
||||
TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
|
||||
TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
|
||||
TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace();
|
||||
TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace();
|
||||
TORCH_CUDA_CPP_API size_t getChosenWorkspaceSize();
|
||||
TORCH_CUDA_CPP_API size_t getCUDABlasLtWorkspaceSize();
|
||||
TORCH_CUDA_CPP_API void* getCUDABlasLtWorkspace();
|
||||
|
||||
@ -99,7 +99,7 @@ void destroyCublasHandle(cublasHandle_t handle) {
|
||||
// - Comments of @soumith copied from cuDNN handle pool implementation
|
||||
#ifdef NO_CUDNN_DESTROY_HANDLE
|
||||
#else
|
||||
cublasDestroy(handle);
|
||||
cublasDestroy(handle);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -107,27 +107,19 @@ using CuBlasPoolType = DeviceThreadHandlePool<cublasHandle_t, createCublasHandle
|
||||
|
||||
} // namespace
|
||||
|
||||
WorkspaceMapWithMutex& cublas_handle_stream_to_workspace() {
|
||||
static auto& instance = *new WorkspaceMapWithMutex;
|
||||
std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace() {
|
||||
static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
|
||||
return instance;
|
||||
}
|
||||
|
||||
WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace() {
|
||||
static auto& instance = *new WorkspaceMapWithMutex;
|
||||
std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace() {
|
||||
static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
|
||||
return instance;
|
||||
}
|
||||
|
||||
void clearCublasWorkspaces() {
|
||||
{
|
||||
auto& workspace = cublas_handle_stream_to_workspace();
|
||||
std::unique_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
workspace.map.clear();
|
||||
}
|
||||
{
|
||||
auto& workspace = cublaslt_handle_stream_to_workspace();
|
||||
std::unique_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
workspace.map.clear();
|
||||
}
|
||||
cublas_handle_stream_to_workspace().clear();
|
||||
cublaslt_handle_stream_to_workspace().clear();
|
||||
}
|
||||
|
||||
size_t parseChosenWorkspaceSize() {
|
||||
@ -241,38 +233,6 @@ at::DataPtr getNewCUDABlasLtWorkspace() {
|
||||
return c10::cuda::CUDACachingAllocator::get()->allocate(getCUDABlasLtWorkspaceSize());
|
||||
}
|
||||
|
||||
void setWorkspaceForHandle(cublasHandle_t handle, c10::cuda::CUDAStream stream) {
|
||||
cudaStream_t _stream = stream;
|
||||
auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
|
||||
|
||||
auto& workspace = cublas_handle_stream_to_workspace();
|
||||
|
||||
size_t workspace_size = getChosenWorkspaceSize();
|
||||
|
||||
// Fast path: check if workspace already exists
|
||||
{
|
||||
std::shared_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
auto workspace_it = workspace.map.find(key);
|
||||
if (workspace_it != workspace.map.end()) {
|
||||
TORCH_CUDABLAS_CHECK(cublasSetWorkspace(
|
||||
handle, workspace_it->second.get(), workspace_size));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Slow path: allocate workspace outside the lock
|
||||
auto new_workspace = getNewWorkspace();
|
||||
|
||||
// Insert with lock (double-check in case another thread inserted while we
|
||||
// were allocating)
|
||||
{
|
||||
std::unique_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
auto workspace_it = workspace.map.try_emplace(key, std::move(new_workspace)).first;
|
||||
TORCH_CUDABLAS_CHECK(
|
||||
cublasSetWorkspace(handle, workspace_it->second.get(), workspace_size));
|
||||
}
|
||||
}
|
||||
|
||||
void* getCUDABlasLtWorkspace() {
|
||||
#ifndef USE_ROCM
|
||||
static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true;
|
||||
@ -281,10 +241,8 @@ void* getCUDABlasLtWorkspace() {
|
||||
auto stream = c10::cuda::getCurrentCUDAStream();
|
||||
cudaStream_t _stream = stream;
|
||||
auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
|
||||
auto& workspace = at::cuda::cublas_handle_stream_to_workspace();
|
||||
std::shared_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
auto workspace_it = workspace.map.find(key);
|
||||
TORCH_INTERNAL_ASSERT(workspace_it != workspace.map.end());
|
||||
auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key);
|
||||
TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end());
|
||||
return workspace_it->second.mutable_get();
|
||||
}
|
||||
#endif
|
||||
@ -292,29 +250,11 @@ void* getCUDABlasLtWorkspace() {
|
||||
auto stream = c10::cuda::getCurrentCUDAStream();
|
||||
cudaStream_t _stream = stream;
|
||||
auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
|
||||
|
||||
auto& workspace = cublaslt_handle_stream_to_workspace();
|
||||
|
||||
// Fast path: check if workspace already exists
|
||||
{
|
||||
std::shared_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
auto workspace_it = workspace.map.find(key);
|
||||
if (workspace_it != workspace.map.end()) {
|
||||
return workspace_it->second.mutable_get();
|
||||
}
|
||||
}
|
||||
|
||||
// Slow path: allocate workspace outside the lock
|
||||
auto new_workspace = getNewCUDABlasLtWorkspace();
|
||||
|
||||
// Insert with lock (double-check in case another thread inserted while we
|
||||
// were allocating)
|
||||
{
|
||||
std::unique_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
auto workspace_it =
|
||||
workspace.map.try_emplace(key, std::move(new_workspace)).first;
|
||||
return workspace_it->second.mutable_get();
|
||||
auto workspace_it = cublaslt_handle_stream_to_workspace().find(key);
|
||||
if (workspace_it == cublaslt_handle_stream_to_workspace().end()) {
|
||||
workspace_it = cublaslt_handle_stream_to_workspace().insert(workspace_it, {key, getNewCUDABlasLtWorkspace()});
|
||||
}
|
||||
return workspace_it->second.mutable_get();
|
||||
}
|
||||
|
||||
cublasHandle_t getCurrentCUDABlasHandle() {
|
||||
@ -358,8 +298,13 @@ cublasHandle_t getCurrentCUDABlasHandle() {
|
||||
// will allocate memory dynamically (even if they're cheap) outside
|
||||
// PyTorch's CUDA caching allocator. It's possible that CCA used up
|
||||
// all the memory and cublas's cudaMallocAsync will return OOM
|
||||
setWorkspaceForHandle(handle, stream);
|
||||
|
||||
cudaStream_t _stream = stream;
|
||||
auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
|
||||
auto workspace_it = cublas_handle_stream_to_workspace().find(key);
|
||||
if (workspace_it == cublas_handle_stream_to_workspace().end()) {
|
||||
workspace_it = cublas_handle_stream_to_workspace().insert(workspace_it, {key, getNewWorkspace()});
|
||||
}
|
||||
TORCH_CUDABLAS_CHECK(cublasSetWorkspace(handle, workspace_it->second.get(), getChosenWorkspaceSize()));
|
||||
#if !defined(USE_ROCM)
|
||||
// On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
|
||||
// FP32 data type calculations based on the value of the allow_tf32 flag.
|
||||
|
||||
@ -296,7 +296,7 @@ template <typename scalar_t, typename res_scalar_t = scalar_t>
|
||||
bool launchGemmAndBiasCublasLt(
|
||||
// args contains result which is modified
|
||||
cublasCommonArgs& args,
|
||||
const std::optional<Tensor>& self,
|
||||
const Tensor& self,
|
||||
const Scalar& alpha,
|
||||
Activation activation = Activation::None
|
||||
) {
|
||||
@ -304,8 +304,12 @@ bool launchGemmAndBiasCublasLt(
|
||||
// or when it can be squeezed to 1D.
|
||||
// self_ptr == nullptr implies ignore bias epilogue
|
||||
// and use standard gemm-like API.
|
||||
const auto* self_ptr = self.has_value() ? self.value().const_data_ptr<scalar_t>() : static_cast<const scalar_t*>(nullptr);
|
||||
|
||||
const auto* self_ptr = [&]() -> auto {
|
||||
if (self.dim() == 1 || self.squeeze().dim() == 1) {
|
||||
return self.const_data_ptr<scalar_t>();
|
||||
}
|
||||
return static_cast<const scalar_t*>(nullptr);
|
||||
}();
|
||||
|
||||
const auto tuning_ctx = at::cuda::tunable::getTuningContext();
|
||||
if (tuning_ctx->IsTunableOpEnabled()) {
|
||||
@ -388,30 +392,35 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
bool disable_addmm_cuda_lt = persistent_disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
|
||||
#ifdef USE_ROCM
|
||||
// Conditioned on the device index, which is not persistent
|
||||
disable_addmm_cuda_lt = disable_addmm_cuda_lt || isGloballyDisabledAddmmCudaLt(self.device());
|
||||
disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt;
|
||||
#endif
|
||||
// Condition on the input
|
||||
disable_addmm_cuda_lt = disable_addmm_cuda_lt || !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation);
|
||||
disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation) || disable_addmm_cuda_lt;
|
||||
// }
|
||||
|
||||
at::ScalarType scalar_type = mat1.scalar_type();
|
||||
bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;
|
||||
|
||||
#ifdef USE_ROCM
|
||||
disable_addmm_cuda_lt = disable_addmm_cuda_lt || is_float_output_with_half_input;
|
||||
#endif
|
||||
|
||||
bool use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
|
||||
// for float output with half input cublasLT with bias produces wrong results
|
||||
use_bias_ptr_lt &= !is_float_output_with_half_input;
|
||||
|
||||
// Handle result/self shapes
|
||||
if (!result.is_same(self)) {
|
||||
at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]});
|
||||
|
||||
// We do not copy bias only when we need the bias ptr
|
||||
// We use bias ptr in the Lt path only when bias is 1D
|
||||
const auto use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
|
||||
const auto self_maybe_expanded = [&]() -> c10::MaybeOwned<Tensor> {
|
||||
if (!use_bias_ptr_lt) {
|
||||
// We do expand self even before
|
||||
// check for beta != 0.0 to make sure that
|
||||
// test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_*
|
||||
// runs green.
|
||||
return expand_size(self, result.sizes(), "addmm");
|
||||
}
|
||||
return c10::MaybeOwned<Tensor>::borrowed(self);
|
||||
}();
|
||||
// We do not copy bias only when we need the bias ptr
|
||||
if (beta.toComplexDouble() != 0.0 && !use_bias_ptr_lt) {
|
||||
// NOTE: self should broadcast over result
|
||||
at::native::copy_(result, *expand_size(self, result.sizes(), "addmm"));
|
||||
at::native::copy_(result, *self_maybe_expanded);
|
||||
}
|
||||
}
|
||||
|
||||
@ -459,7 +468,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
scalar_type,
|
||||
"addmm_cuda_lt",
|
||||
[&] {
|
||||
lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
|
||||
lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, self, alpha, activation);
|
||||
}
|
||||
);
|
||||
#endif
|
||||
@ -471,7 +480,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
scalar_type,
|
||||
"addmm_cuda_lt",
|
||||
[&] {
|
||||
lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
|
||||
lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, self, alpha, activation);
|
||||
}
|
||||
);
|
||||
} // end is_float_output_with_half_input
|
||||
@ -927,7 +936,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
|
||||
return _int_mm_out_cuda(self, mat2, result);
|
||||
}
|
||||
|
||||
static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
|
||||
static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
|
||||
// ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm
|
||||
TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
|
||||
TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
|
||||
@ -951,7 +960,7 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
|
||||
(out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
|
||||
"out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
|
||||
|
||||
if (self_baddbmm.has_value()) {
|
||||
if (!is_bmm && self_baddbmm.has_value()) {
|
||||
const auto& self = self_baddbmm.value();
|
||||
TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor");
|
||||
TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output");
|
||||
@ -959,12 +968,15 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
|
||||
}
|
||||
|
||||
Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
|
||||
Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
|
||||
IntArrayRef batch1_sizes = batch1.sizes();
|
||||
IntArrayRef batch2_sizes = batch2.sizes();
|
||||
|
||||
Tensor out = at::empty({batch1_sizes[0], batch1_sizes[1], batch2_sizes[2]}, batch1.options().dtype(out_dtype));
|
||||
return _bmm_out_dtype_cuda(batch1, batch2, out_dtype, out);
|
||||
}
|
||||
|
||||
Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
|
||||
baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype);
|
||||
baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true);
|
||||
Scalar beta(0.0);
|
||||
Scalar alpha(1.0);
|
||||
{
|
||||
@ -976,16 +988,14 @@ Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at
|
||||
}
|
||||
|
||||
Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
|
||||
TORCH_CHECK(self.scalar_type() == out_dtype || self.scalar_type() == batch1.dtype(),
|
||||
"self dtype must match either out_dtype or batch1 dtype");
|
||||
Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
|
||||
return _baddbmm_out_dtype_cuda(self, batch1, batch2, out_dtype, beta, alpha, out);
|
||||
// We need to copy the tensor
|
||||
Tensor out = self.clone().to(self.options().dtype(out_dtype));
|
||||
|
||||
return _baddbmm_out_dtype_cuda(out, batch1, batch2, out_dtype, beta, alpha, out);
|
||||
}
|
||||
|
||||
Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
|
||||
baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, out);
|
||||
// We need to copy the tensor
|
||||
out.copy_(self);
|
||||
baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self);
|
||||
{
|
||||
NoNamesGuard guard;
|
||||
baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
|
||||
@ -1020,27 +1030,24 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca
|
||||
}
|
||||
|
||||
Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
|
||||
TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
|
||||
TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
|
||||
Tensor result = at::empty({mat1.size(0), mat2.size(1)}, self.options().dtype(out_dtype));
|
||||
Tensor result = at::empty(self.sizes(), self.options().dtype(out_dtype));
|
||||
return _addmm_dtype_out_cuda(self, mat1, mat2, out_dtype, beta, alpha, result);
|
||||
}
|
||||
|
||||
Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
|
||||
// repeat dimensionality checks for direct calls to `out` overload
|
||||
TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type());
|
||||
TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
|
||||
TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
|
||||
TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
|
||||
TORCH_CHECK(
|
||||
mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
|
||||
mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
|
||||
TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
|
||||
TORCH_CHECK(out_dtype == mat1.scalar_type() ||
|
||||
(out_dtype == at::ScalarType::Float && (mat1.scalar_type() == at::ScalarType::Half || mat1.scalar_type() == at::ScalarType::BFloat16)),
|
||||
"out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
|
||||
|
||||
TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
|
||||
TORCH_CHECK(out_dtype == self.scalar_type() || self.scalar_type() == mat1.scalar_type(),
|
||||
"self dtype must match either out_dtype or mat1 dtype");
|
||||
TORCH_CHECK(out_dtype == self.scalar_type() ||
|
||||
(out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
|
||||
"out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
|
||||
TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
|
||||
|
||||
addmm_out_cuda_impl(out, self, mat1, mat2, beta, alpha);
|
||||
|
||||
|
||||
@ -607,8 +607,6 @@ _scaled_grouped_mm_cuda_v2(
|
||||
// scale shape checks
|
||||
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
|
||||
_check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
|
||||
// swizze checks
|
||||
TORCH_CHECK_VALUE(swizzle_a_enum.size() == 1 && swizzle_b_enum.size() == 1, "Expected single swizzle argument");
|
||||
return _mx8_mx8_bf16_grouped_mm_fbgemm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
|
||||
@ -2,250 +2,18 @@
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/native/DispatchStub.h>
|
||||
#include <ATen/native/cuda/Loops.cuh>
|
||||
#include <ATen/native/cuda/JitLoops.cuh>
|
||||
#include <ATen/native/cuda/jit_utils.h>
|
||||
#include <ATen/native/cuda/ScanUtils.cuh>
|
||||
#include <ATen/native/TensorIterator.h>
|
||||
#include <ATen/native/BinaryOps.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
#include <c10/util/MathConstants.h>
|
||||
#include <c10/util/complex.h>
|
||||
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
|
||||
// NOTE: CUDA on Windows requires that the enclosing function
|
||||
// of a __device__ lambda not have internal linkage.
|
||||
|
||||
namespace at::native {
|
||||
|
||||
// custom min and max to be used in logaddexp for complex arguments
|
||||
template <typename scalar_t, bool min>
|
||||
__host__ __device__ c10::complex<scalar_t> _logaddexp_minmax(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
|
||||
scalar_t xr = std::real(x);
|
||||
scalar_t yr = std::real(y);
|
||||
if (::isnan(yr) || (::isnan(std::imag(y)))) {
|
||||
return y;
|
||||
} else if (::isnan(xr) || (::isnan(std::imag(x)))) {
|
||||
return x;
|
||||
} else if (min) { // min
|
||||
return (xr < yr) ? x : y;
|
||||
} else { // max
|
||||
return (xr >= yr) ? x : y;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__host__ __device__ scalar_t _log_add_exp_helper(const scalar_t& x, const scalar_t& y) {
|
||||
// Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
|
||||
// Using the original expression: `at::_isnan(y) ? y : std::min(x, y)` causes an error in ROCM
|
||||
const auto isnan_x = at::_isnan(x);
|
||||
const auto isnan_y = at::_isnan(y);
|
||||
scalar_t min = isnan_y ? y : (isnan_x ? x : std::min(x, y));
|
||||
scalar_t max = isnan_y ? y : (isnan_x ? x : std::max(x, y));
|
||||
if (min != max || ::isfinite(min)) {
|
||||
// nan will be propagated here
|
||||
return ::log1p(std::exp(min - max)) + max;
|
||||
} else {
|
||||
// special case to correctly handle infinite cases
|
||||
return x;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__host__ __device__ c10::complex<scalar_t> _fast_build_exp(const c10::complex<scalar_t>& x) {
|
||||
// complex exponential function, but implemented manually to get fast compilation time
|
||||
// this function only handles the case where the x is finite (not inf nor nan)
|
||||
const auto xreal = std::real(x);
|
||||
const auto ximag = std::imag(x);
|
||||
const auto exp_x_abs = std::exp(xreal);
|
||||
auto exp_x_real = exp_x_abs * std::cos(ximag);
|
||||
auto exp_x_imag = exp_x_abs * std::sin(ximag);
|
||||
return {exp_x_real, exp_x_imag};
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__host__ __device__ c10::complex<scalar_t> _fast_build_exp_inf(const c10::complex<scalar_t>& x) {
|
||||
// complex exponential function, but implemented manually to get fast compilation time
|
||||
// this function only handles the case where the real part of x is infinite
|
||||
const auto ximag = std::imag(x);
|
||||
constexpr auto exp_x_abs = std::numeric_limits<scalar_t>::infinity();
|
||||
if (!::isfinite(ximag)) { // add this to make consitent with std::exp(x+yi)
|
||||
return {exp_x_abs, std::numeric_limits<scalar_t>::quiet_NaN()};
|
||||
}
|
||||
const auto sin = std::sin(ximag);
|
||||
const auto cos = std::cos(ximag);
|
||||
// special case if the angle is exactly the multiple of pi/2
|
||||
auto exp_x_real = (cos == 0) ? (scalar_t)0.0 : exp_x_abs * cos;
|
||||
auto exp_x_imag = (sin == 0) ? (scalar_t)0.0 : exp_x_abs * sin;
|
||||
return {exp_x_real, exp_x_imag};
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
|
||||
c10::complex<scalar_t> min = _logaddexp_minmax<scalar_t, /*min=*/true>(x, y);
|
||||
c10::complex<scalar_t> max = _logaddexp_minmax<scalar_t, /*min=*/false>(x, y);
|
||||
scalar_t min_real = std::real(min);
|
||||
scalar_t max_real = std::real(max);
|
||||
|
||||
if (::isnan(min_real) || ::isnan(std::imag(min))) {
|
||||
// handling the "infectious" NaNs
|
||||
return {std::numeric_limits<scalar_t>::quiet_NaN(), std::numeric_limits<scalar_t>::quiet_NaN()};
|
||||
}
|
||||
else if ((!::isfinite(min_real)) && (min_real == max_real)) {
|
||||
if (min_real < 0) {
|
||||
// handle the -inf case, the imaginary part here does not really matter as the exp(value)
|
||||
// will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined.
|
||||
// It does not matter if we're taking the exp of this value
|
||||
return min;
|
||||
} else {
|
||||
// handle the +inf case, we don't need the special precision for log1p for small values
|
||||
// and to avoid producing nan in case of real(max) == real(min) == +inf
|
||||
const auto exp_min = _fast_build_exp_inf(min);
|
||||
const auto exp_max = _fast_build_exp_inf(max);
|
||||
return ::log1p(exp_min + exp_max - 1); // log1p(x - 1) builds faster than log
|
||||
}
|
||||
} else {
|
||||
const auto minmax = min - max;
|
||||
c10::complex<scalar_t> exp_minmax;
|
||||
if (!::isfinite(minmax.real())) {
|
||||
exp_minmax = minmax.real() < 0 ? c10::complex<scalar_t>{0.0, 0.0} : _fast_build_exp_inf(minmax);
|
||||
} else {
|
||||
exp_minmax = _fast_build_exp(minmax);
|
||||
}
|
||||
return ::log1p(exp_minmax) + max;
|
||||
}
|
||||
}
|
||||
|
||||
// Complex logaddexp jiterator string
|
||||
const auto logaddexp_complex_string = jiterator_stringify(
|
||||
template<typename T>
|
||||
std::complex<T> log1p(const std::complex<T>& z)
|
||||
{
|
||||
using complex_t = std::complex<T>;
|
||||
T x = z.real();
|
||||
T y = z.imag();
|
||||
T zabs = abs(z);
|
||||
T theta = atan2(y, x + T(1));
|
||||
if (zabs < 0.5) {
|
||||
T r = x * (T(2) + x) + y * y;
|
||||
if (r == 0) { // handle underflow
|
||||
return complex_t(x, theta);
|
||||
}
|
||||
return complex_t(T(0.5) * std::log1p(r), theta);
|
||||
} else {
|
||||
T z0 = std::hypot(x + 1, y);
|
||||
return complex_t(log(z0), theta);
|
||||
}
|
||||
}
|
||||
|
||||
// separated _logaddexp_minmax into 2 different functions for jiterator_string
|
||||
template <typename T>
|
||||
std::complex<T> logaddexp_min(const std::complex<T>& x, const std::complex<T>& y) {
|
||||
T xr = x.real();
|
||||
T yr = y.real();
|
||||
if (isnan(yr) || isnan(y.imag())) {
|
||||
return y;
|
||||
} else if (isnan(xr) || isnan(x.imag())) {
|
||||
return x;
|
||||
} else {
|
||||
return (xr < yr) ? x : y;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::complex<T> logaddexp_max(const std::complex<T>& x, const std::complex<T>& y) {
|
||||
T xr = x.real();
|
||||
T yr = y.real();
|
||||
if (isnan(yr) || isnan(y.imag())) {
|
||||
return y;
|
||||
} else if (isnan(xr) || isnan(x.imag())) {
|
||||
return x;
|
||||
} else {
|
||||
return (xr >= yr) ? x : y;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::complex<T> fast_build_exp(const std::complex<T>& x) {
|
||||
const auto xreal = x.real();
|
||||
const auto ximag = x.imag();
|
||||
const auto exp_x_abs = exp(xreal);
|
||||
auto exp_x_real = exp_x_abs * cos(ximag);
|
||||
auto exp_x_imag = exp_x_abs * sin(ximag);
|
||||
return std::complex<T>(exp_x_real, exp_x_imag);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::complex<T> fast_build_exp_inf(const std::complex<T>& x) {
|
||||
using complex_t = std::complex<T>;
|
||||
const auto ximag = x.imag();
|
||||
const T exp_x_abs = INFINITY;
|
||||
if (!isfinite(ximag)) {
|
||||
return complex_t(exp_x_abs, NAN);
|
||||
}
|
||||
const auto sin_val = sin(ximag);
|
||||
const auto cos_val = cos(ximag);
|
||||
auto exp_x_real = (cos_val == T(0)) ? T(0) : exp_x_abs * cos_val;
|
||||
auto exp_x_imag = (sin_val == T(0)) ? T(0) : exp_x_abs * sin_val;
|
||||
return complex_t(exp_x_real, exp_x_imag);
|
||||
}
|
||||
|
||||
template <typename complex_t>
|
||||
complex_t logaddexp_complex(complex_t x, complex_t y) {
|
||||
using T = typename complex_t::value_type;
|
||||
complex_t min_val = logaddexp_min(x, y);
|
||||
complex_t max_val = logaddexp_max(x, y);
|
||||
T min_real = min_val.real();
|
||||
T max_real = max_val.real();
|
||||
|
||||
if (isnan(min_real) || isnan(min_val.imag())) {
|
||||
return complex_t(NAN, NAN);
|
||||
}
|
||||
else if ((!isfinite(min_real)) && (min_real == max_real)) {
|
||||
if (min_real < T(0)) {
|
||||
return min_val;
|
||||
} else {
|
||||
const auto exp_min = fast_build_exp_inf<T>(min_val);
|
||||
const auto exp_max = fast_build_exp_inf<T>(max_val);
|
||||
return log1p(exp_min + exp_max - complex_t(1, 0));
|
||||
}
|
||||
} else {
|
||||
const auto minmax = min_val - max_val;
|
||||
complex_t exp_minmax;
|
||||
if (!isfinite(minmax.real())) {
|
||||
exp_minmax = (minmax.real() < T(0)) ? complex_t(0, 0) : fast_build_exp_inf<T>(minmax);
|
||||
} else {
|
||||
exp_minmax = fast_build_exp<T>(minmax);
|
||||
}
|
||||
return log1p(exp_minmax) + max_val;
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
constexpr char logaddexp_complex_name[] = "logaddexp_complex";
|
||||
void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
|
||||
if (at::isComplexType(iter.dtype())) {
|
||||
#if AT_USE_JITERATOR()
|
||||
AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
|
||||
jitted_gpu_kernel<
|
||||
/*name=*/logaddexp_complex_name,
|
||||
/*return_dtype=*/scalar_t,
|
||||
/*common_dtype=*/scalar_t,
|
||||
/*arity=*/2>(iter, logaddexp_complex_string);
|
||||
});
|
||||
#else
|
||||
AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
|
||||
using opmath_t = at::opmath_type<scalar_t>;
|
||||
gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a_, scalar_t b_) -> scalar_t {
|
||||
const auto a = static_cast<opmath_t>(a_);
|
||||
const auto b = static_cast<opmath_t>(b_);
|
||||
return static_cast<scalar_t>(_log_add_exp_helper(a, b));
|
||||
});
|
||||
});
|
||||
#endif
|
||||
} else {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
||||
ScalarType::BFloat16, ScalarType::Half,
|
||||
iter.dtype(), "logaddexp_cuda",
|
||||
[&]() {
|
||||
@ -261,7 +29,6 @@ void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void logaddexp2_kernel_cuda(TensorIteratorBase& iter) {
|
||||
|
||||
@ -1101,19 +1101,6 @@ _scaled_mxfp8_mxfp8(
|
||||
return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
|
||||
}
|
||||
|
||||
void
|
||||
_check_mxfp4_support() {
|
||||
#ifndef USE_ROCM
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
// Only on B200 GPUs
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
// B200 = 10.0, B300 = 10.3
|
||||
dprops->major == 10,
|
||||
"MXFP4 scaling only supported in CUDA for B200/B300"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
Tensor&
|
||||
_scaled_mxfp4_mxfp4(
|
||||
@ -1126,7 +1113,6 @@ _scaled_mxfp4_mxfp4(
|
||||
#if defined(_WIN32) || (!defined(USE_ROCM) && !defined(USE_FBGEMM_GENAI))
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM and CUDA+FBGEMM_GENAI only");
|
||||
#else
|
||||
_check_mxfp4_support();
|
||||
// Restrictions:
|
||||
// A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
|
||||
TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",
|
||||
|
||||
@ -5,7 +5,6 @@
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
|
||||
#include <ATen/native/xpu/Blas.h>
|
||||
#include <ATen/xpu/XPUScaledBlas.h>
|
||||
#include <torch/library.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
@ -340,399 +339,4 @@ Tensor _scaled_mm_xpu(
|
||||
out);
|
||||
}
|
||||
|
||||
using acceptance_fn = std::function<bool(
|
||||
c10::ScalarType,
|
||||
std::vector<ScalingType>&,
|
||||
ArrayRef<Tensor>&,
|
||||
c10::ScalarType,
|
||||
std::vector<ScalingType>&,
|
||||
ArrayRef<Tensor>&)>;
|
||||
using namespace std::placeholders;
|
||||
|
||||
namespace scaled_blas = at::native::onednn::scaled;
|
||||
using scaled_blas::convert_int_to_enum;
|
||||
using scaled_blas::ScaledGemmImplementation;
|
||||
|
||||
std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 2>
|
||||
scale_kernel_dispatch = {{
|
||||
{"tensorwise_tensorwise",
|
||||
scaled_blas::check_tensorwise_recipe,
|
||||
ScaledGemmImplementation::TENSORWISE_TENSORWISE},
|
||||
{"rowwise_rowwise",
|
||||
scaled_blas::check_rowwise_recipe,
|
||||
ScaledGemmImplementation::ROWWISE_ROWWISE},
|
||||
|
||||
}};
|
||||
|
||||
Tensor& _scaled_tensorwise_tensorwise(
|
||||
const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
const Tensor& scale_a,
|
||||
const Tensor& scale_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const c10::ScalarType out_dtype,
|
||||
bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
// Restrictions:
|
||||
// A, B are FP8, scales are fp32
|
||||
|
||||
TORCH_CHECK_VALUE(
|
||||
isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()),
|
||||
"mat_a and mat_b must be fp8 types, got: ",
|
||||
mat_a.scalar_type(),
|
||||
mat_b.scalar_type());
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_a.numel() == 1 && scale_a.scalar_type() == kFloat,
|
||||
"scale_a must have 1 Float element")
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_b.numel() == 1 && scale_b.scalar_type() == kFloat,
|
||||
"scale_b must have 1 Float element")
|
||||
|
||||
auto scaling_choice_a = ScalingType::TensorWise;
|
||||
auto scaling_choice_b = ScalingType::TensorWise;
|
||||
|
||||
_scaled_gemm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a,
|
||||
scale_b,
|
||||
scaling_choice_a,
|
||||
scaling_choice_b,
|
||||
bias,
|
||||
use_fast_accum,
|
||||
out);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
Tensor& _scaled_rowwise_rowwise(
|
||||
const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
const Tensor& scale_a,
|
||||
const Tensor& scale_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const c10::ScalarType out_dtype,
|
||||
bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
// Restrictions:
|
||||
// A, B are FP8, scales are fp32, shape M/N for A/B
|
||||
TORCH_CHECK_VALUE(
|
||||
isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()),
|
||||
"mat_a and mat_b must be fp8 types, got: ",
|
||||
mat_a.scalar_type(),
|
||||
mat_b.scalar_type());
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_a.size(0) == mat_a.size(0) && scale_a.size(1) == 1,
|
||||
"scale_a must have shape [",
|
||||
mat_a.size(0),
|
||||
", 1], got [",
|
||||
scale_a.sizes(),
|
||||
"]");
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat,
|
||||
"scale_a must have ",
|
||||
mat_a.size(0),
|
||||
" Float elements, got ",
|
||||
scale_a.numel())
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat,
|
||||
"scale_b must have ",
|
||||
mat_b.size(1),
|
||||
" Float elements, got ",
|
||||
scale_b.numel())
|
||||
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_a.stride(1) == 1,
|
||||
"expected scale_a.stride(1) to be 1, but got ",
|
||||
scale_a.stride(1));
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_b.stride(1) == 1,
|
||||
"expected scale_b.stride(1) to be 1, but got ",
|
||||
scale_b.stride(1));
|
||||
|
||||
auto scaling_choice_a = ScalingType::RowWise;
|
||||
auto scaling_choice_b = ScalingType::RowWise;
|
||||
|
||||
_scaled_gemm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a,
|
||||
scale_b,
|
||||
scaling_choice_a,
|
||||
scaling_choice_b,
|
||||
bias,
|
||||
use_fast_accum,
|
||||
out);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
// V2: Computes matrix multiply + bias while applying scaling to input and
|
||||
// output matrices Scales are only applicable when matrices are of Float8 type
|
||||
// and assumed to be equal to 1.0 by default. If output matrix type is 16 or
|
||||
// 32-bit type, scale_result is not applied. Known limitations:
|
||||
// - Only works if mat1 is row-major and mat2 is column-major
|
||||
// - Only works if matrices sizes are divisible by 32
|
||||
// - If 1-dimensional tensors are used then scale_a should be size =
|
||||
// mat1.size(0)
|
||||
// and scale_b should have size = to mat2.size(1)
|
||||
// Arguments:
|
||||
// - `mat_a`: the first operand of the matrix multiply, can be type
|
||||
// `torch.float8_e4m3fn` or `torch.float8_e5m2`
|
||||
// - `mat_b`: the second operand of the matrix multiply, can be type
|
||||
// `torch.float8_e4m3fn` or `torch.float8_e5m2`
|
||||
// - `scale_a`: a tensor with the inverse scale of `mat1`, whose
|
||||
// shape/strides/dtype depend on the scaling scheme
|
||||
// - `scale_recipe_a`: An integer corresponding to an enum describing the
|
||||
// scaling scheme used for `scale_a`
|
||||
// - `swizzle_a`: An integer corresponding to a `SwizzleType` enum describing
|
||||
// the swizzling scheme for `scale_a`.
|
||||
// Not supported for XPU for now.
|
||||
// - `scale_b`: a tensor with the inverse scale of `mat2`, whose
|
||||
// shape/strides/dtype depend on the scaling scheme
|
||||
// - `scale_recipe_b`: An integer corresponding to an enum describing the
|
||||
// scaling scheme used for `scale_b`
|
||||
// - `swizzle_b`: An integer corresponding to a `SwizzleType` enum describing
|
||||
// the swizzling scheme for `scale_b`.
|
||||
// Not supported for XPU for now.
|
||||
// - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
|
||||
// - `out_dtype`: the output dtype, can either be a float8 or a higher
|
||||
// precision floating point type
|
||||
// - `contraction_dim`: describe which dimensions are `K` in the matmul.
|
||||
// Not supported for XPU. Should always be empty.
|
||||
// - `use_fast_accum`: Not supported for XPU, should always be false.
|
||||
// - `out`: a reference to the output tensor
|
||||
Tensor& _scaled_mm_xpu_v2_out(
|
||||
const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
ArrayRef<Tensor> scale_a,
|
||||
IntArrayRef scale_recipe_a,
|
||||
IntArrayRef swizzle_a,
|
||||
ArrayRef<Tensor> scale_b,
|
||||
IntArrayRef scale_recipe_b,
|
||||
IntArrayRef swizzle_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const std::optional<c10::ScalarType> out_dtype,
|
||||
IntArrayRef contraction_dim,
|
||||
bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
TORCH_CHECK_VALUE(mat_a.dim() == 2, "mat_a must be a matrix");
|
||||
TORCH_CHECK_VALUE(mat_b.dim() == 2, "mat_b must be a matrix");
|
||||
|
||||
// If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm
|
||||
// kernels do not support this case).
|
||||
if (mat_a.size(0) == 0 || mat_a.size(1) == 0 || mat_b.size(1) == 0) {
|
||||
// `out` was created with `at::empty`. In the case where we are multiplying
|
||||
// MxK by KxN and K is the zero dim, we need to initialize here to properly
|
||||
// return a tensor of zeros.
|
||||
at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
|
||||
if (mat_a.size(1) == 0) {
|
||||
out.zero_();
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
// Note: The `contraction_dim` is not actually used for now. We will need to
|
||||
// align this code when upstreamed CUDA code is done. Currently, only keeps
|
||||
// the code here for check.
|
||||
|
||||
// Check if the input matrix sizes can be multiplied
|
||||
// - if optional contraction dims are provided, use those
|
||||
// -- mostly for < 1B formats (i.e. nvfp4x2) where cheap .t() is not
|
||||
// available.
|
||||
if (contraction_dim.size() > 0) {
|
||||
TORCH_CHECK_VALUE(
|
||||
contraction_dim.size() == 2,
|
||||
"contraction_dim must have exactly 2 elements");
|
||||
auto mat_a_dim = contraction_dim[0];
|
||||
auto mat_b_dim = contraction_dim[1];
|
||||
TORCH_CHECK_VALUE(
|
||||
mat_a.size(mat_a_dim) == mat_b.size(mat_b_dim),
|
||||
"mat_a and mat_b shapes cannot be multiplied (",
|
||||
mat_a.size(0),
|
||||
"x",
|
||||
mat_a.size(1),
|
||||
" and ",
|
||||
mat_b.size(0),
|
||||
"x",
|
||||
mat_b.size(1),
|
||||
") ",
|
||||
"with contraction dims mat_a: ",
|
||||
mat_a_dim,
|
||||
", mat_b: ",
|
||||
mat_b_dim);
|
||||
} else {
|
||||
TORCH_CHECK_VALUE(
|
||||
mat_a.size(1) == mat_b.size(0),
|
||||
"mat_a and mat_b shapes cannot be multiplied (",
|
||||
mat_a.size(0),
|
||||
"x",
|
||||
mat_a.size(1),
|
||||
" and ",
|
||||
mat_b.size(0),
|
||||
"x",
|
||||
mat_b.size(1),
|
||||
")");
|
||||
}
|
||||
|
||||
TORCH_CHECK_VALUE(
|
||||
!bias || bias->numel() == mat_b.sizes()[1],
|
||||
"Bias must be size ",
|
||||
mat_b.sizes()[1],
|
||||
" but got ",
|
||||
bias->numel());
|
||||
|
||||
TORCH_CHECK_VALUE(
|
||||
!out_dtype || *out_dtype == out.scalar_type(),
|
||||
"out_dtype must match output matrix type");
|
||||
|
||||
if (bias) {
|
||||
TORCH_CHECK_VALUE(
|
||||
bias->scalar_type() == kFloat ||
|
||||
bias->scalar_type() == c10::ScalarType::BFloat16 ||
|
||||
bias->scalar_type() == c10::ScalarType::Half,
|
||||
"Bias must be Float32 or BFloat16 or Half, but got ",
|
||||
bias->scalar_type());
|
||||
}
|
||||
{
|
||||
auto bias_ = bias.value_or(Tensor());
|
||||
// NOLINTNEXTLINE(*c-array*)
|
||||
TensorArg targs[]{
|
||||
{out, "out", 0},
|
||||
{mat_a, "mat_a", 1},
|
||||
{mat_b, "mat_b", 2},
|
||||
{bias_, "bias", 3},
|
||||
{scale_a[0], "scale_a", 4},
|
||||
{scale_b[0], "scale_b", 5}};
|
||||
checkAllSameGPU(__func__, targs);
|
||||
}
|
||||
// Align with CUDA's default out to be bf16
|
||||
auto out_dtype_ = out_dtype.value_or(c10::ScalarType::BFloat16);
|
||||
|
||||
// Conversion of implicitly-defined enums to explicit
|
||||
auto scale_recipe_a_enum = convert_int_to_enum<ScalingType>(scale_recipe_a);
|
||||
auto swizzle_a_enum = convert_int_to_enum<SwizzleType>(swizzle_a);
|
||||
auto scale_recipe_b_enum = convert_int_to_enum<ScalingType>(scale_recipe_b);
|
||||
auto swizzle_b_enum = convert_int_to_enum<SwizzleType>(swizzle_b);
|
||||
|
||||
// XPU does not support swizzle for now. So directly return false.
|
||||
TORCH_CHECK_VALUE(
|
||||
swizzle_a_enum[0] == at::blas::SwizzleType::NO_SWIZZLE &&
|
||||
swizzle_b_enum[0] == at::blas::SwizzleType::NO_SWIZZLE,
|
||||
"XPU does not support swizzle yet.");
|
||||
|
||||
// at this point we can start working out what we want to be doing
|
||||
// Try to do as few steps as possible.
|
||||
// NOTE: support is deliberately sparse, can explicitly enumerate all
|
||||
// combinations allowed. Do this via a list of defined (name, acceptance,
|
||||
// concrete_impl) tuples.
|
||||
bool found_impl = false;
|
||||
ScaledGemmImplementation gemm_impl = ScaledGemmImplementation::NONE;
|
||||
|
||||
for (const auto& fn_entry : scale_kernel_dispatch) {
|
||||
const auto [name, accept_fn, scaled_gemm_impl] = fn_entry;
|
||||
bool ok = accept_fn(
|
||||
mat_a.scalar_type(),
|
||||
scale_recipe_a_enum,
|
||||
scale_a,
|
||||
mat_b.scalar_type(),
|
||||
scale_recipe_b_enum,
|
||||
scale_b);
|
||||
if (ok) {
|
||||
gemm_impl = scaled_gemm_impl;
|
||||
found_impl = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
TORCH_CHECK_VALUE(
|
||||
found_impl,
|
||||
"Invalid scaling configuration.\n"
|
||||
"- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n"
|
||||
"- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (",
|
||||
mat_a.size(0),
|
||||
", 1) and scale_b should be (1, ",
|
||||
mat_b.size(1),
|
||||
"), and both should be contiguous.\n"
|
||||
"Got mat_a.dtype()=",
|
||||
mat_a.scalar_type(),
|
||||
", scale_a[0].dtype()=",
|
||||
scale_a[0].scalar_type(),
|
||||
", scale_a[0].size()=",
|
||||
scale_a[0].sizes(),
|
||||
", scale_a[0].stride()=",
|
||||
scale_a[0].strides(),
|
||||
", ",
|
||||
"mat_b.dtype()=",
|
||||
mat_b.scalar_type(),
|
||||
", scale_b[0].dtype()=",
|
||||
scale_b[0].scalar_type(),
|
||||
", scale_b[0].size()=",
|
||||
scale_b[0].sizes(),
|
||||
" and scale_b[0].stride()=",
|
||||
scale_b[0].strides());
|
||||
|
||||
at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
|
||||
|
||||
auto bias_ = bias.value_or(Tensor());
|
||||
|
||||
// dispatch to appropriate lower-level calls for error checking & execution
|
||||
if (gemm_impl == ScaledGemmImplementation::TENSORWISE_TENSORWISE) {
|
||||
return _scaled_tensorwise_tensorwise(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a[0],
|
||||
scale_b[0],
|
||||
bias,
|
||||
out_dtype_,
|
||||
use_fast_accum,
|
||||
out);
|
||||
} else if (gemm_impl == ScaledGemmImplementation::ROWWISE_ROWWISE) {
|
||||
return _scaled_rowwise_rowwise(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a[0],
|
||||
scale_b[0],
|
||||
bias,
|
||||
out_dtype_,
|
||||
use_fast_accum,
|
||||
out);
|
||||
} else {
|
||||
TORCH_CHECK_VALUE(
|
||||
false, "Invalid state - found an implementation, but not really");
|
||||
}
|
||||
}
|
||||
|
||||
Tensor _scaled_mm_xpu_v2(
|
||||
const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
ArrayRef<Tensor> scale_a,
|
||||
IntArrayRef scale_recipe_a,
|
||||
IntArrayRef swizzle_a,
|
||||
ArrayRef<Tensor> scale_b,
|
||||
IntArrayRef scale_recipe_b,
|
||||
IntArrayRef swizzle_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const std::optional<c10::ScalarType> out_dtype,
|
||||
IntArrayRef contraction_dim,
|
||||
bool use_fast_accum) {
|
||||
const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
|
||||
Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
|
||||
|
||||
return _scaled_mm_xpu_v2_out(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a,
|
||||
scale_recipe_a,
|
||||
swizzle_a,
|
||||
scale_b,
|
||||
scale_recipe_b,
|
||||
swizzle_b,
|
||||
bias,
|
||||
out_dtype,
|
||||
contraction_dim,
|
||||
use_fast_accum,
|
||||
out);
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -91,30 +91,25 @@ static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
|
||||
#include <ATen/native/mps/Repeat_metallib.h>
|
||||
#endif
|
||||
|
||||
Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
|
||||
TORCH_CHECK(repeat.dim() == 1, "repeat_interleave only accept 1D vector as repeat");
|
||||
template <typename index_t>
|
||||
void computeRepeatIndices(const index_t* repeat_ptr,
|
||||
const int64_t* cumsum_ptr,
|
||||
index_t* result_ptr,
|
||||
int64_t size,
|
||||
int64_t result_size) {
|
||||
id<MTLBuffer> repeatBuffer = reinterpret_cast<id<MTLBuffer>>(repeat_ptr);
|
||||
id<MTLBuffer> cumsumBuffer = reinterpret_cast<id<MTLBuffer>>(cumsum_ptr);
|
||||
id<MTLBuffer> resultBuffer = reinterpret_cast<id<MTLBuffer>>(result_ptr);
|
||||
TORCH_CHECK(repeatBuffer && cumsumBuffer && resultBuffer);
|
||||
|
||||
std::string scalar_type;
|
||||
if (repeat.scalar_type() == kInt) {
|
||||
if constexpr (std::is_same_v<index_t, int32_t>) {
|
||||
scalar_type = "int32_t";
|
||||
} else if (repeat.scalar_type() == kLong) {
|
||||
} else if constexpr (std::is_same_v<index_t, int64_t>) {
|
||||
scalar_type = "int64_t";
|
||||
} else {
|
||||
TORCH_CHECK(false, "repeats has to be Long or Int tensor");
|
||||
TORCH_CHECK(false, "repeat_interleave: unsupported indexing data type");
|
||||
}
|
||||
if (repeat.size(0) == 0) {
|
||||
return at::empty_like(repeat, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||
}
|
||||
Tensor repeat_ = repeat.contiguous();
|
||||
Tensor cumsum = repeat.cumsum(0);
|
||||
int64_t total = 0;
|
||||
if (output_size.has_value()) {
|
||||
total = output_size.value();
|
||||
} else {
|
||||
total = cumsum[-1].item<int64_t>();
|
||||
TORCH_CHECK((repeat >= 0).all().item<uint8_t>(), "repeats can not be negative");
|
||||
}
|
||||
|
||||
auto result = at::empty({total}, repeat.options());
|
||||
|
||||
MPSStream* mpsStream = getCurrentMPSStream();
|
||||
dispatch_sync(mpsStream->queue(), ^() {
|
||||
@ -126,13 +121,20 @@ Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output
|
||||
getMPSProfiler().beginProfileKernel(pipelineState, "repeat_interleave:" + scalar_type, false);
|
||||
|
||||
[computeEncoder setComputePipelineState:pipelineState];
|
||||
mps::mtl_setArgs(computeEncoder, repeat_, cumsum, result, repeat.size(0));
|
||||
mps::mtl_dispatch1DJob(computeEncoder, pipelineState, repeat.size(0));
|
||||
mps::mtl_setArgs(computeEncoder, repeatBuffer, cumsumBuffer, resultBuffer, size);
|
||||
mps::mtl_dispatch1DJob(computeEncoder, pipelineState, size);
|
||||
|
||||
getMPSProfiler().endProfileKernel(pipelineState);
|
||||
}
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
|
||||
Tensor output;
|
||||
AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_mps", [&]() {
|
||||
output = repeat_interleave_common<index_t, computeRepeatIndices<index_t>>(repeat, output_size);
|
||||
});
|
||||
return output;
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -4225,7 +4225,7 @@
|
||||
MTIA: mm_out_mtia
|
||||
MPS: mm_out_mps
|
||||
XPU: mm_out_xpu
|
||||
SparseCPU, SparseCUDA, SparseMPS: _sparse_mm_out
|
||||
SparseCPU, SparseCUDA: _sparse_mm_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm_out
|
||||
|
||||
- func: mm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
|
||||
|
||||
@ -61,7 +61,6 @@ list(APPEND ATen_CUDA_TEST_SRCS
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_math_test.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_test.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_cub_test.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_cublas_handle_pool_test.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_device_test.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_distributions_test.cu
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_dlconvertor_test.cpp
|
||||
|
||||
@ -1,77 +0,0 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
// Test concurrent access to getCurrentCUDABlasHandle and getCUDABlasLtWorkspace
|
||||
// to verify that the data race fix is working correctly
|
||||
|
||||
TEST(CUDABlasHandlePoolTest, ConcurrentGetAndClearWorkspaces) {
|
||||
if (!at::cuda::is_available()) {
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int num_accessor_threads = 15;
|
||||
constexpr int num_clear_threads = 5;
|
||||
constexpr int iterations_per_thread = 50;
|
||||
|
||||
std::atomic<bool> stop{false};
|
||||
std::atomic<int> error_count{0};
|
||||
std::vector<std::thread> threads;
|
||||
threads.reserve(num_accessor_threads + num_clear_threads);
|
||||
|
||||
// Launch accessor threads
|
||||
for (int i = 0; i < num_accessor_threads; ++i) {
|
||||
threads.emplace_back([&stop, &error_count]() {
|
||||
try {
|
||||
at::cuda::CUDAGuard device_guard(0);
|
||||
|
||||
while (!stop.load(std::memory_order_relaxed)) {
|
||||
const auto handle = at::cuda::getCurrentCUDABlasHandle();
|
||||
const auto workspace = at::cuda::getCUDABlasLtWorkspace();
|
||||
|
||||
if (handle == nullptr || workspace == nullptr) {
|
||||
error_count++;
|
||||
}
|
||||
}
|
||||
} catch (const std::exception& e) {
|
||||
error_count++;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Launch threads that clear workspaces
|
||||
for (int i = 0; i < num_clear_threads; ++i) {
|
||||
threads.emplace_back([&error_count]() {
|
||||
try {
|
||||
for (int j = 0; j < iterations_per_thread; ++j) {
|
||||
at::cuda::clearCublasWorkspaces();
|
||||
std::this_thread::yield();
|
||||
}
|
||||
} catch (const std::exception& e) {
|
||||
error_count++;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Let them run for a bit
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
||||
stop.store(true, std::memory_order_relaxed);
|
||||
|
||||
for (auto& thread : threads) {
|
||||
thread.join();
|
||||
}
|
||||
|
||||
EXPECT_EQ(error_count.load(), 0);
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
c10::cuda::CUDACachingAllocator::init(1);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
@ -1,122 +0,0 @@
|
||||
#include <c10/core/Scalar.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/SmallVector.h>
|
||||
#include <c10/util/typeid.h>
|
||||
#include <cstdint>
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/BlasBackend.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/ExpandUtils.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <ATen/core/NamedTensor.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/native/GroupedMMUtils.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <c10/util/MaybeOwned.h>
|
||||
|
||||
#include <ATen/ceil_div.h>
|
||||
#include <ATen/xpu/XPUScaledBlas.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/_addmm_activation_native.h>
|
||||
#include <ATen/ops/_efficientzerotensor.h>
|
||||
#include <ATen/ops/_scaled_mm_native.h>
|
||||
#include <ATen/ops/_unsafe_view_native.h>
|
||||
#include <ATen/ops/abs.h>
|
||||
#include <ATen/ops/addmm_native.h>
|
||||
#include <ATen/ops/addmv_native.h>
|
||||
#include <ATen/ops/baddbmm_native.h>
|
||||
#include <ATen/ops/bmm_native.h>
|
||||
#include <ATen/ops/copy_native.h>
|
||||
#include <ATen/ops/dot_native.h>
|
||||
#include <ATen/ops/empty.h>
|
||||
#include <ATen/ops/empty_strided.h>
|
||||
#include <ATen/ops/gelu.h>
|
||||
#include <ATen/ops/max.h>
|
||||
#include <ATen/ops/mm_native.h>
|
||||
#include <ATen/ops/mul.h>
|
||||
#include <ATen/ops/ones.h>
|
||||
#include <ATen/ops/relu.h>
|
||||
#include <ATen/ops/scalar_tensor_native.h>
|
||||
#include <ATen/ops/vdot_native.h>
|
||||
#endif
|
||||
|
||||
using at::blas::ScalingType;
|
||||
|
||||
namespace at::native::onednn::scaled {
|
||||
|
||||
/**
|
||||
* Both inputs must be fp8,
|
||||
* Each needs a single scale, {Tensorwise (float)}
|
||||
*/
|
||||
bool check_tensorwise_recipe(
|
||||
c10::ScalarType type_a,
|
||||
std::vector<ScalingType>& recipe_a,
|
||||
ArrayRef<Tensor>& scales_a,
|
||||
c10::ScalarType type_b,
|
||||
std::vector<ScalingType>& recipe_b,
|
||||
ArrayRef<Tensor>& scales_b) {
|
||||
// both types must be fp8
|
||||
if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 1 scale each, {Tensorwise, float}
|
||||
if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 ||
|
||||
recipe_b.size() != 1) {
|
||||
return false;
|
||||
}
|
||||
// Need {Blockwise_1x32, e8m0} for A & B
|
||||
if (recipe_a[0] != ScalingType::TensorWise)
|
||||
return false;
|
||||
if (scales_a[0].scalar_type() != ScalarType::Float)
|
||||
return false;
|
||||
if (recipe_b[0] != ScalingType::TensorWise)
|
||||
return false;
|
||||
if (scales_b[0].scalar_type() != ScalarType::Float)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Both inputs must be fp8,
|
||||
* Each needs scales, {Rowwise (float)}
|
||||
*/
|
||||
bool check_rowwise_recipe(
|
||||
c10::ScalarType type_a,
|
||||
std::vector<ScalingType>& recipe_a,
|
||||
ArrayRef<Tensor>& scales_a,
|
||||
c10::ScalarType type_b,
|
||||
std::vector<ScalingType>& recipe_b,
|
||||
ArrayRef<Tensor>& scales_b) {
|
||||
// both types must be fp8
|
||||
if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 1 scale each, {Tensorwise, float}
|
||||
if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 ||
|
||||
recipe_b.size() != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Need {RowWise, dp32} for A & B
|
||||
if (recipe_a[0] != ScalingType::RowWise)
|
||||
return false;
|
||||
if (scales_a[0].scalar_type() != ScalarType::Float)
|
||||
return false;
|
||||
if (recipe_b[0] != ScalingType::RowWise)
|
||||
return false;
|
||||
if (scales_b[0].scalar_type() != ScalarType::Float)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace at::native::onednn::scaled
|
||||
@ -1,95 +0,0 @@
|
||||
#include <c10/core/Scalar.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/SmallVector.h>
|
||||
#include <c10/util/typeid.h>
|
||||
#include <cstdint>
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/ExpandUtils.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <ATen/core/NamedTensor.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <c10/util/MaybeOwned.h>
|
||||
|
||||
#include <ATen/BlasBackend.h>
|
||||
#include <ATen/ceil_div.h>
|
||||
|
||||
#ifdef USE_FBGEMM_GENAI
|
||||
#include <fbgemm_gpu/torch_ops.h>
|
||||
#endif
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/_addmm_activation_native.h>
|
||||
#include <ATen/ops/_efficientzerotensor.h>
|
||||
#include <ATen/ops/_scaled_mm_native.h>
|
||||
#include <ATen/ops/_unsafe_view_native.h>
|
||||
#include <ATen/ops/abs.h>
|
||||
#include <ATen/ops/addmm_native.h>
|
||||
#include <ATen/ops/addmv_native.h>
|
||||
#include <ATen/ops/baddbmm_native.h>
|
||||
#include <ATen/ops/bmm_native.h>
|
||||
#include <ATen/ops/copy_native.h>
|
||||
#include <ATen/ops/dot_native.h>
|
||||
#include <ATen/ops/empty.h>
|
||||
#include <ATen/ops/empty_strided.h>
|
||||
#include <ATen/ops/gelu.h>
|
||||
#include <ATen/ops/max.h>
|
||||
#include <ATen/ops/mm_native.h>
|
||||
#include <ATen/ops/mul.h>
|
||||
#include <ATen/ops/ones.h>
|
||||
#include <ATen/ops/relu.h>
|
||||
#include <ATen/ops/scalar_tensor_native.h>
|
||||
#include <ATen/ops/vdot_native.h>
|
||||
#endif
|
||||
|
||||
using at::blas::ScalingType;
|
||||
|
||||
namespace at::native::onednn::scaled {
|
||||
|
||||
/**
|
||||
* Track concrete implementations available
|
||||
*/
|
||||
enum class ScaledGemmImplementation {
|
||||
NONE = 0,
|
||||
TENSORWISE_TENSORWISE = 1,
|
||||
ROWWISE_ROWWISE = 2,
|
||||
};
|
||||
|
||||
/**
|
||||
* Convert passed int (enum) from python back into a
|
||||
* strictly-typed enum
|
||||
*/
|
||||
template <class EnumType, class ArrayType>
|
||||
std::vector<EnumType> convert_int_to_enum(ArrayType& v) {
|
||||
std::vector<EnumType> converted;
|
||||
converted.reserve(v.size());
|
||||
|
||||
for (auto vi : v) {
|
||||
converted.push_back(static_cast<EnumType>(vi));
|
||||
}
|
||||
return converted;
|
||||
}
|
||||
|
||||
bool check_tensorwise_recipe(
|
||||
c10::ScalarType,
|
||||
std::vector<ScalingType>&,
|
||||
ArrayRef<Tensor>&,
|
||||
c10::ScalarType,
|
||||
std::vector<ScalingType>&,
|
||||
ArrayRef<Tensor>&);
|
||||
|
||||
bool check_rowwise_recipe(
|
||||
c10::ScalarType,
|
||||
std::vector<ScalingType>&,
|
||||
ArrayRef<Tensor>&,
|
||||
c10::ScalarType,
|
||||
std::vector<ScalingType>&,
|
||||
ArrayRef<Tensor>&);
|
||||
|
||||
} // namespace at::native::onednn::scaled
|
||||
@ -10,13 +10,6 @@
|
||||
...
|
||||
}
|
||||
|
||||
{
|
||||
ignore_empty_generic_uninitialised_conditional_jump
|
||||
Memcheck:Cond
|
||||
fun:_ZN2at6detail13empty_genericEN3c108ArrayRefIlEEPNS1_9AllocatorENS1_14DispatchKeySetENS1_10ScalarTypeESt8optionalINS1_12MemoryFormatEE
|
||||
...
|
||||
}
|
||||
|
||||
{
|
||||
Cond_cuda
|
||||
Memcheck:Cond
|
||||
|
||||
@ -9,61 +9,28 @@ def check_perf_csv(filename, threshold, threshold_scale):
|
||||
"""
|
||||
Basic performance checking.
|
||||
"""
|
||||
try:
|
||||
df = pd.read_csv(filename)
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File {filename} not found")
|
||||
sys.exit(1)
|
||||
|
||||
effective_threshold = threshold * threshold_scale
|
||||
print(f"Checking {filename} (speedup threshold >= {effective_threshold:.2f}x)\n")
|
||||
df = pd.read_csv(filename)
|
||||
|
||||
failed = []
|
||||
for _, row in df.iterrows():
|
||||
model_name = row["name"]
|
||||
speedup = float(row["speedup"])
|
||||
abs_latency = float(row["abs_latency"])
|
||||
compilation_latency = float(row["compilation_latency"])
|
||||
compression_ratio = float(row["compression_ratio"])
|
||||
eager_peak_mem = float(row["eager_peak_mem"])
|
||||
dynamo_peak_mem = float(row["dynamo_peak_mem"])
|
||||
speedup = row["speedup"]
|
||||
if speedup < threshold * threshold_scale:
|
||||
failed.append(model_name)
|
||||
|
||||
perf_summary = f"{model_name:34} speedup={speedup:.3f}x"
|
||||
if pd.notna(abs_latency):
|
||||
perf_summary += f", latency={abs_latency:.1f} ms/iter"
|
||||
if pd.notna(compilation_latency):
|
||||
perf_summary += f", compile={compilation_latency:.3f}s"
|
||||
if pd.notna(compression_ratio):
|
||||
perf_summary += f", mem_ratio={1 / compression_ratio:.2f}x"
|
||||
if pd.notna(eager_peak_mem) and pd.notna(dynamo_peak_mem):
|
||||
perf_summary += (
|
||||
f" (eager={eager_peak_mem:.1f} GB, dynamo={dynamo_peak_mem:.1f} GB)"
|
||||
)
|
||||
|
||||
if speedup < effective_threshold:
|
||||
failed.append((model_name, speedup))
|
||||
|
||||
print(perf_summary)
|
||||
print(f"{model_name:34} {speedup}")
|
||||
|
||||
if failed:
|
||||
print(
|
||||
textwrap.dedent(
|
||||
f"""
|
||||
Error {len(failed)} model(s) performance regressed
|
||||
{" ".join([name for name, _ in failed])}
|
||||
Error {len(failed)} models performance regressed
|
||||
{" ".join(failed)}
|
||||
"""
|
||||
)
|
||||
)
|
||||
for name, sp in sorted(failed, key=lambda x: x[1]):
|
||||
pct_from_target = (sp / effective_threshold - 1.0) * 100.0
|
||||
print(
|
||||
f" - {name}: {sp:.3f}x (< {effective_threshold:.2f}x; {pct_from_target:.1f}% from target)"
|
||||
)
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(
|
||||
f"\nAll {len(df)} model(s) passed threshold check (>= {effective_threshold:.2f}x)"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -77,7 +44,7 @@ if __name__ == "__main__":
|
||||
"-s",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="multiply threshold by this value to relax the check",
|
||||
help="multiple threshold by this value to relax the check",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
check_perf_csv(args.file, args.threshold, args.threshold_scale)
|
||||
|
||||
@ -2379,9 +2379,7 @@ class BenchmarkRunner:
|
||||
print(
|
||||
f"Load model outputs from {self.args.compare_model_outputs_with} to compare"
|
||||
)
|
||||
saved_result = torch.load(
|
||||
self.args.compare_model_outputs_with, weights_only=False
|
||||
)
|
||||
saved_result = torch.load(self.args.compare_model_outputs_with)
|
||||
is_bitwise_same = bitwise_same(saved_result, new_result)
|
||||
if not is_bitwise_same:
|
||||
print(
|
||||
|
||||
@ -266,11 +266,7 @@ class BenchmarkRunner:
|
||||
print(
|
||||
f"{mode} Execution Time (us) : {results['reported_run_time_us'][0]:.3f}"
|
||||
)
|
||||
print(f"Peak Memory (KB) : {results['peak_memory']}")
|
||||
# Calculate and print memory bandwidth if operator provides memory traffic
|
||||
if results.get('memory_bandwidth_gb_s') is not None:
|
||||
print(f"Memory Bandwidth (GB/s) : {results['memory_bandwidth_gb_s']:.2f}")
|
||||
print()
|
||||
print(f"Peak Memory (KB) : {results['peak_memory']}\n")
|
||||
|
||||
def _perf_result_to_dict(self, results, test_case):
|
||||
"""This function is the parallel of _print_perf_result, which instead of
|
||||
@ -715,15 +711,6 @@ class BenchmarkRunner:
|
||||
result_dict = dict()
|
||||
result_dict["reported_run_time_us"] = [r[0] for r in results]
|
||||
result_dict["peak_memory"] = results[0][1]
|
||||
|
||||
# Calculate memory bandwidth if operator provides memory traffic
|
||||
memory_traffic_bytes = test_case.op_bench.get_memory_traffic_bytes()
|
||||
if memory_traffic_bytes is not None:
|
||||
execution_time_s = result_dict["reported_run_time_us"][0] / 1e6
|
||||
result_dict["memory_bandwidth_gb_s"] = memory_traffic_bytes / execution_time_s / 1e9
|
||||
else:
|
||||
result_dict["memory_bandwidth_gb_s"] = None
|
||||
|
||||
self._print_perf_result(results=result_dict, test_case=test_case)
|
||||
|
||||
# output results to csv
|
||||
|
||||
@ -118,54 +118,6 @@ class TorchBenchmarkBase(torch.nn.Module):
|
||||
name = (self.module_name() + "_" + "_".join(test_name_str)).replace(" ", "")
|
||||
return name
|
||||
|
||||
def get_memory_traffic_bytes(self):
|
||||
"""Return the number of bytes read/written by this operator.
|
||||
|
||||
Override this method in subclasses to enable memory bandwidth calculation.
|
||||
The framework will use this value along with execution time to compute
|
||||
and report memory bandwidth in GB/s.
|
||||
|
||||
This provides automatic calculation for matmul-like operations by
|
||||
inferring dimensions from input tensor shapes:
|
||||
- 2D inputs: (M, N) @ (N, K) → matmul, mm
|
||||
- 3D inputs: (B, M, N) @ (B, N, K) → bmm, baddbmm
|
||||
|
||||
For custom memory patterns, override this method.
|
||||
|
||||
Returns:
|
||||
int or None: Total bytes transferred (reads + writes), or None if not applicable
|
||||
"""
|
||||
if not hasattr(self, 'inputs') or not self.inputs:
|
||||
return None
|
||||
|
||||
input_tensors = [v for v in self.inputs.values() if isinstance(v, torch.Tensor)]
|
||||
if len(input_tensors) < 2:
|
||||
return None
|
||||
|
||||
input_a, input_b = input_tensors[0], input_tensors[1]
|
||||
|
||||
if input_a.dim() != input_b.dim() or input_a.dim() not in (2, 3):
|
||||
return None
|
||||
|
||||
bytes_per_element = input_a.element_size()
|
||||
|
||||
if input_a.dim() == 3:
|
||||
B_a, M, N_a = input_a.shape
|
||||
B_b, N_b, K = input_b.shape
|
||||
if B_a != B_b or N_a != N_b:
|
||||
return None
|
||||
B = B_a
|
||||
else:
|
||||
M, N_a = input_a.shape
|
||||
N_b, K = input_b.shape
|
||||
if N_a != N_b:
|
||||
return None
|
||||
B = 1
|
||||
|
||||
N = N_a
|
||||
total_elements = B * (M * N + N * K + M * K)
|
||||
return total_elements * bytes_per_element
|
||||
|
||||
|
||||
class PyTorchOperatorTestCase:
|
||||
"""This class includes all the information needed to benchmark an operator.
|
||||
|
||||
@ -1,79 +0,0 @@
|
||||
import operator_benchmark as op_bench
|
||||
|
||||
import torch
|
||||
import torch.optim as optim
|
||||
|
||||
|
||||
"""Microbenchmarks for optimizer operators."""
|
||||
|
||||
|
||||
optimizer_list = op_bench.op_list(
|
||||
attr_names=["op_name", "op_func"],
|
||||
attrs=[
|
||||
["adamw", optim.AdamW],
|
||||
["adam", optim.Adam],
|
||||
["sgd", optim.SGD],
|
||||
["rmsprop", optim.RMSprop],
|
||||
["adagrad", optim.Adagrad],
|
||||
],
|
||||
)
|
||||
|
||||
optimizer_configs_long = op_bench.cross_product_configs(
|
||||
num_params=[1, 10, 100],
|
||||
param_size=[100000, 1000000, 10000000],
|
||||
device=["cuda"],
|
||||
tags=["long"],
|
||||
)
|
||||
|
||||
|
||||
class OptimizerBenchmark(op_bench.TorchBenchmarkBase):
|
||||
def init(self, op_func, device, shape=None, num_params=None, param_size=None):
|
||||
if shape is not None:
|
||||
num_params = num_params if num_params is not None else 1
|
||||
self.params = [
|
||||
torch.randn(shape, device=device, requires_grad=True)
|
||||
for _ in range(num_params)
|
||||
]
|
||||
for param in self.params:
|
||||
param.grad = torch.randn(shape, device=device)
|
||||
else:
|
||||
self.params = [
|
||||
torch.randn(param_size, device=device, requires_grad=True)
|
||||
for _ in range(num_params)
|
||||
]
|
||||
for param in self.params:
|
||||
param.grad = torch.randn_like(param)
|
||||
|
||||
kwargs = {"momentum": 0.9} if op_func == optim.SGD else {}
|
||||
self.optimizer = op_func(self.params, lr=0.001, **kwargs)
|
||||
|
||||
# Memory traffic calculation for bandwidth
|
||||
self.total_elements = sum(p.numel() for p in self.params)
|
||||
self.bytes_per_element = self.params[0].element_size()
|
||||
# SGD w/ momentum: read(param, grad, momentum) + write(param, momentum) = 5x
|
||||
# Adam/AdamW: read(param, grad, exp_avg, exp_avg_sq) + write(param, exp_avg, exp_avg_sq) = 7x
|
||||
# Adagrad/RMSprop: read(param, grad, state) + write(param, state) = 5x
|
||||
if op_func in (optim.Adam, optim.AdamW):
|
||||
self.memory_multiplier = 7
|
||||
else:
|
||||
self.memory_multiplier = 5
|
||||
|
||||
self.inputs = {"dummy": self.params[0]}
|
||||
|
||||
def forward(self, dummy):
|
||||
self.optimizer.step()
|
||||
for param in self.params:
|
||||
param.grad = torch.randn_like(param)
|
||||
return self.params[0]
|
||||
|
||||
def get_memory_traffic_bytes(self):
|
||||
return self.total_elements * self.bytes_per_element * self.memory_multiplier
|
||||
|
||||
|
||||
op_bench.generate_pt_tests_from_op_list(
|
||||
optimizer_list, optimizer_configs_long, OptimizerBenchmark
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
op_bench.benchmark_runner.main()
|
||||
@ -44,7 +44,7 @@ struct C10_API SafePyObject {
|
||||
(*other.pyinterpreter_)->incref(other.data_);
|
||||
}
|
||||
if (data_ != nullptr) {
|
||||
(*pyinterpreter_)->decref(data_);
|
||||
(*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
|
||||
}
|
||||
data_ = other.data_;
|
||||
pyinterpreter_ = other.pyinterpreter_;
|
||||
@ -53,7 +53,7 @@ struct C10_API SafePyObject {
|
||||
|
||||
~SafePyObject() {
|
||||
if (data_ != nullptr) {
|
||||
(*pyinterpreter_)->decref(data_);
|
||||
(*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -92,6 +92,13 @@ inline bool isComplexType(ScalarType t) {
|
||||
t == ScalarType::ComplexDouble);
|
||||
}
|
||||
|
||||
inline bool isQIntType(ScalarType t) {
|
||||
// Don't forget to extend this when adding new QInt types
|
||||
return t == ScalarType::QInt8 || t == ScalarType::QUInt8 ||
|
||||
t == ScalarType::QInt32 || t == ScalarType::QUInt4x2 ||
|
||||
t == ScalarType::QUInt2x4;
|
||||
}
|
||||
|
||||
inline bool isBitsType(ScalarType t) {
|
||||
return t == ScalarType::Bits1x8 || t == ScalarType::Bits2x4 ||
|
||||
t == ScalarType::Bits4x2 || t == ScalarType::Bits8 ||
|
||||
|
||||
@ -48,30 +48,6 @@ void warnDeprecatedDataPtr() {
|
||||
TORCH_CHECK(false, "Cannot access data pointer of Storage that is invalid.");
|
||||
}
|
||||
|
||||
void StorageImpl::incref_pyobject() const {
|
||||
// Because intrusive_ptr incref uses relaxed memory order, we need to
|
||||
// do an acquire fence to ensure that the kHasPyObject bit was
|
||||
// observed before the load of the PyObject* below.
|
||||
// NB: This is a no-op on x86/x86-64
|
||||
std::atomic_thread_fence(std::memory_order_acquire);
|
||||
|
||||
PyObject* obj = pyobj_slot_.load_pyobj();
|
||||
(*pyobj_slot_.pyobj_interpreter())->incref(obj);
|
||||
}
|
||||
|
||||
void StorageImpl::decref_pyobject() const {
|
||||
PyObject* obj = pyobj_slot_.load_pyobj();
|
||||
(*pyobj_slot_.pyobj_interpreter())->decref(obj);
|
||||
}
|
||||
|
||||
bool StorageImpl::try_incref_pyobject() const {
|
||||
c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
|
||||
if (C10_UNLIKELY(!interp)) {
|
||||
return false;
|
||||
}
|
||||
return (*interp)->try_incref(pyobj_slot_);
|
||||
}
|
||||
|
||||
void SetStorageImplCreate(DeviceType t, StorageImplCreateHelper fptr) {
|
||||
// Allowlist verification.
|
||||
// Only if the devicetype is in the allowlist,
|
||||
|
||||
@ -105,12 +105,6 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
|
||||
data_ptr_.clear();
|
||||
}
|
||||
|
||||
void incref_pyobject() const override final;
|
||||
|
||||
void decref_pyobject() const override final;
|
||||
|
||||
bool try_incref_pyobject() const override final;
|
||||
|
||||
size_t nbytes() const {
|
||||
// OK to do this instead of maybe_as_int as nbytes is guaranteed positive
|
||||
TORCH_CHECK(!size_bytes_is_heap_allocated_);
|
||||
@ -376,18 +370,4 @@ C10_API c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
|
||||
bool resizable,
|
||||
std::optional<at::Device> device_opt);
|
||||
|
||||
namespace detail {
|
||||
|
||||
#ifndef C10_MOBILE
|
||||
template <class T>
|
||||
struct TargetTraits<
|
||||
T,
|
||||
std::enable_if_t<
|
||||
std::is_base_of_v<c10::StorageImpl, std::remove_cv_t<T>>>> {
|
||||
static constexpr bool can_have_pyobject = true;
|
||||
};
|
||||
#endif
|
||||
|
||||
} // namespace detail
|
||||
|
||||
} // namespace c10
|
||||
|
||||
@ -277,6 +277,7 @@ void TensorImpl::release_resources() {
|
||||
if (storage_) {
|
||||
storage_ = {};
|
||||
}
|
||||
pyobj_slot_.maybe_destroy_pyobj();
|
||||
}
|
||||
|
||||
#ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
|
||||
@ -988,30 +989,6 @@ void TensorImpl::empty_tensor_restride_symint(MemoryFormat memory_format) {
|
||||
}
|
||||
}
|
||||
|
||||
void TensorImpl::incref_pyobject() const {
|
||||
// Because intrusive_ptr incref uses relaxed memory order, we need to
|
||||
// do an acquire fence to ensure that the kHasPyObject bit was
|
||||
// observed before the load of the PyObject* below.
|
||||
// NB: This is a no-op on x86/x86-64
|
||||
std::atomic_thread_fence(std::memory_order_acquire);
|
||||
|
||||
PyObject* obj = pyobj_slot_.load_pyobj();
|
||||
(*pyobj_slot_.pyobj_interpreter())->incref(obj);
|
||||
}
|
||||
|
||||
void TensorImpl::decref_pyobject() const {
|
||||
PyObject* obj = pyobj_slot_.load_pyobj();
|
||||
(*pyobj_slot_.pyobj_interpreter())->decref(obj);
|
||||
}
|
||||
|
||||
bool TensorImpl::try_incref_pyobject() const {
|
||||
c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
|
||||
if (C10_UNLIKELY(!interp)) {
|
||||
return false;
|
||||
}
|
||||
return (*interp)->try_incref(pyobj_slot_);
|
||||
}
|
||||
|
||||
namespace impl {
|
||||
|
||||
namespace {
|
||||
|
||||
@ -2178,12 +2178,6 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
|
||||
return &pyobj_slot_;
|
||||
}
|
||||
|
||||
void incref_pyobject() const override final;
|
||||
|
||||
void decref_pyobject() const override final;
|
||||
|
||||
bool try_incref_pyobject() const override final;
|
||||
|
||||
private:
|
||||
// See NOTE [std::optional operator usage in CUDA]
|
||||
// We probably don't want to expose this publicly until
|
||||
@ -3085,19 +3079,6 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
|
||||
friend class C10_TensorImpl_Size_Check_Dummy_Class;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
#ifndef C10_MOBILE
|
||||
template <class T>
|
||||
struct TargetTraits<
|
||||
T,
|
||||
std::enable_if_t<std::is_base_of_v<c10::TensorImpl, std::remove_cv_t<T>>>> {
|
||||
static constexpr bool can_have_pyobject = true;
|
||||
};
|
||||
#endif
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// Note [TensorImpl size constraints]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// Changed the size of TensorImpl? If the size went down, good for
|
||||
|
||||
@ -11,11 +11,8 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
|
||||
|
||||
void incref(PyObject* pyobj) const override {} // do nothing
|
||||
|
||||
void decref(PyObject* pyobj) const override {} // do nothing
|
||||
|
||||
bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const override {
|
||||
return false;
|
||||
}
|
||||
void decref(PyObject* pyobj, bool has_pyobj_slot) const override {
|
||||
} // do nothing
|
||||
|
||||
#define PANIC(m) \
|
||||
TORCH_INTERNAL_ASSERT( \
|
||||
@ -23,10 +20,6 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
|
||||
"attempted to call " #m \
|
||||
" on a Tensor with nontrivial PyObject after corresponding interpreter died")
|
||||
|
||||
size_t refcnt(PyObject* pyobj) const override {
|
||||
PANIC(refcnt);
|
||||
}
|
||||
|
||||
c10::intrusive_ptr<TensorImpl> detach(const TensorImpl* self) const override {
|
||||
PANIC(detach);
|
||||
}
|
||||
|
||||
@ -18,9 +18,6 @@ namespace c10 {
|
||||
struct IValue;
|
||||
class OperatorHandle;
|
||||
struct TensorImpl;
|
||||
namespace impl {
|
||||
struct PyObjectSlot;
|
||||
} // namespace impl
|
||||
} // namespace c10
|
||||
|
||||
namespace torch::jit {
|
||||
@ -129,12 +126,9 @@ struct C10_API PyInterpreterVTable {
|
||||
|
||||
// Run Py_INCREF on a PyObject.
|
||||
virtual void incref(PyObject* pyobj) const = 0;
|
||||
// Run Py_DECREF on a PyObject. We DO NOT assume the GIL is held on call.
|
||||
virtual void decref(PyObject* pyobj) const = 0;
|
||||
// Run PyUnstable_TryIncRef on a PyObject if it's not NULL.
|
||||
virtual bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const = 0;
|
||||
// Run Py_REFCNT on a PyObject.
|
||||
virtual size_t refcnt(PyObject* pyobj) const = 0;
|
||||
// Run Py_DECREF on a PyObject. We DO NOT assume the GIL is held on call
|
||||
// See NOTE [PyInterpreter::decref takes a `has_pyobj_slot` arg]
|
||||
virtual void decref(PyObject* pyobj, bool has_pyobj_slot) const = 0;
|
||||
|
||||
// Perform a detach by deferring to the __torch_dispatch__ implementation of
|
||||
// detach, which will also arrange for the PyObject to get copied in this
|
||||
|
||||
56
c10/core/impl/PyObjectSlot.cpp
Normal file
56
c10/core/impl/PyObjectSlot.cpp
Normal file
@ -0,0 +1,56 @@
|
||||
#include <c10/core/impl/PyObjectSlot.h>
|
||||
|
||||
namespace c10::impl {
|
||||
|
||||
PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
|
||||
|
||||
PyObjectSlot::~PyObjectSlot() {
|
||||
maybe_destroy_pyobj();
|
||||
}
|
||||
|
||||
void PyObjectSlot::maybe_destroy_pyobj() {
|
||||
if (owns_pyobj()) {
|
||||
TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
|
||||
TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
|
||||
(*pyobj_interpreter_.load(std::memory_order_acquire))
|
||||
->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
|
||||
// NB: this destructor can only be entered when there are no
|
||||
// references to this C++ object (obviously), NOR any references
|
||||
// to the PyObject (if there are references to the PyObject,
|
||||
// then the PyObject holds an owning reference to the tensor).
|
||||
// So it is OK to clear pyobj_ here as it is impossible for it to
|
||||
// be used again (modulo weak reference races)
|
||||
pyobj_ = nullptr; // for safety
|
||||
}
|
||||
}
|
||||
|
||||
PyInterpreter* PyObjectSlot::pyobj_interpreter() {
|
||||
return pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
return reinterpret_cast<PyObject*>(
|
||||
reinterpret_cast<uintptr_t>(pyobj_) & ~0x1ULL);
|
||||
}
|
||||
|
||||
PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
|
||||
auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
if (interpreter) {
|
||||
return *interpreter;
|
||||
}
|
||||
TORCH_CHECK(false, "cannot access PyObject for Tensor - no interpreter set");
|
||||
}
|
||||
|
||||
bool PyObjectSlot::owns_pyobj() {
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
return reinterpret_cast<uintptr_t>(pyobj_) & 1;
|
||||
}
|
||||
|
||||
void PyObjectSlot::set_owns_pyobj(bool b) {
|
||||
// NOLINTNEXTLINE(performance-no-int-to-ptr)
|
||||
pyobj_ = reinterpret_cast<PyObject*>(
|
||||
reinterpret_cast<uintptr_t>(_unchecked_untagged_pyobj()) | b);
|
||||
}
|
||||
|
||||
} // namespace c10::impl
|
||||
@ -8,58 +8,117 @@
|
||||
|
||||
#include <atomic>
|
||||
|
||||
namespace torch::utils {
|
||||
class PyObjectPreservation;
|
||||
}
|
||||
|
||||
namespace c10::impl {
|
||||
|
||||
struct C10_API PyObjectSlot {
|
||||
public:
|
||||
PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
|
||||
PyObjectSlot();
|
||||
|
||||
~PyObjectSlot();
|
||||
|
||||
void maybe_destroy_pyobj();
|
||||
|
||||
// Associate the TensorImpl with the specified PyObject, and, if necessary,
|
||||
// also tag the interpreter.
|
||||
//
|
||||
// NB: This lives in a header so that we can inline away the switch on status
|
||||
//
|
||||
// NB: THIS FUNCTION CAN RAISE AN EXCEPTION. Make sure to clean up after
|
||||
// PyObject if necessary!
|
||||
void init_pyobj(PyObject* pyobj) {
|
||||
pyobj_interpreter_.store(
|
||||
getGlobalPyInterpreter(), std::memory_order_relaxed);
|
||||
pyobj_ = pyobj;
|
||||
}
|
||||
|
||||
// Query the PyObject interpreter. This may return null if there is no
|
||||
// interpreter.
|
||||
PyInterpreter* pyobj_interpreter() const {
|
||||
return pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
// interpreter. This is racy!
|
||||
PyInterpreter* pyobj_interpreter();
|
||||
|
||||
PyObject* _unchecked_untagged_pyobj() const;
|
||||
|
||||
// Test the interpreter tag. If tagged for the current interpreter, return
|
||||
// a non-nullopt (but possibly null) PyObject. If (possibly) untagged,
|
||||
// returns a nullopt. If it is definitely invalid, raises an error.
|
||||
//
|
||||
// If `ignore_hermetic_tls` is false and this function is called from a
|
||||
// hermetic context (ie, `HermeticPyObjectTLS::get_state()` is true), then
|
||||
// nullopt is returned. If `ignore_hermetic_tls` is true, then the hermetic
|
||||
// context is ignored, allowing you to check the interpreter tag of a
|
||||
// nonhermetic PyObject from within a hermetic context. This is necessary
|
||||
// because there are some cases where the deallocator function of a
|
||||
// nonhermetic PyObject is called from within a hermetic context, so it must
|
||||
// be properly treated as a nonhermetic PyObject.
|
||||
//
|
||||
// NB: this lives in header so that we can avoid actually creating the
|
||||
// std::optional
|
||||
|
||||
// @todo alban: I'm not too sure what's going on here, we can probably delete
|
||||
// it but it's worthwhile making sure
|
||||
std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
|
||||
impl::PyInterpreter* interpreter =
|
||||
pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
if (interpreter == nullptr) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
|
||||
return std::nullopt;
|
||||
} else {
|
||||
return _unchecked_untagged_pyobj();
|
||||
}
|
||||
}
|
||||
|
||||
PyInterpreter& load_pyobj_interpreter() const {
|
||||
auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
interpreter, "cannot access PyObject for Tensor - no interpreter set");
|
||||
return *interpreter;
|
||||
}
|
||||
PyInterpreter& load_pyobj_interpreter() const;
|
||||
|
||||
PyObject* load_pyobj() const {
|
||||
return pyobj_.load(std::memory_order_acquire);
|
||||
}
|
||||
bool owns_pyobj();
|
||||
|
||||
void store_pyobj(PyObject* obj) {
|
||||
pyobj_.store(obj, std::memory_order_release);
|
||||
}
|
||||
|
||||
bool has_unique_reference() const {
|
||||
PyObject* pyobj = load_pyobj();
|
||||
return pyobj != nullptr && load_pyobj_interpreter()->refcnt(pyobj) == 1;
|
||||
}
|
||||
|
||||
void clear() {
|
||||
pyobj_.store(nullptr, std::memory_order_relaxed);
|
||||
pyobj_interpreter_.store(nullptr, std::memory_order_relaxed);
|
||||
}
|
||||
void set_owns_pyobj(bool b);
|
||||
|
||||
private:
|
||||
// This is now always the global interpreter if the PyObject is set.
|
||||
// Maybe we can remove this field some day...
|
||||
// This field contains the interpreter tag for this object. See
|
||||
// Note [Python interpreter tag] for general context
|
||||
//
|
||||
// Note [Memory ordering on Python interpreter tag]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// What memory_order do we need when accessing this atomic? We don't
|
||||
// need a single total modification order (as provided by
|
||||
// memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
|
||||
// transition from -1 to some positive integer and never changes afterwards.
|
||||
// Because there is only one modification, it trivially already has a total
|
||||
// modification order (e.g., we don't need fences or locked instructions on
|
||||
// x86)
|
||||
//
|
||||
// In fact, one could make a reasonable argument that relaxed reads are OK,
|
||||
// due to the presence of external locking (GIL) to ensure that interactions
|
||||
// with other data structures are still correctly synchronized, so that
|
||||
// we fall in the "Single-Location Data Structures" case as described in
|
||||
// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
|
||||
// However, on x86, it doesn't matter if I use acquire or relaxed on the load
|
||||
// as I get the same assembly in both cases. So I just use the more
|
||||
// conservative acquire (which will impede compiler optimizations but I don't
|
||||
// care)
|
||||
std::atomic<PyInterpreter*> pyobj_interpreter_;
|
||||
|
||||
// The PyObject representing this Tensor or nullptr. Ownership is managed
|
||||
// by intrusive_ptr. By the time the PyObjectSlot is destroyed, this
|
||||
// reference is already dead.
|
||||
std::atomic<PyObject*> pyobj_;
|
||||
|
||||
friend class torch::utils::PyObjectPreservation;
|
||||
// This field contains a reference to a PyObject representing this Tensor.
|
||||
// If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
|
||||
// PyObject for it and set this field. This field does not have to be
|
||||
// protected by an atomic as it is only allowed to be accessed when you hold
|
||||
// the GIL, or during destruction of the tensor.
|
||||
//
|
||||
// When a PyObject dies, you are obligated to clear this field
|
||||
// (otherwise, you will try to use-after-free the pyobj); this currently
|
||||
// occurs in THPVariable_clear in torch/csrc/autograd/python_variable.cpp
|
||||
//
|
||||
// NB: Ordinarily, this should not be a strong reference, as if the
|
||||
// PyObject owns the Tensor, this would create a reference cycle.
|
||||
// However, sometimes this ownership flips. To track who owns
|
||||
// who, this has a single pointer tag indicating whether or not the
|
||||
// C++ object owns the PyObject (the common case, zero, means PyObject
|
||||
// owns the C++ object); see _unchecked_untagged_pyobj for raw access
|
||||
// or check_pyobj for checked access. See references to PyObject
|
||||
// resurrection in torch/csrc/autograd/python_variable.cpp
|
||||
PyObject* pyobj_;
|
||||
};
|
||||
|
||||
} // namespace c10::impl
|
||||
|
||||
@ -20,22 +20,6 @@
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define C10_CUDA_DRIVER_CHECK_GOTO(EXPR, NEXT) \
|
||||
do { \
|
||||
CUresult __err = EXPR; \
|
||||
if (__err != CUDA_SUCCESS) { \
|
||||
const char* err_str; \
|
||||
CUresult get_error_str_err [[maybe_unused]] = \
|
||||
c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \
|
||||
if (get_error_str_err != CUDA_SUCCESS) { \
|
||||
TORCH_WARN("CUDA driver error: unknown error"); \
|
||||
} else { \
|
||||
TORCH_WARN("CUDA driver error: ", err_str); \
|
||||
} \
|
||||
goto NEXT; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// The integer in the second column specifies the requested CUDA Driver API
|
||||
// version. The dynamic loader will accept a driver with a newer version, but it
|
||||
// ensures that the requested symbol exists in *at least* the specified version
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/FileSystem.h>
|
||||
#include <c10/util/Logging.h>
|
||||
#include <c10/util/Type.h>
|
||||
|
||||
@ -27,7 +28,7 @@ Error::Error(
|
||||
const void* caller)
|
||||
: Error(
|
||||
str("[enforce fail at ",
|
||||
detail::StripBasename(file),
|
||||
c10::filesystem::path(file).filename(),
|
||||
":",
|
||||
line,
|
||||
"] ",
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
#include <c10/util/Backtrace.h>
|
||||
#include <c10/util/FileSystem.h>
|
||||
#include <c10/util/Flags.h>
|
||||
#include <c10/util/Lazy.h>
|
||||
#include <c10/util/Logging.h>
|
||||
@ -478,8 +479,7 @@ MessageLogger::MessageLogger(
|
||||
<< std::setfill('0') << ' ' << std::setw(2) << timeinfo->tm_hour
|
||||
<< ':' << std::setw(2) << timeinfo->tm_min << ':' << std::setw(2)
|
||||
<< timeinfo->tm_sec << '.' << std::setw(9) << ns << ' '
|
||||
<< c10::detail::StripBasename(std::string(file)) << ':' << line
|
||||
<< "] ";
|
||||
<< c10::filesystem::path(file).filename() << ':' << line << "] ";
|
||||
}
|
||||
|
||||
// Output the contents of the stream to the proper channel on destruction.
|
||||
|
||||
@ -12,10 +12,6 @@ template <typename, typename...>
|
||||
class class_;
|
||||
}
|
||||
|
||||
namespace torch::utils {
|
||||
class PyObjectPreservation;
|
||||
}
|
||||
|
||||
namespace c10 {
|
||||
class intrusive_ptr_target;
|
||||
namespace raw {
|
||||
@ -37,8 +33,6 @@ constexpr uint64_t kImpracticallyHugeWeakReferenceCount =
|
||||
constexpr uint64_t kReferenceCountOne = 1;
|
||||
constexpr uint64_t kWeakReferenceCountOne = (kReferenceCountOne << 32);
|
||||
constexpr uint64_t kUniqueRef = (kReferenceCountOne | kWeakReferenceCountOne);
|
||||
// Indicates whether the object has a PyObject wrapper.
|
||||
constexpr uint64_t kHasPyObject = (uint64_t(1) << 63);
|
||||
|
||||
template <class TTarget>
|
||||
struct intrusive_target_default_null_type final {
|
||||
@ -61,11 +55,7 @@ inline uint32_t refcount(uint64_t combined_refcount) {
|
||||
}
|
||||
|
||||
inline uint32_t weakcount(uint64_t combined_refcount) {
|
||||
return static_cast<uint32_t>((combined_refcount & ~kHasPyObject) >> 32);
|
||||
}
|
||||
|
||||
inline bool has_pyobject(uint64_t combined_refcount) {
|
||||
return (combined_refcount & kHasPyObject) != 0;
|
||||
return static_cast<uint32_t>(combined_refcount >> 32);
|
||||
}
|
||||
|
||||
// The only requirement for refcount increment is that it happens-before
|
||||
@ -76,6 +66,12 @@ inline uint64_t atomic_combined_refcount_increment(
|
||||
return combined_refcount.fetch_add(inc, std::memory_order_relaxed) + inc;
|
||||
}
|
||||
|
||||
inline uint32_t atomic_refcount_increment(
|
||||
std::atomic<uint64_t>& combined_refcount) {
|
||||
return detail::refcount(atomic_combined_refcount_increment(
|
||||
combined_refcount, kReferenceCountOne));
|
||||
}
|
||||
|
||||
inline uint32_t atomic_weakcount_increment(
|
||||
std::atomic<uint64_t>& combined_refcount) {
|
||||
return detail::weakcount(atomic_combined_refcount_increment(
|
||||
@ -103,11 +99,6 @@ inline uint32_t atomic_weakcount_decrement(
|
||||
combined_refcount, kWeakReferenceCountOne));
|
||||
}
|
||||
|
||||
template <class T, class = void>
|
||||
struct TargetTraits {
|
||||
static constexpr bool can_have_pyobject = false;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/**
|
||||
@ -164,23 +155,6 @@ class C10_API intrusive_ptr_target {
|
||||
// we can atomically operate on both at the same time for performance
|
||||
// and defined behaviors.
|
||||
//
|
||||
// Note [PyObject preservation for Tensor and Storages]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// intrusive_ptr has special support for preserving PyObject wrappers
|
||||
// for TensorImpl and StorageImpl. The most significant bit (kHasPyObject) of
|
||||
// the combined_refcount_ is used to indicate whether the object has a
|
||||
// PyObject wrapper.
|
||||
//
|
||||
// - The PyObject, if it exists, holds a strong reference to the
|
||||
// intrusive_ptr_target.
|
||||
//
|
||||
// - When the refcount goes from 1 to 2, we incref the PyObject.
|
||||
//
|
||||
// - When the refcount goes from 2 to 1, we decref the PyObject.
|
||||
//
|
||||
// In other words, the intrusive_ptr keeps the PyObject alive as long as there
|
||||
// are other C++ references to the intrusive_ptr_target.
|
||||
|
||||
mutable std::atomic<uint64_t> combined_refcount_;
|
||||
static_assert(sizeof(std::atomic<uint64_t>) == 8);
|
||||
static_assert(alignof(std::atomic<uint64_t>) == 8);
|
||||
@ -198,8 +172,6 @@ class C10_API intrusive_ptr_target {
|
||||
template <typename T>
|
||||
friend struct ExclusivelyOwnedTensorTraits;
|
||||
|
||||
friend class torch::utils::PyObjectPreservation;
|
||||
|
||||
protected:
|
||||
// protected destructor. We never want to destruct intrusive_ptr_target*
|
||||
// directly.
|
||||
@ -283,16 +255,6 @@ class C10_API intrusive_ptr_target {
|
||||
*/
|
||||
virtual void release_resources() {}
|
||||
|
||||
/**
|
||||
* These two methods are called when the refcount transitions between one
|
||||
* and two and the object has a PyObject wrapper.
|
||||
*/
|
||||
virtual void incref_pyobject() const {}
|
||||
virtual void decref_pyobject() const {}
|
||||
virtual bool try_incref_pyobject() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t refcount(std::memory_order order = std::memory_order_relaxed) const {
|
||||
return detail::refcount(combined_refcount_.load(order));
|
||||
}
|
||||
@ -303,19 +265,6 @@ class C10_API intrusive_ptr_target {
|
||||
}
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
#ifndef C10_MOBILE
|
||||
template <>
|
||||
struct TargetTraits<c10::intrusive_ptr_target> {
|
||||
// A generic intrusive_ptr<intrusive_ptr_target> may actually be a TensorImpl
|
||||
// or StorageImpl, so we have to allow for PyObject support.
|
||||
static constexpr bool can_have_pyobject = true;
|
||||
};
|
||||
#endif
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <class TTarget, class NullType>
|
||||
class weak_intrusive_ptr;
|
||||
|
||||
@ -365,34 +314,18 @@ class intrusive_ptr final {
|
||||
|
||||
void retain_() {
|
||||
if (target_ != NullType::singleton()) {
|
||||
uint64_t combined = detail::atomic_combined_refcount_increment(
|
||||
target_->combined_refcount_, detail::kReferenceCountOne);
|
||||
uint32_t new_refcount = detail::refcount(combined);
|
||||
uint32_t new_refcount =
|
||||
detail::atomic_refcount_increment(target_->combined_refcount_);
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
new_refcount != 1,
|
||||
"intrusive_ptr: Cannot increase refcount after it reached zero.");
|
||||
|
||||
if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
|
||||
// If the refcount transitioned from 1 to 2, we need to incref the
|
||||
// PyObject. In other words, we need to ensure that the PyObject stays
|
||||
// alive now that we have a C++ reference to this object in addition to
|
||||
// the PyObject itself.
|
||||
if (C10_UNLIKELY(
|
||||
detail::has_pyobject(combined) &&
|
||||
detail::refcount(combined) == 2)) {
|
||||
target_->incref_pyobject();
|
||||
}
|
||||
} else {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
!detail::has_pyobject(combined),
|
||||
"TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void reset_() noexcept {
|
||||
if (target_ != NullType::singleton()) {
|
||||
if (is_uniquely_owned()) {
|
||||
if (target_->combined_refcount_.load(std::memory_order_acquire) ==
|
||||
detail::kUniqueRef) {
|
||||
// Both counts are 1, so there are no weak references and
|
||||
// we are releasing the last strong reference. No other
|
||||
// threads can observe the effects of this target_ deletion
|
||||
@ -404,10 +337,9 @@ class intrusive_ptr final {
|
||||
|
||||
auto combined_refcount = detail::atomic_combined_refcount_decrement(
|
||||
target_->combined_refcount_, detail::kReferenceCountOne);
|
||||
uint32_t new_refcount = detail::refcount(combined_refcount);
|
||||
bool has_pyobject = detail::has_pyobject(combined_refcount);
|
||||
if (new_refcount == 0) {
|
||||
bool should_delete = detail::weakcount(combined_refcount) == 1;
|
||||
if (detail::refcount(combined_refcount) == 0) {
|
||||
bool should_delete =
|
||||
(combined_refcount == detail::kWeakReferenceCountOne);
|
||||
// See comment above about weakcount. As long as refcount>0,
|
||||
// weakcount is one larger than the actual number of weak references.
|
||||
// So we need to decrement it here.
|
||||
@ -424,18 +356,6 @@ class intrusive_ptr final {
|
||||
if (should_delete) {
|
||||
delete target_;
|
||||
}
|
||||
} else if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
|
||||
// If the refcount transitioned from 2 to 1, we need to decref the
|
||||
// PyObject. In other words, we don't want to keep the PyObject alive if
|
||||
// there are no C++ references to this object other than the PyObject
|
||||
// itself.
|
||||
if (C10_UNLIKELY(has_pyobject && new_refcount == 1)) {
|
||||
target_->decref_pyobject();
|
||||
}
|
||||
} else {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
!has_pyobject,
|
||||
"TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -602,16 +522,6 @@ class intrusive_ptr final {
|
||||
return use_count() == 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stronger than unique() in that it must not have any weakrefs as well.
|
||||
*/
|
||||
bool is_uniquely_owned() const noexcept {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(target_ != NullType::singleton());
|
||||
uint64_t combined =
|
||||
target_->combined_refcount_.load(std::memory_order_acquire);
|
||||
return (combined & ~detail::kHasPyObject) == detail::kUniqueRef;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an owning (!) pointer to the underlying object and makes the
|
||||
* intrusive_ptr instance invalid. That means the refcount is not decreased.
|
||||
@ -1022,7 +932,6 @@ class weak_intrusive_ptr final {
|
||||
if (target_ == NullType::singleton()) {
|
||||
return intrusive_ptr<TTarget, NullType>();
|
||||
} else {
|
||||
bool increfed = false;
|
||||
auto combined_refcount =
|
||||
target_->combined_refcount_.load(std::memory_order_relaxed);
|
||||
do {
|
||||
@ -1031,31 +940,12 @@ class weak_intrusive_ptr final {
|
||||
// Return nullptr.
|
||||
return intrusive_ptr<TTarget, NullType>();
|
||||
}
|
||||
if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
|
||||
if (detail::has_pyobject(combined_refcount) &&
|
||||
detail::refcount(combined_refcount) == 1 && !increfed) {
|
||||
// Object has a python wrapper with no other C++ references.
|
||||
// We need to to incref the Python object before we acquire a
|
||||
// strong reference to the C++ object to avoid a situation
|
||||
// where the Python object is deallocated concurrently.
|
||||
if (!target_->try_incref_pyobject()) {
|
||||
return intrusive_ptr<TTarget, NullType>();
|
||||
}
|
||||
increfed = true;
|
||||
}
|
||||
}
|
||||
} while (!target_->combined_refcount_.compare_exchange_weak(
|
||||
combined_refcount,
|
||||
combined_refcount + detail::kReferenceCountOne,
|
||||
std::memory_order_acquire,
|
||||
std::memory_order_relaxed));
|
||||
|
||||
if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
|
||||
if (increfed && detail::refcount(combined_refcount) != 1) {
|
||||
target_->decref_pyobject();
|
||||
}
|
||||
}
|
||||
|
||||
return intrusive_ptr<TTarget, NullType>(
|
||||
target_, raw::DontIncreaseRefcount{});
|
||||
}
|
||||
@ -1170,18 +1060,7 @@ namespace intrusive_ptr {
|
||||
// NullType::singleton to this function
|
||||
inline void incref(intrusive_ptr_target* self) {
|
||||
if (self) {
|
||||
uint64_t combined = detail::atomic_combined_refcount_increment(
|
||||
self->combined_refcount_, detail::kReferenceCountOne);
|
||||
|
||||
#ifndef C10_MOBILE
|
||||
if (C10_UNLIKELY(
|
||||
detail::has_pyobject(combined) &&
|
||||
detail::refcount(combined) == 2)) {
|
||||
self->incref_pyobject();
|
||||
}
|
||||
#else
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!detail::has_pyobject(combined));
|
||||
#endif
|
||||
detail::atomic_refcount_increment(self->combined_refcount_);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -15,8 +15,6 @@ using namespace c10::CachingDeviceAllocator;
|
||||
// newly allocated memory with 512-byte alignment.
|
||||
constexpr size_t kDeviceAlignment = 512;
|
||||
|
||||
class XPUAllocator;
|
||||
|
||||
namespace {
|
||||
using stream_set = ska::flat_hash_set<xpu::XPUStream>;
|
||||
|
||||
@ -25,19 +23,14 @@ typedef bool (*Comparison)(const Block*, const Block*);
|
||||
bool BlockComparatorSize(const Block* a, const Block* b);
|
||||
bool BlockComparatorAddress(const Block* a, const Block* b);
|
||||
|
||||
struct PrivatePool;
|
||||
|
||||
struct BlockPool {
|
||||
BlockPool(bool small, PrivatePool* private_pool = nullptr)
|
||||
BlockPool(bool small)
|
||||
: blocks(BlockComparatorSize),
|
||||
unmapped(BlockComparatorAddress),
|
||||
is_small(small),
|
||||
owner_PrivatePool(private_pool) {}
|
||||
|
||||
is_small(small) {}
|
||||
std::set<Block*, Comparison> blocks;
|
||||
std::set<Block*, Comparison> unmapped;
|
||||
const bool is_small;
|
||||
PrivatePool* owner_PrivatePool;
|
||||
};
|
||||
|
||||
struct ExpandableSegment;
|
||||
@ -356,43 +349,6 @@ struct AllocParams {
|
||||
StatTypes stat_types = {};
|
||||
};
|
||||
|
||||
// Internal implementation that manages actual memory blocks.
|
||||
// high level MemPool interface wraps PrivatePool via MempoolId.
|
||||
struct PrivatePool {
|
||||
PrivatePool(MempoolId_t id, XPUAllocator* allocator = nullptr)
|
||||
: id(std::move(id)),
|
||||
allocator_(allocator),
|
||||
large_blocks(/*small=*/false, this),
|
||||
small_blocks(/*small=*/true, this) {}
|
||||
PrivatePool(const PrivatePool&) = delete;
|
||||
PrivatePool(PrivatePool&&) = delete;
|
||||
PrivatePool& operator=(const PrivatePool&) = delete;
|
||||
PrivatePool& operator=(PrivatePool&&) = delete;
|
||||
~PrivatePool() = default;
|
||||
|
||||
// default Mempool when no Mempool is specified
|
||||
MempoolId_t id{0, 0};
|
||||
// Number of live graphs using this pool
|
||||
int use_count{1};
|
||||
// Number of unfreed allocations made for this pool. When use_count and
|
||||
// allocation_count drop to zero, we can delete this PrivatePool from
|
||||
// graph_pools.
|
||||
int allocation_count{0};
|
||||
XPUAllocator* allocator_;
|
||||
BlockPool large_blocks;
|
||||
BlockPool small_blocks;
|
||||
|
||||
public:
|
||||
XPUAllocator* allocator() {
|
||||
return allocator_;
|
||||
}
|
||||
};
|
||||
struct MempoolIdHash {
|
||||
std::size_t operator()(const MempoolId_t& mempool_id) const noexcept {
|
||||
return mempool_id.first != 0 ? mempool_id.first : mempool_id.second;
|
||||
}
|
||||
};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
class DeviceCachingAllocator {
|
||||
@ -409,13 +365,6 @@ class DeviceCachingAllocator {
|
||||
bool set_fraction = false;
|
||||
std::vector<ExpandableSegment*> expandable_segments;
|
||||
std::vector<c10::DeviceIndex> devices_with_peer_access; // reserved
|
||||
std::vector<std::pair<MempoolId_t, std::function<bool(sycl::queue*)>>>
|
||||
captures_underway;
|
||||
ska::flat_hash_map<MempoolId_t, std::unique_ptr<PrivatePool>, MempoolIdHash>
|
||||
graph_pools;
|
||||
// Pools no longer referenced by any graph.
|
||||
ska::flat_hash_map<MempoolId_t, PrivatePool*, MempoolIdHash>
|
||||
graph_pools_freeable;
|
||||
|
||||
size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
|
||||
if (!src || src->allocated || src->event_count > 0 ||
|
||||
@ -514,22 +463,7 @@ class DeviceCachingAllocator {
|
||||
}
|
||||
}
|
||||
|
||||
BlockPool& get_pool(size_t size, sycl::queue* queue) {
|
||||
if (C10_UNLIKELY(!captures_underway.empty())) {
|
||||
for (auto& entry : captures_underway) {
|
||||
// lookup for mempool id matching current capture graph
|
||||
if (entry.second(queue)) {
|
||||
auto it1 = graph_pools.find(entry.first);
|
||||
// lookup mempool
|
||||
TORCH_INTERNAL_ASSERT(it1 != graph_pools.end());
|
||||
if (size <= kSmallSize) {
|
||||
return it1->second->small_blocks;
|
||||
} else {
|
||||
return it1->second->large_blocks;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
BlockPool& get_pool(size_t size) {
|
||||
if (size < kSmallSize) {
|
||||
return small_blocks;
|
||||
} else {
|
||||
@ -735,10 +669,6 @@ class DeviceCachingAllocator {
|
||||
if (!ptr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (p.pool->owner_PrivatePool) {
|
||||
p.pool->owner_PrivatePool->allocation_count++;
|
||||
}
|
||||
p.block = new Block(device, p.queue(), size, p.pool, ptr);
|
||||
for_each_selected_stat_type(p.stat_types, [&](size_t stat_type) {
|
||||
stats.reserved_bytes[stat_type].increase(size);
|
||||
@ -747,14 +677,11 @@ class DeviceCachingAllocator {
|
||||
return true;
|
||||
}
|
||||
|
||||
void synchronize_and_free_events(PrivatePool* pool = nullptr) {
|
||||
void synchronize_and_free_events() {
|
||||
for (auto& xe : xpu_events) {
|
||||
for (auto& e : xe.second) {
|
||||
auto event = e.first;
|
||||
auto* block = e.second;
|
||||
if (pool && block->pool->owner_PrivatePool != pool) {
|
||||
continue;
|
||||
}
|
||||
event.wait();
|
||||
block->event_count--;
|
||||
if (block->event_count == 0) {
|
||||
@ -858,13 +785,6 @@ class DeviceCachingAllocator {
|
||||
for_each_selected_stat_type(stat_types, [&](size_t stat_type) {
|
||||
stats.reserved_bytes[stat_type].decrease(unmapped.size);
|
||||
});
|
||||
|
||||
if (block->pool->owner_PrivatePool) {
|
||||
// The Freed block belonged to a XPU graph's PrivatePool.
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
block->pool->owner_PrivatePool->allocation_count > 0);
|
||||
block->pool->owner_PrivatePool->allocation_count--;
|
||||
}
|
||||
}
|
||||
|
||||
void release_blocks(BlockPool& pool) {
|
||||
@ -892,41 +812,13 @@ class DeviceCachingAllocator {
|
||||
}
|
||||
}
|
||||
|
||||
bool release_cached_blocks(MempoolId_t mempool_id) {
|
||||
if (mempool_id.first == 0 && mempool_id.second == 0 &&
|
||||
captures_underway.empty()) {
|
||||
synchronize_and_free_events();
|
||||
// See Note [Safe to Free Blocks on BlockPool]
|
||||
c10::xpu::syncStreamsOnDevice(device_index);
|
||||
bool release_cached_blocks() {
|
||||
synchronize_and_free_events();
|
||||
// See Note [Safe to Free Blocks on BlockPool]
|
||||
c10::xpu::syncStreamsOnDevice(device_index);
|
||||
|
||||
release_blocks(large_blocks);
|
||||
release_blocks(small_blocks);
|
||||
}
|
||||
|
||||
for (auto it = graph_pools_freeable.begin();
|
||||
it != graph_pools_freeable.end();) {
|
||||
if (mempool_id.first != 0 || mempool_id.second != 0) {
|
||||
if (it->first == mempool_id) {
|
||||
// If there is an active mempool, we sync only the events
|
||||
// associated with the pool
|
||||
synchronize_and_free_events(it->second);
|
||||
} else {
|
||||
// otherwise we move on
|
||||
++it;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
|
||||
release_blocks(it->second->small_blocks);
|
||||
release_blocks(it->second->large_blocks);
|
||||
if (it->second->allocation_count == 0) {
|
||||
auto erase_count = graph_pools.erase(it->first);
|
||||
TORCH_INTERNAL_ASSERT(erase_count == 1);
|
||||
it = graph_pools_freeable.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
release_blocks(large_blocks);
|
||||
release_blocks(small_blocks);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -1011,30 +903,6 @@ class DeviceCachingAllocator {
|
||||
}
|
||||
}
|
||||
|
||||
void create_or_incref_pool(
|
||||
MempoolId_t mempool_id,
|
||||
XPUAllocator* allocator = nullptr) {
|
||||
auto it = graph_pools.find(mempool_id);
|
||||
if (it == graph_pools.end()) {
|
||||
// mempool_id does not reference an existing pool.
|
||||
// Make a new pool for XPU graph capture or memory pool usage.
|
||||
graph_pools.emplace(
|
||||
mempool_id, std::make_unique<PrivatePool>(mempool_id, allocator));
|
||||
} else {
|
||||
// mempool_id references an existing pool, which the current XPU graph
|
||||
// capture will share.
|
||||
TORCH_INTERNAL_ASSERT(it->second->use_count > 0);
|
||||
TORCH_INTERNAL_ASSERT(allocator == nullptr);
|
||||
it->second->use_count++;
|
||||
}
|
||||
}
|
||||
|
||||
PrivatePool* get_private_pool(MempoolId_t mempool_id) {
|
||||
auto it = graph_pools.find(mempool_id);
|
||||
TORCH_INTERNAL_ASSERT(it != graph_pools.end());
|
||||
return it->second.get();
|
||||
}
|
||||
|
||||
public:
|
||||
DeviceCachingAllocator(DeviceIndex device_index)
|
||||
: large_blocks(/* small */ false),
|
||||
@ -1043,11 +911,9 @@ class DeviceCachingAllocator {
|
||||
|
||||
Block* malloc(DeviceIndex device, size_t orig_size, sycl::queue& queue) {
|
||||
std::scoped_lock<std::recursive_mutex> lock(mutex);
|
||||
if (C10_LIKELY(captures_underway.empty())) {
|
||||
process_events();
|
||||
}
|
||||
process_events();
|
||||
size_t size = round_size(orig_size);
|
||||
auto& pool = get_pool(size, &queue);
|
||||
auto& pool = get_pool(size);
|
||||
const size_t alloc_size = get_allocation_size(size);
|
||||
AllocParams params(device, size, &queue, &pool, alloc_size);
|
||||
params.stat_types = get_stat_types_for_pool(pool);
|
||||
@ -1057,7 +923,7 @@ class DeviceCachingAllocator {
|
||||
// Can't reuse an existing block, try to get a new one.
|
||||
if (!block_found) {
|
||||
block_found = alloc_block(params, false) ||
|
||||
(release_cached_blocks({0, 0}) && alloc_block(params, true));
|
||||
(release_cached_blocks() && alloc_block(params, true));
|
||||
}
|
||||
if (!block_found) {
|
||||
const auto& raw_device = c10::xpu::get_raw_device(device);
|
||||
@ -1150,9 +1016,9 @@ class DeviceCachingAllocator {
|
||||
block->stream_uses.insert(stream);
|
||||
}
|
||||
|
||||
void emptyCache(MempoolId_t mempool_id) {
|
||||
void emptyCache() {
|
||||
std::scoped_lock<std::recursive_mutex> lock(mutex);
|
||||
release_cached_blocks(mempool_id);
|
||||
release_cached_blocks();
|
||||
}
|
||||
|
||||
DeviceStats getStats() {
|
||||
@ -1306,9 +1172,9 @@ class XPUAllocator : public DeviceAllocator {
|
||||
}
|
||||
}
|
||||
|
||||
void emptyCache(MempoolId_t mempool_id) override {
|
||||
void emptyCache(MempoolId_t mempool_id [[maybe_unused]] = {0, 0}) override {
|
||||
for (auto& da : device_allocators) {
|
||||
da->emptyCache(mempool_id);
|
||||
da->emptyCache();
|
||||
}
|
||||
}
|
||||
|
||||
@ -1424,8 +1290,8 @@ void init(DeviceIndex device_count) {
|
||||
return allocator.init(device_count);
|
||||
}
|
||||
|
||||
void emptyCache(MempoolId_t mempool_id) {
|
||||
return allocator.emptyCache(mempool_id);
|
||||
void emptyCache() {
|
||||
return allocator.emptyCache();
|
||||
}
|
||||
|
||||
void resetPeakStats(DeviceIndex device) {
|
||||
|
||||
@ -10,7 +10,7 @@ C10_XPU_API Allocator* get();
|
||||
|
||||
C10_XPU_API void init(DeviceIndex device_count);
|
||||
|
||||
C10_XPU_API void emptyCache(MempoolId_t mempool_id = {0, 0});
|
||||
C10_XPU_API void emptyCache();
|
||||
|
||||
C10_XPU_API void resetPeakStats(DeviceIndex device);
|
||||
|
||||
|
||||
@ -1643,8 +1643,6 @@ if(USE_CUDA)
|
||||
target_link_libraries(torch_cuda PUBLIC c10_cuda)
|
||||
if(TARGET torch::nvtx3)
|
||||
target_link_libraries(torch_cuda PRIVATE torch::nvtx3)
|
||||
else()
|
||||
target_link_libraries(torch_cuda PUBLIC torch::nvtoolsext)
|
||||
endif()
|
||||
|
||||
target_include_directories(
|
||||
@ -1741,9 +1739,6 @@ if(BUILD_SHARED_LIBS)
|
||||
if(USE_CUDA)
|
||||
target_link_libraries(torch_global_deps ${Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS})
|
||||
target_link_libraries(torch_global_deps torch::cudart)
|
||||
if(TARGET torch::nvtoolsext)
|
||||
target_link_libraries(torch_global_deps torch::nvtoolsext)
|
||||
endif()
|
||||
endif()
|
||||
install(TARGETS torch_global_deps DESTINATION "${TORCH_INSTALL_LIB_DIR}")
|
||||
endif()
|
||||
|
||||
@ -118,12 +118,6 @@ if(INTERN_BUILD_ATEN_OPS)
|
||||
list(APPEND _file_compile_flags "-gencode;arch=compute_120a,code=sm_120a")
|
||||
endif()
|
||||
endif()
|
||||
# We will need to gate against CUDA version, sm_121a was introduced in CUDA 12.9
|
||||
if("${_arch}" STREQUAL "121a" AND CUDA_VERSION VERSION_GREATER_EQUAL 12.9)
|
||||
if(_existing_arch_flags MATCHES ".*compute_120.*")
|
||||
list(APPEND _file_compile_flags "-gencode;arch=compute_121a,code=sm_121a")
|
||||
endif()
|
||||
endif()
|
||||
endforeach()
|
||||
list(JOIN _file_compile_flags " " _file_compile_flags)
|
||||
|
||||
@ -132,7 +126,7 @@ if(INTERN_BUILD_ATEN_OPS)
|
||||
|
||||
_BUILD_FOR_ADDITIONAL_ARCHS(
|
||||
"${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/RowwiseScaledMM.cu"
|
||||
"89;90a;100a;103a;120a;121a")
|
||||
"89;90a;100a;103a;120a")
|
||||
_BUILD_FOR_ADDITIONAL_ARCHS(
|
||||
"${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/ScaledGroupMM.cu"
|
||||
"90a")
|
||||
|
||||
@ -968,11 +968,8 @@ find_package_handle_standard_args(nvtx3 DEFAULT_MSG nvtx3_dir)
|
||||
if(nvtx3_FOUND)
|
||||
add_library(torch::nvtx3 INTERFACE IMPORTED)
|
||||
target_include_directories(torch::nvtx3 INTERFACE "${nvtx3_dir}")
|
||||
target_compile_definitions(torch::nvtx3 INTERFACE TORCH_CUDA_USE_NVTX3)
|
||||
else()
|
||||
message(WARNING "Cannot find NVTX3, find old NVTX instead")
|
||||
add_library(torch::nvtoolsext INTERFACE IMPORTED)
|
||||
set_property(TARGET torch::nvtoolsext PROPERTY INTERFACE_LINK_LIBRARIES CUDA::nvToolsExt)
|
||||
message(FATAL_ERROR "Cannot find NVTX3!")
|
||||
endif()
|
||||
|
||||
|
||||
|
||||
3
cmake/External/aotriton.cmake
vendored
3
cmake/External/aotriton.cmake
vendored
@ -15,14 +15,12 @@ if(NOT __AOTRITON_INCLUDED)
|
||||
"manylinux_2_28" # rocm6.3
|
||||
"manylinux_2_28" # rocm6.4
|
||||
"manylinux_2_28" # rocm7.0
|
||||
"manylinux_2_28" # rocm7.1
|
||||
)
|
||||
set(__AOTRITON_ROCM_LIST
|
||||
"rocm6.2"
|
||||
"rocm6.3"
|
||||
"rocm6.4"
|
||||
"rocm7.0"
|
||||
"rocm7.1"
|
||||
)
|
||||
set(__AOTRITON_CI_COMMIT "972223c501ffc22068bb035ac5d64cf54318d895")
|
||||
set(__AOTRITON_SHA256_LIST
|
||||
@ -30,7 +28,6 @@ if(NOT __AOTRITON_INCLUDED)
|
||||
"72a153549ea20707331e8a1f1e3d1b8de2913f9d5af2b900c56235d578b57efe" # rocm6.3
|
||||
"c7f319dd7448cbbbab81889dd8a37d47dbc25ebcbd89760f09e6a0904e556393" # rocm6.4
|
||||
"a2a974e0ad929a5e5827c0f896c59bda4872459cbaf8dd8e0a00407f404491cf" # rocm7.0
|
||||
"d4eb24c9f1a0cfedb35f9292efb41d16589cf5a4b98c3c0940181bbefc49d722" # rocm7.1
|
||||
)
|
||||
set(__AOTRITON_IMAGE_LIST
|
||||
"amd-gfx90a"
|
||||
|
||||
@ -132,9 +132,6 @@ if(@USE_CUDA@)
|
||||
else()
|
||||
set(TORCH_CUDA_LIBRARIES ${CUDA_NVRTC_LIB})
|
||||
endif()
|
||||
if(TARGET torch::nvtoolsext)
|
||||
list(APPEND TORCH_CUDA_LIBRARIES torch::nvtoolsext)
|
||||
endif()
|
||||
|
||||
if(@BUILD_SHARED_LIBS@)
|
||||
find_library(C10_CUDA_LIBRARY c10_cuda PATHS "${TORCH_INSTALL_PREFIX}/lib")
|
||||
|
||||
@ -101,14 +101,3 @@ TEST(TestScalarType, toUnderlying) {
|
||||
AT_FORALL_FLOAT8_TYPES(DEFINE_CHECK);
|
||||
#undef DEFINE_CHECK
|
||||
}
|
||||
|
||||
TEST(TestScalarType, isQIntType) {
|
||||
using torch::headeronly::isQIntType;
|
||||
using torch::headeronly::ScalarType;
|
||||
#define DEFINE_CHECK(_, name) EXPECT_TRUE(isQIntType(ScalarType::name));
|
||||
AT_FORALL_QINT_TYPES(DEFINE_CHECK);
|
||||
#undef DEFINE_CHECK
|
||||
#define DEFINE_CHECK(_, name) EXPECT_FALSE(isQIntType(ScalarType::name));
|
||||
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CHECK);
|
||||
#undef DEFINE_CHECK
|
||||
}
|
||||
|
||||
@ -15,7 +15,7 @@ namespace jit {
|
||||
TEST(CustomOperatorTest, InferredSchema) {
|
||||
torch::RegisterOperators reg(
|
||||
"foo::bar", [](double a, at::Tensor b) { return a + b; });
|
||||
auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
|
||||
auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar"));
|
||||
ASSERT_EQ(ops.size(), 1);
|
||||
|
||||
auto& op = ops.front();
|
||||
@ -43,7 +43,8 @@ TEST(CustomOperatorTest, ExplicitSchema) {
|
||||
"foo::bar_with_schema(float a, Tensor b) -> Tensor",
|
||||
[](double a, at::Tensor b) { return a + b; });
|
||||
|
||||
auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
|
||||
auto& ops =
|
||||
getAllOperatorsFor(Symbol::fromQualString("foo::bar_with_schema"));
|
||||
ASSERT_EQ(ops.size(), 1);
|
||||
|
||||
auto& op = ops.front();
|
||||
@ -76,7 +77,7 @@ TEST(CustomOperatorTest, ListParameters) {
|
||||
torch::List<c10::complex<double>> complexdoubles,
|
||||
torch::List<at::Tensor> tensors) { return floats; });
|
||||
|
||||
auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
|
||||
auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists"));
|
||||
ASSERT_EQ(ops.size(), 1);
|
||||
|
||||
auto& op = ops.front();
|
||||
@ -122,7 +123,7 @@ TEST(CustomOperatorTest, ListParameters2) {
|
||||
"foo::lists2(Tensor[] tensors) -> Tensor[]",
|
||||
[](torch::List<at::Tensor> tensors) { return tensors; });
|
||||
|
||||
auto ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
|
||||
auto& ops = getAllOperatorsFor(Symbol::fromQualString("foo::lists2"));
|
||||
ASSERT_EQ(ops.size(), 1);
|
||||
|
||||
auto& op = ops.front();
|
||||
@ -212,7 +213,7 @@ TEST(TestCustomOperator, OperatorGeneratorUndeclared) {
|
||||
},
|
||||
aliasAnalysisFromSchema())});
|
||||
|
||||
auto ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
|
||||
auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::not_exist"));
|
||||
ASSERT_EQ(ops.size(), 0);
|
||||
}
|
||||
|
||||
@ -231,7 +232,7 @@ TEST(TestCustomOperator, OperatorGeneratorBasic) {
|
||||
},
|
||||
aliasAnalysisFromSchema())});
|
||||
|
||||
auto ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
|
||||
auto& ops = getAllOperatorsFor(Symbol::fromQualString("foofoo::bar"));
|
||||
ASSERT_EQ(ops.size(), 1);
|
||||
|
||||
auto& op = ops.front();
|
||||
|
||||
@ -1,20 +0,0 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
uint64_t get_any_data_ptr(Tensor t, bool mutable_) {
|
||||
if (mutable_) {
|
||||
return reinterpret_cast<uint64_t>(t.mutable_data_ptr());
|
||||
} else {
|
||||
return reinterpret_cast<uint64_t>(t.const_data_ptr());
|
||||
}
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("get_any_data_ptr(Tensor t, bool mutable_) -> int");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("get_any_data_ptr", TORCH_BOX(&get_any_data_ptr));
|
||||
}
|
||||
@ -1,34 +0,0 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/headeronly/core/ScalarType.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
uint64_t get_template_any_data_ptr(Tensor t, torch::headeronly::ScalarType dtype, bool mutable_) {
|
||||
#define DEFINE_CASE(T, name) \
|
||||
case torch::headeronly::ScalarType::name: { \
|
||||
if (mutable_) { \
|
||||
return reinterpret_cast<uint64_t>(t.mutable_data_ptr<T>()); \
|
||||
} else { \
|
||||
return reinterpret_cast<uint64_t>(t.const_data_ptr<T>()); \
|
||||
} \
|
||||
}
|
||||
switch (dtype) {
|
||||
// per aten/src/ATen/templates/TensorMethods.cpp:
|
||||
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
|
||||
DEFINE_CASE(uint16_t, UInt16)
|
||||
DEFINE_CASE(uint32_t, UInt32)
|
||||
DEFINE_CASE(uint64_t, UInt64)
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
#undef DEFINE_CASE
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("get_template_any_data_ptr(Tensor t, ScalarType dtype, bool mutable_) -> int");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("get_template_any_data_ptr", TORCH_BOX(&get_template_any_data_ptr));
|
||||
}
|
||||
@ -1,41 +0,0 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
// Declare my__foreach_mul (defined in my__foreach_mul.cpp)
|
||||
extern std::vector<Tensor> my__foreach_mul(
|
||||
torch::headeronly::HeaderOnlyArrayRef<Tensor> self,
|
||||
torch::headeronly::HeaderOnlyArrayRef<Tensor> other);
|
||||
|
||||
// Helper function for cloning
|
||||
Tensor my_clone(Tensor t) {
|
||||
return clone(t);
|
||||
}
|
||||
|
||||
std::vector<Tensor> make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) {
|
||||
// This function tests that my__foreach_mul can take in std::initializer_lists
|
||||
// in addition to std::vectors.
|
||||
Tensor t1_1 = my_clone(t1);
|
||||
Tensor t1_2 = my_clone(t1);
|
||||
Tensor t2_1 = my_clone(t2);
|
||||
Tensor t2_2 = my_clone(t2);
|
||||
return my__foreach_mul({t1_1, t2_1}, {t1_2, t2_2});
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def(
|
||||
"make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) -> Tensor[]");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(
|
||||
libtorch_agnostic_2_10,
|
||||
CompositeExplicitAutograd,
|
||||
m) {
|
||||
m.impl(
|
||||
"make_tensor_clones_and_call_foreach",
|
||||
TORCH_BOX(&make_tensor_clones_and_call_foreach));
|
||||
}
|
||||
@ -1,40 +0,0 @@
|
||||
// This is duplicated from the libtorch_agnostic_2_9_extension
|
||||
// as a negative test for test_version_compatibility.py
|
||||
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
#include <torch/headeronly/util/Exception.h>
|
||||
#include <torch/headeronly/core/ScalarType.h>
|
||||
#include <torch/headeronly/core/Dispatch_v2.h>
|
||||
#include <torch/headeronly/core/TensorAccessor.h>
|
||||
|
||||
#include "tensor_accessor_kernel.h"
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
|
||||
STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
|
||||
STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
|
||||
STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
|
||||
STD_TORCH_CHECK(m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
|
||||
STD_TORCH_CHECK(m.device() == v.device(), "m and v must be on the same device");
|
||||
Tensor res = new_empty(m, {m.size(0)});
|
||||
THO_DISPATCH_V2(m.scalar_type(), "mv_tensor_accessor_cpu",
|
||||
AT_WRAP(([&]() {
|
||||
auto resa = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(res.data_ptr()), res.sizes().data(), res.strides().data());
|
||||
auto ma = Accessor_cpu<scalar_t, 2>(reinterpret_cast<scalar_t*>(m.data_ptr()), m.sizes().data(), m.strides().data());
|
||||
auto va = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(v.data_ptr()), v.sizes().data(), v.strides().data());
|
||||
mv_tensor_accessor_kernel<Accessor_cpu, scalar_t>(resa, ma, va);
|
||||
})),
|
||||
AT_FLOATING_TYPES);
|
||||
return res;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("mv_tensor_accessor_cpu(Tensor res, Tensor m, Tensor v) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("mv_tensor_accessor_cpu", TORCH_BOX(&mv_tensor_accessor_cpu));
|
||||
}
|
||||
@ -1,47 +0,0 @@
|
||||
// This is duplicated from the libtorch_agnostic_2_9_extension
|
||||
// as a negative test for test_version_compatibility.py
|
||||
|
||||
#include "tensor_accessor_kernel.h"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
|
||||
STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
|
||||
STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
|
||||
STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
|
||||
STD_TORCH_CHECK(
|
||||
m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
|
||||
STD_TORCH_CHECK(
|
||||
m.device() == v.device(), "m and v must be on the same device");
|
||||
Tensor res = new_empty(m, {m.size(0)});
|
||||
THO_DISPATCH_V2(
|
||||
m.scalar_type(),
|
||||
"mv_tensor_accessor_cuda",
|
||||
AT_WRAP(([&]() {
|
||||
auto resa = Accessor_cuda<scalar_t, 1>(
|
||||
reinterpret_cast<scalar_t*>(res.data_ptr()),
|
||||
res.sizes().data(),
|
||||
res.strides().data());
|
||||
auto ma = Accessor_cuda<scalar_t, 2>(
|
||||
reinterpret_cast<scalar_t*>(m.data_ptr()),
|
||||
m.sizes().data(),
|
||||
m.strides().data());
|
||||
auto va = Accessor_cuda<scalar_t, 1>(
|
||||
reinterpret_cast<scalar_t*>(v.data_ptr()),
|
||||
v.sizes().data(),
|
||||
v.strides().data());
|
||||
mv_tensor_accessor_kernel<Accessor_cuda, scalar_t>
|
||||
<<<1, 1, 0, 0>>>(resa, ma, va);
|
||||
})),
|
||||
AT_FLOATING_TYPES);
|
||||
return res;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CUDA, m) {
|
||||
m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
|
||||
}
|
||||
@ -1,20 +0,0 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
|
||||
#include <vector>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
|
||||
std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
|
||||
aoti_torch_call_dispatcher("aten::_foreach_mul", "List", stack.data());
|
||||
return torch::stable::detail::to<std::vector<Tensor>>(stack[0]);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("my__foreach_mul(Tensor[] self, Tensor[] other) -> Tensor[]");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("my__foreach_mul", TORCH_BOX(&my__foreach_mul));
|
||||
}
|
||||
@ -1,19 +0,0 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/stableivalue_conversions.h>
|
||||
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
void my__foreach_mul_(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
|
||||
std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
|
||||
aoti_torch_call_dispatcher("aten::_foreach_mul_", "List", stack.data());
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("my__foreach_mul_(Tensor(a!)[] self, Tensor[] other) -> ()");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("my__foreach_mul_", TORCH_BOX(&my__foreach_mul_));
|
||||
}
|
||||
@ -1,25 +0,0 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
|
||||
#include <optional>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
Tensor my_empty(
|
||||
torch::headeronly::HeaderOnlyArrayRef<int64_t> size,
|
||||
std::optional<torch::headeronly::ScalarType> dtype,
|
||||
std::optional<torch::stable::Device> device,
|
||||
std::optional<bool> pin_memory) {
|
||||
return empty(size, dtype, device, pin_memory);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def(
|
||||
"my_empty(int[] size, ScalarType? dtype=None, Device? device=None, bool? pin_memory=None) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_empty", TORCH_BOX(&my_empty));
|
||||
}
|
||||
@ -1,17 +0,0 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
Tensor my_reshape(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> shape) {
|
||||
return reshape(t, shape);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("my_reshape(Tensor t, int[] shape) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_reshape", TORCH_BOX(&my_reshape));
|
||||
}
|
||||
@ -1,20 +0,0 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
Tensor my_view(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> size) {
|
||||
return view(t, size);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("my_view(Tensor t, int[] size) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(
|
||||
libtorch_agnostic_2_10,
|
||||
CompositeExplicitAutograd,
|
||||
m) {
|
||||
m.impl("my_view", TORCH_BOX(&my_view));
|
||||
}
|
||||
@ -1,31 +0,0 @@
|
||||
// This is duplicated from the libtorch_agnostic_2_9_extension
|
||||
// as a negative test for test_version_compatibility.py
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <torch/headeronly/core/Dispatch_v2.h>
|
||||
#include <torch/headeronly/core/TensorAccessor.h>
|
||||
|
||||
template <typename T, size_t N>
|
||||
using Accessor_cpu = torch::headeronly::HeaderOnlyTensorAccessor<T, N>;
|
||||
|
||||
#if defined(__CUDACC__) || defined(__HIPCC__)
|
||||
#define MAYBE_GLOBAL __global__
|
||||
|
||||
template <typename T, size_t N>
|
||||
using Accessor_cuda = torch::headeronly::HeaderOnlyGenericPackedTensorAccessor<T, N, torch::headeronly::RestrictPtrTraits>;
|
||||
|
||||
#else
|
||||
#define MAYBE_GLOBAL
|
||||
#endif
|
||||
|
||||
template <template <typename, size_t> class Accessor, typename scalar_t>
|
||||
MAYBE_GLOBAL void mv_tensor_accessor_kernel(Accessor<scalar_t, 1> resa, Accessor<scalar_t, 2> ma, Accessor<scalar_t, 1> va) {
|
||||
for (int64_t i = 0; i < resa.size(0); i++) {
|
||||
scalar_t val = 0;
|
||||
for (int64_t j = 0; j < ma.size(1); j++) {
|
||||
val += ma[i][j] * va[j];
|
||||
}
|
||||
resa[i] = val;
|
||||
}
|
||||
}
|
||||
@ -1,37 +0,0 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
torch::stable::Device test_device_constructor(
|
||||
bool is_cuda,
|
||||
torch::stable::DeviceIndex index,
|
||||
bool use_str) {
|
||||
using torch::stable::Device;
|
||||
using torch::stable::DeviceType;
|
||||
|
||||
if (use_str) {
|
||||
std::string device_str;
|
||||
if (is_cuda) {
|
||||
device_str = "cuda:" + std::to_string(index);
|
||||
} else {
|
||||
device_str = "cpu";
|
||||
}
|
||||
return Device(device_str);
|
||||
} else {
|
||||
if (is_cuda) {
|
||||
return Device(DeviceType::CUDA, index);
|
||||
} else {
|
||||
return Device(DeviceType::CPU);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def(
|
||||
"test_device_constructor(bool is_cuda, DeviceIndex index, bool use_str) -> Device");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_device_constructor", TORCH_BOX(&test_device_constructor));
|
||||
}
|
||||
@ -1,14 +0,0 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
|
||||
bool test_device_equality(torch::stable::Device d1, torch::stable::Device d2) {
|
||||
return d1 == d2;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("test_device_equality(Device d1, Device d2) -> bool");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_device_equality", TORCH_BOX(&test_device_equality));
|
||||
}
|
||||
@ -1,14 +0,0 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
|
||||
torch::stable::DeviceIndex test_device_index(torch::stable::Device device) {
|
||||
return device.index();
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("test_device_index(Device device) -> DeviceIndex");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_device_index", TORCH_BOX(&test_device_index));
|
||||
}
|
||||
@ -1,14 +0,0 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
|
||||
bool test_device_is_cpu(torch::stable::Device device) {
|
||||
return device.is_cpu();
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("test_device_is_cpu(Device device) -> bool");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_device_is_cpu", TORCH_BOX(&test_device_is_cpu));
|
||||
}
|
||||
@ -1,14 +0,0 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
|
||||
bool test_device_is_cuda(torch::stable::Device device) {
|
||||
return device.is_cuda();
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("test_device_is_cuda(Device device) -> bool");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_device_is_cuda", TORCH_BOX(&test_device_is_cuda));
|
||||
}
|
||||
@ -1,17 +0,0 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
|
||||
torch::stable::Device test_device_set_index(
|
||||
torch::stable::Device device,
|
||||
torch::stable::DeviceIndex index) {
|
||||
device.set_index(index);
|
||||
return device;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("test_device_set_index(Device device, DeviceIndex index) -> Device");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_device_set_index", TORCH_BOX(&test_device_set_index));
|
||||
}
|
||||
@ -1,14 +0,0 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
|
||||
uint32_t test_get_num_threads() {
|
||||
return torch::stable::get_num_threads();
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("test_get_num_threads() -> int");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_get_num_threads", TORCH_BOX(&test_get_num_threads));
|
||||
}
|
||||
@ -1,49 +0,0 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
|
||||
#include <torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
Tensor test_parallel_for(int64_t size, int64_t grain_size) {
|
||||
AtenTensorHandle tensor_handle;
|
||||
int64_t stride = 1;
|
||||
|
||||
aoti_torch_empty_strided(
|
||||
1,
|
||||
&size,
|
||||
&stride,
|
||||
aoti_torch_dtype_int64(),
|
||||
aoti_torch_device_type_cpu(),
|
||||
0,
|
||||
&tensor_handle);
|
||||
|
||||
Tensor tensor(tensor_handle);
|
||||
int64_t* data_ptr = reinterpret_cast<int64_t*>(tensor.data_ptr());
|
||||
|
||||
torch::stable::zero_(tensor);
|
||||
|
||||
// Use parallel_for to fill each element with its index
|
||||
// If using a parallel path, the thread id is encoded in the upper 32 bits
|
||||
torch::stable::parallel_for(
|
||||
0, size, grain_size, [data_ptr](int64_t begin, int64_t end) {
|
||||
for (auto i = begin; i < end; i++) {
|
||||
STD_TORCH_CHECK(i <= UINT32_MAX);
|
||||
uint32_t thread_id;
|
||||
torch_get_thread_idx(&thread_id);
|
||||
data_ptr[i] = i | (static_cast<int64_t>(thread_id) << 32);
|
||||
}
|
||||
});
|
||||
|
||||
return tensor;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("test_parallel_for(int size, int grain_size) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_parallel_for", TORCH_BOX(&test_parallel_for));
|
||||
}
|
||||
@ -1,17 +0,0 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
torch::stable::Device test_tensor_device(torch::stable::Tensor tensor) {
|
||||
return tensor.device();
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("test_tensor_device(Tensor t) -> Device");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_tensor_device", TORCH_BOX(&test_tensor_device));
|
||||
}
|
||||
@ -1,225 +0,0 @@
|
||||
import torch
|
||||
from torch import Tensor
|
||||
|
||||
|
||||
def my__foreach_mul_(tensors, others) -> ():
|
||||
"""
|
||||
Updates tensors to be the result of pointwise multiplying with others.
|
||||
|
||||
Args:
|
||||
tensors: list of tensors
|
||||
others: list of tensors (with the same corresponding shapes as tensors)
|
||||
|
||||
Returns: nothing, tensors is updated in place.
|
||||
"""
|
||||
torch.ops.libtorch_agnostic_2_10.my__foreach_mul_.default(tensors, others)
|
||||
|
||||
|
||||
def my__foreach_mul(tensors, others) -> list[Tensor]:
|
||||
"""
|
||||
Returns a list of tensors that are the results of pointwise multiplying
|
||||
tensors and others.
|
||||
|
||||
Args:
|
||||
tensors: list of tensors
|
||||
others: list of tensors (with the same corresponding shapes as tensors)
|
||||
|
||||
Returns: list of multiplied tensors
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.my__foreach_mul.default(tensors, others)
|
||||
|
||||
|
||||
def make_tensor_clones_and_call_foreach(t1, t2) -> list[Tensor]:
|
||||
"""
|
||||
Returns a list of 2 tensors corresponding to the square of the inputs.
|
||||
|
||||
Args:
|
||||
t1: Tensor
|
||||
t2: Tensor
|
||||
|
||||
Returns: list of [t1^2, t2^2]
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.make_tensor_clones_and_call_foreach.default(
|
||||
t1, t2
|
||||
)
|
||||
|
||||
|
||||
def test_tensor_device(t):
|
||||
"""
|
||||
Tests Tensor device() method.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to get device from
|
||||
|
||||
Returns: Device - device of the tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.test_tensor_device.default(t)
|
||||
|
||||
|
||||
def test_device_constructor(is_cuda, index, use_str):
|
||||
"""
|
||||
Tests creating a Device from DeviceType and index, or from a string.
|
||||
|
||||
Args:
|
||||
is_cuda: bool - if True, creates CUDA device; if False, creates CPU device
|
||||
index: int - device index
|
||||
use_str: bool - if True, constructs from string; if False, constructs from DeviceType
|
||||
|
||||
Returns: Device - A device with the specified type and index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.test_device_constructor.default(
|
||||
is_cuda, index, use_str
|
||||
)
|
||||
|
||||
|
||||
def test_device_equality(d1, d2) -> bool:
|
||||
"""
|
||||
Tests Device equality operator.
|
||||
|
||||
Args:
|
||||
d1: Device - first device
|
||||
d2: Device - second device
|
||||
|
||||
Returns: bool - True if devices are equal
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.test_device_equality.default(d1, d2)
|
||||
|
||||
|
||||
def test_device_set_index(device, index):
|
||||
"""
|
||||
Tests Device set_index() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to modify
|
||||
index: int - new device index
|
||||
|
||||
Returns: Device - device with updated index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.test_device_set_index.default(device, index)
|
||||
|
||||
|
||||
def test_device_index(device) -> int:
|
||||
"""
|
||||
Tests Device index() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to query
|
||||
|
||||
Returns: int - device index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.test_device_index.default(device)
|
||||
|
||||
|
||||
def test_device_is_cuda(device) -> bool:
|
||||
"""
|
||||
Tests Device is_cuda() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to check
|
||||
|
||||
Returns: bool - True if device is CUDA
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.test_device_is_cuda.default(device)
|
||||
|
||||
|
||||
def test_device_is_cpu(device) -> bool:
|
||||
"""
|
||||
Tests Device is_cpu() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to check
|
||||
|
||||
Returns: bool - True if device is CPU
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.test_device_is_cpu.default(device)
|
||||
|
||||
|
||||
def test_parallel_for(size, grain_size) -> Tensor:
|
||||
"""
|
||||
Tests the parallel_for functionality by using it to fill a tensor with indices.
|
||||
Args:
|
||||
size: int - size of the tensor to create
|
||||
grain_size: int - grain size for parallel_for
|
||||
Returns: Tensor - a 1D int64 tensor where each element contains its index
|
||||
(if multiple threads are used the threadid will be encoded in the upper 32 bits)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.test_parallel_for.default(size, grain_size)
|
||||
|
||||
|
||||
def test_get_num_threads() -> int:
|
||||
"""
|
||||
Tests the get_num_threads functionality by returning the number of threads
|
||||
for the parallel backend.
|
||||
|
||||
Returns: int - the number of threads for the parallel backend
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.test_get_num_threads.default()
|
||||
|
||||
|
||||
def my_empty(size, dtype=None, device=None, pin_memory=None) -> Tensor:
|
||||
"""
|
||||
Creates an empty tensor with the specified size, dtype, device, and pin_memory.
|
||||
|
||||
Args:
|
||||
size: list[int] - size of the tensor to create
|
||||
dtype: ScalarType or None - data type of the tensor
|
||||
device: Device or None - device on which to create the tensor
|
||||
pin_memory: bool or None - whether to use pinned memory
|
||||
|
||||
Returns: Tensor - an uninitialized tensor with the specified properties
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.my_empty.default(
|
||||
size, dtype, device, pin_memory
|
||||
)
|
||||
|
||||
|
||||
def my_reshape(t, shape) -> Tensor:
|
||||
"""
|
||||
Returns a tensor with the same data but different shape.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to reshape
|
||||
shape: list[int] - new shape for the tensor
|
||||
|
||||
Returns: Tensor - reshaped tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.my_reshape.default(t, shape)
|
||||
|
||||
|
||||
def my_view(t, size) -> Tensor:
|
||||
"""
|
||||
Returns a new tensor with the same data as the input tensor but of a different shape.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to view
|
||||
size: list[int] - new size for the tensor
|
||||
|
||||
Returns: Tensor - tensor with new view
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.my_view.default(t, size)
|
||||
|
||||
|
||||
def get_any_data_ptr(t, mutable) -> int:
|
||||
"""
|
||||
Return data pointer value of the tensor.
|
||||
Args:
|
||||
t: Input tensor
|
||||
mutable: whether data pointer qualifier is mutable or const
|
||||
Returns: int - pointer value
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.get_any_data_ptr.default(t, mutable)
|
||||
|
||||
|
||||
def get_template_any_data_ptr(t, dtype, mutable) -> int:
|
||||
"""
|
||||
Return data pointer value of the tensor iff it has dtype.
|
||||
Args:
|
||||
t: Input tensor
|
||||
dtype: Input dtype
|
||||
mutable: whether data pointer qualifier is mutable or const
|
||||
Returns: int - pointer value
|
||||
Raises RuntimeError when t.dtype() != dtype.
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.get_template_any_data_ptr.default(
|
||||
t, dtype, mutable
|
||||
)
|
||||
@ -1,308 +0,0 @@
|
||||
# Owner(s): ["module: cpp"]
|
||||
|
||||
"""
|
||||
Unit tests to verify that each function file requires PyTorch 2.10+.
|
||||
|
||||
This test suite compiles each .cpp file in the csrc directory with
|
||||
TORCH_TARGET_VERSION=2.9.0 and expects compilation to fail.
|
||||
If compilation succeeds, it means that either
|
||||
|
||||
(1) The test function works with 2.9.0 and should not be in this directory.
|
||||
(2) The test function tests APIs that do not have proper TORCH_FEATURE_VERSION
|
||||
guards. If this is the case, and you incorrectly move the test function into
|
||||
libtorch_agnostic_2_9_extension the libtorch_agnostic_targetting CI workflow
|
||||
will catch this.
|
||||
|
||||
Run this script with VERSION_COMPAT_DEBUG=1 to see compilation errors.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
|
||||
from torch.utils.cpp_extension import CUDA_HOME, include_paths as torch_include_paths
|
||||
|
||||
|
||||
# TODO: Fix this error in Windows:
|
||||
# numba.cuda.cudadrv.driver:driver.py:384 Call to cuInit results in CUDA_ERROR_NO_DEVICE
|
||||
if not IS_WINDOWS:
|
||||
|
||||
class FunctionVersionCompatibilityTest(TestCase):
|
||||
"""Test that all function files require PyTorch 2.10+."""
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""Set up test environment once for all tests."""
|
||||
cls.csrc_dir = Path(__file__).parent / "libtorch_agnostic_2_10" / "csrc"
|
||||
cls.build_dir = Path(tempfile.mkdtemp(prefix="version_check_"))
|
||||
|
||||
cls.pytorch_includes = [
|
||||
f"-I{path}" for path in torch_include_paths(device_type="cpu")
|
||||
]
|
||||
cls.cuda_includes = []
|
||||
if CUDA_HOME:
|
||||
cuda_include_path = os.path.join(CUDA_HOME, "include")
|
||||
if os.path.exists(cuda_include_path):
|
||||
cls.cuda_includes = [f"-I{cuda_include_path}"]
|
||||
|
||||
cls.cuda_available = cls._check_cuda_available()
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
"""Clean up build directory."""
|
||||
import shutil
|
||||
|
||||
if cls.build_dir.exists():
|
||||
shutil.rmtree(cls.build_dir)
|
||||
|
||||
@staticmethod
|
||||
def _check_cuda_available() -> bool:
|
||||
"""Check if CUDA is available."""
|
||||
try:
|
||||
import torch
|
||||
|
||||
return torch.cuda.is_available()
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
def _compile_cpp_file(
|
||||
self, source_file: Path, output_file: Path
|
||||
) -> tuple[bool, str]:
|
||||
"""
|
||||
Compile a C++ file with TORCH_TARGET_VERSION=2.9.0.
|
||||
Returns (success, error_message).
|
||||
"""
|
||||
torch_version_2_9 = "0x0209000000000000"
|
||||
|
||||
cmd = [
|
||||
"g++",
|
||||
"-c",
|
||||
"-std=c++17",
|
||||
f"-DTORCH_TARGET_VERSION={torch_version_2_9}",
|
||||
f"-I{source_file.parent}", # For includes in same directory
|
||||
*self.pytorch_includes,
|
||||
]
|
||||
|
||||
# Add CUDA flags if available
|
||||
if self.cuda_available:
|
||||
cmd.extend(self.cuda_includes)
|
||||
|
||||
cmd.extend([str(source_file), "-o", str(output_file)])
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
||||
|
||||
if result.returncode == 0:
|
||||
return True, ""
|
||||
else:
|
||||
return False, result.stderr
|
||||
|
||||
def _compile_cu_file(
|
||||
self, source_file: Path, output_file: Path
|
||||
) -> tuple[bool, str]:
|
||||
"""
|
||||
Compile a CUDA file with TORCH_TARGET_VERSION=2.9.0.
|
||||
Returns (success, error_message).
|
||||
"""
|
||||
if not CUDA_HOME:
|
||||
return False, "CUDA_HOME not set"
|
||||
|
||||
torch_version_2_9 = "0x0209000000000000"
|
||||
|
||||
cmd = [
|
||||
os.path.join(CUDA_HOME, "bin", "nvcc"),
|
||||
"-c",
|
||||
"-std=c++17",
|
||||
f"-DTORCH_TARGET_VERSION={torch_version_2_9}",
|
||||
f"-I{source_file.parent}", # For includes in same directory
|
||||
*self.pytorch_includes,
|
||||
*self.cuda_includes,
|
||||
]
|
||||
|
||||
cmd.extend([str(source_file), "-o", str(output_file)])
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
||||
|
||||
if result.returncode == 0:
|
||||
return True, ""
|
||||
else:
|
||||
return False, result.stderr
|
||||
|
||||
def _test_function_file(self, source_file: Path):
|
||||
"""Test that a function file fails to compile with TORCH_TARGET_VERSION=2.9.0."""
|
||||
func_name = source_file.stem
|
||||
obj_file = self.build_dir / f"{func_name}.o"
|
||||
|
||||
# Choose the appropriate compiler based on file extension
|
||||
if source_file.suffix == ".cu":
|
||||
if not self.cuda_available:
|
||||
self.skipTest(f"CUDA not available, skipping {source_file.name}")
|
||||
success, error_msg = self._compile_cu_file(source_file, obj_file)
|
||||
else:
|
||||
success, error_msg = self._compile_cpp_file(source_file, obj_file)
|
||||
|
||||
obj_file.unlink(missing_ok=True)
|
||||
|
||||
# Print error details for debugging
|
||||
if not success:
|
||||
relevant_errors = self._extract_relevant_errors(error_msg)
|
||||
if relevant_errors:
|
||||
print(f"\n Compilation errors for {func_name} (requires 2.10+):")
|
||||
for err in relevant_errors:
|
||||
print(f" {err}")
|
||||
|
||||
self.assertFalse(
|
||||
success,
|
||||
f"Function {func_name} compiled successfully with TORCH_TARGET_VERSION=2.9.0. "
|
||||
f"This could mean two things.\n\t1. It should run with 2.9.0 and should be "
|
||||
"moved to libtorch_agnostic_2_9_extension\n\t2. The function(s) it tests do not use the "
|
||||
"proper TORCH_FEATURE_VERSION guards\n\nThe libtorch_agnostic_targetting CI workflow will "
|
||||
"verify if you incorrectly move this to the 2_9 extension instead of adding "
|
||||
"the appropriate version guards.",
|
||||
)
|
||||
|
||||
def test_mv_tensor_accessor_cpu_works_with_2_9(self):
|
||||
"""Test that mv_tensor_accessor_cpu.cpp compiles successfully with 2.9.0.
|
||||
|
||||
This is a negative test - it ensures that a file we expect to work with 2.9.0
|
||||
actually does compile. This validates that our test infrastructure correctly
|
||||
distinguishes between files that require 2.10+ and those that don't.
|
||||
"""
|
||||
cpp_file = self.csrc_dir / "mv_tensor_accessor_cpu.cpp"
|
||||
|
||||
if not cpp_file.exists():
|
||||
self.skipTest(f"{cpp_file} not found - this is a test file only")
|
||||
|
||||
obj_file = self.build_dir / "mv_tensor_accessor_cpu.o"
|
||||
success, error_msg = self._compile_cpp_file(cpp_file, obj_file)
|
||||
|
||||
# Clean up
|
||||
obj_file.unlink(missing_ok=True)
|
||||
|
||||
if not success:
|
||||
relevant_errors = self._extract_relevant_errors(error_msg)
|
||||
if relevant_errors:
|
||||
print(
|
||||
"\n Unexpected compilation errors for mv_tensor_accessor_cpu:"
|
||||
)
|
||||
for err in relevant_errors:
|
||||
print(f"{err}")
|
||||
|
||||
self.assertTrue(
|
||||
success,
|
||||
f"mv_tensor_accessor_cpu.cpp failed to compile with TORCH_TARGET_VERSION=2.9.0. "
|
||||
f"This file is expected to work with 2.9.0 since it doesn't use 2.10+ features. "
|
||||
f"Error: {error_msg}",
|
||||
)
|
||||
|
||||
def test_mv_tensor_accessor_cuda_works_with_2_9(self):
|
||||
"""Test that mv_tensor_accessor_cuda.cu compiles successfully with 2.9.0.
|
||||
|
||||
This is a negative test - it ensures that a .cu file we expect to work with 2.9.0
|
||||
actually does compile. This validates that our test infrastructure correctly
|
||||
compiles CUDA files and distinguishes between files that require 2.10+ and those
|
||||
that don't.
|
||||
"""
|
||||
if not self.cuda_available:
|
||||
self.skipTest(
|
||||
"CUDA not available, skipping mv_tensor_accessor_cuda.cu test"
|
||||
)
|
||||
|
||||
cu_file = self.csrc_dir / "mv_tensor_accessor_cuda.cu"
|
||||
|
||||
if not cu_file.exists():
|
||||
self.skipTest(f"{cu_file} not found - this is a test file only")
|
||||
|
||||
obj_file = self.build_dir / "cuda_kernel.o"
|
||||
success, error_msg = self._compile_cu_file(cu_file, obj_file)
|
||||
|
||||
# Clean up
|
||||
obj_file.unlink(missing_ok=True)
|
||||
|
||||
if not success:
|
||||
relevant_errors = self._extract_relevant_errors(error_msg)
|
||||
if relevant_errors:
|
||||
print(
|
||||
"\n Unexpected compilation errors for mv_tensor_accessor_cuda.cu:"
|
||||
)
|
||||
for err in relevant_errors:
|
||||
print(f"{err}")
|
||||
|
||||
self.assertTrue(
|
||||
success,
|
||||
f"mv_tensor_accessor_cuda.cu failed to compile with TORCH_TARGET_VERSION=2.9.0. "
|
||||
f"This file is expected to work with 2.9.0 since it doesn't use 2.10+ features. "
|
||||
f"Error: {error_msg}",
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _extract_relevant_errors(error_msg: str) -> list[str]:
|
||||
"""Extract the most relevant error messages."""
|
||||
error_lines = error_msg.strip().split("\n")
|
||||
relevant_errors = []
|
||||
|
||||
for line in error_lines:
|
||||
line_lower = line.lower()
|
||||
if (
|
||||
"error:" in line_lower
|
||||
or "undefined" in line_lower
|
||||
or "undeclared" in line_lower
|
||||
or "no member named" in line_lower
|
||||
):
|
||||
relevant_errors.append(line.strip())
|
||||
|
||||
return relevant_errors
|
||||
|
||||
# Dynamically create test methods for each .cpp and .cu file
|
||||
|
||||
def _create_test_method_for_file(source_file: Path):
|
||||
"""Create a test method for a specific source file."""
|
||||
|
||||
def test_method_impl(self):
|
||||
self._test_function_file(source_file)
|
||||
|
||||
# Set a descriptive name and docstring
|
||||
func_name = source_file.stem
|
||||
file_ext = source_file.suffix
|
||||
test_method_impl.__name__ = f"test_{func_name}_requires_2_10"
|
||||
test_method_impl.__doc__ = (
|
||||
f"Test that {func_name}{file_ext} requires PyTorch 2.10+"
|
||||
)
|
||||
|
||||
return test_method_impl
|
||||
|
||||
# Test discovery: generate a test for each .cpp and .cu file
|
||||
_csrc_dir = Path(__file__).parent / "libtorch_agnostic_2_10" / "csrc"
|
||||
if _csrc_dir.exists():
|
||||
# Collect both .cpp and .cu files, excluding those used for negative test
|
||||
# already defined above
|
||||
_source_files = sorted(
|
||||
[
|
||||
f
|
||||
for f in _csrc_dir.rglob("*.cpp")
|
||||
if f.name not in ("mv_tensor_accessor_cpu.cpp",)
|
||||
]
|
||||
+ [
|
||||
f
|
||||
for f in _csrc_dir.rglob("*.cu")
|
||||
if f.name not in ("mv_tensor_accessor_cuda.cu",)
|
||||
]
|
||||
)
|
||||
|
||||
for _source_file in _source_files:
|
||||
_test_method = _create_test_method_for_file(_source_file)
|
||||
setattr(
|
||||
FunctionVersionCompatibilityTest, _test_method.__name__, _test_method
|
||||
)
|
||||
|
||||
del (
|
||||
_create_test_method_for_file,
|
||||
_csrc_dir,
|
||||
_source_files,
|
||||
_source_file,
|
||||
_test_method,
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
@ -1,21 +0,0 @@
|
||||
import ctypes
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
so_files = list(Path(__file__).parent.glob("_C*.so"))
|
||||
assert len(so_files) == 1, f"Expected one _C*.so file, found {len(so_files)}"
|
||||
|
||||
# use ctypes.CDLL instead of load_library to be able to test the unload logic
|
||||
# below code is reduced from the load_library code
|
||||
with torch._ops.dl_open_guard():
|
||||
loaded_lib = ctypes.CDLL(so_files[0])
|
||||
|
||||
from . import ops
|
||||
|
||||
|
||||
__all__ = [
|
||||
"loaded_lib",
|
||||
"ops",
|
||||
]
|
||||
@ -1,44 +0,0 @@
|
||||
#include "kernel.h"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
|
||||
STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
|
||||
STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
|
||||
STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
|
||||
STD_TORCH_CHECK(
|
||||
m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
|
||||
STD_TORCH_CHECK(
|
||||
m.device() == v.device(), "m and v must be on the same device");
|
||||
Tensor res = new_empty(m, {m.size(0)});
|
||||
THO_DISPATCH_V2(
|
||||
m.scalar_type(),
|
||||
"mv_tensor_accessor_cuda",
|
||||
AT_WRAP(([&]() {
|
||||
auto resa = Accessor_cuda<scalar_t, 1>(
|
||||
reinterpret_cast<scalar_t*>(res.data_ptr()),
|
||||
res.sizes().data(),
|
||||
res.strides().data());
|
||||
auto ma = Accessor_cuda<scalar_t, 2>(
|
||||
reinterpret_cast<scalar_t*>(m.data_ptr()),
|
||||
m.sizes().data(),
|
||||
m.strides().data());
|
||||
auto va = Accessor_cuda<scalar_t, 1>(
|
||||
reinterpret_cast<scalar_t*>(v.data_ptr()),
|
||||
v.sizes().data(),
|
||||
v.strides().data());
|
||||
mv_tensor_accessor_kernel<Accessor_cuda, scalar_t>
|
||||
<<<1, 1, 0, 0>>>(resa, ma, va);
|
||||
})),
|
||||
AT_FLOATING_TYPES);
|
||||
return res;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CUDA, m) {
|
||||
m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
|
||||
}
|
||||
@ -1,363 +0,0 @@
|
||||
import torch
|
||||
from torch import Tensor
|
||||
|
||||
|
||||
def sgd_out_of_place(param, grad, weight_decay, lr, maximize) -> Tensor:
|
||||
"""
|
||||
Computes a single step of SGD on a single parameter Tensor with grad.
|
||||
|
||||
Assumes:
|
||||
- param and grad are the same shape and are 1D.
|
||||
- param and grad are float and on CPU
|
||||
|
||||
Args:
|
||||
param: a 1D tensor of floats
|
||||
grad: a 1D tensor of floats
|
||||
weight_decay: a python double between 0 and 1
|
||||
lr: a python double
|
||||
|
||||
Returns:
|
||||
a 1D float Tensor the same shape as param
|
||||
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.sgd_out_of_place.default(
|
||||
param, grad, weight_decay, lr, maximize
|
||||
)
|
||||
|
||||
|
||||
def identity(t) -> Tensor:
|
||||
"""
|
||||
Returns the input tensor
|
||||
|
||||
Args:
|
||||
t: any Tensor
|
||||
|
||||
Returns:
|
||||
a Tensor, the same as input.
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.identity.default(t)
|
||||
|
||||
|
||||
def my_abs(t) -> Tensor:
|
||||
"""
|
||||
Returns abs on the input tensor, outputs a new Tensor
|
||||
|
||||
Args:
|
||||
t: any Tensor
|
||||
|
||||
Returns:
|
||||
a Tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_abs.default(t)
|
||||
|
||||
|
||||
def my_is_cpu(t) -> bool:
|
||||
"""
|
||||
Returns is_cpu on the input tensor.
|
||||
|
||||
Args:
|
||||
t: any Tensor
|
||||
|
||||
Returns:
|
||||
a bool
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_is_cpu.default(t)
|
||||
|
||||
|
||||
def my_ones_like(tensor, device) -> Tensor:
|
||||
"""
|
||||
Returns a new Tensor like the input tensor, but with all ones
|
||||
|
||||
Args:
|
||||
tensor: any Tensor
|
||||
device: a device string
|
||||
|
||||
Returns:
|
||||
a ones Tensor with the same dtype and shape and other attributes
|
||||
like the input tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_ones_like.default(tensor, device)
|
||||
|
||||
|
||||
def exp_neg_is_leaf(t1, t2, t3) -> tuple[Tensor, Tensor, bool]:
|
||||
"""
|
||||
Returns a Tensor, Tensor, bool tuple corresponding to the respective inputs
|
||||
t1, t2, and t3.
|
||||
|
||||
Args:
|
||||
t1: Tensor
|
||||
t2: Tensor
|
||||
t3: Tensor
|
||||
|
||||
Returns:
|
||||
(exp(t1), neg(t2), is_leaf(t3))
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.exp_neg_is_leaf.default(t1, t2, t3)
|
||||
|
||||
|
||||
def neg_exp(t) -> Tensor:
|
||||
"""
|
||||
Returns a Tensor composing neg of exp
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: neg(exp(t))
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.neg_exp.default(t)
|
||||
|
||||
|
||||
def divide_neg_exp(t) -> Tensor:
|
||||
"""
|
||||
Returns a Tensor division of neg and exp
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: divide(neg(t), exp(t))
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.divide_neg_exp.default(t)
|
||||
|
||||
|
||||
def is_contiguous(t) -> bool:
|
||||
"""
|
||||
Returns a bool indicating if the input tensor is contiguous
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: is_contiguous(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.is_contiguous.default(t)
|
||||
|
||||
|
||||
def my_transpose(t, dim0, dim1) -> Tensor:
|
||||
"""
|
||||
Returns t.transpose(dim0, dim1)
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: my_transpose(t, dim0, dim1)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_transpose.default(t, dim0, dim1)
|
||||
|
||||
|
||||
def my_empty_like(t) -> Tensor:
|
||||
"""
|
||||
Returns t.empty_like()
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: my_empty_like(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_empty_like.default(t)
|
||||
|
||||
|
||||
def my_zero_(t) -> Tensor:
|
||||
"""
|
||||
Returns t.zero_()
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: my_zero_(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_zero_.default(t)
|
||||
|
||||
|
||||
def my_amax(t) -> Tensor:
|
||||
"""
|
||||
Returns t.amax()
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: amax(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_amax.default(t)
|
||||
|
||||
|
||||
def my_amax_vec(t) -> Tensor:
|
||||
"""
|
||||
Returns t.amax()
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: amax(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_amax_vec.default(t)
|
||||
|
||||
|
||||
def fill_infinity(t) -> Tensor:
|
||||
"""
|
||||
Fills the tensor with inf.
|
||||
|
||||
Args:
|
||||
t: Tensor to fill
|
||||
|
||||
Returns: The modified tensor (same as input)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.fill_infinity.default(t)
|
||||
|
||||
|
||||
def test_default_constructor(defined) -> bool:
|
||||
"""
|
||||
Tests the default constructor for torch::stable::Tensor.
|
||||
|
||||
Args:
|
||||
defined: bool - if True, tests defined tensor; if False, tests undefined tensor
|
||||
|
||||
Returns: bool - result of calling .defined() on the tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.test_default_constructor.default(defined)
|
||||
|
||||
|
||||
def mv_tensor_accessor(m, v) -> Tensor:
|
||||
"""
|
||||
Returns matrix-vector product.
|
||||
|
||||
Args:
|
||||
m: any 2-D Tensor with shape (N, M)
|
||||
v: any 1-D Tensor with shape (M,)
|
||||
|
||||
Returns:
|
||||
a 1-D Tensor with shape (N,)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.mv_tensor_accessor.default(m, v)
|
||||
|
||||
|
||||
def my_pad(t) -> Tensor:
|
||||
"""
|
||||
Pads the input tensor with hardcoded padding parameters.
|
||||
|
||||
Args:
|
||||
t: Input tensor
|
||||
|
||||
Returns: Padded tensor with padding [1, 2, 2, 1], mode "constant", value 0.0
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_pad.default(t)
|
||||
|
||||
|
||||
def my_narrow(t, dim, start, length) -> Tensor:
|
||||
"""
|
||||
Returns a new tensor that is a narrowed version of the input tensor.
|
||||
|
||||
Args:
|
||||
t: Input tensor
|
||||
dim: Dimension along which to narrow
|
||||
start: Starting position
|
||||
length: Length of the narrowed section
|
||||
|
||||
Returns: Narrowed tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_narrow.default(t, dim, start, length)
|
||||
|
||||
|
||||
def my_copy_(dst, src, non_blocking) -> Tensor:
|
||||
"""
|
||||
Returns tensor dst that is updated with src elements.
|
||||
|
||||
Args:
|
||||
dst: Destination tensor
|
||||
src: Source tensor
|
||||
non_blocking: bool
|
||||
|
||||
Returns: Updated tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_copy_.default(dst, src, non_blocking)
|
||||
|
||||
|
||||
def my_clone(t) -> Tensor:
|
||||
"""
|
||||
Returns a clone of input tensor.
|
||||
|
||||
Args:
|
||||
t: Input tensor
|
||||
|
||||
Returns: Cloned tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_clone.default(t)
|
||||
|
||||
|
||||
def test_device_guard(device_index) -> int:
|
||||
"""
|
||||
Tests the DeviceGuard functionality by creating a device guard and returning an empty tensor.
|
||||
|
||||
Args:
|
||||
device_index: Device index to set the guard to
|
||||
|
||||
Returns: result of cudaGetDevice() as an integer after using the guard
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.test_device_guard.default(device_index)
|
||||
|
||||
|
||||
def test_device_guard_set_index() -> int:
|
||||
"""
|
||||
Tests the DeviceGuard set_index functionality by creating a device guard with index 1,
|
||||
then setting it to index 0, and returning the current device.
|
||||
|
||||
Returns: result of cudaGetDevice() as an integer after using set_index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.test_device_guard_set_index.default()
|
||||
|
||||
|
||||
def test_stream(device_index) -> int:
|
||||
"""
|
||||
Tests the Stream functionality by getting the current stream ID for the specified device.
|
||||
|
||||
Args:
|
||||
device_index: Device index to get the stream for
|
||||
|
||||
Returns: Stream ID as an integer
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.test_stream.default(device_index)
|
||||
|
||||
|
||||
def test_get_current_device_index() -> int:
|
||||
"""
|
||||
Tests the getCurrentDeviceIndex functionality by getting the current device index.
|
||||
|
||||
Returns: Current device index as an integer
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.test_get_current_device_index.default()
|
||||
|
||||
|
||||
def my_new_empty_dtype_variant(t) -> Tensor:
|
||||
"""
|
||||
Returns a new empty tensor with shape [2, 5] and dtype bfloat16
|
||||
|
||||
Args:
|
||||
t: Input tensor used as a reference for device and other properties
|
||||
|
||||
Returns: New empty tensor with shape [2, 5] and dtype bfloat16
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_new_empty_dtype_variant.default(t)
|
||||
|
||||
|
||||
def my_new_zeros_dtype_variant(t) -> Tensor:
|
||||
"""
|
||||
Returns a new tensor filled with 0s with shape [2, 5] and dtype Float
|
||||
|
||||
Args:
|
||||
t: Input tensor used as a reference for device and other properties
|
||||
|
||||
Returns: New zeros tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_new_zeros_dtype_variant.default(t)
|
||||
|
||||
|
||||
def my_flatten(t, start_dim=0, end_dim=-1) -> Tensor:
|
||||
"""
|
||||
Flattens the input tensor from start_dim to end_dim into a single dimension.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to flatten
|
||||
start_dim: int - first dimension to flatten (default: 0)
|
||||
end_dim: int - last dimension to flatten (default: -1)
|
||||
|
||||
Returns: Tensor - flattened tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_flatten.default(t, start_dim, end_dim)
|
||||
@ -1,82 +0,0 @@
|
||||
import distutils.command.clean
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
import torch
|
||||
from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension
|
||||
|
||||
|
||||
ROOT_DIR = Path(__file__).parent
|
||||
CSRC_DIR = ROOT_DIR / "libtorch_agnostic_2_9" / "csrc"
|
||||
|
||||
|
||||
class clean(distutils.command.clean.clean):
|
||||
def run(self):
|
||||
# Run default behavior first
|
||||
distutils.command.clean.clean.run(self)
|
||||
|
||||
# Remove extension
|
||||
for path in (ROOT_DIR / "libtorch_agnostic_2_9").glob("**/*.so"):
|
||||
path.unlink()
|
||||
# Remove build and dist and egg-info directories
|
||||
dirs = [
|
||||
ROOT_DIR / "build",
|
||||
ROOT_DIR / "dist",
|
||||
ROOT_DIR / "libtorch_agnostic_2_9.egg-info",
|
||||
]
|
||||
for path in dirs:
|
||||
if path.exists():
|
||||
shutil.rmtree(str(path), ignore_errors=True)
|
||||
|
||||
|
||||
def get_extension():
|
||||
extra_compile_args = {
|
||||
"cxx": [
|
||||
"-fdiagnostics-color=always",
|
||||
"-DTORCH_STABLE_ONLY",
|
||||
"-DTORCH_TARGET_VERSION=0x0209000000000000",
|
||||
],
|
||||
}
|
||||
sources = list(CSRC_DIR.glob("**/*.cpp"))
|
||||
|
||||
extension = CppExtension
|
||||
# allow including <cuda_runtime.h>
|
||||
if torch.cuda.is_available():
|
||||
extra_compile_args["cxx"].append("-DLAE_USE_CUDA")
|
||||
extra_compile_args["nvcc"] = [
|
||||
"-O2",
|
||||
"-DTORCH_TARGET_VERSION=0x0209000000000000",
|
||||
]
|
||||
extension = CUDAExtension
|
||||
sources.extend(CSRC_DIR.glob("**/*.cu"))
|
||||
|
||||
return [
|
||||
extension(
|
||||
"libtorch_agnostic_2_9._C",
|
||||
sources=sorted(str(s) for s in sources),
|
||||
py_limited_api=True,
|
||||
extra_compile_args=extra_compile_args,
|
||||
extra_link_args=[],
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
setup(
|
||||
name="libtorch_agnostic_2_9",
|
||||
version="0.0",
|
||||
author="PyTorch Core Team",
|
||||
description="Example of libtorch agnostic extension for PyTorch 2.9",
|
||||
packages=find_packages(exclude=("test",)),
|
||||
package_data={"libtorch_agnostic_2_9": ["*.dll", "*.dylib", "*.so"]},
|
||||
install_requires=[
|
||||
"torch",
|
||||
],
|
||||
ext_modules=get_extension(),
|
||||
cmdclass={
|
||||
"build_ext": BuildExtension.with_options(no_python_abi_suffix=True),
|
||||
"clean": clean,
|
||||
},
|
||||
options={"bdist_wheel": {"py_limited_api": "cp39"}},
|
||||
)
|
||||
@ -0,0 +1,30 @@
|
||||
#include "kernel.h"
|
||||
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
|
||||
STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
|
||||
STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
|
||||
STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
|
||||
STD_TORCH_CHECK(m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
|
||||
STD_TORCH_CHECK(m.device() == v.device(), "m and v must be on the same device");
|
||||
Tensor res = new_empty(m, {m.size(0)});
|
||||
THO_DISPATCH_V2(m.scalar_type(), "mv_tensor_accessor_cuda",
|
||||
AT_WRAP(([&]() {
|
||||
auto resa = Accessor_cuda<scalar_t, 1>(reinterpret_cast<scalar_t*>(res.data_ptr()), res.sizes().data(), res.strides().data());
|
||||
auto ma = Accessor_cuda<scalar_t, 2>(reinterpret_cast<scalar_t*>(m.data_ptr()), m.sizes().data(), m.strides().data());
|
||||
auto va = Accessor_cuda<scalar_t, 1>(reinterpret_cast<scalar_t*>(v.data_ptr()), v.sizes().data(), v.strides().data());
|
||||
mv_tensor_accessor_kernel<Accessor_cuda, scalar_t><<<1, 1, 0, 0>>>(resa, ma, va);
|
||||
})),
|
||||
AT_FLOATING_TYPES);
|
||||
return res;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CUDA, m) {
|
||||
m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
|
||||
}
|
||||
@ -1,6 +1,8 @@
|
||||
#include "kernel.h"
|
||||
|
||||
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
|
||||
#include <torch/csrc/stable/accelerator.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
@ -9,7 +11,6 @@
|
||||
|
||||
#ifdef LAE_USE_CUDA
|
||||
#include <cuda_runtime.h>
|
||||
#include <torch/csrc/stable/accelerator.h>
|
||||
#endif
|
||||
|
||||
#include <optional>
|
||||
@ -67,11 +68,11 @@ Tensor sgd_out_of_place(
|
||||
return out;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY(libtorch_agnostic_2_9, m) {
|
||||
STABLE_TORCH_LIBRARY(libtorch_agnostic, m) {
|
||||
m.def("sgd_out_of_place(Tensor param, Tensor grad, float weight_decay, float lr, bool maximize) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CPU, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
|
||||
m.impl("sgd_out_of_place", TORCH_BOX(&sgd_out_of_place));
|
||||
}
|
||||
|
||||
@ -80,15 +81,15 @@ Tensor identity(Tensor t) {
|
||||
}
|
||||
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("identity(Tensor t) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CUDA, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CUDA, m) {
|
||||
m.impl("identity", TORCH_BOX(&identity));
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CPU, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
|
||||
m.impl("identity", TORCH_BOX(&identity));
|
||||
}
|
||||
|
||||
@ -100,11 +101,11 @@ Tensor my_abs(Tensor t) {
|
||||
return torch::stable::detail::to<Tensor>(stack[0]);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("my_abs(Tensor t) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_abs", TORCH_BOX(&my_abs));
|
||||
}
|
||||
|
||||
@ -126,11 +127,11 @@ Tensor my_ones_like(Tensor t, StableIValue device) {
|
||||
return torch::stable::detail::to<Tensor>(stack[0]);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("my_ones_like(Tensor t, Device d) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_ones_like", TORCH_BOX(&my_ones_like));
|
||||
}
|
||||
|
||||
@ -153,11 +154,11 @@ std::tuple<Tensor, Tensor, bool> exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3
|
||||
torch::stable::detail::to<bool>(stack_is_leaf[0]));
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3) -> (Tensor, Tensor, bool)");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("exp_neg_is_leaf", TORCH_BOX(&exp_neg_is_leaf));
|
||||
}
|
||||
|
||||
@ -169,11 +170,11 @@ Tensor neg_exp(Tensor t) {
|
||||
return torch::stable::detail::to<Tensor>(stack[0]);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("neg_exp(Tensor t) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("neg_exp", TORCH_BOX(&neg_exp));
|
||||
}
|
||||
|
||||
@ -193,11 +194,11 @@ Tensor divide_neg_exp(Tensor t) {
|
||||
return torch::stable::detail::to<Tensor>(stack_div[0]);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("divide_neg_exp(Tensor t) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("divide_neg_exp", TORCH_BOX(÷_neg_exp));
|
||||
}
|
||||
|
||||
@ -205,11 +206,11 @@ bool is_contiguous(Tensor t) {
|
||||
return t.is_contiguous();
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("is_contiguous(Tensor t) -> bool");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("is_contiguous", TORCH_BOX(&is_contiguous));
|
||||
}
|
||||
|
||||
@ -264,7 +265,7 @@ Tensor my_clone(Tensor t) {
|
||||
return clone(t);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("my_transpose(Tensor t, int dim0, int dim1) -> Tensor");
|
||||
m.def("my_empty_like(Tensor t) -> Tensor");
|
||||
m.def("fill_infinity(Tensor(a!) t) -> Tensor(a!)");
|
||||
@ -276,7 +277,7 @@ STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
m.def("my_clone(Tensor t) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_transpose", TORCH_BOX(&my_transpose));
|
||||
m.impl("my_empty_like", TORCH_BOX(&my_empty_like));
|
||||
m.impl("fill_infinity", TORCH_BOX(&fill_infinity));
|
||||
@ -287,7 +288,7 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_clone", TORCH_BOX(&my_clone));
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeImplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeImplicitAutograd, m) {
|
||||
m.impl("my_pad", TORCH_BOX(&my_pad));
|
||||
m.impl("my_narrow", TORCH_BOX(&my_narrow));
|
||||
}
|
||||
@ -304,12 +305,12 @@ Tensor my_amax_vec(Tensor t) {
|
||||
return amax(t, {0,1}, false);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("my_zero_(Tensor(a!) t) -> Tensor(a!)");
|
||||
m.def("my_amax(Tensor a) -> Tensor");
|
||||
m.def("my_amax_vec(Tensor a) -> Tensor");
|
||||
m.def("my_is_cpu(Tensor t) -> bool");
|
||||
m.def("test_default_constructor(bool undefined) -> bool");
|
||||
m.def("test_default_constructor(bool undefined) -> bool");
|
||||
}
|
||||
|
||||
bool test_default_constructor(bool defined) {
|
||||
@ -331,13 +332,190 @@ bool test_default_constructor(bool defined) {
|
||||
return out.defined();
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_zero_", TORCH_BOX(&my_zero_));
|
||||
m.impl("my_amax", TORCH_BOX(&my_amax));
|
||||
m.impl("my_amax_vec", TORCH_BOX(&my_amax_vec));
|
||||
m.impl("test_default_constructor", TORCH_BOX(&test_default_constructor));
|
||||
}
|
||||
|
||||
std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
|
||||
std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
|
||||
aoti_torch_call_dispatcher("aten::_foreach_mul", "List", stack.data());
|
||||
return torch::stable::detail::to<std::vector<Tensor>>(stack[0]);
|
||||
}
|
||||
|
||||
void my__foreach_mul_(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
|
||||
std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
|
||||
aoti_torch_call_dispatcher("aten::_foreach_mul_", "List", stack.data());
|
||||
}
|
||||
|
||||
std::vector<Tensor> make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) {
|
||||
// This function tests that my__foreach_mul can take in std::initializer_lists
|
||||
// in addition to std::vectors.
|
||||
Tensor t1_1 = my_clone(t1);
|
||||
Tensor t1_2 = my_clone(t1);
|
||||
Tensor t2_1 = my_clone(t2);
|
||||
Tensor t2_2 = my_clone(t2);
|
||||
return my__foreach_mul({t1_1, t2_1}, {t1_2, t2_2});
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("my__foreach_mul(Tensor[] self, Tensor[] other) -> Tensor[]");
|
||||
m.def("my__foreach_mul_(Tensor(a!)[] self, Tensor[] other) -> ()");
|
||||
m.def("make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) -> Tensor[]");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("my__foreach_mul", TORCH_BOX(&my__foreach_mul));
|
||||
m.impl("my__foreach_mul_", TORCH_BOX(&my__foreach_mul_));
|
||||
m.impl("make_tensor_clones_and_call_foreach", TORCH_BOX(&make_tensor_clones_and_call_foreach));
|
||||
}
|
||||
|
||||
// Test functions for torch::stable::Tensor device method
|
||||
|
||||
torch::stable::Device test_tensor_device(torch::stable::Tensor tensor) {
|
||||
return tensor.device();
|
||||
}
|
||||
|
||||
void boxed_test_tensor_device(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
torch::stable::Device res = test_tensor_device(
|
||||
torch::stable::detail::to<torch::stable::Tensor>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
// Test functions for torch::stable::Device
|
||||
|
||||
torch::stable::Device test_device_constructor(
|
||||
bool is_cuda,
|
||||
torch::stable::DeviceIndex index,
|
||||
bool use_str) {
|
||||
using torch::stable::Device;
|
||||
using torch::stable::DeviceType;
|
||||
|
||||
if (use_str) {
|
||||
std::string device_str;
|
||||
if (is_cuda) {
|
||||
device_str = "cuda:" + std::to_string(index);
|
||||
} else {
|
||||
device_str = "cpu";
|
||||
}
|
||||
return Device(device_str);
|
||||
} else {
|
||||
if (is_cuda) {
|
||||
return Device(DeviceType::CUDA, index);
|
||||
} else {
|
||||
return Device(DeviceType::CPU);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void boxed_test_device_constructor(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
torch::stable::Device res = test_device_constructor(
|
||||
torch::stable::detail::to<bool>(stack[0]),
|
||||
torch::stable::detail::to<torch::stable::DeviceIndex>(stack[1]),
|
||||
torch::stable::detail::to<bool>(stack[2]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
bool test_device_equality(torch::stable::Device d1, torch::stable::Device d2) {
|
||||
return d1 == d2;
|
||||
}
|
||||
|
||||
void boxed_test_device_equality(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
bool res = test_device_equality(
|
||||
torch::stable::detail::to<torch::stable::Device>(stack[0]),
|
||||
torch::stable::detail::to<torch::stable::Device>(stack[1]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
torch::stable::Device test_device_set_index(
|
||||
torch::stable::Device device,
|
||||
torch::stable::DeviceIndex index) {
|
||||
device.set_index(index);
|
||||
return device;
|
||||
}
|
||||
|
||||
void boxed_test_device_set_index(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
torch::stable::Device res = test_device_set_index(
|
||||
torch::stable::detail::to<torch::stable::Device>(stack[0]),
|
||||
torch::stable::detail::to<torch::stable::DeviceIndex>(stack[1]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
torch::stable::DeviceIndex test_device_index(torch::stable::Device device) {
|
||||
return device.index();
|
||||
}
|
||||
|
||||
void boxed_test_device_index(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
torch::stable::DeviceIndex res = test_device_index(
|
||||
torch::stable::detail::to<torch::stable::Device>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
bool test_device_is_cuda(torch::stable::Device device) {
|
||||
return device.is_cuda();
|
||||
}
|
||||
|
||||
void boxed_test_device_is_cuda(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
bool res = test_device_is_cuda(
|
||||
torch::stable::detail::to<torch::stable::Device>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
bool test_device_is_cpu(torch::stable::Device device) {
|
||||
return device.is_cpu();
|
||||
}
|
||||
|
||||
void boxed_test_device_is_cpu(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
bool res = test_device_is_cpu(
|
||||
torch::stable::detail::to<torch::stable::Device>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("test_tensor_device(Tensor t) -> Device");
|
||||
m.def(
|
||||
"test_device_constructor(bool is_cuda, DeviceIndex index, bool use_str) -> Device");
|
||||
m.def("test_device_equality(Device d1, Device d2) -> bool");
|
||||
m.def("test_device_set_index(Device device, DeviceIndex index) -> Device");
|
||||
m.def("test_device_index(Device device) -> DeviceIndex");
|
||||
m.def("test_device_is_cuda(Device device) -> bool");
|
||||
m.def("test_device_is_cpu(Device device) -> bool");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_tensor_device", &boxed_test_tensor_device);
|
||||
m.impl("test_device_constructor", &boxed_test_device_constructor);
|
||||
m.impl("test_device_equality", &boxed_test_device_equality);
|
||||
m.impl("test_device_set_index", &boxed_test_device_set_index);
|
||||
m.impl("test_device_index", &boxed_test_device_index);
|
||||
m.impl("test_device_is_cuda", &boxed_test_device_is_cuda);
|
||||
m.impl("test_device_is_cpu", &boxed_test_device_is_cpu);
|
||||
}
|
||||
|
||||
Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
|
||||
STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
|
||||
STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
|
||||
@ -356,11 +534,11 @@ Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
|
||||
return res;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("mv_tensor_accessor(Tensor m, Tensor v) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CPU, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
|
||||
m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cpu));
|
||||
}
|
||||
|
||||
@ -406,14 +584,14 @@ int64_t test_get_current_device_index() {
|
||||
return torch::stable::accelerator::getCurrentDeviceIndex();
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("test_device_guard(int device_index) -> int");
|
||||
m.def("test_device_guard_set_index() -> int");
|
||||
m.def("test_stream(int device_index) -> int");
|
||||
m.def("test_get_current_device_index() -> int");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_device_guard", TORCH_BOX(&test_device_guard));
|
||||
m.impl("test_device_guard_set_index", TORCH_BOX(&test_device_guard_set_index));
|
||||
m.impl("test_stream", TORCH_BOX(&test_stream));
|
||||
@ -422,14 +600,100 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
|
||||
#endif // LAE_USE_CUDA
|
||||
|
||||
Tensor test_parallel_for(int64_t size, int64_t grain_size) {
|
||||
AtenTensorHandle tensor_handle;
|
||||
int64_t stride = 1;
|
||||
|
||||
aoti_torch_empty_strided(
|
||||
1,
|
||||
&size,
|
||||
&stride,
|
||||
aoti_torch_dtype_int64(),
|
||||
aoti_torch_device_type_cpu(),
|
||||
0,
|
||||
&tensor_handle);
|
||||
|
||||
Tensor tensor(tensor_handle);
|
||||
int64_t* data_ptr = reinterpret_cast<int64_t*>(tensor.data_ptr());
|
||||
|
||||
torch::stable::zero_(tensor);
|
||||
|
||||
// Use parallel_for to fill each element with its index
|
||||
// If using a parallel path, the thread id is encoded in the upper 32 bits
|
||||
torch::stable::parallel_for(
|
||||
0, size, grain_size, [data_ptr](int64_t begin, int64_t end) {
|
||||
for (auto i = begin; i < end; i++) {
|
||||
STD_TORCH_CHECK(i <= UINT32_MAX);
|
||||
uint32_t thread_id;
|
||||
torch_get_thread_idx(&thread_id);
|
||||
data_ptr[i] = i | (static_cast<int64_t>(thread_id) << 32);
|
||||
}
|
||||
});
|
||||
|
||||
return tensor;
|
||||
}
|
||||
|
||||
void boxed_test_parallel_for(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
Tensor res = test_parallel_for(to<int64_t>(stack[0]), to<int64_t>(stack[1]));
|
||||
stack[0] = from(res);
|
||||
}
|
||||
|
||||
uint32_t test_get_num_threads() {
|
||||
return torch::stable::get_num_threads();
|
||||
}
|
||||
|
||||
void boxed_test_get_num_threads(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
uint32_t res = test_get_num_threads();
|
||||
stack[0] = from(res);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("test_parallel_for(int size, int grain_size) -> Tensor");
|
||||
m.def("test_get_num_threads() -> int");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_parallel_for", &boxed_test_parallel_for);
|
||||
m.impl("test_get_num_threads", &boxed_test_get_num_threads);
|
||||
}
|
||||
|
||||
Tensor my_empty(
|
||||
torch::headeronly::HeaderOnlyArrayRef<int64_t> size,
|
||||
std::optional<torch::headeronly::ScalarType> dtype,
|
||||
std::optional<torch::stable::Device> device,
|
||||
std::optional<bool> pin_memory) {
|
||||
return empty(size, dtype, device, pin_memory);
|
||||
}
|
||||
|
||||
Tensor my_flatten(Tensor t, int64_t start_dim, int64_t end_dim) {
|
||||
return flatten(t, start_dim, end_dim);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
m.def("my_flatten(Tensor t, int start_dim=0, int end_dim=-1) -> Tensor");
|
||||
Tensor my_reshape(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> shape) {
|
||||
return reshape(t, shape);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_flatten", TORCH_BOX(&my_flatten));
|
||||
Tensor my_view(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> size) {
|
||||
return view(t, size);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def(
|
||||
"my_empty(int[] size, ScalarType? dtype=None, Device? device=None, bool? pin_memory=None) -> Tensor");
|
||||
m.def("my_flatten(Tensor t, int start_dim=0, int end_dim=-1) -> Tensor");
|
||||
m.def("my_reshape(Tensor t, int[] shape) -> Tensor");
|
||||
m.def("my_view(Tensor t, int[] size) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_empty", TORCH_BOX(&my_empty));
|
||||
m.impl("my_flatten", TORCH_BOX(&my_flatten));
|
||||
m.impl("my_reshape", TORCH_BOX(&my_reshape));
|
||||
m.impl("my_view", TORCH_BOX(&my_view));
|
||||
}
|
||||
@ -0,0 +1,558 @@
|
||||
import torch
|
||||
from torch import Tensor
|
||||
|
||||
|
||||
def sgd_out_of_place(param, grad, weight_decay, lr, maximize) -> Tensor:
|
||||
"""
|
||||
Computes a single step of SGD on a single parameter Tensor with grad.
|
||||
|
||||
Assumes:
|
||||
- param and grad are the same shape and are 1D.
|
||||
- param and grad are float and on CPU
|
||||
|
||||
Args:
|
||||
param: a 1D tensor of floats
|
||||
grad: a 1D tensor of floats
|
||||
weight_decay: a python double between 0 and 1
|
||||
lr: a python double
|
||||
|
||||
Returns:
|
||||
a 1D float Tensor the same shape as param
|
||||
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.sgd_out_of_place.default(
|
||||
param, grad, weight_decay, lr, maximize
|
||||
)
|
||||
|
||||
|
||||
def identity(t) -> Tensor:
|
||||
"""
|
||||
Returns the input tensor
|
||||
|
||||
Args:
|
||||
t: any Tensor
|
||||
|
||||
Returns:
|
||||
a Tensor, the same as input.
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.identity.default(t)
|
||||
|
||||
|
||||
def my_abs(t) -> Tensor:
|
||||
"""
|
||||
Returns abs on the input tensor, outputs a new Tensor
|
||||
|
||||
Args:
|
||||
t: any Tensor
|
||||
|
||||
Returns:
|
||||
a Tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_abs.default(t)
|
||||
|
||||
|
||||
def my_is_cpu(t) -> bool:
|
||||
"""
|
||||
Returns is_cpu on the input tensor.
|
||||
|
||||
Args:
|
||||
t: any Tensor
|
||||
|
||||
Returns:
|
||||
a bool
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_is_cpu.default(t)
|
||||
|
||||
|
||||
def my_ones_like(tensor, device) -> Tensor:
|
||||
"""
|
||||
Returns a new Tensor like the input tensor, but with all ones
|
||||
|
||||
Args:
|
||||
tensor: any Tensor
|
||||
device: a device string
|
||||
|
||||
Returns:
|
||||
a ones Tensor with the same dtype and shape and other attributes
|
||||
like the input tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_ones_like.default(tensor, device)
|
||||
|
||||
|
||||
def exp_neg_is_leaf(t1, t2, t3) -> tuple[Tensor, Tensor, bool]:
|
||||
"""
|
||||
Returns a Tensor, Tensor, bool tuple corresponding to the respective inputs
|
||||
t1, t2, and t3.
|
||||
|
||||
Args:
|
||||
t1: Tensor
|
||||
t2: Tensor
|
||||
t3: Tensor
|
||||
|
||||
Returns:
|
||||
(exp(t1), neg(t2), is_leaf(t3))
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.exp_neg_is_leaf.default(t1, t2, t3)
|
||||
|
||||
|
||||
def neg_exp(t) -> Tensor:
|
||||
"""
|
||||
Returns a Tensor composing neg of exp
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: neg(exp(t))
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.neg_exp.default(t)
|
||||
|
||||
|
||||
def divide_neg_exp(t) -> Tensor:
|
||||
"""
|
||||
Returns a Tensor division of neg and exp
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: divide(neg(t), exp(t))
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.divide_neg_exp.default(t)
|
||||
|
||||
|
||||
def is_contiguous(t) -> bool:
|
||||
"""
|
||||
Returns a bool indicating if the input tensor is contiguous
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: is_contiguous(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.is_contiguous.default(t)
|
||||
|
||||
|
||||
def my_transpose(t, dim0, dim1) -> Tensor:
|
||||
"""
|
||||
Returns t.transpose(dim0, dim1)
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: my_transpose(t, dim0, dim1)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_transpose.default(t, dim0, dim1)
|
||||
|
||||
|
||||
def my_empty_like(t) -> Tensor:
|
||||
"""
|
||||
Returns t.empty_like()
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: my_empty_like(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_empty_like.default(t)
|
||||
|
||||
|
||||
def my_zero_(t) -> Tensor:
|
||||
"""
|
||||
Returns t.zero_()
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: my_zero_(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_zero_.default(t)
|
||||
|
||||
|
||||
def my_amax(t) -> Tensor:
|
||||
"""
|
||||
Returns t.amax()
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: amax(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_amax.default(t)
|
||||
|
||||
|
||||
def my_amax_vec(t) -> Tensor:
|
||||
"""
|
||||
Returns t.amax()
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: amax(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_amax_vec.default(t)
|
||||
|
||||
|
||||
def fill_infinity(t) -> Tensor:
|
||||
"""
|
||||
Fills the tensor with inf.
|
||||
|
||||
Args:
|
||||
t: Tensor to fill
|
||||
|
||||
Returns: The modified tensor (same as input)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.fill_infinity.default(t)
|
||||
|
||||
|
||||
def test_default_constructor(defined) -> bool:
|
||||
"""
|
||||
Tests the default constructor for torch::stable::Tensor.
|
||||
|
||||
Args:
|
||||
defined: bool - if True, tests defined tensor; if False, tests undefined tensor
|
||||
|
||||
Returns: bool - result of calling .defined() on the tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_default_constructor.default(defined)
|
||||
|
||||
|
||||
def test_tensor_device(t):
|
||||
"""
|
||||
Tests Tensor device() method.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to get device from
|
||||
|
||||
Returns: Device - device of the tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_tensor_device.default(t)
|
||||
|
||||
|
||||
def my_pad(t) -> Tensor:
|
||||
"""
|
||||
Pads the input tensor with hardcoded padding parameters.
|
||||
|
||||
Args:
|
||||
t: Input tensor
|
||||
|
||||
Returns: Padded tensor with padding [1, 2, 2, 1], mode "constant", value 0.0
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_pad.default(t)
|
||||
|
||||
|
||||
def my_narrow(t, dim, start, length) -> Tensor:
|
||||
"""
|
||||
Returns a new tensor that is a narrowed version of the input tensor.
|
||||
|
||||
Args:
|
||||
t: Input tensor
|
||||
dim: Dimension along which to narrow
|
||||
start: Starting position
|
||||
length: Length of the narrowed section
|
||||
|
||||
Returns: Narrowed tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_narrow.default(t, dim, start, length)
|
||||
|
||||
|
||||
def my_copy_(dst, src, non_blocking) -> Tensor:
|
||||
"""
|
||||
Returns tensor dst that is updated with src elements.
|
||||
|
||||
Args:
|
||||
dst: Destination tensor
|
||||
src: Source tensor
|
||||
non_blocking: bool
|
||||
|
||||
Returns: Updated tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_copy_.default(dst, src, non_blocking)
|
||||
|
||||
|
||||
def my_clone(t) -> Tensor:
|
||||
"""
|
||||
Returns a clone of input tensor.
|
||||
|
||||
Args:
|
||||
t: Input tensor
|
||||
|
||||
Returns: Cloned tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_clone.default(t)
|
||||
|
||||
|
||||
def test_device_guard(device_index) -> int:
|
||||
"""
|
||||
Tests the DeviceGuard functionality by creating a device guard and returning an empty tensor.
|
||||
|
||||
Args:
|
||||
device_index: Device index to set the guard to
|
||||
|
||||
Returns: result of cudaGetDevice() as an integer after using the guard
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_guard.default(device_index)
|
||||
|
||||
|
||||
def test_device_guard_set_index() -> int:
|
||||
"""
|
||||
Tests the DeviceGuard set_index functionality by creating a device guard with index 1,
|
||||
then setting it to index 0, and returning the current device.
|
||||
|
||||
Returns: result of cudaGetDevice() as an integer after using set_index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_guard_set_index.default()
|
||||
|
||||
|
||||
def test_stream(device_index) -> int:
|
||||
"""
|
||||
Tests the Stream functionality by getting the current stream ID for the specified device.
|
||||
|
||||
Args:
|
||||
device_index: Device index to get the stream for
|
||||
|
||||
Returns: Stream ID as an integer
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_stream.default(device_index)
|
||||
|
||||
|
||||
def test_get_current_device_index() -> int:
|
||||
"""
|
||||
Tests the getCurrentDeviceIndex functionality by getting the current device index.
|
||||
|
||||
Returns: Current device index as an integer
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_get_current_device_index.default()
|
||||
|
||||
|
||||
def my_new_empty_dtype_variant(t) -> Tensor:
|
||||
"""
|
||||
Returns a new empty tensor with shape [2, 5] and dtype bfloat16
|
||||
|
||||
Args:
|
||||
t: Input tensor used as a reference for device and other properties
|
||||
|
||||
Returns: New empty tensor with shape [2, 5] and dtype bfloat16
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_new_empty_dtype_variant.default(t)
|
||||
|
||||
|
||||
def my_new_zeros_dtype_variant(t) -> Tensor:
|
||||
"""
|
||||
Returns a new tensor filled with 0s with shape [2, 5] and dtype Float
|
||||
|
||||
Args:
|
||||
t: Input tensor used as a reference for device and other properties
|
||||
|
||||
Returns: New zeros tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_new_zeros_dtype_variant.default(t)
|
||||
|
||||
|
||||
def my__foreach_mul_(tensors, others) -> ():
|
||||
"""
|
||||
Updates tensors to be the result of pointwise multiplying with others.
|
||||
|
||||
Args:
|
||||
tensors: list of tensors
|
||||
others: list of tensors (with the same corresponding shapes as tensors)
|
||||
|
||||
Returns: nothing, tensors is updated in place.
|
||||
"""
|
||||
torch.ops.libtorch_agnostic.my__foreach_mul_.default(tensors, others)
|
||||
|
||||
|
||||
def my__foreach_mul(tensors, others) -> list[Tensor]:
|
||||
"""
|
||||
Returns a list of tensors that are the results of pointwise multiplying
|
||||
tensors and others.
|
||||
|
||||
Args:
|
||||
tensors: list of tensors
|
||||
others: list of tensors (with the same corresponding shapes as tensors)
|
||||
|
||||
Returns: list of multiplied tensors
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my__foreach_mul.default(tensors, others)
|
||||
|
||||
|
||||
def make_tensor_clones_and_call_foreach(t1, t2) -> list[Tensor]:
|
||||
"""
|
||||
Returns a list of 2 tensors corresponding to the square of the inputs.
|
||||
|
||||
Args:
|
||||
t1: Tensor
|
||||
t2: Tensor
|
||||
|
||||
Returns: list of [t1^2, t2^2]
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.make_tensor_clones_and_call_foreach.default(
|
||||
t1, t2
|
||||
)
|
||||
|
||||
|
||||
def test_device_constructor(is_cuda, index, use_str):
|
||||
"""
|
||||
Tests creating a Device from DeviceType and index, or from a string.
|
||||
|
||||
Args:
|
||||
is_cuda: bool - if True, creates CUDA device; if False, creates CPU device
|
||||
index: int - device index
|
||||
use_str: bool - if True, constructs from string; if False, constructs from DeviceType
|
||||
|
||||
Returns: Device - A device with the specified type and index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_constructor.default(
|
||||
is_cuda, index, use_str
|
||||
)
|
||||
|
||||
|
||||
def test_device_equality(d1, d2) -> bool:
|
||||
"""
|
||||
Tests Device equality operator.
|
||||
|
||||
Args:
|
||||
d1: Device - first device
|
||||
d2: Device - second device
|
||||
|
||||
Returns: bool - True if devices are equal
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_equality.default(d1, d2)
|
||||
|
||||
|
||||
def test_device_set_index(device, index):
|
||||
"""
|
||||
Tests Device set_index() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to modify
|
||||
index: int - new device index
|
||||
|
||||
Returns: Device - device with updated index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_set_index.default(device, index)
|
||||
|
||||
|
||||
def test_device_index(device) -> int:
|
||||
"""
|
||||
Tests Device index() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to query
|
||||
|
||||
Returns: int - device index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_index.default(device)
|
||||
|
||||
|
||||
def test_device_is_cuda(device) -> bool:
|
||||
"""
|
||||
Tests Device is_cuda() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to check
|
||||
|
||||
Returns: bool - True if device is CUDA
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_is_cuda.default(device)
|
||||
|
||||
|
||||
def test_device_is_cpu(device) -> bool:
|
||||
"""
|
||||
Tests Device is_cpu() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to check
|
||||
|
||||
Returns: bool - True if device is CPU
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_is_cpu.default(device)
|
||||
|
||||
|
||||
def test_parallel_for(size, grain_size) -> Tensor:
|
||||
"""
|
||||
Tests the parallel_for functionality by using it to fill a tensor with indices.
|
||||
Args:
|
||||
size: int - size of the tensor to create
|
||||
grain_size: int - grain size for parallel_for
|
||||
Returns: Tensor - a 1D int64 tensor where each element contains its index
|
||||
(if multiple threads are used the threadid will be encoded in the upper 32 bits)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_parallel_for.default(size, grain_size)
|
||||
|
||||
|
||||
def test_get_num_threads() -> int:
|
||||
"""
|
||||
Tests the get_num_threads functionality by returning the number of threads
|
||||
for the parallel backend.
|
||||
|
||||
Returns: int - the number of threads for the parallel backend
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_get_num_threads.default()
|
||||
|
||||
|
||||
def my_empty(size, dtype=None, device=None, pin_memory=None) -> Tensor:
|
||||
"""
|
||||
Creates an empty tensor with the specified size, dtype, device, and pin_memory.
|
||||
|
||||
Args:
|
||||
size: list[int] - size of the tensor to create
|
||||
dtype: ScalarType or None - data type of the tensor
|
||||
device: Device or None - device on which to create the tensor
|
||||
pin_memory: bool or None - whether to use pinned memory
|
||||
|
||||
Returns: Tensor - an uninitialized tensor with the specified properties
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_empty.default(size, dtype, device, pin_memory)
|
||||
|
||||
|
||||
def my_flatten(t, start_dim=0, end_dim=-1) -> Tensor:
|
||||
"""
|
||||
Flattens the input tensor from start_dim to end_dim into a single dimension.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to flatten
|
||||
start_dim: int - first dimension to flatten (default: 0)
|
||||
end_dim: int - last dimension to flatten (default: -1)
|
||||
|
||||
Returns: Tensor - flattened tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_flatten.default(t, start_dim, end_dim)
|
||||
|
||||
|
||||
def my_reshape(t, shape) -> Tensor:
|
||||
"""
|
||||
Returns a tensor with the same data but different shape.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to reshape
|
||||
shape: list[int] - new shape for the tensor
|
||||
|
||||
Returns: Tensor - reshaped tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_reshape.default(t, shape)
|
||||
|
||||
|
||||
def my_view(t, size) -> Tensor:
|
||||
"""
|
||||
Returns a new tensor with the same data as the input tensor but of a different shape.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to view
|
||||
size: list[int] - new size for the tensor
|
||||
|
||||
Returns: Tensor - tensor with new view
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_view.default(t, size)
|
||||
|
||||
|
||||
def mv_tensor_accessor(m, v) -> Tensor:
|
||||
"""
|
||||
Returns matrix-vector product.
|
||||
|
||||
Args:
|
||||
m: any 2-D Tensor with shape (N, M)
|
||||
v: any 1-D Tensor with shape (M,)
|
||||
|
||||
Returns:
|
||||
a 1-D Tensor with shape (N,)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.mv_tensor_accessor.default(m, v)
|
||||
@ -9,7 +9,7 @@ from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtensio
|
||||
|
||||
|
||||
ROOT_DIR = Path(__file__).parent
|
||||
CSRC_DIR = ROOT_DIR / "libtorch_agnostic_2_10" / "csrc"
|
||||
CSRC_DIR = ROOT_DIR / "libtorch_agnostic" / "csrc"
|
||||
|
||||
|
||||
class clean(distutils.command.clean.clean):
|
||||
@ -18,13 +18,13 @@ class clean(distutils.command.clean.clean):
|
||||
distutils.command.clean.clean.run(self)
|
||||
|
||||
# Remove extension
|
||||
for path in (ROOT_DIR / "libtorch_agnostic_2_10").glob("**/*.so"):
|
||||
for path in (ROOT_DIR / "libtorch_agnostic").glob("**/*.so"):
|
||||
path.unlink()
|
||||
# Remove build and dist and egg-info directories
|
||||
dirs = [
|
||||
ROOT_DIR / "build",
|
||||
ROOT_DIR / "dist",
|
||||
ROOT_DIR / "libtorch_agnostic_2_10.egg-info",
|
||||
ROOT_DIR / "libtorch_agnostic.egg-info",
|
||||
]
|
||||
for path in dirs:
|
||||
if path.exists():
|
||||
@ -33,11 +33,7 @@ class clean(distutils.command.clean.clean):
|
||||
|
||||
def get_extension():
|
||||
extra_compile_args = {
|
||||
"cxx": [
|
||||
"-fdiagnostics-color=always",
|
||||
"-DTORCH_STABLE_ONLY",
|
||||
"-DTORCH_TARGET_VERSION=0x020a000000000000",
|
||||
],
|
||||
"cxx": ["-fdiagnostics-color=always"],
|
||||
}
|
||||
sources = list(CSRC_DIR.glob("**/*.cpp"))
|
||||
|
||||
@ -51,7 +47,7 @@ def get_extension():
|
||||
|
||||
return [
|
||||
extension(
|
||||
"libtorch_agnostic_2_10._C",
|
||||
"libtorch_agnostic._C",
|
||||
sources=sorted(str(s) for s in sources),
|
||||
py_limited_api=True,
|
||||
extra_compile_args=extra_compile_args,
|
||||
@ -61,12 +57,12 @@ def get_extension():
|
||||
|
||||
|
||||
setup(
|
||||
name="libtorch_agnostic_2_10",
|
||||
name="libtorch_agnostic",
|
||||
version="0.0",
|
||||
author="PyTorch Core Team",
|
||||
description="Example of libtorch agnostic extension for PyTorch 2.10+",
|
||||
description="Example of libtorch agnostic extension",
|
||||
packages=find_packages(exclude=("test",)),
|
||||
package_data={"libtorch_agnostic_2_10": ["*.dll", "*.dylib", "*.so"]},
|
||||
package_data={"libtorch_agnostic": ["*.dll", "*.dylib", "*.so"]},
|
||||
install_requires=[
|
||||
"torch",
|
||||
],
|
||||
@ -1,7 +1,6 @@
|
||||
# Owner(s): ["module: cpp"]
|
||||
|
||||
import math
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
@ -15,110 +14,26 @@ from torch.testing._internal.common_utils import (
|
||||
install_cpp_extension,
|
||||
IS_WINDOWS,
|
||||
run_tests,
|
||||
skipIfTorchDynamo,
|
||||
TestCase,
|
||||
xfailIfTorchDynamo,
|
||||
)
|
||||
|
||||
|
||||
def get_supported_dtypes():
|
||||
"""Return a list of dtypes that are supported by torch stable ABI."""
|
||||
return [
|
||||
torch.int8,
|
||||
torch.int16,
|
||||
torch.int32,
|
||||
torch.int64,
|
||||
torch.uint8,
|
||||
torch.uint16,
|
||||
torch.uint32,
|
||||
torch.uint64,
|
||||
torch.bfloat16,
|
||||
torch.float16,
|
||||
torch.float32,
|
||||
torch.float64,
|
||||
torch.float8_e5m2,
|
||||
torch.float8_e4m3fn,
|
||||
torch.float8_e5m2fnuz,
|
||||
torch.float8_e4m3fnuz,
|
||||
torch.complex32,
|
||||
torch.complex64,
|
||||
torch.complex128,
|
||||
torch.bool,
|
||||
]
|
||||
|
||||
|
||||
def skipIfTorchVersionLessThan(major, minor):
|
||||
"""Skip test if PyTorch version is less than specified version."""
|
||||
|
||||
def decorator(func):
|
||||
version_parts = torch.__version__.split(".")
|
||||
current_major = int(version_parts[0])
|
||||
current_minor = int(
|
||||
version_parts[1].split("+")[0].split("a")[0].split("b")[0].split("rc")[0]
|
||||
)
|
||||
|
||||
should_skip = (current_major < major) or (
|
||||
current_major == major and current_minor < minor
|
||||
)
|
||||
reason = f"Test requires PyTorch >= {major}.{minor}, current version is {torch.__version__}"
|
||||
|
||||
return unittest.skipIf(should_skip, reason)(func)
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
# TODO: Fix this error in Windows:
|
||||
# LINK : error LNK2001: unresolved external symbol PyInit__C
|
||||
if not IS_WINDOWS:
|
||||
|
||||
class TestLibtorchAgnostic(TestCase):
|
||||
"""
|
||||
Tests for versioned libtorch_agnostic extensions.
|
||||
|
||||
This test class supports testing both:
|
||||
|
||||
- libtorch_agnostic_2_9: Extension built with TORCH_TARGET_VERSION=2.9.0
|
||||
- libtorch_agnostic_2_10: Extension built with TORCH_TARGET_VERSION=2.10.0
|
||||
|
||||
Tests should be decorated with @skipIfTorchVersionLessThan to indicate the
|
||||
version that they target.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
# Build both 2.9 and 2.10 extensions
|
||||
base_dir = Path(__file__).parent
|
||||
|
||||
try:
|
||||
import libtorch_agnostic_2_9 # noqa: F401
|
||||
import libtorch_agnostic # noqa: F401
|
||||
except Exception:
|
||||
install_cpp_extension(
|
||||
extension_root=base_dir / "libtorch_agnostic_2_9_extension"
|
||||
)
|
||||
|
||||
# Only build 2.10 extension if running on PyTorch 2.10+
|
||||
import re
|
||||
|
||||
version_parts = torch.__version__.split(".")
|
||||
current_major = int(version_parts[0])
|
||||
# Extract just the numeric part of the minor version (handles "10+git", "10a1", etc.)
|
||||
current_minor = int(re.match(r"\d+", version_parts[1]).group())
|
||||
|
||||
if (current_major > 2) or (current_major == 2 and current_minor >= 10):
|
||||
try:
|
||||
import libtorch_agnostic_2_10 # noqa: F401
|
||||
except Exception:
|
||||
install_cpp_extension(
|
||||
extension_root=base_dir / "libtorch_agnostic_2_10_extension"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"Skipping 2.10 extension (running on PyTorch {torch.__version__})"
|
||||
)
|
||||
install_cpp_extension(extension_root=Path(__file__).parent.parent)
|
||||
|
||||
@onlyCPU
|
||||
def test_slow_sgd(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
param = torch.rand(5, device=device)
|
||||
grad = torch.rand_like(param)
|
||||
@ -145,7 +60,7 @@ if not IS_WINDOWS:
|
||||
|
||||
@onlyCUDA
|
||||
def test_identity_does_not_hog_memory(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
def _run_identity(prior_mem):
|
||||
t = torch.rand(32, 32, device=device)
|
||||
@ -161,7 +76,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(curr_mem, init_mem)
|
||||
|
||||
def test_exp_neg_is_leaf(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t1 = torch.rand(2, 3, device=device)
|
||||
t2 = torch.rand(3, 2, device=device)
|
||||
@ -173,7 +88,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(is_leaf, t3.is_leaf)
|
||||
|
||||
def test_my_abs(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.rand(32, 16, device=device) - 0.5
|
||||
res = libtorch_agnostic.ops.my_abs(t)
|
||||
@ -192,7 +107,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(curr_mem, init_mem)
|
||||
|
||||
def test_neg_exp(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.rand(32, 16, device=device) - 0.5
|
||||
res = libtorch_agnostic.ops.neg_exp(t)
|
||||
@ -211,7 +126,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(curr_mem, init_mem)
|
||||
|
||||
def test_divide_neg_exp(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.zeros(2, 3, device=device) - 0.5
|
||||
res = libtorch_agnostic.ops.divide_neg_exp(t)
|
||||
@ -230,7 +145,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(curr_mem, init_mem)
|
||||
|
||||
def test_is_contiguous(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.rand(2, 7, device=device)
|
||||
self.assertTrue(libtorch_agnostic.ops.is_contiguous(t))
|
||||
@ -242,7 +157,7 @@ if not IS_WINDOWS:
|
||||
# **{}): got AssertionError("tensor's device must be `meta`, got cpu instead")
|
||||
@xfailIfTorchDynamo
|
||||
def test_my_ones_like(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.rand(3, 1, device=device) - 0.5
|
||||
cpu_t = libtorch_agnostic.ops.my_ones_like(t, "cpu")
|
||||
@ -261,7 +176,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(curr_mem, init_mem)
|
||||
|
||||
def test_my_transpose(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.rand(2, 7, device=device)
|
||||
out = libtorch_agnostic.ops.my_transpose(t, 0, 1)
|
||||
@ -271,7 +186,7 @@ if not IS_WINDOWS:
|
||||
libtorch_agnostic.ops.my_transpose(t, 1, 2)
|
||||
|
||||
def test_my_empty_like(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
deterministic = torch.are_deterministic_algorithms_enabled()
|
||||
try:
|
||||
@ -287,7 +202,7 @@ if not IS_WINDOWS:
|
||||
|
||||
@onlyCPU
|
||||
def test_my_zero_(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.rand(2, 7, device=device)
|
||||
out = libtorch_agnostic.ops.my_zero_(t)
|
||||
@ -295,28 +210,28 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(out, torch.zeros_like(t))
|
||||
|
||||
def test_my_amax(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.rand(2, 7, device=device)
|
||||
out = libtorch_agnostic.ops.my_amax(t)
|
||||
self.assertEqual(out, torch.amax(t, 0))
|
||||
|
||||
def test_my_amax_vec(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.rand(2, 7, 5, device=device)
|
||||
out = libtorch_agnostic.ops.my_amax_vec(t)
|
||||
self.assertEqual(out, torch.amax(t, (0, 1)))
|
||||
|
||||
def test_my_is_cpu(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.rand(2, 7, device=device)
|
||||
out = libtorch_agnostic.ops.my_is_cpu(t)
|
||||
self.assertEqual(out, t.is_cpu)
|
||||
|
||||
def test_fill_infinity(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.rand(3, 4, device=device)
|
||||
out = libtorch_agnostic.ops.fill_infinity(t)
|
||||
@ -327,7 +242,7 @@ if not IS_WINDOWS:
|
||||
|
||||
@onlyCPU
|
||||
def test_default_constructor(self):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
defined_tensor_is_defined = libtorch_agnostic.ops.test_default_constructor(
|
||||
True
|
||||
@ -340,7 +255,7 @@ if not IS_WINDOWS:
|
||||
self.assertFalse(undefined_tensor_is_defined)
|
||||
|
||||
def test_my_pad(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.rand(2, 3, device=device)
|
||||
out = libtorch_agnostic.ops.my_pad(t)
|
||||
@ -348,7 +263,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(out, expected)
|
||||
|
||||
def test_my_narrow(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.randn(2, 5, device=device)
|
||||
|
||||
@ -362,7 +277,7 @@ if not IS_WINDOWS:
|
||||
@onlyCUDA
|
||||
@deviceCountAtLeast(2)
|
||||
def test_device_guard(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
device_index = 1
|
||||
out = libtorch_agnostic.ops.test_device_guard(device_index)
|
||||
@ -371,7 +286,7 @@ if not IS_WINDOWS:
|
||||
@onlyCUDA
|
||||
@deviceCountAtLeast(2)
|
||||
def test_device_guard_set_index(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
# This test creates a DeviceGuard with index 1, then sets it to index 0
|
||||
# and returns the current device (should be 0)
|
||||
@ -380,7 +295,7 @@ if not IS_WINDOWS:
|
||||
|
||||
@onlyCUDA
|
||||
def test_stream(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
stream = torch.cuda.Stream()
|
||||
device = torch.cuda.current_device()
|
||||
@ -394,7 +309,7 @@ if not IS_WINDOWS:
|
||||
@onlyCUDA
|
||||
@deviceCountAtLeast(2)
|
||||
def test_get_current_device_index(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
prev_device = torch.cuda.current_device()
|
||||
|
||||
@ -408,7 +323,7 @@ if not IS_WINDOWS:
|
||||
torch.cuda.set_device(prev_device)
|
||||
|
||||
def test_my_new_empty_dtype_variant(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
deterministic = torch.are_deterministic_algorithms_enabled()
|
||||
try:
|
||||
@ -423,7 +338,7 @@ if not IS_WINDOWS:
|
||||
torch.use_deterministic_algorithms(deterministic)
|
||||
|
||||
def test_my_new_zeros_dtype_variant(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.randn(3, 4, device=device)
|
||||
out = libtorch_agnostic.ops.my_new_zeros_dtype_variant(t)
|
||||
@ -431,7 +346,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(out, ref_out, exact_device=True)
|
||||
|
||||
def test_my_copy_(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
dst = torch.empty(2, 5, device=device)
|
||||
src = torch.randn(2, 5, device=device)
|
||||
@ -442,7 +357,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(result.data_ptr(), dst.data_ptr())
|
||||
|
||||
def test_my_clone(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.randn(2, 5, device=device)
|
||||
|
||||
@ -452,9 +367,8 @@ if not IS_WINDOWS:
|
||||
self.assertNotEqual(result.data_ptr(), expected.data_ptr())
|
||||
self.assertEqual(result.stride(), expected.stride())
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
def test_my__foreach_mul_(self, device):
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
N = 5
|
||||
tensors = [torch.rand(32, 16, device=device) for _ in range(N)]
|
||||
@ -467,9 +381,8 @@ if not IS_WINDOWS:
|
||||
for tensor_t, expected_t in zip(tensors, expected_values):
|
||||
self.assertEqual(tensor_t, expected_t)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
def test_my__foreach_mul(self, device):
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
N = 5
|
||||
tensors = [torch.rand(32, 16, device=device) for _ in range(N)]
|
||||
@ -496,9 +409,8 @@ if not IS_WINDOWS:
|
||||
curr_mem = torch.cuda.memory_allocated(device)
|
||||
self.assertEqual(curr_mem, init_mem)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
def test_make_tensor_clones_and_call_foreach(self, device):
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t1 = torch.rand(2, 5, device=device)
|
||||
t2 = torch.rand(3, 4, device=device)
|
||||
@ -506,10 +418,9 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(result[0], t1 * t1)
|
||||
self.assertEqual(result[1], t2 * t2)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
@onlyCUDA
|
||||
def test_device(self, device):
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
cuda_device = libtorch_agnostic.ops.test_device_constructor(
|
||||
is_cuda=True, index=1, use_str=False
|
||||
@ -562,11 +473,10 @@ if not IS_WINDOWS:
|
||||
):
|
||||
libtorch_agnostic.ops.test_device_set_index(cuda_device, 129)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
@onlyCUDA
|
||||
@deviceCountAtLeast(2)
|
||||
def test_tensor_device(self, device):
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.randn(2, 3)
|
||||
self.assertEqual(libtorch_agnostic.ops.test_tensor_device(t), t.device)
|
||||
@ -581,7 +491,6 @@ if not IS_WINDOWS:
|
||||
libtorch_agnostic.ops.test_tensor_device(t_cuda_1), t_cuda_1.device
|
||||
)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
@onlyCPU
|
||||
# TODO: Debug this:
|
||||
# Dynamo failed to run FX node with fake tensors:
|
||||
@ -591,7 +500,7 @@ if not IS_WINDOWS:
|
||||
# Declaration: libtorch_agnostic::test_parallel_for(int size, int grain_size) -> Tensor')
|
||||
@xfailIfTorchDynamo
|
||||
def test_parallel_for(self, device):
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
num_threads = torch.get_num_threads()
|
||||
size = 100
|
||||
@ -608,18 +517,16 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(result_values, expected)
|
||||
self.assertEqual(result_thread_ids, torch.arange(expected_num_threads_used))
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
@onlyCPU
|
||||
def test_get_num_threads(self, device):
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
num_threads = libtorch_agnostic.ops.test_get_num_threads()
|
||||
expected_num_threads = torch.get_num_threads()
|
||||
self.assertEqual(num_threads, expected_num_threads)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
def test_my_empty(self, device):
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
deterministic = torch.are_deterministic_algorithms_enabled()
|
||||
try:
|
||||
@ -660,7 +567,7 @@ if not IS_WINDOWS:
|
||||
torch.use_deterministic_algorithms(deterministic)
|
||||
|
||||
def test_my_flatten(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.randn(2, 3, 4, device=device)
|
||||
result = libtorch_agnostic.ops.my_flatten(t)
|
||||
@ -675,9 +582,8 @@ if not IS_WINDOWS:
|
||||
expected_range = torch.flatten(t, 2, -1)
|
||||
self.assertEqual(result_range, expected_range)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
def test_my_reshape(self, device):
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.randn(2, 3, 4, device=device)
|
||||
|
||||
@ -693,9 +599,8 @@ if not IS_WINDOWS:
|
||||
expected_flat = torch.reshape(t, [-1])
|
||||
self.assertEqual(result_flat, expected_flat)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
def test_my_view(self, device):
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.randn(2, 3, 4, device=device)
|
||||
|
||||
@ -712,7 +617,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(result_flat, expected_flat)
|
||||
|
||||
def test_mv_tensor_accessor(self, device):
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
import libtorch_agnostic
|
||||
|
||||
m = torch.rand(3, 5, device=device)
|
||||
v = torch.rand(5, device=device)
|
||||
@ -727,45 +632,6 @@ if not IS_WINDOWS:
|
||||
expected = torch.mv(m, v)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
@skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
|
||||
def test_get_any_data_ptr(self, device):
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
|
||||
t = torch.empty(2, 5, device=device, dtype=torch.float32)
|
||||
expected_p = t.data_ptr()
|
||||
|
||||
for mutable in [True, False]:
|
||||
p = libtorch_agnostic.ops.get_any_data_ptr(t, mutable)
|
||||
self.assertEqual(p, expected_p)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
@skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
|
||||
def test_get_template_any_data_ptr(self, device):
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
|
||||
supported_dtypes = get_supported_dtypes()
|
||||
|
||||
for dtype in supported_dtypes:
|
||||
t = torch.empty(2, 5, device=device, dtype=dtype)
|
||||
expected_p = t.data_ptr()
|
||||
|
||||
for rdtype in supported_dtypes:
|
||||
if dtype == rdtype:
|
||||
for mutable in [True, False]:
|
||||
p = libtorch_agnostic.ops.get_template_any_data_ptr(
|
||||
t, rdtype, mutable
|
||||
)
|
||||
self.assertEqual(p, expected_p)
|
||||
else:
|
||||
for mutable in [True, False]:
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError, "expected scalar type.* but found"
|
||||
):
|
||||
libtorch_agnostic.ops.get_template_any_data_ptr(
|
||||
t, rdtype, mutable
|
||||
)
|
||||
|
||||
instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -22,7 +22,7 @@ void check_all_parameters(
|
||||
|
||||
template<class Result, class... Args>
|
||||
Result get_operator_from_registry_and_execute(const char* op_name, Args&&... args) {
|
||||
auto ops = torch::jit::getAllOperatorsFor(
|
||||
auto& ops = torch::jit::getAllOperatorsFor(
|
||||
torch::jit::Symbol::fromQualString(op_name));
|
||||
TORCH_INTERNAL_ASSERT(ops.size() == 1);
|
||||
|
||||
|
||||
@ -59,12 +59,7 @@ from torch.testing._internal.common_fsdp import (
|
||||
patch_reshard,
|
||||
patch_unshard,
|
||||
)
|
||||
from torch.testing._internal.common_utils import (
|
||||
run_tests,
|
||||
TEST_WITH_ROCM,
|
||||
TEST_XPU,
|
||||
xfailIf,
|
||||
)
|
||||
from torch.testing._internal.common_utils import run_tests, TEST_XPU, xfailIf
|
||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||
FeedForward,
|
||||
ModelArgs,
|
||||
@ -1663,17 +1658,10 @@ class TestFullyShardAllocFromPG(FSDPTest):
|
||||
class TestFullyShardForceSumReduction(FSDPTest):
|
||||
# The messages might change when we move to a different NCCL version.
|
||||
# Please update this test if it starts failing.
|
||||
|
||||
if TEST_WITH_ROCM and torch.cuda.nccl.version()[:2] >= (2, 27):
|
||||
COLLECTIVE_RE = (
|
||||
r"NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ acc \(nil\) "
|
||||
"count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
|
||||
)
|
||||
else:
|
||||
COLLECTIVE_RE = (
|
||||
"NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ "
|
||||
"count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
|
||||
)
|
||||
COLLECTIVE_RE = (
|
||||
"NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ "
|
||||
"count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
|
||||
)
|
||||
# See here for the numerical values for each reduction op:
|
||||
# https://github.com/NVIDIA/nccl/blob/72d2432094d6ae36abd6e511c3a16a2d052dbf94/src/nccl.h.in#L260-L275
|
||||
SUM_REDUCTION = 0
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user