mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-19 10:04:58 +08:00
Compare commits
72 Commits
dev/joona/
...
ciflow/ind
| Author | SHA1 | Date | |
|---|---|---|---|
| 9e229e69ae | |||
| 7c97da6af4 | |||
| 2f023bf7b9 | |||
| 9760a633ba | |||
| 2e907f48cf | |||
| 4c127f1a65 | |||
| 3beb3786fc | |||
| d2ccb5bc5e | |||
| 8cb8b6cbbd | |||
| 2b92b31bd6 | |||
| db1551bafa | |||
| 73921060d9 | |||
| 01f94d4096 | |||
| 35dae27a66 | |||
| 9ff1922397 | |||
| 5df0e49801 | |||
| e5e94ec65c | |||
| ef7fa96fbf | |||
| 7ffeb34a9b | |||
| 63b012a4dc | |||
| 1a0a19892a | |||
| 39f5e0e52c | |||
| 6eb71ce649 | |||
| 2d14e86b94 | |||
| 8bb11524df | |||
| bbf39cad67 | |||
| 654f3f67d3 | |||
| bc30c98b6d | |||
| 510cc2e62a | |||
| ee9008a51f | |||
| 66f3e4eddf | |||
| 8a8c634fe5 | |||
| 71f28f4d42 | |||
| 9b39276255 | |||
| 86f9a9ae76 | |||
| c4f3d7d410 | |||
| da98d711b5 | |||
| a3f01ba365 | |||
| 28e3b5b3bc | |||
| 9c1f94250e | |||
| 8f6588f0da | |||
| be5b4c906a | |||
| b9ac1ef64b | |||
| 0c00627304 | |||
| 7201ff18d3 | |||
| fccdb8f3b5 | |||
| 0c1c9fa536 | |||
| d609959cbe | |||
| dbdf832464 | |||
| 528cf18433 | |||
| 8ab0bc8576 | |||
| 8cfdf0fff1 | |||
| 683888e8bf | |||
| d04323d1f3 | |||
| e42d86cb27 | |||
| 72541274cf | |||
| ad0d9f58e5 | |||
| 06fd6256e2 | |||
| 7c6704f89e | |||
| cd026659a2 | |||
| fb906ea443 | |||
| 78c8cc1274 | |||
| 90d905cd84 | |||
| 92e7a49691 | |||
| 5bcaf2a31f | |||
| fb9ae4a244 | |||
| e1b6188fa6 | |||
| c01c2de05d | |||
| 2a9880cdf3 | |||
| 0b06a06bb7 | |||
| 73e9c17457 | |||
| e6ffee4f1c |
19
.ci/aarch64_linux/README.md
Normal file
19
.ci/aarch64_linux/README.md
Normal file
@ -0,0 +1,19 @@
|
||||
# Aarch64 (ARM/Graviton) Support Scripts
|
||||
Scripts for building aarch64 PyTorch PIP Wheels. These scripts build the following wheels:
|
||||
* torch
|
||||
* torchvision
|
||||
* torchaudio
|
||||
* torchtext
|
||||
* torchdata
|
||||
## Aarch64_ci_build.sh
|
||||
This script is design to support CD operations within PyPi manylinux aarch64 container, and be executed in the container. It prepares the container and then executes __aarch64_wheel_ci_build.py__ to build the wheels. The script "assumes" the PyTorch repo is located at: ```/pytorch``` and will put the wheels into ```/artifacts```.
|
||||
### Usage
|
||||
```DESIRED_PYTHON=<PythonVersion> aarch64_ci_build.sh```
|
||||
|
||||
__NOTE:__ CI build is currently __EXPERMINTAL__
|
||||
|
||||
## Build_aarch64_wheel.py
|
||||
This app allows a person to build using AWS EC3 resources and requires AWS-CLI and Boto3 with AWS credentials to support building EC2 instances for the wheel builds. Can be used in a codebuild CD or from a local system.
|
||||
|
||||
### Usage
|
||||
```build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch <RCtag>```
|
||||
53
.ci/aarch64_linux/aarch64_ci_build.sh
Normal file
53
.ci/aarch64_linux/aarch64_ci_build.sh
Normal file
@ -0,0 +1,53 @@
|
||||
#!/bin/bash
|
||||
set -eux -o pipefail
|
||||
|
||||
GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
|
||||
|
||||
# Set CUDA architecture lists to match x86 build_cuda.sh
|
||||
if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0"
|
||||
elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
|
||||
elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
|
||||
elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
|
||||
fi
|
||||
|
||||
# Compress the fatbin with -compress-mode=size for CUDA 13
|
||||
if [[ "$DESIRED_CUDA" == *"13"* ]]; then
|
||||
export TORCH_NVCC_FLAGS="-compress-mode=size"
|
||||
# Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
|
||||
export BUILD_BUNDLE_PTXAS=1
|
||||
fi
|
||||
|
||||
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
|
||||
source $SCRIPTPATH/aarch64_ci_setup.sh
|
||||
|
||||
###############################################################################
|
||||
# Run aarch64 builder python
|
||||
###############################################################################
|
||||
cd /
|
||||
# adding safe directory for git as the permissions will be
|
||||
# on the mounted pytorch repo
|
||||
git config --global --add safe.directory /pytorch
|
||||
pip install -r /pytorch/requirements.txt
|
||||
pip install auditwheel==6.2.0 wheel
|
||||
if [ "$DESIRED_CUDA" = "cpu" ]; then
|
||||
echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
|
||||
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
|
||||
else
|
||||
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
|
||||
export USE_SYSTEM_NCCL=1
|
||||
|
||||
# Check if we should use NVIDIA libs from PyPI (similar to x86 build_cuda.sh logic)
|
||||
if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
|
||||
echo "Bundling CUDA libraries with wheel for aarch64."
|
||||
else
|
||||
echo "Using nvidia libs from pypi for aarch64."
|
||||
echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
|
||||
export USE_NVIDIA_PYPI_LIBS=1
|
||||
fi
|
||||
|
||||
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
|
||||
fi
|
||||
21
.ci/aarch64_linux/aarch64_ci_setup.sh
Executable file
21
.ci/aarch64_linux/aarch64_ci_setup.sh
Executable file
@ -0,0 +1,21 @@
|
||||
#!/bin/bash
|
||||
set -eux -o pipefail
|
||||
|
||||
# This script is used to prepare the Docker container for aarch64_ci_wheel_build.py python script
|
||||
# By creating symlinks from desired /opt/python to /usr/local/bin/
|
||||
|
||||
NUMPY_VERSION=2.0.2
|
||||
if [[ "$DESIRED_PYTHON" == "3.13" || "$DESIRED_PYTHON" == "3.13t" ]]; then
|
||||
NUMPY_VERSION=2.1.2
|
||||
fi
|
||||
|
||||
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
|
||||
source $SCRIPTPATH/../manywheel/set_desired_python.sh
|
||||
|
||||
pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2
|
||||
|
||||
for tool in python python3 pip pip3 ninja scons patchelf; do
|
||||
ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin;
|
||||
done
|
||||
|
||||
python --version
|
||||
333
.ci/aarch64_linux/aarch64_wheel_ci_build.py
Executable file
333
.ci/aarch64_linux/aarch64_wheel_ci_build.py
Executable file
@ -0,0 +1,333 @@
|
||||
#!/usr/bin/env python3
|
||||
# encoding: UTF-8
|
||||
|
||||
import os
|
||||
import shutil
|
||||
from subprocess import check_call, check_output
|
||||
|
||||
|
||||
def list_dir(path: str) -> list[str]:
|
||||
"""'
|
||||
Helper for getting paths for Python
|
||||
"""
|
||||
return check_output(["ls", "-1", path]).decode().split("\n")
|
||||
|
||||
|
||||
def replace_tag(filename) -> None:
|
||||
with open(filename) as f:
|
||||
lines = f.readlines()
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith("Tag:"):
|
||||
lines[i] = line.replace("-linux_", "-manylinux_2_28_")
|
||||
print(f"Updated tag from {line} to {lines[i]}")
|
||||
break
|
||||
|
||||
with open(filename, "w") as f:
|
||||
f.writelines(lines)
|
||||
|
||||
|
||||
def patch_library_rpath(
|
||||
folder: str,
|
||||
lib_name: str,
|
||||
use_nvidia_pypi_libs: bool = False,
|
||||
desired_cuda: str = "",
|
||||
) -> None:
|
||||
"""Apply patchelf to set RPATH for a library in torch/lib"""
|
||||
lib_path = f"{folder}/tmp/torch/lib/{lib_name}"
|
||||
|
||||
if use_nvidia_pypi_libs:
|
||||
# For PyPI NVIDIA libraries, construct CUDA RPATH
|
||||
cuda_rpaths = [
|
||||
"$ORIGIN/../../nvidia/cudnn/lib",
|
||||
"$ORIGIN/../../nvidia/nvshmem/lib",
|
||||
"$ORIGIN/../../nvidia/nccl/lib",
|
||||
"$ORIGIN/../../nvidia/cusparselt/lib",
|
||||
]
|
||||
|
||||
if "130" in desired_cuda:
|
||||
cuda_rpaths.append("$ORIGIN/../../nvidia/cu13/lib")
|
||||
else:
|
||||
cuda_rpaths.extend(
|
||||
[
|
||||
"$ORIGIN/../../nvidia/cublas/lib",
|
||||
"$ORIGIN/../../nvidia/cuda_cupti/lib",
|
||||
"$ORIGIN/../../nvidia/cuda_nvrtc/lib",
|
||||
"$ORIGIN/../../nvidia/cuda_runtime/lib",
|
||||
"$ORIGIN/../../nvidia/cufft/lib",
|
||||
"$ORIGIN/../../nvidia/curand/lib",
|
||||
"$ORIGIN/../../nvidia/cusolver/lib",
|
||||
"$ORIGIN/../../nvidia/cusparse/lib",
|
||||
"$ORIGIN/../../nvidia/nvtx/lib",
|
||||
"$ORIGIN/../../nvidia/cufile/lib",
|
||||
]
|
||||
)
|
||||
|
||||
# Add $ORIGIN for local torch libs
|
||||
rpath = ":".join(cuda_rpaths) + ":$ORIGIN"
|
||||
else:
|
||||
# For bundled libraries, just use $ORIGIN
|
||||
rpath = "$ORIGIN"
|
||||
|
||||
if os.path.exists(lib_path):
|
||||
os.system(
|
||||
f"cd {folder}/tmp/torch/lib/; "
|
||||
f"patchelf --set-rpath '{rpath}' --force-rpath {lib_name}"
|
||||
)
|
||||
|
||||
|
||||
def copy_and_patch_library(
|
||||
src_path: str,
|
||||
folder: str,
|
||||
use_nvidia_pypi_libs: bool = False,
|
||||
desired_cuda: str = "",
|
||||
) -> None:
|
||||
"""Copy a library to torch/lib and patch its RPATH"""
|
||||
if os.path.exists(src_path):
|
||||
lib_name = os.path.basename(src_path)
|
||||
shutil.copy2(src_path, f"{folder}/tmp/torch/lib/{lib_name}")
|
||||
patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
|
||||
|
||||
|
||||
def package_cuda_wheel(wheel_path, desired_cuda) -> None:
|
||||
"""
|
||||
Package the cuda wheel libraries
|
||||
"""
|
||||
folder = os.path.dirname(wheel_path)
|
||||
os.mkdir(f"{folder}/tmp")
|
||||
os.system(f"unzip {wheel_path} -d {folder}/tmp")
|
||||
# Delete original wheel since it will be repackaged
|
||||
os.system(f"rm {wheel_path}")
|
||||
|
||||
# Check if we should use PyPI NVIDIA libraries or bundle system libraries
|
||||
use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
|
||||
|
||||
if use_nvidia_pypi_libs:
|
||||
print("Using nvidia libs from pypi - skipping CUDA library bundling")
|
||||
# For PyPI approach, we don't bundle CUDA libraries - they come from PyPI packages
|
||||
# We only need to bundle non-NVIDIA libraries
|
||||
minimal_libs_to_copy = [
|
||||
"/lib64/libgomp.so.1",
|
||||
"/usr/lib64/libgfortran.so.5",
|
||||
"/acl/build/libarm_compute.so",
|
||||
"/acl/build/libarm_compute_graph.so",
|
||||
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_lapack_core.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_core.so.0",
|
||||
]
|
||||
|
||||
# Copy minimal libraries to unzipped_folder/torch/lib
|
||||
for lib_path in minimal_libs_to_copy:
|
||||
copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
|
||||
|
||||
# Patch torch libraries used for searching libraries
|
||||
torch_libs_to_patch = [
|
||||
"libtorch.so",
|
||||
"libtorch_cpu.so",
|
||||
"libtorch_cuda.so",
|
||||
"libtorch_cuda_linalg.so",
|
||||
"libtorch_global_deps.so",
|
||||
"libtorch_python.so",
|
||||
"libtorch_nvshmem.so",
|
||||
"libc10.so",
|
||||
"libc10_cuda.so",
|
||||
"libcaffe2_nvrtc.so",
|
||||
"libshm.so",
|
||||
]
|
||||
for lib_name in torch_libs_to_patch:
|
||||
patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
|
||||
else:
|
||||
print("Bundling CUDA libraries with wheel")
|
||||
# Original logic for bundling system CUDA libraries
|
||||
# Common libraries for all CUDA versions
|
||||
common_libs = [
|
||||
# Non-NVIDIA system libraries
|
||||
"/lib64/libgomp.so.1",
|
||||
"/usr/lib64/libgfortran.so.5",
|
||||
"/acl/build/libarm_compute.so",
|
||||
"/acl/build/libarm_compute_graph.so",
|
||||
# Common CUDA libraries (same for all versions)
|
||||
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_lapack_core.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_core.so.0",
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
|
||||
"/usr/local/cuda/lib64/libcudnn.so.9",
|
||||
"/usr/local/cuda/lib64/libcusparseLt.so.0",
|
||||
"/usr/local/cuda/lib64/libcurand.so.10",
|
||||
"/usr/local/cuda/lib64/libnccl.so.2",
|
||||
"/usr/local/cuda/lib64/libnvshmem_host.so.3",
|
||||
"/usr/local/cuda/lib64/libcudnn_adv.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_cnn.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_graph.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_ops.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
|
||||
"/usr/local/cuda/lib64/libcufile.so.0",
|
||||
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
|
||||
"/usr/local/cuda/lib64/libcusparse.so.12",
|
||||
]
|
||||
|
||||
# CUDA version-specific libraries
|
||||
if "13" in desired_cuda:
|
||||
minor_version = desired_cuda[-1]
|
||||
version_specific_libs = [
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
|
||||
"/usr/local/cuda/lib64/libcublas.so.13",
|
||||
"/usr/local/cuda/lib64/libcublasLt.so.13",
|
||||
"/usr/local/cuda/lib64/libcudart.so.13",
|
||||
"/usr/local/cuda/lib64/libcufft.so.12",
|
||||
"/usr/local/cuda/lib64/libcusolver.so.12",
|
||||
"/usr/local/cuda/lib64/libnvJitLink.so.13",
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.13",
|
||||
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",
|
||||
]
|
||||
elif "12" in desired_cuda:
|
||||
# Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
|
||||
minor_version = desired_cuda[-1]
|
||||
version_specific_libs = [
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
|
||||
"/usr/local/cuda/lib64/libcublas.so.12",
|
||||
"/usr/local/cuda/lib64/libcublasLt.so.12",
|
||||
"/usr/local/cuda/lib64/libcudart.so.12",
|
||||
"/usr/local/cuda/lib64/libcufft.so.11",
|
||||
"/usr/local/cuda/lib64/libcusolver.so.11",
|
||||
"/usr/local/cuda/lib64/libnvJitLink.so.12",
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.12",
|
||||
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
|
||||
]
|
||||
else:
|
||||
raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")
|
||||
|
||||
# Combine all libraries
|
||||
libs_to_copy = common_libs + version_specific_libs
|
||||
|
||||
# Copy libraries to unzipped_folder/torch/lib
|
||||
for lib_path in libs_to_copy:
|
||||
copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
|
||||
|
||||
# Make sure the wheel is tagged with manylinux_2_28
|
||||
for f in os.scandir(f"{folder}/tmp/"):
|
||||
if f.is_dir() and f.name.endswith(".dist-info"):
|
||||
replace_tag(f"{f.path}/WHEEL")
|
||||
break
|
||||
|
||||
os.system(f"wheel pack {folder}/tmp/ -d {folder}")
|
||||
os.system(f"rm -rf {folder}/tmp/")
|
||||
|
||||
|
||||
def complete_wheel(folder: str) -> str:
|
||||
"""
|
||||
Complete wheel build and put in artifact location
|
||||
"""
|
||||
wheel_name = list_dir(f"/{folder}/dist")[0]
|
||||
|
||||
# Please note for cuda we don't run auditwheel since we use custom script to package
|
||||
# the cuda dependencies to the wheel file using update_wheel() method.
|
||||
# However we need to make sure filename reflects the correct Manylinux platform.
|
||||
if "pytorch" in folder and not enable_cuda:
|
||||
print("Repairing Wheel with AuditWheel")
|
||||
check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
|
||||
repaired_wheel_name = list_dir(f"/{folder}/wheelhouse")[0]
|
||||
|
||||
print(f"Moving {repaired_wheel_name} wheel to /{folder}/dist")
|
||||
os.rename(
|
||||
f"/{folder}/wheelhouse/{repaired_wheel_name}",
|
||||
f"/{folder}/dist/{repaired_wheel_name}",
|
||||
)
|
||||
else:
|
||||
repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
|
||||
|
||||
print(f"Copying {repaired_wheel_name} to artifacts")
|
||||
shutil.copy2(
|
||||
f"/{folder}/dist/{repaired_wheel_name}", f"/artifacts/{repaired_wheel_name}"
|
||||
)
|
||||
|
||||
return repaired_wheel_name
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
"""
|
||||
Parse inline arguments
|
||||
"""
|
||||
from argparse import ArgumentParser
|
||||
|
||||
parser = ArgumentParser("AARCH64 wheels python CD")
|
||||
parser.add_argument("--debug", action="store_true")
|
||||
parser.add_argument("--build-only", action="store_true")
|
||||
parser.add_argument("--test-only", type=str)
|
||||
parser.add_argument("--enable-mkldnn", action="store_true")
|
||||
parser.add_argument("--enable-cuda", action="store_true")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
Entry Point
|
||||
"""
|
||||
args = parse_arguments()
|
||||
enable_mkldnn = args.enable_mkldnn
|
||||
enable_cuda = args.enable_cuda
|
||||
branch = check_output(
|
||||
["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd="/pytorch"
|
||||
).decode()
|
||||
|
||||
print("Building PyTorch wheel")
|
||||
build_vars = ""
|
||||
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
|
||||
if enable_cuda:
|
||||
build_vars += "MAX_JOBS=5 "
|
||||
|
||||
# Handle PyPI NVIDIA libraries vs bundled libraries
|
||||
use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
|
||||
if use_nvidia_pypi_libs:
|
||||
print("Configuring build for PyPI NVIDIA libraries")
|
||||
# Configure for dynamic linking (matching x86 logic)
|
||||
build_vars += "ATEN_STATIC_CUDA=0 USE_CUDA_STATIC_LINK=0 USE_CUPTI_SO=1 "
|
||||
else:
|
||||
print("Configuring build for bundled NVIDIA libraries")
|
||||
# Keep existing static linking approach - already configured above
|
||||
|
||||
override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
|
||||
desired_cuda = os.getenv("DESIRED_CUDA")
|
||||
if override_package_version is not None:
|
||||
version = override_package_version
|
||||
build_vars += (
|
||||
f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
|
||||
)
|
||||
elif branch in ["nightly", "main"]:
|
||||
build_date = (
|
||||
check_output(["git", "log", "--pretty=format:%cs", "-1"], cwd="/pytorch")
|
||||
.decode()
|
||||
.replace("-", "")
|
||||
)
|
||||
version = (
|
||||
check_output(["cat", "version.txt"], cwd="/pytorch").decode().strip()[:-2]
|
||||
)
|
||||
if enable_cuda:
|
||||
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date}+{desired_cuda} PYTORCH_BUILD_NUMBER=1 "
|
||||
else:
|
||||
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
|
||||
elif branch.startswith(("v1.", "v2.")):
|
||||
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
|
||||
|
||||
if enable_mkldnn:
|
||||
print("build pytorch with mkldnn+acl backend")
|
||||
build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
|
||||
build_vars += "ACL_ROOT_DIR=/acl "
|
||||
if enable_cuda:
|
||||
build_vars += "BLAS=NVPL "
|
||||
else:
|
||||
build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS "
|
||||
else:
|
||||
print("build pytorch without mkldnn backend")
|
||||
|
||||
os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")
|
||||
if enable_cuda:
|
||||
print("Updating Cuda Dependency")
|
||||
filename = os.listdir("/pytorch/dist/")
|
||||
wheel_path = f"/pytorch/dist/{filename[0]}"
|
||||
package_cuda_wheel(wheel_path, desired_cuda)
|
||||
pytorch_wheel_name = complete_wheel("/pytorch/")
|
||||
print(f"Build Complete. Created {pytorch_wheel_name}..")
|
||||
999
.ci/aarch64_linux/build_aarch64_wheel.py
Executable file
999
.ci/aarch64_linux/build_aarch64_wheel.py
Executable file
@ -0,0 +1,999 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# This script is for building AARCH64 wheels using AWS EC2 instances.
|
||||
# To generate binaries for the release follow these steps:
|
||||
# 1. Update mappings for each of the Domain Libraries by adding new row to a table like this:
|
||||
# "v1.11.0": ("0.11.0", "rc1"),
|
||||
# 2. Run script with following arguments for each of the supported python versions and required tag, for example:
|
||||
# build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch v1.11.0-rc3
|
||||
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from typing import Optional, Union
|
||||
|
||||
import boto3
|
||||
|
||||
|
||||
# AMI images for us-east-1, change the following based on your ~/.aws/config
|
||||
os_amis = {
|
||||
"ubuntu20_04": "ami-052eac90edaa9d08f", # login_name: ubuntu
|
||||
"ubuntu22_04": "ami-0c6c29c5125214c77", # login_name: ubuntu
|
||||
"redhat8": "ami-0698b90665a2ddcf1", # login_name: ec2-user
|
||||
}
|
||||
|
||||
ubuntu20_04_ami = os_amis["ubuntu20_04"]
|
||||
|
||||
|
||||
def compute_keyfile_path(key_name: Optional[str] = None) -> tuple[str, str]:
|
||||
if key_name is None:
|
||||
key_name = os.getenv("AWS_KEY_NAME")
|
||||
if key_name is None:
|
||||
return os.getenv("SSH_KEY_PATH", ""), ""
|
||||
|
||||
homedir_path = os.path.expanduser("~")
|
||||
default_path = os.path.join(homedir_path, ".ssh", f"{key_name}.pem")
|
||||
return os.getenv("SSH_KEY_PATH", default_path), key_name
|
||||
|
||||
|
||||
ec2 = boto3.resource("ec2")
|
||||
|
||||
|
||||
def ec2_get_instances(filter_name, filter_value):
|
||||
return ec2.instances.filter(
|
||||
Filters=[{"Name": filter_name, "Values": [filter_value]}]
|
||||
)
|
||||
|
||||
|
||||
def ec2_instances_of_type(instance_type="t4g.2xlarge"):
|
||||
return ec2_get_instances("instance-type", instance_type)
|
||||
|
||||
|
||||
def ec2_instances_by_id(instance_id):
|
||||
rc = list(ec2_get_instances("instance-id", instance_id))
|
||||
return rc[0] if len(rc) > 0 else None
|
||||
|
||||
|
||||
def start_instance(
|
||||
key_name, ami=ubuntu20_04_ami, instance_type="t4g.2xlarge", ebs_size: int = 50
|
||||
):
|
||||
inst = ec2.create_instances(
|
||||
ImageId=ami,
|
||||
InstanceType=instance_type,
|
||||
SecurityGroups=["ssh-allworld"],
|
||||
KeyName=key_name,
|
||||
MinCount=1,
|
||||
MaxCount=1,
|
||||
BlockDeviceMappings=[
|
||||
{
|
||||
"DeviceName": "/dev/sda1",
|
||||
"Ebs": {
|
||||
"DeleteOnTermination": True,
|
||||
"VolumeSize": ebs_size,
|
||||
"VolumeType": "standard",
|
||||
},
|
||||
}
|
||||
],
|
||||
)[0]
|
||||
print(f"Create instance {inst.id}")
|
||||
inst.wait_until_running()
|
||||
running_inst = ec2_instances_by_id(inst.id)
|
||||
print(f"Instance started at {running_inst.public_dns_name}")
|
||||
return running_inst
|
||||
|
||||
|
||||
class RemoteHost:
|
||||
addr: str
|
||||
keyfile_path: str
|
||||
login_name: str
|
||||
container_id: Optional[str] = None
|
||||
ami: Optional[str] = None
|
||||
|
||||
def __init__(self, addr: str, keyfile_path: str, login_name: str = "ubuntu"):
|
||||
self.addr = addr
|
||||
self.keyfile_path = keyfile_path
|
||||
self.login_name = login_name
|
||||
|
||||
def _gen_ssh_prefix(self) -> list[str]:
|
||||
return [
|
||||
"ssh",
|
||||
"-o",
|
||||
"StrictHostKeyChecking=no",
|
||||
"-i",
|
||||
self.keyfile_path,
|
||||
f"{self.login_name}@{self.addr}",
|
||||
"--",
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def _split_cmd(args: Union[str, list[str]]) -> list[str]:
|
||||
return args.split() if isinstance(args, str) else args
|
||||
|
||||
def run_ssh_cmd(self, args: Union[str, list[str]]) -> None:
|
||||
subprocess.check_call(self._gen_ssh_prefix() + self._split_cmd(args))
|
||||
|
||||
def check_ssh_output(self, args: Union[str, list[str]]) -> str:
|
||||
return subprocess.check_output(
|
||||
self._gen_ssh_prefix() + self._split_cmd(args)
|
||||
).decode("utf-8")
|
||||
|
||||
def scp_upload_file(self, local_file: str, remote_file: str) -> None:
|
||||
subprocess.check_call(
|
||||
[
|
||||
"scp",
|
||||
"-i",
|
||||
self.keyfile_path,
|
||||
local_file,
|
||||
f"{self.login_name}@{self.addr}:{remote_file}",
|
||||
]
|
||||
)
|
||||
|
||||
def scp_download_file(
|
||||
self, remote_file: str, local_file: Optional[str] = None
|
||||
) -> None:
|
||||
if local_file is None:
|
||||
local_file = "."
|
||||
subprocess.check_call(
|
||||
[
|
||||
"scp",
|
||||
"-i",
|
||||
self.keyfile_path,
|
||||
f"{self.login_name}@{self.addr}:{remote_file}",
|
||||
local_file,
|
||||
]
|
||||
)
|
||||
|
||||
def start_docker(self, image="quay.io/pypa/manylinux2014_aarch64:latest") -> None:
|
||||
self.run_ssh_cmd("sudo apt-get install -y docker.io")
|
||||
self.run_ssh_cmd(f"sudo usermod -a -G docker {self.login_name}")
|
||||
self.run_ssh_cmd("sudo service docker start")
|
||||
self.run_ssh_cmd(f"docker pull {image}")
|
||||
self.container_id = self.check_ssh_output(
|
||||
f"docker run -t -d -w /root {image}"
|
||||
).strip()
|
||||
|
||||
def using_docker(self) -> bool:
|
||||
return self.container_id is not None
|
||||
|
||||
def run_cmd(self, args: Union[str, list[str]]) -> None:
|
||||
if not self.using_docker():
|
||||
return self.run_ssh_cmd(args)
|
||||
assert self.container_id is not None
|
||||
docker_cmd = self._gen_ssh_prefix() + [
|
||||
"docker",
|
||||
"exec",
|
||||
"-i",
|
||||
self.container_id,
|
||||
"bash",
|
||||
]
|
||||
p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE)
|
||||
p.communicate(
|
||||
input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
|
||||
"utf-8"
|
||||
)
|
||||
)
|
||||
rc = p.wait()
|
||||
if rc != 0:
|
||||
raise subprocess.CalledProcessError(rc, docker_cmd)
|
||||
|
||||
def check_output(self, args: Union[str, list[str]]) -> str:
|
||||
if not self.using_docker():
|
||||
return self.check_ssh_output(args)
|
||||
assert self.container_id is not None
|
||||
docker_cmd = self._gen_ssh_prefix() + [
|
||||
"docker",
|
||||
"exec",
|
||||
"-i",
|
||||
self.container_id,
|
||||
"bash",
|
||||
]
|
||||
p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
(out, err) = p.communicate(
|
||||
input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
|
||||
"utf-8"
|
||||
)
|
||||
)
|
||||
rc = p.wait()
|
||||
if rc != 0:
|
||||
raise subprocess.CalledProcessError(rc, docker_cmd, output=out, stderr=err)
|
||||
return out.decode("utf-8")
|
||||
|
||||
def upload_file(self, local_file: str, remote_file: str) -> None:
|
||||
if not self.using_docker():
|
||||
return self.scp_upload_file(local_file, remote_file)
|
||||
tmp_file = os.path.join("/tmp", os.path.basename(local_file))
|
||||
self.scp_upload_file(local_file, tmp_file)
|
||||
self.run_ssh_cmd(
|
||||
["docker", "cp", tmp_file, f"{self.container_id}:/root/{remote_file}"]
|
||||
)
|
||||
self.run_ssh_cmd(["rm", tmp_file])
|
||||
|
||||
def download_file(self, remote_file: str, local_file: Optional[str] = None) -> None:
|
||||
if not self.using_docker():
|
||||
return self.scp_download_file(remote_file, local_file)
|
||||
tmp_file = os.path.join("/tmp", os.path.basename(remote_file))
|
||||
self.run_ssh_cmd(
|
||||
["docker", "cp", f"{self.container_id}:/root/{remote_file}", tmp_file]
|
||||
)
|
||||
self.scp_download_file(tmp_file, local_file)
|
||||
self.run_ssh_cmd(["rm", tmp_file])
|
||||
|
||||
def download_wheel(
|
||||
self, remote_file: str, local_file: Optional[str] = None
|
||||
) -> None:
|
||||
if self.using_docker() and local_file is None:
|
||||
basename = os.path.basename(remote_file)
|
||||
local_file = basename.replace(
|
||||
"-linux_aarch64.whl", "-manylinux2014_aarch64.whl"
|
||||
)
|
||||
self.download_file(remote_file, local_file)
|
||||
|
||||
def list_dir(self, path: str) -> list[str]:
|
||||
return self.check_output(["ls", "-1", path]).split("\n")
|
||||
|
||||
|
||||
def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
|
||||
import socket
|
||||
|
||||
for i in range(attempt_cnt):
|
||||
try:
|
||||
with socket.create_connection((addr, port), timeout=timeout):
|
||||
return
|
||||
except (ConnectionRefusedError, TimeoutError): # noqa: PERF203
|
||||
if i == attempt_cnt - 1:
|
||||
raise
|
||||
time.sleep(timeout)
|
||||
|
||||
|
||||
def update_apt_repo(host: RemoteHost) -> None:
|
||||
time.sleep(5)
|
||||
host.run_cmd("sudo systemctl stop apt-daily.service || true")
|
||||
host.run_cmd("sudo systemctl stop unattended-upgrades.service || true")
|
||||
host.run_cmd(
|
||||
"while systemctl is-active --quiet apt-daily.service; do sleep 1; done"
|
||||
)
|
||||
host.run_cmd(
|
||||
"while systemctl is-active --quiet unattended-upgrades.service; do sleep 1; done"
|
||||
)
|
||||
host.run_cmd("sudo apt-get update")
|
||||
time.sleep(3)
|
||||
host.run_cmd("sudo apt-get update")
|
||||
|
||||
|
||||
def install_condaforge(
|
||||
host: RemoteHost, suffix: str = "latest/download/Miniforge3-Linux-aarch64.sh"
|
||||
) -> None:
|
||||
print("Install conda-forge")
|
||||
host.run_cmd(f"curl -OL https://github.com/conda-forge/miniforge/releases/{suffix}")
|
||||
host.run_cmd(f"sh -f {os.path.basename(suffix)} -b")
|
||||
host.run_cmd(f"rm -f {os.path.basename(suffix)}")
|
||||
if host.using_docker():
|
||||
host.run_cmd("echo 'PATH=$HOME/miniforge3/bin:$PATH'>>.bashrc")
|
||||
else:
|
||||
host.run_cmd(
|
||||
[
|
||||
"sed",
|
||||
"-i",
|
||||
"'/^# If not running interactively.*/i PATH=$HOME/miniforge3/bin:$PATH'",
|
||||
".bashrc",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
|
||||
if python_version == "3.6":
|
||||
# Python-3.6 EOLed and not compatible with conda-4.11
|
||||
install_condaforge(
|
||||
host, suffix="download/4.10.3-10/Miniforge3-4.10.3-10-Linux-aarch64.sh"
|
||||
)
|
||||
host.run_cmd(f"conda install -y python={python_version} numpy pyyaml")
|
||||
else:
|
||||
install_condaforge(
|
||||
host, suffix="download/4.11.0-4/Miniforge3-4.11.0-4-Linux-aarch64.sh"
|
||||
)
|
||||
# Pytorch-1.10 or older are not compatible with setuptools=59.6 or newer
|
||||
host.run_cmd(
|
||||
f"conda install -y python={python_version} numpy pyyaml setuptools>=59.5.0"
|
||||
)
|
||||
|
||||
|
||||
def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:
|
||||
host.run_cmd("pip3 install auditwheel")
|
||||
host.run_cmd(
|
||||
"conda install -y patchelf" if use_conda else "sudo apt-get install -y patchelf"
|
||||
)
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
with NamedTemporaryFile() as tmp:
|
||||
tmp.write(embed_library_script.encode("utf-8"))
|
||||
tmp.flush()
|
||||
host.upload_file(tmp.name, "embed_library.py")
|
||||
|
||||
print("Embedding libgomp into wheel")
|
||||
if host.using_docker():
|
||||
host.run_cmd(f"python3 embed_library.py {wheel_name} --update-tag")
|
||||
else:
|
||||
host.run_cmd(f"python3 embed_library.py {wheel_name}")
|
||||
|
||||
|
||||
def checkout_repo(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
url: str,
|
||||
git_clone_flags: str,
|
||||
mapping: dict[str, tuple[str, str]],
|
||||
) -> Optional[str]:
|
||||
for prefix in mapping:
|
||||
if not branch.startswith(prefix):
|
||||
continue
|
||||
tag = f"v{mapping[prefix][0]}-{mapping[prefix][1]}"
|
||||
host.run_cmd(f"git clone {url} -b {tag} {git_clone_flags}")
|
||||
return mapping[prefix][0]
|
||||
|
||||
host.run_cmd(f"git clone {url} -b {branch} {git_clone_flags}")
|
||||
return None
|
||||
|
||||
|
||||
def build_torchvision(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
use_conda: bool = True,
|
||||
git_clone_flags: str,
|
||||
run_smoke_tests: bool = True,
|
||||
) -> str:
|
||||
print("Checking out TorchVision repo")
|
||||
build_version = checkout_repo(
|
||||
host,
|
||||
branch=branch,
|
||||
url="https://github.com/pytorch/vision",
|
||||
git_clone_flags=git_clone_flags,
|
||||
mapping={
|
||||
"v1.7.1": ("0.8.2", "rc2"),
|
||||
"v1.8.0": ("0.9.0", "rc3"),
|
||||
"v1.8.1": ("0.9.1", "rc1"),
|
||||
"v1.9.0": ("0.10.0", "rc1"),
|
||||
"v1.10.0": ("0.11.1", "rc1"),
|
||||
"v1.10.1": ("0.11.2", "rc1"),
|
||||
"v1.10.2": ("0.11.3", "rc1"),
|
||||
"v1.11.0": ("0.12.0", "rc1"),
|
||||
"v1.12.0": ("0.13.0", "rc4"),
|
||||
"v1.12.1": ("0.13.1", "rc6"),
|
||||
"v1.13.0": ("0.14.0", "rc4"),
|
||||
"v1.13.1": ("0.14.1", "rc2"),
|
||||
"v2.0.0": ("0.15.1", "rc2"),
|
||||
"v2.0.1": ("0.15.2", "rc2"),
|
||||
},
|
||||
)
|
||||
print("Building TorchVision wheel")
|
||||
|
||||
# Please note libnpg and jpeg are required to build image.so extension
|
||||
if use_conda:
|
||||
host.run_cmd("conda install -y libpng jpeg")
|
||||
# Remove .so files to force static linking
|
||||
host.run_cmd(
|
||||
"rm miniforge3/lib/libpng.so miniforge3/lib/libpng16.so miniforge3/lib/libjpeg.so"
|
||||
)
|
||||
# And patch setup.py to include libz dependency for libpng
|
||||
host.run_cmd(
|
||||
[
|
||||
'sed -i -e \'s/image_link_flags\\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py'
|
||||
]
|
||||
)
|
||||
|
||||
build_vars = ""
|
||||
if branch == "nightly":
|
||||
version = host.check_output(
|
||||
["if [ -f vision/version.txt ]; then cat vision/version.txt; fi"]
|
||||
).strip()
|
||||
if len(version) == 0:
|
||||
# In older revisions, version was embedded in setup.py
|
||||
version = (
|
||||
host.check_output(["grep", '"version = \'"', "vision/setup.py"])
|
||||
.strip()
|
||||
.split("'")[1][:-2]
|
||||
)
|
||||
build_date = (
|
||||
host.check_output("cd vision && git log --pretty=format:%s -1")
|
||||
.strip()
|
||||
.split()[0]
|
||||
.replace("-", "")
|
||||
)
|
||||
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
|
||||
elif build_version is not None:
|
||||
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation")
|
||||
vision_wheel_name = host.list_dir("vision/dist")[0]
|
||||
embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name))
|
||||
|
||||
print("Copying TorchVision wheel")
|
||||
host.download_wheel(os.path.join("vision", "dist", vision_wheel_name))
|
||||
if run_smoke_tests:
|
||||
host.run_cmd(
|
||||
f"pip3 install {os.path.join('vision', 'dist', vision_wheel_name)}"
|
||||
)
|
||||
host.run_cmd("python3 vision/test/smoke_test.py")
|
||||
print("Delete vision checkout")
|
||||
host.run_cmd("rm -rf vision")
|
||||
|
||||
return vision_wheel_name
|
||||
|
||||
|
||||
def build_torchdata(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
use_conda: bool = True,
|
||||
git_clone_flags: str = "",
|
||||
) -> str:
|
||||
print("Checking out TorchData repo")
|
||||
git_clone_flags += " --recurse-submodules"
|
||||
build_version = checkout_repo(
|
||||
host,
|
||||
branch=branch,
|
||||
url="https://github.com/pytorch/data",
|
||||
git_clone_flags=git_clone_flags,
|
||||
mapping={
|
||||
"v1.13.1": ("0.5.1", ""),
|
||||
"v2.0.0": ("0.6.0", "rc5"),
|
||||
"v2.0.1": ("0.6.1", "rc1"),
|
||||
},
|
||||
)
|
||||
print("Building TorchData wheel")
|
||||
build_vars = ""
|
||||
if branch == "nightly":
|
||||
version = host.check_output(
|
||||
["if [ -f data/version.txt ]; then cat data/version.txt; fi"]
|
||||
).strip()
|
||||
build_date = (
|
||||
host.check_output("cd data && git log --pretty=format:%s -1")
|
||||
.strip()
|
||||
.split()[0]
|
||||
.replace("-", "")
|
||||
)
|
||||
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
|
||||
elif build_version is not None:
|
||||
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation")
|
||||
wheel_name = host.list_dir("data/dist")[0]
|
||||
embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name))
|
||||
|
||||
print("Copying TorchData wheel")
|
||||
host.download_wheel(os.path.join("data", "dist", wheel_name))
|
||||
|
||||
return wheel_name
|
||||
|
||||
|
||||
def build_torchtext(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
use_conda: bool = True,
|
||||
git_clone_flags: str = "",
|
||||
) -> str:
|
||||
print("Checking out TorchText repo")
|
||||
git_clone_flags += " --recurse-submodules"
|
||||
build_version = checkout_repo(
|
||||
host,
|
||||
branch=branch,
|
||||
url="https://github.com/pytorch/text",
|
||||
git_clone_flags=git_clone_flags,
|
||||
mapping={
|
||||
"v1.9.0": ("0.10.0", "rc1"),
|
||||
"v1.10.0": ("0.11.0", "rc2"),
|
||||
"v1.10.1": ("0.11.1", "rc1"),
|
||||
"v1.10.2": ("0.11.2", "rc1"),
|
||||
"v1.11.0": ("0.12.0", "rc1"),
|
||||
"v1.12.0": ("0.13.0", "rc2"),
|
||||
"v1.12.1": ("0.13.1", "rc5"),
|
||||
"v1.13.0": ("0.14.0", "rc3"),
|
||||
"v1.13.1": ("0.14.1", "rc1"),
|
||||
"v2.0.0": ("0.15.1", "rc2"),
|
||||
"v2.0.1": ("0.15.2", "rc2"),
|
||||
},
|
||||
)
|
||||
print("Building TorchText wheel")
|
||||
build_vars = ""
|
||||
if branch == "nightly":
|
||||
version = host.check_output(
|
||||
["if [ -f text/version.txt ]; then cat text/version.txt; fi"]
|
||||
).strip()
|
||||
build_date = (
|
||||
host.check_output("cd text && git log --pretty=format:%s -1")
|
||||
.strip()
|
||||
.split()[0]
|
||||
.replace("-", "")
|
||||
)
|
||||
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
|
||||
elif build_version is not None:
|
||||
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation")
|
||||
wheel_name = host.list_dir("text/dist")[0]
|
||||
embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name))
|
||||
|
||||
print("Copying TorchText wheel")
|
||||
host.download_wheel(os.path.join("text", "dist", wheel_name))
|
||||
|
||||
return wheel_name
|
||||
|
||||
|
||||
def build_torchaudio(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
use_conda: bool = True,
|
||||
git_clone_flags: str = "",
|
||||
) -> str:
|
||||
print("Checking out TorchAudio repo")
|
||||
git_clone_flags += " --recurse-submodules"
|
||||
build_version = checkout_repo(
|
||||
host,
|
||||
branch=branch,
|
||||
url="https://github.com/pytorch/audio",
|
||||
git_clone_flags=git_clone_flags,
|
||||
mapping={
|
||||
"v1.9.0": ("0.9.0", "rc2"),
|
||||
"v1.10.0": ("0.10.0", "rc5"),
|
||||
"v1.10.1": ("0.10.1", "rc1"),
|
||||
"v1.10.2": ("0.10.2", "rc1"),
|
||||
"v1.11.0": ("0.11.0", "rc1"),
|
||||
"v1.12.0": ("0.12.0", "rc3"),
|
||||
"v1.12.1": ("0.12.1", "rc5"),
|
||||
"v1.13.0": ("0.13.0", "rc4"),
|
||||
"v1.13.1": ("0.13.1", "rc2"),
|
||||
"v2.0.0": ("2.0.1", "rc3"),
|
||||
"v2.0.1": ("2.0.2", "rc2"),
|
||||
},
|
||||
)
|
||||
print("Building TorchAudio wheel")
|
||||
build_vars = ""
|
||||
if branch == "nightly":
|
||||
version = (
|
||||
host.check_output(["grep", '"version = \'"', "audio/setup.py"])
|
||||
.strip()
|
||||
.split("'")[1][:-2]
|
||||
)
|
||||
build_date = (
|
||||
host.check_output("cd audio && git log --pretty=format:%s -1")
|
||||
.strip()
|
||||
.split()[0]
|
||||
.replace("-", "")
|
||||
)
|
||||
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
|
||||
elif build_version is not None:
|
||||
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
host.run_cmd(
|
||||
f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \
|
||||
&& ./packaging/ffmpeg/build.sh \
|
||||
&& {build_vars} python3 -m build --wheel --no-isolation"
|
||||
)
|
||||
|
||||
wheel_name = host.list_dir("audio/dist")[0]
|
||||
embed_libgomp(host, use_conda, os.path.join("audio", "dist", wheel_name))
|
||||
|
||||
print("Copying TorchAudio wheel")
|
||||
host.download_wheel(os.path.join("audio", "dist", wheel_name))
|
||||
|
||||
return wheel_name
|
||||
|
||||
|
||||
def configure_system(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
compiler: str = "gcc-8",
|
||||
use_conda: bool = True,
|
||||
python_version: str = "3.8",
|
||||
) -> None:
|
||||
if use_conda:
|
||||
install_condaforge_python(host, python_version)
|
||||
|
||||
print("Configuring the system")
|
||||
if not host.using_docker():
|
||||
update_apt_repo(host)
|
||||
host.run_cmd("sudo apt-get install -y ninja-build g++ git cmake gfortran unzip")
|
||||
else:
|
||||
host.run_cmd("yum install -y sudo")
|
||||
host.run_cmd("conda install -y ninja scons")
|
||||
|
||||
if not use_conda:
|
||||
host.run_cmd(
|
||||
"sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
|
||||
)
|
||||
host.run_cmd("pip3 install dataclasses typing-extensions")
|
||||
if not use_conda:
|
||||
print("Installing Cython + numpy from PyPy")
|
||||
host.run_cmd("sudo pip3 install Cython")
|
||||
host.run_cmd("sudo pip3 install numpy")
|
||||
|
||||
|
||||
def build_domains(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
use_conda: bool = True,
|
||||
git_clone_flags: str = "",
|
||||
) -> tuple[str, str, str, str]:
|
||||
vision_wheel_name = build_torchvision(
|
||||
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
|
||||
)
|
||||
audio_wheel_name = build_torchaudio(
|
||||
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
|
||||
)
|
||||
data_wheel_name = build_torchdata(
|
||||
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
|
||||
)
|
||||
text_wheel_name = build_torchtext(
|
||||
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
|
||||
)
|
||||
return (vision_wheel_name, audio_wheel_name, data_wheel_name, text_wheel_name)
|
||||
|
||||
|
||||
def start_build(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
compiler: str = "gcc-8",
|
||||
use_conda: bool = True,
|
||||
python_version: str = "3.8",
|
||||
pytorch_only: bool = False,
|
||||
pytorch_build_number: Optional[str] = None,
|
||||
shallow_clone: bool = True,
|
||||
enable_mkldnn: bool = False,
|
||||
) -> tuple[str, str, str, str, str]:
|
||||
git_clone_flags = " --depth 1 --shallow-submodules" if shallow_clone else ""
|
||||
if host.using_docker() and not use_conda:
|
||||
print("Auto-selecting conda option for docker images")
|
||||
use_conda = True
|
||||
if not host.using_docker():
|
||||
print("Disable mkldnn for host builds")
|
||||
enable_mkldnn = False
|
||||
|
||||
configure_system(
|
||||
host, compiler=compiler, use_conda=use_conda, python_version=python_version
|
||||
)
|
||||
|
||||
if host.using_docker():
|
||||
print("Move libgfortant.a into a standard location")
|
||||
# HACK: pypa gforntran.a is compiled without PIC, which leads to the following error
|
||||
# libgfortran.a(error.o)(.text._gfortrani_st_printf+0x34): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `__stack_chk_guard@@GLIBC_2.17' # noqa: E501, B950
|
||||
# Workaround by copying gfortran library from the host
|
||||
host.run_ssh_cmd("sudo apt-get install -y gfortran-8")
|
||||
host.run_cmd("mkdir -p /usr/lib/gcc/aarch64-linux-gnu/8")
|
||||
host.run_ssh_cmd(
|
||||
[
|
||||
"docker",
|
||||
"cp",
|
||||
"/usr/lib/gcc/aarch64-linux-gnu/8/libgfortran.a",
|
||||
f"{host.container_id}:/opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/",
|
||||
]
|
||||
)
|
||||
|
||||
print("Checking out PyTorch repo")
|
||||
host.run_cmd(
|
||||
f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"
|
||||
)
|
||||
|
||||
host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")
|
||||
|
||||
print("Building PyTorch wheel")
|
||||
build_opts = ""
|
||||
if pytorch_build_number is not None:
|
||||
build_opts += f" -C--build-option=--build-number={pytorch_build_number}"
|
||||
# Breakpad build fails on aarch64
|
||||
build_vars = "USE_BREAKPAD=0 "
|
||||
if branch == "nightly":
|
||||
build_date = (
|
||||
host.check_output("cd pytorch && git log --pretty=format:%s -1")
|
||||
.strip()
|
||||
.split()[0]
|
||||
.replace("-", "")
|
||||
)
|
||||
version = host.check_output("cat pytorch/version.txt").strip()[:-2]
|
||||
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
|
||||
if branch.startswith(("v1.", "v2.")):
|
||||
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
if enable_mkldnn:
|
||||
host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")
|
||||
print("build pytorch with mkldnn+acl backend")
|
||||
build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
|
||||
build_vars += " BLAS=OpenBLAS"
|
||||
build_vars += " OpenBLAS_HOME=/opt/OpenBLAS"
|
||||
build_vars += " ACL_ROOT_DIR=/acl"
|
||||
host.run_cmd(
|
||||
f"cd $HOME/pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
|
||||
)
|
||||
print("Repair the wheel")
|
||||
pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
|
||||
ld_library_path = "/acl/build:$HOME/pytorch/build/lib"
|
||||
host.run_cmd(
|
||||
f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"
|
||||
)
|
||||
print("replace the original wheel with the repaired one")
|
||||
pytorch_repaired_wheel_name = host.list_dir("wheelhouse")[0]
|
||||
host.run_cmd(
|
||||
f"cp $HOME/wheelhouse/{pytorch_repaired_wheel_name} $HOME/pytorch/dist/{pytorch_wheel_name}"
|
||||
)
|
||||
else:
|
||||
print("build pytorch without mkldnn backend")
|
||||
host.run_cmd(
|
||||
f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
|
||||
)
|
||||
|
||||
print("Deleting build folder")
|
||||
host.run_cmd("cd pytorch && rm -rf build")
|
||||
pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
|
||||
embed_libgomp(host, use_conda, os.path.join("pytorch", "dist", pytorch_wheel_name))
|
||||
print("Copying the wheel")
|
||||
host.download_wheel(os.path.join("pytorch", "dist", pytorch_wheel_name))
|
||||
|
||||
print("Installing PyTorch wheel")
|
||||
host.run_cmd(f"pip3 install pytorch/dist/{pytorch_wheel_name}")
|
||||
|
||||
if pytorch_only:
|
||||
return (pytorch_wheel_name, None, None, None, None)
|
||||
domain_wheels = build_domains(
|
||||
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
|
||||
)
|
||||
|
||||
return (pytorch_wheel_name, *domain_wheels)
|
||||
|
||||
|
||||
embed_library_script = """
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from auditwheel.patcher import Patchelf
|
||||
from auditwheel.wheeltools import InWheelCtx
|
||||
from auditwheel.elfutils import elf_file_filter
|
||||
from auditwheel.repair import copylib
|
||||
from auditwheel.lddtree import lddtree
|
||||
from subprocess import check_call
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
|
||||
def replace_tag(filename):
|
||||
with open(filename, 'r') as f:
|
||||
lines = f.read().split("\\n")
|
||||
for i,line in enumerate(lines):
|
||||
if not line.startswith("Tag: "):
|
||||
continue
|
||||
lines[i] = line.replace("-linux_", "-manylinux2014_")
|
||||
print(f'Updated tag from {line} to {lines[i]}')
|
||||
|
||||
with open(filename, 'w') as f:
|
||||
f.write("\\n".join(lines))
|
||||
|
||||
|
||||
class AlignedPatchelf(Patchelf):
|
||||
def set_soname(self, file_name: str, new_soname: str) -> None:
|
||||
check_call(['patchelf', '--page-size', '65536', '--set-soname', new_soname, file_name])
|
||||
|
||||
def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
|
||||
check_call(['patchelf', '--page-size', '65536', '--replace-needed', soname, new_soname, file_name])
|
||||
|
||||
|
||||
def embed_library(whl_path, lib_soname, update_tag=False):
|
||||
patcher = AlignedPatchelf()
|
||||
out_dir = TemporaryDirectory()
|
||||
whl_name = os.path.basename(whl_path)
|
||||
tmp_whl_name = os.path.join(out_dir.name, whl_name)
|
||||
with InWheelCtx(whl_path) as ctx:
|
||||
torchlib_path = os.path.join(ctx._tmpdir.name, 'torch', 'lib')
|
||||
ctx.out_wheel=tmp_whl_name
|
||||
new_lib_path, new_lib_soname = None, None
|
||||
for filename, elf in elf_file_filter(ctx.iter_files()):
|
||||
if not filename.startswith('torch/lib'):
|
||||
continue
|
||||
libtree = lddtree(filename)
|
||||
if lib_soname not in libtree['needed']:
|
||||
continue
|
||||
lib_path = libtree['libs'][lib_soname]['path']
|
||||
if lib_path is None:
|
||||
print(f"Can't embed {lib_soname} as it could not be found")
|
||||
break
|
||||
if lib_path.startswith(torchlib_path):
|
||||
continue
|
||||
|
||||
if new_lib_path is None:
|
||||
new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
|
||||
patcher.replace_needed(filename, lib_soname, new_lib_soname)
|
||||
print(f'Replacing {lib_soname} with {new_lib_soname} for {filename}')
|
||||
if update_tag:
|
||||
# Add manylinux2014 tag
|
||||
for filename in ctx.iter_files():
|
||||
if os.path.basename(filename) != 'WHEEL':
|
||||
continue
|
||||
replace_tag(filename)
|
||||
shutil.move(tmp_whl_name, whl_path)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
embed_library(sys.argv[1], 'libgomp.so.1', len(sys.argv) > 2 and sys.argv[2] == '--update-tag')
|
||||
"""
|
||||
|
||||
|
||||
def run_tests(host: RemoteHost, whl: str, branch="main") -> None:
|
||||
print("Configuring the system")
|
||||
update_apt_repo(host)
|
||||
host.run_cmd("sudo apt-get install -y python3-pip git")
|
||||
host.run_cmd("sudo pip3 install Cython")
|
||||
host.run_cmd("sudo pip3 install numpy")
|
||||
host.upload_file(whl, ".")
|
||||
host.run_cmd(f"sudo pip3 install {whl}")
|
||||
host.run_cmd("python3 -c 'import torch;print(torch.rand((3,3))'")
|
||||
host.run_cmd(f"git clone -b {branch} https://github.com/pytorch/pytorch")
|
||||
host.run_cmd("cd pytorch/test; python3 test_torch.py -v")
|
||||
|
||||
|
||||
def get_instance_name(instance) -> Optional[str]:
|
||||
if instance.tags is None:
|
||||
return None
|
||||
for tag in instance.tags:
|
||||
if tag["Key"] == "Name":
|
||||
return tag["Value"]
|
||||
return None
|
||||
|
||||
|
||||
def list_instances(instance_type: str) -> None:
|
||||
print(f"All instances of type {instance_type}")
|
||||
for instance in ec2_instances_of_type(instance_type):
|
||||
ifaces = instance.network_interfaces
|
||||
az = ifaces[0].subnet.availability_zone if len(ifaces) > 0 else None
|
||||
print(
|
||||
f"{instance.id} {get_instance_name(instance)} {instance.public_dns_name} {instance.state['Name']} {az}"
|
||||
)
|
||||
|
||||
|
||||
def terminate_instances(instance_type: str) -> None:
|
||||
print(f"Terminating all instances of type {instance_type}")
|
||||
instances = list(ec2_instances_of_type(instance_type))
|
||||
for instance in instances:
|
||||
print(f"Terminating {instance.id}")
|
||||
instance.terminate()
|
||||
print("Waiting for termination to complete")
|
||||
for instance in instances:
|
||||
instance.wait_until_terminated()
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
from argparse import ArgumentParser
|
||||
|
||||
parser = ArgumentParser("Build and test AARCH64 wheels using EC2")
|
||||
parser.add_argument("--key-name", type=str)
|
||||
parser.add_argument("--debug", action="store_true")
|
||||
parser.add_argument("--build-only", action="store_true")
|
||||
parser.add_argument("--test-only", type=str)
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
group.add_argument("--os", type=str, choices=list(os_amis.keys()))
|
||||
group.add_argument("--ami", type=str)
|
||||
parser.add_argument(
|
||||
"--python-version",
|
||||
type=str,
|
||||
choices=[f"3.{d}" for d in range(6, 12)],
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument("--alloc-instance", action="store_true")
|
||||
parser.add_argument("--list-instances", action="store_true")
|
||||
parser.add_argument("--pytorch-only", action="store_true")
|
||||
parser.add_argument("--keep-running", action="store_true")
|
||||
parser.add_argument("--terminate-instances", action="store_true")
|
||||
parser.add_argument("--instance-type", type=str, default="t4g.2xlarge")
|
||||
parser.add_argument("--ebs-size", type=int, default=50)
|
||||
parser.add_argument("--branch", type=str, default="main")
|
||||
parser.add_argument("--use-docker", action="store_true")
|
||||
parser.add_argument(
|
||||
"--compiler",
|
||||
type=str,
|
||||
choices=["gcc-7", "gcc-8", "gcc-9", "clang"],
|
||||
default="gcc-8",
|
||||
)
|
||||
parser.add_argument("--use-torch-from-pypi", action="store_true")
|
||||
parser.add_argument("--pytorch-build-number", type=str, default=None)
|
||||
parser.add_argument("--disable-mkldnn", action="store_true")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_arguments()
|
||||
ami = (
|
||||
args.ami
|
||||
if args.ami is not None
|
||||
else os_amis[args.os]
|
||||
if args.os is not None
|
||||
else ubuntu20_04_ami
|
||||
)
|
||||
keyfile_path, key_name = compute_keyfile_path(args.key_name)
|
||||
|
||||
if args.list_instances:
|
||||
list_instances(args.instance_type)
|
||||
sys.exit(0)
|
||||
|
||||
if args.terminate_instances:
|
||||
terminate_instances(args.instance_type)
|
||||
sys.exit(0)
|
||||
|
||||
if len(key_name) == 0:
|
||||
raise RuntimeError("""
|
||||
Cannot start build without key_name, please specify
|
||||
--key-name argument or AWS_KEY_NAME environment variable.""")
|
||||
if len(keyfile_path) == 0 or not os.path.exists(keyfile_path):
|
||||
raise RuntimeError(f"""
|
||||
Cannot find keyfile with name: [{key_name}] in path: [{keyfile_path}], please
|
||||
check `~/.ssh/` folder or manually set SSH_KEY_PATH environment variable.""")
|
||||
|
||||
# Starting the instance
|
||||
inst = start_instance(
|
||||
key_name, ami=ami, instance_type=args.instance_type, ebs_size=args.ebs_size
|
||||
)
|
||||
instance_name = f"{args.key_name}-{args.os}"
|
||||
if args.python_version is not None:
|
||||
instance_name += f"-py{args.python_version}"
|
||||
inst.create_tags(
|
||||
DryRun=False,
|
||||
Tags=[
|
||||
{
|
||||
"Key": "Name",
|
||||
"Value": instance_name,
|
||||
}
|
||||
],
|
||||
)
|
||||
addr = inst.public_dns_name
|
||||
wait_for_connection(addr, 22)
|
||||
host = RemoteHost(addr, keyfile_path)
|
||||
host.ami = ami
|
||||
if args.use_docker:
|
||||
update_apt_repo(host)
|
||||
host.start_docker()
|
||||
|
||||
if args.test_only:
|
||||
run_tests(host, args.test_only)
|
||||
sys.exit(0)
|
||||
|
||||
if args.alloc_instance:
|
||||
if args.python_version is None:
|
||||
sys.exit(0)
|
||||
install_condaforge_python(host, args.python_version)
|
||||
sys.exit(0)
|
||||
|
||||
python_version = args.python_version if args.python_version is not None else "3.10"
|
||||
|
||||
if args.use_torch_from_pypi:
|
||||
configure_system(host, compiler=args.compiler, python_version=python_version)
|
||||
print("Installing PyTorch wheel")
|
||||
host.run_cmd("pip3 install torch")
|
||||
build_domains(
|
||||
host, branch=args.branch, git_clone_flags=" --depth 1 --shallow-submodules"
|
||||
)
|
||||
else:
|
||||
start_build(
|
||||
host,
|
||||
branch=args.branch,
|
||||
compiler=args.compiler,
|
||||
python_version=python_version,
|
||||
pytorch_only=args.pytorch_only,
|
||||
pytorch_build_number=args.pytorch_build_number,
|
||||
enable_mkldnn=not args.disable_mkldnn,
|
||||
)
|
||||
if not args.keep_running:
|
||||
print(f"Waiting for instance {inst.id} to terminate")
|
||||
inst.terminate()
|
||||
inst.wait_until_terminated()
|
||||
87
.ci/aarch64_linux/embed_library.py
Normal file
87
.ci/aarch64_linux/embed_library.py
Normal file
@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from subprocess import check_call
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from auditwheel.elfutils import elf_file_filter
|
||||
from auditwheel.lddtree import lddtree
|
||||
from auditwheel.patcher import Patchelf
|
||||
from auditwheel.repair import copylib
|
||||
from auditwheel.wheeltools import InWheelCtx
|
||||
|
||||
|
||||
def replace_tag(filename):
|
||||
with open(filename) as f:
|
||||
lines = f.read().split("\\n")
|
||||
for i, line in enumerate(lines):
|
||||
if not line.startswith("Tag: "):
|
||||
continue
|
||||
lines[i] = line.replace("-linux_", "-manylinux2014_")
|
||||
print(f"Updated tag from {line} to {lines[i]}")
|
||||
|
||||
with open(filename, "w") as f:
|
||||
f.write("\\n".join(lines))
|
||||
|
||||
|
||||
class AlignedPatchelf(Patchelf):
|
||||
def set_soname(self, file_name: str, new_soname: str) -> None:
|
||||
check_call(
|
||||
["patchelf", "--page-size", "65536", "--set-soname", new_soname, file_name]
|
||||
)
|
||||
|
||||
def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
|
||||
check_call(
|
||||
[
|
||||
"patchelf",
|
||||
"--page-size",
|
||||
"65536",
|
||||
"--replace-needed",
|
||||
soname,
|
||||
new_soname,
|
||||
file_name,
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def embed_library(whl_path, lib_soname, update_tag=False):
|
||||
patcher = AlignedPatchelf()
|
||||
out_dir = TemporaryDirectory()
|
||||
whl_name = os.path.basename(whl_path)
|
||||
tmp_whl_name = os.path.join(out_dir.name, whl_name)
|
||||
with InWheelCtx(whl_path) as ctx:
|
||||
torchlib_path = os.path.join(ctx._tmpdir.name, "torch", "lib")
|
||||
ctx.out_wheel = tmp_whl_name
|
||||
new_lib_path, new_lib_soname = None, None
|
||||
for filename, _ in elf_file_filter(ctx.iter_files()):
|
||||
if not filename.startswith("torch/lib"):
|
||||
continue
|
||||
libtree = lddtree(filename)
|
||||
if lib_soname not in libtree["needed"]:
|
||||
continue
|
||||
lib_path = libtree["libs"][lib_soname]["path"]
|
||||
if lib_path is None:
|
||||
print(f"Can't embed {lib_soname} as it could not be found")
|
||||
break
|
||||
if lib_path.startswith(torchlib_path):
|
||||
continue
|
||||
|
||||
if new_lib_path is None:
|
||||
new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
|
||||
patcher.replace_needed(filename, lib_soname, new_lib_soname)
|
||||
print(f"Replacing {lib_soname} with {new_lib_soname} for {filename}")
|
||||
if update_tag:
|
||||
# Add manylinux2014 tag
|
||||
for filename in ctx.iter_files():
|
||||
if os.path.basename(filename) != "WHEEL":
|
||||
continue
|
||||
replace_tag(filename)
|
||||
shutil.move(tmp_whl_name, whl_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
embed_library(
|
||||
sys.argv[1], "libgomp.so.1", len(sys.argv) > 2 and sys.argv[2] == "--update-tag"
|
||||
)
|
||||
@ -4,17 +4,14 @@ set -ex
|
||||
|
||||
SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||
|
||||
# Source the common build script for architecture-specific configurations (MKLDNN, ACL, etc.)
|
||||
source "${SCRIPTPATH}/../pytorch/build.sh" || true
|
||||
|
||||
case "${GPU_ARCH_TYPE:-BLANK}" in
|
||||
cuda | cuda-aarch64)
|
||||
cuda)
|
||||
bash "${SCRIPTPATH}/build_cuda.sh"
|
||||
;;
|
||||
rocm)
|
||||
bash "${SCRIPTPATH}/build_rocm.sh"
|
||||
;;
|
||||
cpu | cpu-cxx11-abi | cpu-aarch64 | cpu-s390x)
|
||||
cpu | cpu-cxx11-abi | cpu-s390x)
|
||||
bash "${SCRIPTPATH}/build_cpu.sh"
|
||||
;;
|
||||
xpu)
|
||||
|
||||
@ -18,31 +18,12 @@ retry () {
|
||||
$* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
|
||||
}
|
||||
|
||||
# Detect architecture first
|
||||
ARCH=$(uname -m)
|
||||
echo "Detected architecture: $ARCH"
|
||||
|
||||
PLATFORM=""
|
||||
# TODO move this into the Docker images
|
||||
OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
|
||||
if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
|
||||
retry yum install -q -y zip openssl
|
||||
# Set platform based on architecture
|
||||
case $ARCH in
|
||||
x86_64)
|
||||
PLATFORM="manylinux_2_28_x86_64"
|
||||
;;
|
||||
aarch64)
|
||||
PLATFORM="manylinux_2_28_aarch64"
|
||||
;;
|
||||
s390x)
|
||||
PLATFORM="manylinux_2_28_s390x"
|
||||
;;
|
||||
*)
|
||||
echo "Unsupported architecture: $ARCH"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
PLATFORM="manylinux_2_28_x86_64"
|
||||
elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
|
||||
retry dnf install -q -y zip openssl
|
||||
elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
|
||||
@ -57,8 +38,6 @@ else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Platform set to: $PLATFORM"
|
||||
|
||||
# We use the package name to test the package by passing this to 'pip install'
|
||||
# This is the env variable that setup.py uses to name the package. Note that
|
||||
# pip 'normalizes' the name first by changing all - to _
|
||||
@ -320,8 +299,8 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
|
||||
# ROCm workaround for roctracer dlopens
|
||||
if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
|
||||
patchedpath=$(fname_without_so_number $destpath)
|
||||
# Keep the so number for XPU dependencies, libgomp.so.1, ACL libraries, and NVPL libraries to avoid twice load
|
||||
elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" || "$filename" == libarm_compute* || "$filename" == libnvpl* || "$filename" == "libgfortran.so.5" ]]; then
|
||||
# Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load
|
||||
elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then
|
||||
patchedpath=$destpath
|
||||
else
|
||||
patchedpath=$(fname_with_sha256 $destpath)
|
||||
@ -367,22 +346,9 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
|
||||
done
|
||||
|
||||
# create Manylinux 2_28 tag this needs to happen before regenerate the RECORD
|
||||
# Support all architectures (x86_64, aarch64, s390x)
|
||||
if [[ "$IS_MANYLINUX2_28" == "1" && $GPU_ARCH_TYPE != "xpu" ]]; then
|
||||
if [[ $PLATFORM == "manylinux_2_28_x86_64" && $GPU_ARCH_TYPE != "cpu-s390x" && $GPU_ARCH_TYPE != "xpu" ]]; then
|
||||
wheel_file=$(echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/WHEEL/g')
|
||||
echo "Updating wheel tag for $ARCH architecture"
|
||||
# Replace linux_* with manylinux_2_28_* based on architecture
|
||||
case $ARCH in
|
||||
x86_64)
|
||||
sed -i -e 's#linux_x86_64#manylinux_2_28_x86_64#g' $wheel_file
|
||||
;;
|
||||
aarch64)
|
||||
sed -i -e 's#linux_aarch64#manylinux_2_28_aarch64#g' $wheel_file
|
||||
;;
|
||||
s390x)
|
||||
sed -i -e 's#linux_s390x#manylinux_2_28_s390x#g' $wheel_file
|
||||
;;
|
||||
esac
|
||||
sed -i -e s#linux_x86_64#"${PLATFORM}"# $wheel_file;
|
||||
fi
|
||||
|
||||
# regenerate the RECORD file with new hashes
|
||||
|
||||
@ -15,10 +15,6 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
|
||||
EXTRA_CAFFE2_CMAKE_FLAGS=()
|
||||
fi
|
||||
|
||||
# Detect architecture
|
||||
ARCH=$(uname -m)
|
||||
echo "Building CPU wheel for architecture: $ARCH"
|
||||
|
||||
WHEELHOUSE_DIR="wheelhousecpu"
|
||||
LIBTORCH_HOUSE_DIR="libtorch_housecpu"
|
||||
if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
|
||||
@ -38,10 +34,8 @@ elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
|
||||
elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
|
||||
LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
|
||||
elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
|
||||
if [[ "$ARCH" == "s390x" ]]; then
|
||||
if [[ "$(uname -m)" == "s390x" ]]; then
|
||||
LIBGOMP_PATH="/usr/lib/s390x-linux-gnu/libgomp.so.1"
|
||||
elif [[ "$ARCH" == "aarch64" ]]; then
|
||||
LIBGOMP_PATH="/usr/lib/aarch64-linux-gnu/libgomp.so.1"
|
||||
else
|
||||
LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
|
||||
fi
|
||||
@ -55,34 +49,6 @@ DEPS_SONAME=(
|
||||
"libgomp.so.1"
|
||||
)
|
||||
|
||||
# Add ARM-specific library dependencies for CPU builds
|
||||
if [[ "$ARCH" == "aarch64" ]]; then
|
||||
echo "Adding ARM-specific CPU library dependencies"
|
||||
|
||||
# ARM Compute Library (if available)
|
||||
if [[ -d "/acl/build" ]]; then
|
||||
echo "Adding ARM Compute Library for CPU"
|
||||
DEPS_LIST+=(
|
||||
"/acl/build/libarm_compute.so"
|
||||
"/acl/build/libarm_compute_graph.so"
|
||||
)
|
||||
DEPS_SONAME+=(
|
||||
"libarm_compute.so"
|
||||
"libarm_compute_graph.so"
|
||||
)
|
||||
fi
|
||||
|
||||
# ARM system libraries
|
||||
DEPS_LIST+=(
|
||||
"/usr/lib64/libgfortran.so.5"
|
||||
"/opt/OpenBLAS/lib/libopenblas.so.0"
|
||||
)
|
||||
DEPS_SONAME+=(
|
||||
"libgfortran.so.5"
|
||||
"libopenblas.so.0"
|
||||
)
|
||||
fi
|
||||
|
||||
rm -rf /usr/local/cuda*
|
||||
|
||||
SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
|
||||
|
||||
@ -29,10 +29,6 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
|
||||
EXTRA_CAFFE2_CMAKE_FLAGS=()
|
||||
fi
|
||||
|
||||
# Detect architecture
|
||||
ARCH=$(uname -m)
|
||||
echo "Building for architecture: $ARCH"
|
||||
|
||||
# Determine CUDA version and architectures to build for
|
||||
#
|
||||
# NOTE: We should first check `DESIRED_CUDA` when determining `CUDA_VERSION`,
|
||||
@ -57,60 +53,34 @@ fi
|
||||
cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
|
||||
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
|
||||
|
||||
# Function to remove architectures from a list
|
||||
remove_archs() {
|
||||
local result="$1"
|
||||
shift
|
||||
for arch in "$@"; do
|
||||
result="${result//${arch};/}"
|
||||
done
|
||||
echo "$result"
|
||||
}
|
||||
|
||||
# Function to filter CUDA architectures for aarch64
|
||||
# aarch64 ARM GPUs only support certain compute capabilities
|
||||
# Keep: 8.0 (A100), 9.0+ (Hopper, Grace Hopper, newer)
|
||||
# Remove: < 8.0 (no ARM GPUs), 8.6 (x86_64 RTX 3090/A6000 only)
|
||||
filter_aarch64_archs() {
|
||||
local arch_list="$1"
|
||||
# Explicitly remove architectures not needed on aarch64
|
||||
arch_list=$(remove_archs "$arch_list" "5.0" "6.0" "7.0" "7.5" "8.6")
|
||||
echo "$arch_list"
|
||||
}
|
||||
|
||||
# Base: Common architectures across all modern CUDA versions
|
||||
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0"
|
||||
|
||||
case ${CUDA_VERSION} in
|
||||
12.6) TORCH_CUDA_ARCH_LIST="5.0;6.0;${TORCH_CUDA_ARCH_LIST}" ;; # Only 12.6 includes Legacy Maxwell/Pascal that will be removed in future releases
|
||||
12.8) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0" ;; # +Hopper/Blackwell support
|
||||
12.9) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0+PTX" # +Hopper/Blackwell support + PTX for forward compatibility
|
||||
#removing sm_50-sm_60 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
|
||||
#however we would like to keep sm_70 architecture see: https://github.com/pytorch/pytorch/issues/157517
|
||||
12.8)
|
||||
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0"
|
||||
;;
|
||||
12.9)
|
||||
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0+PTX"
|
||||
# WAR to resolve the ld error in libtorch build with CUDA 12.9
|
||||
if [[ "$PACKAGE_TYPE" == "libtorch" ]]; then
|
||||
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//7.0;/}" # Remove 7.0 to resolve the ld error
|
||||
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//8.6;/}" # Remove 8.6 for libtorch
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
|
||||
fi
|
||||
;;
|
||||
13.0)
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;$([[ "$ARCH" == "aarch64" ]] && echo "11.0;" || echo "")12.0+PTX"
|
||||
export TORCH_NVCC_FLAGS="-compress-mode=size"
|
||||
export BUILD_BUNDLE_PTXAS=1
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
|
||||
;;
|
||||
12.6)
|
||||
TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
|
||||
;;
|
||||
*)
|
||||
echo "unknown cuda version $CUDA_VERSION"
|
||||
exit 1
|
||||
;;
|
||||
*) echo "unknown cuda version $CUDA_VERSION"; exit 1 ;;
|
||||
esac
|
||||
|
||||
# Filter for aarch64: Remove < 8.0 and 8.6
|
||||
[[ "$ARCH" == "aarch64" ]] && TORCH_CUDA_ARCH_LIST=$(filter_aarch64_archs "$TORCH_CUDA_ARCH_LIST")
|
||||
|
||||
echo "TORCH_CUDA_ARCH_LIST set to: $TORCH_CUDA_ARCH_LIST"
|
||||
export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
||||
echo "${TORCH_CUDA_ARCH_LIST}"
|
||||
|
||||
# Disable MAGMA for aarch64 as pre-built libraries are x86-64 only
|
||||
if [[ "$ARCH" == "aarch64" ]]; then
|
||||
echo "Disabling MAGMA for aarch64 architecture"
|
||||
export USE_MAGMA=0
|
||||
fi
|
||||
|
||||
# Package directories
|
||||
WHEELHOUSE_DIR="wheelhouse$cuda_version_nodot"
|
||||
LIBTORCH_HOUSE_DIR="libtorch_house$cuda_version_nodot"
|
||||
@ -274,51 +244,6 @@ else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Add ARM-specific library dependencies
|
||||
if [[ "$ARCH" == "aarch64" ]]; then
|
||||
echo "Adding ARM-specific library dependencies"
|
||||
|
||||
# ARM Compute Library (if available)
|
||||
if [[ -d "/acl/build" ]]; then
|
||||
echo "Adding ARM Compute Library"
|
||||
DEPS_LIST+=(
|
||||
"/acl/build/libarm_compute.so"
|
||||
"/acl/build/libarm_compute_graph.so"
|
||||
)
|
||||
DEPS_SONAME+=(
|
||||
"libarm_compute.so"
|
||||
"libarm_compute_graph.so"
|
||||
)
|
||||
fi
|
||||
|
||||
# ARM system libraries
|
||||
DEPS_LIST+=(
|
||||
"/lib64/libgomp.so.1"
|
||||
"/usr/lib64/libgfortran.so.5"
|
||||
)
|
||||
DEPS_SONAME+=(
|
||||
"libgomp.so.1"
|
||||
"libgfortran.so.5"
|
||||
)
|
||||
|
||||
# NVPL libraries (ARM optimized BLAS/LAPACK)
|
||||
if [[ -d "/usr/local/lib" && -f "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0" ]]; then
|
||||
echo "Adding NVPL libraries for ARM"
|
||||
DEPS_LIST+=(
|
||||
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0"
|
||||
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0"
|
||||
"/usr/local/lib/libnvpl_lapack_core.so.0"
|
||||
"/usr/local/lib/libnvpl_blas_core.so.0"
|
||||
)
|
||||
DEPS_SONAME+=(
|
||||
"libnvpl_lapack_lp64_gomp.so.0"
|
||||
"libnvpl_blas_lp64_gomp.so.0"
|
||||
"libnvpl_lapack_core.so.0"
|
||||
"libnvpl_blas_core.so.0"
|
||||
)
|
||||
fi
|
||||
fi
|
||||
|
||||
# run_tests.sh requires DESIRED_CUDA to know what tests to exclude
|
||||
export DESIRED_CUDA="$cuda_version_nodot"
|
||||
|
||||
@ -326,11 +251,9 @@ export DESIRED_CUDA="$cuda_version_nodot"
|
||||
rm -rf /usr/local/cuda || true
|
||||
ln -s "/usr/local/cuda-${CUDA_VERSION}" /usr/local/cuda
|
||||
|
||||
# Switch `/usr/local/magma` to the desired CUDA version (skip for aarch64)
|
||||
if [[ "$ARCH" != "aarch64" ]]; then
|
||||
rm -rf /usr/local/magma || true
|
||||
ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
|
||||
fi
|
||||
# Switch `/usr/local/magma` to the desired CUDA version
|
||||
rm -rf /usr/local/magma || true
|
||||
ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
|
||||
|
||||
export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev) # 10.0.130
|
||||
export CUDA_VERSION_SHORT=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev | cut -f1,2 -d".") # 10.0
|
||||
|
||||
@ -21,3 +21,87 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
|
||||
fi
|
||||
|
||||
mkdir -p "$pytest_reports_dir" || true
|
||||
|
||||
##########################################
|
||||
# copied from .ci/pytorch/common_utils.sh
|
||||
##########################################
|
||||
|
||||
function get_pinned_commit() {
|
||||
cat .github/ci_commit_pins/"${1}".txt
|
||||
}
|
||||
|
||||
function pip_install_whl() {
|
||||
# This is used to install PyTorch and other build artifacts wheel locally
|
||||
# without using any network connection
|
||||
|
||||
# Convert the input arguments into an array
|
||||
local args=("$@")
|
||||
|
||||
# Check if the first argument contains multiple paths separated by spaces
|
||||
if [[ "${args[0]}" == *" "* ]]; then
|
||||
# Split the string by spaces into an array
|
||||
IFS=' ' read -r -a paths <<< "${args[0]}"
|
||||
# Loop through each path and install individually
|
||||
for path in "${paths[@]}"; do
|
||||
echo "Installing $path"
|
||||
python3 -mpip install --no-index --no-deps "$path"
|
||||
done
|
||||
else
|
||||
# Loop through each argument and install individually
|
||||
for path in "${args[@]}"; do
|
||||
echo "Installing $path"
|
||||
python3 -mpip install --no-index --no-deps "$path"
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
function pip_build_and_install() {
|
||||
local build_target=$1
|
||||
local wheel_dir=$2
|
||||
|
||||
local found_whl=0
|
||||
for file in "${wheel_dir}"/*.whl
|
||||
do
|
||||
if [[ -f "${file}" ]]; then
|
||||
found_whl=1
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
# Build the wheel if it doesn't exist
|
||||
if [ "${found_whl}" == "0" ]; then
|
||||
python3 -m pip wheel \
|
||||
--no-build-isolation \
|
||||
--no-deps \
|
||||
-w "${wheel_dir}" \
|
||||
"${build_target}"
|
||||
fi
|
||||
|
||||
for file in "${wheel_dir}"/*.whl
|
||||
do
|
||||
pip_install_whl "${file}"
|
||||
done
|
||||
}
|
||||
|
||||
function install_torchvision() {
|
||||
local orig_preload
|
||||
local commit
|
||||
commit=$(get_pinned_commit vision)
|
||||
orig_preload=${LD_PRELOAD}
|
||||
if [ -n "${LD_PRELOAD}" ]; then
|
||||
# Silence dlerror to work-around glibc ASAN bug, see https://sourceware.org/bugzilla/show_bug.cgi?id=27653#c9
|
||||
echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c -
|
||||
LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so
|
||||
fi
|
||||
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
|
||||
# Not sure if both are needed, but why not
|
||||
export FORCE_CUDA=1
|
||||
export WITH_CUDA=1
|
||||
fi
|
||||
pip_build_and_install "git+https://github.com/pytorch/vision.git@${commit}" dist/vision
|
||||
|
||||
if [ -n "${LD_PRELOAD}" ]; then
|
||||
LD_PRELOAD=${orig_preload}
|
||||
fi
|
||||
}
|
||||
|
||||
@ -19,7 +19,7 @@ git config --global --add safe.directory /var/lib/jenkins/workspace
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
|
||||
# TODO: This can be removed later once vision is also part of the Docker image
|
||||
pip install -q --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
|
||||
install_torchvision
|
||||
# JIT C++ extensions require ninja, so put it into PATH.
|
||||
export PATH="/var/lib/jenkins/.local/bin:$PATH"
|
||||
# NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we
|
||||
|
||||
@ -86,20 +86,14 @@ else
|
||||
fi
|
||||
fi
|
||||
|
||||
# Enable MKLDNN with ARM Compute Library for ARM builds
|
||||
if [[ "$BUILD_ENVIRONMENT" == *zen* ]]; then
|
||||
export USE_ZENDNN=1
|
||||
fi
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
|
||||
export USE_MKLDNN=1
|
||||
|
||||
# ACL is required for aarch64 builds
|
||||
if [[ ! -d "/acl" ]]; then
|
||||
echo "ERROR: ARM Compute Library not found at /acl"
|
||||
echo "ACL is required for aarch64 builds. Check Docker image setup."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export USE_MKLDNN_ACL=1
|
||||
export ACL_ROOT_DIR=/acl
|
||||
echo "ARM Compute Library enabled for MKLDNN: ACL_ROOT_DIR=/acl"
|
||||
fi
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then
|
||||
|
||||
@ -1250,6 +1250,97 @@ test_custom_script_ops() {
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_libtorch_agnostic_targetting() {
|
||||
echo "Testing libtorch_agnostic runs correctly on TORCH_TARGET_VERSION"
|
||||
|
||||
REPO_DIR=$(pwd)
|
||||
WHEEL_DIR="${REPO_DIR}/test/cpp_extensions/.wheels"
|
||||
|
||||
# Build wheel with current PyTorch (this has TORCH_TARGET_VERSION 2_9_0)
|
||||
echo "Building 2.9 extension wheel with current PyTorch..."
|
||||
pushd test/cpp_extensions/libtorch_agnostic_2_9_extension
|
||||
time python setup.py bdist_wheel
|
||||
|
||||
# Save the wheel
|
||||
mkdir -p "$WHEEL_DIR"
|
||||
cp dist/*.whl "$WHEEL_DIR/"
|
||||
WHEEL_FILE=$(find "$WHEEL_DIR" -maxdepth 1 -name "*.whl" -type f | head -1)
|
||||
echo "Built wheel: $(basename "$WHEEL_FILE")"
|
||||
popd
|
||||
|
||||
# Create venv and install PyTorch 2.9
|
||||
python -m venv venv_pytorch_2_9
|
||||
# shellcheck disable=SC1091
|
||||
. venv_pytorch_2_9/bin/activate
|
||||
|
||||
# Clear PYTHONPATH to avoid using the development PyTorch
|
||||
echo "Clearing PYTHONPATH to use only venv packages..."
|
||||
unset PYTHONPATH
|
||||
|
||||
# Upgrade pip to latest version
|
||||
echo "Upgrading pip to latest version..."
|
||||
pip install --upgrade pip
|
||||
pip --version
|
||||
|
||||
echo "Installing PyTorch 2.9..."
|
||||
|
||||
# Install from release channel only
|
||||
PYTORCH_VERSION="2.9.0"
|
||||
|
||||
# Extract CUDA version from BUILD_ENVIRONMENT (e.g., "cuda12.1" -> "cu121")
|
||||
if [[ "$BUILD_ENVIRONMENT" =~ cuda([0-9]+)\.([0-9]+) ]]; then
|
||||
CUDA_MAJOR="${BASH_REMATCH[1]}"
|
||||
CUDA_MINOR="${BASH_REMATCH[2]}"
|
||||
CUDA_VERSION="cu${CUDA_MAJOR}${CUDA_MINOR}"
|
||||
echo " Detected CUDA ${CUDA_MAJOR}.${CUDA_MINOR} from BUILD_ENVIRONMENT, using ${CUDA_VERSION}"
|
||||
else
|
||||
# Default to CPU build
|
||||
CUDA_VERSION="cpu"
|
||||
echo " No CUDA detected in BUILD_ENVIRONMENT, using CPU build"
|
||||
fi
|
||||
|
||||
if pip install torch=="${PYTORCH_VERSION}" --index-url https://download.pytorch.org/whl/${CUDA_VERSION}/; then
|
||||
echo "Installed PyTorch ${PYTORCH_VERSION} from release channel (${CUDA_VERSION})"
|
||||
else
|
||||
echo " FAILED to install PyTorch 2.9.0 from release channel"
|
||||
echo " URL: https://download.pytorch.org/whl/${CUDA_VERSION}/"
|
||||
deactivate
|
||||
rm -rf venv_pytorch_2_9
|
||||
return 1
|
||||
fi
|
||||
|
||||
INSTALLED_VERSION=$(python -c "import torch; print(torch.__version__)" 2>/dev/null || echo "unknown")
|
||||
echo " Installed version: $INSTALLED_VERSION"
|
||||
|
||||
# Install test dependencies
|
||||
echo "Installing test dependencies..."
|
||||
pip install expecttest numpy unittest-xml-reporting
|
||||
|
||||
# Install the pre-built wheel
|
||||
echo ""
|
||||
echo "Installing pre-built 2.9 extension wheel (built with PyTorch 2.10)..."
|
||||
pip install "$WHEEL_FILE"
|
||||
echo "Installed $(basename "$WHEEL_FILE") into PyTorch 2.9 environment"
|
||||
|
||||
# Run tests with PyTorch 2.9 runtime (2.10 tests will be skipped automatically)
|
||||
echo ""
|
||||
echo "Running tests with PyTorch 2.9 runtime (using wheel built on PyTorch 2.10)..."
|
||||
if time python test/cpp_extensions/test_libtorch_agnostic.py -v; then
|
||||
echo ""
|
||||
echo " Wheel built with current torch and TORCH_TARGET_VERSION 2_9_0 works with PyTorch 2.9 runtime!"
|
||||
else
|
||||
echo "targeting test failed"
|
||||
deactivate
|
||||
rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
|
||||
return 1
|
||||
fi
|
||||
|
||||
deactivate
|
||||
rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
|
||||
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_jit_hooks() {
|
||||
echo "Testing jit hooks in cpp"
|
||||
HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
|
||||
@ -1722,6 +1813,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]];
|
||||
elif [[ "${TEST_CONFIG}" == *backward* ]]; then
|
||||
test_forward_backward_compatibility
|
||||
# Do NOT add tests after bc check tests, see its comment.
|
||||
elif [[ "${TEST_CONFIG}" == *libtorch_agnostic_targetting* ]]; then
|
||||
test_libtorch_agnostic_targetting
|
||||
elif [[ "${TEST_CONFIG}" == *xla* ]]; then
|
||||
install_torchvision
|
||||
build_xla
|
||||
|
||||
7
.github/workflows/_binary-build-linux.yml
vendored
7
.github/workflows/_binary-build-linux.yml
vendored
@ -260,8 +260,11 @@ jobs:
|
||||
"${DOCKER_IMAGE}"
|
||||
)
|
||||
docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
|
||||
# Unified build script for all architectures (x86_64, aarch64, s390x)
|
||||
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"aarch64"* ]]; then
|
||||
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/aarch64_linux/aarch64_ci_build.sh"
|
||||
else
|
||||
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
|
||||
fi
|
||||
|
||||
- name: Chown artifacts
|
||||
if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
|
||||
|
||||
@ -80,7 +80,7 @@ jobs:
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
build-environment: linux-jammy-zen-py3.10-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -106,7 +106,7 @@ jobs:
|
||||
needs: inductor-build
|
||||
if: github.event.schedule == '0 7 * * *'
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
build-environment: linux-jammy-zen-py3.10-gcc11-build
|
||||
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
@ -122,7 +122,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
build-environment: linux-jammy-zen-py3.10-gcc11-build
|
||||
dashboard-tag: training-${{ inputs.training || 'false' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'true' }}-aotinductor-${{ inputs.aotinductor || 'true' }}-freezing-${{ inputs.freezing || 'true' }}
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
|
||||
1
.github/workflows/pull.yml
vendored
1
.github/workflows/pull.yml
vendored
@ -70,6 +70,7 @@ jobs:
|
||||
{ config: "distributed", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "distributed", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "numpy_2_x", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
{ config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
1
.github/workflows/trunk.yml
vendored
1
.github/workflows/trunk.yml
vendored
@ -83,6 +83,7 @@ jobs:
|
||||
{ config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
|
||||
{ config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
|
||||
{ config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -132,3 +132,6 @@
|
||||
[submodule "third_party/aiter"]
|
||||
path = third_party/aiter
|
||||
url = https://github.com/ROCm/aiter.git
|
||||
[submodule "third_party/ZenDNN"]
|
||||
path = third_party/ZenDNN
|
||||
url = https://github.com/amd/ZenDNN.git
|
||||
|
||||
@ -82,6 +82,7 @@ include_patterns = [
|
||||
'aten/src/ATen/native/mkldnn/xpu/**/*.cpp',
|
||||
'aten/src/ATen/native/Tensor*.h',
|
||||
'aten/src/ATen/native/Tensor*.cpp',
|
||||
'aten/src/ATen/native/zendnn/*.*',
|
||||
'c10/**/*.h',
|
||||
'c10/**/*.cpp',
|
||||
'torch/csrc/**/*.h',
|
||||
|
||||
@ -205,6 +205,11 @@ filegroup(
|
||||
srcs = glob(["aten/src/ATen/native/xnnpack/*.cpp"]),
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "aten_native_zendnn_cpp",
|
||||
srcs = glob(["aten/src/ATen/native/zendnn/*.cpp"]),
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "aten_base_vulkan",
|
||||
srcs = glob(["aten/src/ATen/vulkan/*.cpp"]),
|
||||
@ -285,6 +290,7 @@ header_template_rule(
|
||||
"@AT_BLAS_USE_CBLAS_DOT@": "1",
|
||||
"@AT_KLEIDIAI_ENABLED@": "0",
|
||||
"@AT_USE_EIGEN_SPARSE@": "0",
|
||||
"@AT_ZENDNN_ENABLED@": "0",
|
||||
},
|
||||
)
|
||||
|
||||
@ -365,6 +371,7 @@ cc_library(
|
||||
":aten_native_sparse_cpp",
|
||||
":aten_native_transformers_cpp",
|
||||
":aten_native_xnnpack",
|
||||
":aten_native_zendnn_cpp",
|
||||
":aten_src_ATen_config",
|
||||
] + generated_cpu_cpp + aten_ufunc_generated_cpu_sources("aten/src/ATen/{}"),
|
||||
copts = ATEN_COPTS,
|
||||
|
||||
@ -336,6 +336,21 @@ set(MKLDNN_ENABLE_CONCURRENT_EXEC ${USE_MKLDNN})
|
||||
cmake_dependent_option(USE_MKLDNN_CBLAS "Use CBLAS in MKLDNN" OFF "USE_MKLDNN"
|
||||
OFF)
|
||||
option(USE_STATIC_MKL "Prefer to link with MKL statically (Unix only)" OFF)
|
||||
|
||||
# currently ZenDNN is kept off and enabled only through user setting on X86_64/AMD64
|
||||
option(USE_ZENDNN
|
||||
"Build with ZENDNN support"
|
||||
OFF)
|
||||
if(USE_ZENDNN AND NOT CPU_INTEL)
|
||||
message(WARNING
|
||||
"USE_ZENDNN was requested, but the target processor "
|
||||
"(${CMAKE_SYSTEM_PROCESSOR}) is not AMD64/x86_64. "
|
||||
"ZENDNN support will be disabled.")
|
||||
|
||||
# Switch it off in the cache so the GUI / subsequent runs see the change
|
||||
set(USE_ZENDNN OFF CACHE BOOL "Build with ZENDNN support" FORCE)
|
||||
endif()
|
||||
|
||||
cmake_dependent_option(
|
||||
USE_MPI "Use MPI for Caffe2. Only available if USE_DISTRIBUTED is on." ON
|
||||
"USE_DISTRIBUTED" OFF)
|
||||
@ -1385,6 +1400,7 @@ if(BUILD_SHARED_LIBS)
|
||||
${PROJECT_SOURCE_DIR}/cmake/public/gflags.cmake
|
||||
${PROJECT_SOURCE_DIR}/cmake/public/mkl.cmake
|
||||
${PROJECT_SOURCE_DIR}/cmake/public/mkldnn.cmake
|
||||
${PROJECT_SOURCE_DIR}/cmake/public/zendnn.cmake
|
||||
${PROJECT_SOURCE_DIR}/cmake/public/protobuf.cmake
|
||||
${PROJECT_SOURCE_DIR}/cmake/public/utils.cmake
|
||||
${PROJECT_SOURCE_DIR}/cmake/public/LoadHIP.cmake
|
||||
|
||||
@ -93,6 +93,7 @@ file(GLOB mkldnn_xpu_cpp "native/mkldnn/xpu/*.cpp" "native/mkldnn/xpu/detail/*.c
|
||||
file(GLOB native_cpp "native/*.cpp")
|
||||
file(GLOB native_mkl_cpp "native/mkl/*.cpp")
|
||||
file(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp")
|
||||
file(GLOB native_zendnn_cpp "native/zendnn/*.cpp")
|
||||
file(GLOB vulkan_cpp "vulkan/*.cpp")
|
||||
file(GLOB native_vulkan_cpp "native/vulkan/*.cpp" "native/vulkan/api/*.cpp" "native/vulkan/impl/*.cpp" "native/vulkan/ops/*.cpp")
|
||||
|
||||
@ -378,7 +379,7 @@ if(BUILD_LITE_INTERPRETER)
|
||||
append_filelist("aten_native_source_non_codegen_list" all_cpu_cpp)
|
||||
else()
|
||||
set(
|
||||
all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp}
|
||||
all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp} ${native_zendnn_cpp}
|
||||
${native_ao_sparse_cpp} ${native_sparse_cpp} ${native_nested_cpp}
|
||||
${native_quantized_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp}
|
||||
${native_transformers_cpp}
|
||||
|
||||
@ -21,3 +21,4 @@
|
||||
#define AT_BLAS_USE_CBLAS_DOT() @AT_BLAS_USE_CBLAS_DOT@
|
||||
#define AT_KLEIDIAI_ENABLED() @AT_KLEIDIAI_ENABLED@
|
||||
#define AT_USE_EIGEN_SPARSE() @AT_USE_EIGEN_SPARSE@
|
||||
#define AT_ZENDNN_ENABLED() @AT_ZENDNN_ENABLED@
|
||||
|
||||
@ -664,6 +664,14 @@ bool Context::hasEigenSparse() {
|
||||
#endif
|
||||
}
|
||||
|
||||
bool Context::hasZenDNN() {
|
||||
#if AT_ZENDNN_ENABLED()
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
at::QEngine Context::qEngine() const {
|
||||
static auto _quantized_engine = []() {
|
||||
at::QEngine qengine = at::kNoQEngine;
|
||||
|
||||
@ -150,6 +150,7 @@ class TORCH_API Context {
|
||||
static bool hasMKL();
|
||||
static bool hasKleidiAI();
|
||||
static bool hasLAPACK();
|
||||
static bool hasZenDNN();
|
||||
static bool hasMKLDNN();
|
||||
static bool ckSupported();
|
||||
static bool hasEigenSparse();
|
||||
@ -639,6 +640,10 @@ inline bool hasEigenSparse() {
|
||||
return globalContext().hasEigenSparse();
|
||||
}
|
||||
|
||||
inline bool hasZenDNN() {
|
||||
return globalContext().hasZenDNN();
|
||||
}
|
||||
|
||||
inline bool hasMAGMA() {
|
||||
return globalContext().hasMAGMA();
|
||||
}
|
||||
|
||||
@ -130,4 +130,29 @@ uint32_t L2_cache_size() {
|
||||
return get_cache_size(2);
|
||||
}
|
||||
|
||||
bool is_amd_cpu() {
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
auto check_amd_vendor = []() -> bool {
|
||||
if (!cpuinfo_initialize()) {
|
||||
return false;
|
||||
}
|
||||
const uint32_t num_cores = cpuinfo_get_cores_count();
|
||||
if (num_cores <= 0) {
|
||||
return false;
|
||||
}
|
||||
// Get first core information
|
||||
const struct cpuinfo_core* core = cpuinfo_get_core(0);
|
||||
if (!core) {
|
||||
return false;
|
||||
}
|
||||
// Check AMD vendor support
|
||||
return (core->vendor == cpuinfo_vendor_amd);
|
||||
};
|
||||
static bool is_amd = check_amd_vendor();
|
||||
return is_amd;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace at::cpu
|
||||
|
||||
@ -9,6 +9,9 @@ namespace at::cpu {
|
||||
TORCH_API bool is_avx2_supported();
|
||||
TORCH_API bool is_avx512_supported();
|
||||
|
||||
// Detect if CPU is AMD Zen4 or newer.
|
||||
TORCH_API bool is_amd_cpu();
|
||||
|
||||
// Detect if CPU support Vector Neural Network Instruction.
|
||||
TORCH_API bool is_avx512_vnni_supported();
|
||||
|
||||
@ -30,4 +33,7 @@ TORCH_API uint32_t L1d_cache_size();
|
||||
// Get the L2 cache size per core in Byte
|
||||
TORCH_API uint32_t L2_cache_size();
|
||||
|
||||
// Detect if CPU is AMD.
|
||||
TORCH_API bool is_amd_cpu();
|
||||
|
||||
} // namespace at::cpu
|
||||
|
||||
@ -20,6 +20,7 @@
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/mkldnn/Matmul.h>
|
||||
#include <ATen/native/mkldnn/Utils.h>
|
||||
#include <ATen/native/zendnn/Matmul.h>
|
||||
#include <ATen/cpu/Utils.h>
|
||||
#include <c10/core/GradMode.h>
|
||||
#include <c10/util/accumulate.h>
|
||||
@ -1396,6 +1397,7 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) {
|
||||
return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void addmm_impl_cpu_(
|
||||
Tensor &result, const Tensor &self, Tensor m1, Tensor m2, const Scalar& beta, const Scalar& alpha) {
|
||||
TORCH_INTERNAL_ASSERT(self.dim() == 2 && m1.dim() == 2 && m2.dim() == 2);
|
||||
@ -1728,7 +1730,6 @@ static void baddbmm_with_gemm_(const Tensor &result, const Tensor &mat1, const T
|
||||
result.data_ptr<scalar_t>(), ldc, result_strides[0]);
|
||||
});
|
||||
}
|
||||
|
||||
// This tries to apply some optimizations to bmm/baddbmm:
|
||||
// - When the operand size is small, computation are parallelized over the batch
|
||||
// dimension using OMP and naive matrix multiplication is applied.
|
||||
@ -1751,6 +1752,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
|
||||
int64_t res_rows = batch1_sizes[1];
|
||||
int64_t res_cols = batch2_sizes[2];
|
||||
|
||||
|
||||
// handle pathological cases that blas may not like
|
||||
if (self_or_result.numel() == 0) {
|
||||
return;
|
||||
@ -1771,6 +1773,19 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
|
||||
return (strides[2] == 1 && (sizes[1] == 1 || strides[1] >= sizes[2])) ||
|
||||
(strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1]));
|
||||
};
|
||||
|
||||
#if AT_ZENDNN_ENABLED()
|
||||
if(at::cpu::is_amd_cpu()
|
||||
&& at::cpu::is_avx512_supported()
|
||||
&& self_or_result.scalar_type() == kBFloat16
|
||||
&& self_or_result.is_contiguous()
|
||||
&& self_or_result.sizes()[0] > 1)
|
||||
{
|
||||
zendnn_baddbmm(self_or_result, batch1, batch2, beta.to<float>(), alpha.to<float>());
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
|
||||
// Always apply mkldnn heuristic on x86 platform, but on ARM only if compiled with ACL
|
||||
bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]);
|
||||
|
||||
@ -17,6 +17,15 @@
|
||||
#else
|
||||
#include <ATen/ops/empty.h>
|
||||
#endif
|
||||
|
||||
#if AT_ZENDNN_ENABLED()
|
||||
#include <zendnnl.hpp>
|
||||
#include <ATen/cpu/Utils.h>
|
||||
#include <ATen/native/zendnn/ZenDNN_utils.hpp>
|
||||
|
||||
|
||||
using namespace zendnnl::lowoha;
|
||||
#endif
|
||||
namespace at::native {
|
||||
|
||||
namespace {
|
||||
@ -440,7 +449,29 @@ void cpu_flash_attention(
|
||||
accum_t* buf_data = buf.data_ptr<accum_t>();
|
||||
scalar_t* buf_reduced_data = is_reduced_type ? buf_reduced.data_ptr<scalar_t>() : nullptr;
|
||||
|
||||
// Buffer to store padding query and packing key/value
|
||||
bool enable_zen_matmul = false;
|
||||
|
||||
#if AT_ZENDNN_ENABLED()
|
||||
enable_zen_matmul = at::cpu::is_amd_cpu()
|
||||
&& at::cpu::is_avx512_supported()
|
||||
&& output.scalar_type() == kBFloat16;
|
||||
data_type_t out_type = get_zendnn_dtype(buf);
|
||||
data_type_t inp_dtype = get_zendnn_dtype(query);
|
||||
data_type_t wgt_dtype = get_zendnn_dtype(key);
|
||||
|
||||
data_types matmul_dtype;
|
||||
matmul_dtype.src = inp_dtype;
|
||||
matmul_dtype.wei = wgt_dtype;
|
||||
matmul_dtype.dst = out_type;
|
||||
matmul_dtype.bias = data_type_t::none;
|
||||
matmul_dtype.compute = data_type_t::none;
|
||||
|
||||
lowoha_params params;
|
||||
params.dtypes = matmul_dtype;
|
||||
params.lowoha_algo= matmul_algo_t::libxsmm;
|
||||
#endif
|
||||
|
||||
// Buffer to store padding query and packing key/value
|
||||
scalar_t* key_reorder_ptr = nullptr;
|
||||
scalar_t* value_reorder_ptr = nullptr;
|
||||
scalar_t* query_padding_ptr = nullptr;
|
||||
@ -575,22 +606,55 @@ void cpu_flash_attention(
|
||||
qk_data);
|
||||
}
|
||||
} else {
|
||||
cpublas::gemm(
|
||||
TransposeType::Transpose,
|
||||
TransposeType::NoTranspose,
|
||||
kvBlockSize,
|
||||
qBlockSize,
|
||||
headSize,
|
||||
static_cast<accum_t>(1),
|
||||
k_data + i * kStrideB + kv_j * kStrideH +
|
||||
n * kStrideN,
|
||||
kStrideN,
|
||||
q_data + i * qStrideB + j * qStrideH +
|
||||
|
||||
if(enable_zen_matmul)
|
||||
{
|
||||
#if AT_ZENDNN_ENABLED()
|
||||
// Limit OpenMP nesting to prevent over-subscription and
|
||||
// ensure optimal thread performance
|
||||
omp_set_max_active_levels(1);
|
||||
zendnnl::lowoha::matmul_direct(
|
||||
'r', // row major
|
||||
false, // transA
|
||||
true, // trasnsB
|
||||
qBlockSize,
|
||||
kvBlockSize,
|
||||
headSize,
|
||||
static_cast<accum_t>(1), // alpha
|
||||
q_data + i * qStrideB + j * qStrideH +
|
||||
m * qStrideM,
|
||||
qStrideM,
|
||||
static_cast<accum_t>(0),
|
||||
qk_data,
|
||||
kvBlockSize);
|
||||
qStrideM,
|
||||
k_data + i * kStrideB + kv_j * kStrideH +
|
||||
n * kStrideN,
|
||||
kStrideN,
|
||||
nullptr,
|
||||
static_cast<accum_t>(0), // beta
|
||||
qk_data,
|
||||
kvBlockSize,
|
||||
params,
|
||||
1, // batch size 1
|
||||
1); // batch size 1
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
cpublas::gemm(
|
||||
TransposeType::Transpose,
|
||||
TransposeType::NoTranspose,
|
||||
kvBlockSize,
|
||||
qBlockSize,
|
||||
headSize,
|
||||
static_cast<accum_t>(1),
|
||||
k_data + i * kStrideB + kv_j * kStrideH +
|
||||
n * kStrideN,
|
||||
kStrideN,
|
||||
q_data + i * qStrideB + j * qStrideH +
|
||||
m * qStrideM,
|
||||
qStrideM,
|
||||
static_cast<accum_t>(0),
|
||||
qk_data,
|
||||
kvBlockSize);
|
||||
}
|
||||
}
|
||||
// Apply causal mask, fill unused with -inf
|
||||
if (is_causal && num_keys - n <= kvSplitSize) {
|
||||
@ -706,21 +770,52 @@ void cpu_flash_attention(
|
||||
dst_data);
|
||||
}
|
||||
} else {
|
||||
cpublas::gemm(
|
||||
TransposeType::NoTranspose,
|
||||
TransposeType::NoTranspose,
|
||||
headSize,
|
||||
qBlockSize,
|
||||
kvBlockSize,
|
||||
static_cast<accum_t>(1),
|
||||
v_data + i * vStrideB + kv_j * vStrideH +
|
||||
n * vStrideN,
|
||||
vStrideN,
|
||||
conditional_data_ptr(qk_data, qk_reduced_data),
|
||||
kvBlockSize,
|
||||
n == 0 ? static_cast<accum_t>(0) : static_cast<accum_t>(1),
|
||||
dst_data,
|
||||
headSize);
|
||||
if(enable_zen_matmul)
|
||||
{
|
||||
#if AT_ZENDNN_ENABLED()
|
||||
// Limit OpenMP nesting to prevent over-subscription and
|
||||
// ensure optimal thread performance
|
||||
omp_set_max_active_levels(1);
|
||||
zendnnl::lowoha::matmul_direct(
|
||||
'r', // row major
|
||||
false, // transA
|
||||
false, // transB
|
||||
qBlockSize,
|
||||
headSize,
|
||||
kvBlockSize,
|
||||
static_cast<accum_t>(1), // alpha
|
||||
conditional_data_ptr(qk_data, qk_reduced_data),
|
||||
kvBlockSize,
|
||||
v_data + i * vStrideB + kv_j * vStrideH +
|
||||
n * vStrideN,
|
||||
vStrideN,
|
||||
nullptr,
|
||||
n == 0 ? static_cast<accum_t>(0) : static_cast<accum_t>(1), // beta
|
||||
dst_data,
|
||||
headSize,
|
||||
params,
|
||||
1, // batch size 1
|
||||
1); // batch size 1
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
cpublas::gemm(
|
||||
TransposeType::NoTranspose,
|
||||
TransposeType::NoTranspose,
|
||||
headSize,
|
||||
qBlockSize,
|
||||
kvBlockSize,
|
||||
static_cast<accum_t>(1),
|
||||
v_data + i * vStrideB + kv_j * vStrideH +
|
||||
n * vStrideN,
|
||||
vStrideN,
|
||||
conditional_data_ptr(qk_data, qk_reduced_data),
|
||||
kvBlockSize,
|
||||
n == 0 ? static_cast<accum_t>(0) : static_cast<accum_t>(1),
|
||||
dst_data,
|
||||
headSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
|
||||
#include <ATen/native/xpu/Blas.h>
|
||||
#include <ATen/xpu/XPUScaledBlas.h>
|
||||
#include <torch/library.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
@ -339,4 +340,399 @@ Tensor _scaled_mm_xpu(
|
||||
out);
|
||||
}
|
||||
|
||||
using acceptance_fn = std::function<bool(
|
||||
c10::ScalarType,
|
||||
std::vector<ScalingType>&,
|
||||
ArrayRef<Tensor>&,
|
||||
c10::ScalarType,
|
||||
std::vector<ScalingType>&,
|
||||
ArrayRef<Tensor>&)>;
|
||||
using namespace std::placeholders;
|
||||
|
||||
namespace scaled_blas = at::native::onednn::scaled;
|
||||
using scaled_blas::convert_int_to_enum;
|
||||
using scaled_blas::ScaledGemmImplementation;
|
||||
|
||||
std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 2>
|
||||
scale_kernel_dispatch = {{
|
||||
{"tensorwise_tensorwise",
|
||||
scaled_blas::check_tensorwise_recipe,
|
||||
ScaledGemmImplementation::TENSORWISE_TENSORWISE},
|
||||
{"rowwise_rowwise",
|
||||
scaled_blas::check_rowwise_recipe,
|
||||
ScaledGemmImplementation::ROWWISE_ROWWISE},
|
||||
|
||||
}};
|
||||
|
||||
Tensor& _scaled_tensorwise_tensorwise(
|
||||
const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
const Tensor& scale_a,
|
||||
const Tensor& scale_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const c10::ScalarType out_dtype,
|
||||
bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
// Restrictions:
|
||||
// A, B are FP8, scales are fp32
|
||||
|
||||
TORCH_CHECK_VALUE(
|
||||
isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()),
|
||||
"mat_a and mat_b must be fp8 types, got: ",
|
||||
mat_a.scalar_type(),
|
||||
mat_b.scalar_type());
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_a.numel() == 1 && scale_a.scalar_type() == kFloat,
|
||||
"scale_a must have 1 Float element")
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_b.numel() == 1 && scale_b.scalar_type() == kFloat,
|
||||
"scale_b must have 1 Float element")
|
||||
|
||||
auto scaling_choice_a = ScalingType::TensorWise;
|
||||
auto scaling_choice_b = ScalingType::TensorWise;
|
||||
|
||||
_scaled_gemm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a,
|
||||
scale_b,
|
||||
scaling_choice_a,
|
||||
scaling_choice_b,
|
||||
bias,
|
||||
use_fast_accum,
|
||||
out);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
Tensor& _scaled_rowwise_rowwise(
|
||||
const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
const Tensor& scale_a,
|
||||
const Tensor& scale_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const c10::ScalarType out_dtype,
|
||||
bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
// Restrictions:
|
||||
// A, B are FP8, scales are fp32, shape M/N for A/B
|
||||
TORCH_CHECK_VALUE(
|
||||
isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()),
|
||||
"mat_a and mat_b must be fp8 types, got: ",
|
||||
mat_a.scalar_type(),
|
||||
mat_b.scalar_type());
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_a.size(0) == mat_a.size(0) && scale_a.size(1) == 1,
|
||||
"scale_a must have shape [",
|
||||
mat_a.size(0),
|
||||
", 1], got [",
|
||||
scale_a.sizes(),
|
||||
"]");
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat,
|
||||
"scale_a must have ",
|
||||
mat_a.size(0),
|
||||
" Float elements, got ",
|
||||
scale_a.numel())
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat,
|
||||
"scale_b must have ",
|
||||
mat_b.size(1),
|
||||
" Float elements, got ",
|
||||
scale_b.numel())
|
||||
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_a.stride(1) == 1,
|
||||
"expected scale_a.stride(1) to be 1, but got ",
|
||||
scale_a.stride(1));
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_b.stride(1) == 1,
|
||||
"expected scale_b.stride(1) to be 1, but got ",
|
||||
scale_b.stride(1));
|
||||
|
||||
auto scaling_choice_a = ScalingType::RowWise;
|
||||
auto scaling_choice_b = ScalingType::RowWise;
|
||||
|
||||
_scaled_gemm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a,
|
||||
scale_b,
|
||||
scaling_choice_a,
|
||||
scaling_choice_b,
|
||||
bias,
|
||||
use_fast_accum,
|
||||
out);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
// V2: Computes matrix multiply + bias while applying scaling to input and
|
||||
// output matrices Scales are only applicable when matrices are of Float8 type
|
||||
// and assumed to be equal to 1.0 by default. If output matrix type is 16 or
|
||||
// 32-bit type, scale_result is not applied. Known limitations:
|
||||
// - Only works if mat1 is row-major and mat2 is column-major
|
||||
// - Only works if matrices sizes are divisible by 32
|
||||
// - If 1-dimensional tensors are used then scale_a should be size =
|
||||
// mat1.size(0)
|
||||
// and scale_b should have size = to mat2.size(1)
|
||||
// Arguments:
|
||||
// - `mat_a`: the first operand of the matrix multiply, can be type
|
||||
// `torch.float8_e4m3fn` or `torch.float8_e5m2`
|
||||
// - `mat_b`: the second operand of the matrix multiply, can be type
|
||||
// `torch.float8_e4m3fn` or `torch.float8_e5m2`
|
||||
// - `scale_a`: a tensor with the inverse scale of `mat1`, whose
|
||||
// shape/strides/dtype depend on the scaling scheme
|
||||
// - `scale_recipe_a`: An integer corresponding to an enum describing the
|
||||
// scaling scheme used for `scale_a`
|
||||
// - `swizzle_a`: An integer corresponding to a `SwizzleType` enum describing
|
||||
// the swizzling scheme for `scale_a`.
|
||||
// Not supported for XPU for now.
|
||||
// - `scale_b`: a tensor with the inverse scale of `mat2`, whose
|
||||
// shape/strides/dtype depend on the scaling scheme
|
||||
// - `scale_recipe_b`: An integer corresponding to an enum describing the
|
||||
// scaling scheme used for `scale_b`
|
||||
// - `swizzle_b`: An integer corresponding to a `SwizzleType` enum describing
|
||||
// the swizzling scheme for `scale_b`.
|
||||
// Not supported for XPU for now.
|
||||
// - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
|
||||
// - `out_dtype`: the output dtype, can either be a float8 or a higher
|
||||
// precision floating point type
|
||||
// - `contraction_dim`: describe which dimensions are `K` in the matmul.
|
||||
// Not supported for XPU. Should always be empty.
|
||||
// - `use_fast_accum`: Not supported for XPU, should always be false.
|
||||
// - `out`: a reference to the output tensor
|
||||
Tensor& _scaled_mm_xpu_v2_out(
|
||||
const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
ArrayRef<Tensor> scale_a,
|
||||
IntArrayRef scale_recipe_a,
|
||||
IntArrayRef swizzle_a,
|
||||
ArrayRef<Tensor> scale_b,
|
||||
IntArrayRef scale_recipe_b,
|
||||
IntArrayRef swizzle_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const std::optional<c10::ScalarType> out_dtype,
|
||||
IntArrayRef contraction_dim,
|
||||
bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
TORCH_CHECK_VALUE(mat_a.dim() == 2, "mat_a must be a matrix");
|
||||
TORCH_CHECK_VALUE(mat_b.dim() == 2, "mat_b must be a matrix");
|
||||
|
||||
// If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm
|
||||
// kernels do not support this case).
|
||||
if (mat_a.size(0) == 0 || mat_a.size(1) == 0 || mat_b.size(1) == 0) {
|
||||
// `out` was created with `at::empty`. In the case where we are multiplying
|
||||
// MxK by KxN and K is the zero dim, we need to initialize here to properly
|
||||
// return a tensor of zeros.
|
||||
at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
|
||||
if (mat_a.size(1) == 0) {
|
||||
out.zero_();
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
// Note: The `contraction_dim` is not actually used for now. We will need to
|
||||
// align this code when upstreamed CUDA code is done. Currently, only keeps
|
||||
// the code here for check.
|
||||
|
||||
// Check if the input matrix sizes can be multiplied
|
||||
// - if optional contraction dims are provided, use those
|
||||
// -- mostly for < 1B formats (i.e. nvfp4x2) where cheap .t() is not
|
||||
// available.
|
||||
if (contraction_dim.size() > 0) {
|
||||
TORCH_CHECK_VALUE(
|
||||
contraction_dim.size() == 2,
|
||||
"contraction_dim must have exactly 2 elements");
|
||||
auto mat_a_dim = contraction_dim[0];
|
||||
auto mat_b_dim = contraction_dim[1];
|
||||
TORCH_CHECK_VALUE(
|
||||
mat_a.size(mat_a_dim) == mat_b.size(mat_b_dim),
|
||||
"mat_a and mat_b shapes cannot be multiplied (",
|
||||
mat_a.size(0),
|
||||
"x",
|
||||
mat_a.size(1),
|
||||
" and ",
|
||||
mat_b.size(0),
|
||||
"x",
|
||||
mat_b.size(1),
|
||||
") ",
|
||||
"with contraction dims mat_a: ",
|
||||
mat_a_dim,
|
||||
", mat_b: ",
|
||||
mat_b_dim);
|
||||
} else {
|
||||
TORCH_CHECK_VALUE(
|
||||
mat_a.size(1) == mat_b.size(0),
|
||||
"mat_a and mat_b shapes cannot be multiplied (",
|
||||
mat_a.size(0),
|
||||
"x",
|
||||
mat_a.size(1),
|
||||
" and ",
|
||||
mat_b.size(0),
|
||||
"x",
|
||||
mat_b.size(1),
|
||||
")");
|
||||
}
|
||||
|
||||
TORCH_CHECK_VALUE(
|
||||
!bias || bias->numel() == mat_b.sizes()[1],
|
||||
"Bias must be size ",
|
||||
mat_b.sizes()[1],
|
||||
" but got ",
|
||||
bias->numel());
|
||||
|
||||
TORCH_CHECK_VALUE(
|
||||
!out_dtype || *out_dtype == out.scalar_type(),
|
||||
"out_dtype must match output matrix type");
|
||||
|
||||
if (bias) {
|
||||
TORCH_CHECK_VALUE(
|
||||
bias->scalar_type() == kFloat ||
|
||||
bias->scalar_type() == c10::ScalarType::BFloat16 ||
|
||||
bias->scalar_type() == c10::ScalarType::Half,
|
||||
"Bias must be Float32 or BFloat16 or Half, but got ",
|
||||
bias->scalar_type());
|
||||
}
|
||||
{
|
||||
auto bias_ = bias.value_or(Tensor());
|
||||
// NOLINTNEXTLINE(*c-array*)
|
||||
TensorArg targs[]{
|
||||
{out, "out", 0},
|
||||
{mat_a, "mat_a", 1},
|
||||
{mat_b, "mat_b", 2},
|
||||
{bias_, "bias", 3},
|
||||
{scale_a[0], "scale_a", 4},
|
||||
{scale_b[0], "scale_b", 5}};
|
||||
checkAllSameGPU(__func__, targs);
|
||||
}
|
||||
// Align with CUDA's default out to be bf16
|
||||
auto out_dtype_ = out_dtype.value_or(c10::ScalarType::BFloat16);
|
||||
|
||||
// Conversion of implicitly-defined enums to explicit
|
||||
auto scale_recipe_a_enum = convert_int_to_enum<ScalingType>(scale_recipe_a);
|
||||
auto swizzle_a_enum = convert_int_to_enum<SwizzleType>(swizzle_a);
|
||||
auto scale_recipe_b_enum = convert_int_to_enum<ScalingType>(scale_recipe_b);
|
||||
auto swizzle_b_enum = convert_int_to_enum<SwizzleType>(swizzle_b);
|
||||
|
||||
// XPU does not support swizzle for now. So directly return false.
|
||||
TORCH_CHECK_VALUE(
|
||||
swizzle_a_enum[0] == at::blas::SwizzleType::NO_SWIZZLE &&
|
||||
swizzle_b_enum[0] == at::blas::SwizzleType::NO_SWIZZLE,
|
||||
"XPU does not support swizzle yet.");
|
||||
|
||||
// at this point we can start working out what we want to be doing
|
||||
// Try to do as few steps as possible.
|
||||
// NOTE: support is deliberately sparse, can explicitly enumerate all
|
||||
// combinations allowed. Do this via a list of defined (name, acceptance,
|
||||
// concrete_impl) tuples.
|
||||
bool found_impl = false;
|
||||
ScaledGemmImplementation gemm_impl = ScaledGemmImplementation::NONE;
|
||||
|
||||
for (const auto& fn_entry : scale_kernel_dispatch) {
|
||||
const auto [name, accept_fn, scaled_gemm_impl] = fn_entry;
|
||||
bool ok = accept_fn(
|
||||
mat_a.scalar_type(),
|
||||
scale_recipe_a_enum,
|
||||
scale_a,
|
||||
mat_b.scalar_type(),
|
||||
scale_recipe_b_enum,
|
||||
scale_b);
|
||||
if (ok) {
|
||||
gemm_impl = scaled_gemm_impl;
|
||||
found_impl = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
TORCH_CHECK_VALUE(
|
||||
found_impl,
|
||||
"Invalid scaling configuration.\n"
|
||||
"- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n"
|
||||
"- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (",
|
||||
mat_a.size(0),
|
||||
", 1) and scale_b should be (1, ",
|
||||
mat_b.size(1),
|
||||
"), and both should be contiguous.\n"
|
||||
"Got mat_a.dtype()=",
|
||||
mat_a.scalar_type(),
|
||||
", scale_a[0].dtype()=",
|
||||
scale_a[0].scalar_type(),
|
||||
", scale_a[0].size()=",
|
||||
scale_a[0].sizes(),
|
||||
", scale_a[0].stride()=",
|
||||
scale_a[0].strides(),
|
||||
", ",
|
||||
"mat_b.dtype()=",
|
||||
mat_b.scalar_type(),
|
||||
", scale_b[0].dtype()=",
|
||||
scale_b[0].scalar_type(),
|
||||
", scale_b[0].size()=",
|
||||
scale_b[0].sizes(),
|
||||
" and scale_b[0].stride()=",
|
||||
scale_b[0].strides());
|
||||
|
||||
at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
|
||||
|
||||
auto bias_ = bias.value_or(Tensor());
|
||||
|
||||
// dispatch to appropriate lower-level calls for error checking & execution
|
||||
if (gemm_impl == ScaledGemmImplementation::TENSORWISE_TENSORWISE) {
|
||||
return _scaled_tensorwise_tensorwise(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a[0],
|
||||
scale_b[0],
|
||||
bias,
|
||||
out_dtype_,
|
||||
use_fast_accum,
|
||||
out);
|
||||
} else if (gemm_impl == ScaledGemmImplementation::ROWWISE_ROWWISE) {
|
||||
return _scaled_rowwise_rowwise(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a[0],
|
||||
scale_b[0],
|
||||
bias,
|
||||
out_dtype_,
|
||||
use_fast_accum,
|
||||
out);
|
||||
} else {
|
||||
TORCH_CHECK_VALUE(
|
||||
false, "Invalid state - found an implementation, but not really");
|
||||
}
|
||||
}
|
||||
|
||||
Tensor _scaled_mm_xpu_v2(
|
||||
const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
ArrayRef<Tensor> scale_a,
|
||||
IntArrayRef scale_recipe_a,
|
||||
IntArrayRef swizzle_a,
|
||||
ArrayRef<Tensor> scale_b,
|
||||
IntArrayRef scale_recipe_b,
|
||||
IntArrayRef swizzle_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const std::optional<c10::ScalarType> out_dtype,
|
||||
IntArrayRef contraction_dim,
|
||||
bool use_fast_accum) {
|
||||
const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
|
||||
Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
|
||||
|
||||
return _scaled_mm_xpu_v2_out(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a,
|
||||
scale_recipe_a,
|
||||
swizzle_a,
|
||||
scale_b,
|
||||
scale_recipe_b,
|
||||
swizzle_b,
|
||||
bias,
|
||||
out_dtype,
|
||||
contraction_dim,
|
||||
use_fast_accum,
|
||||
out);
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -3403,6 +3403,14 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: linear_out
|
||||
|
||||
- func: zendnn_linear_unary(Tensor input, Tensor weight, Tensor? bias=None, *, bool is_weight_prepacked=False, str post_op="none") -> Tensor
|
||||
dispatch:
|
||||
CPU: zendnn_linear_unary
|
||||
|
||||
- func: zendnn_weight_prepack_for_linear(Tensor weight) -> Tensor
|
||||
dispatch:
|
||||
CPU: zendnn_weight_prepack_for_linear
|
||||
|
||||
- func: mkldnn_linear(Tensor self, Tensor weight, Tensor? bias=None) -> Tensor
|
||||
python_module: nn
|
||||
dispatch:
|
||||
|
||||
96
aten/src/ATen/native/zendnn/Linear.cpp
Normal file
96
aten/src/ATen/native/zendnn/Linear.cpp
Normal file
@ -0,0 +1,96 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/zendnn/Linear_utils.hpp>
|
||||
#include <string_view>
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/zendnn_linear_unary_native.h>
|
||||
#endif
|
||||
|
||||
#if !AT_ZENDNN_ENABLED()
|
||||
namespace at::native {
|
||||
at::Tensor zendnn_linear_unary(
|
||||
const at::Tensor& input,
|
||||
const at::Tensor& weight,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
bool is_weight_prepacked,
|
||||
std::string_view post_op) {
|
||||
TORCH_CHECK(
|
||||
false, "zendnn_linear_unary: ATen is not compiled with ZenDNN support");
|
||||
}
|
||||
} // namespace at::native
|
||||
|
||||
#else // !AT_ZENDNN_ENABLED()
|
||||
|
||||
namespace at::native {
|
||||
using namespace zendnnl::interface;
|
||||
|
||||
inline void zendnn_linear_impl(
|
||||
const at::Tensor& input,
|
||||
const at::Tensor& weight,
|
||||
const at::Tensor& bias,
|
||||
at::Tensor& result,
|
||||
bool is_weight_prepacked) {
|
||||
// Get appropriately processed tensors (2D input, transposed weight, 2D
|
||||
// result)
|
||||
check_args_for_linear(input, weight);
|
||||
data_type_t datatype = get_zendnn_dtype(input);
|
||||
auto input_2d = get_2d_view(input);
|
||||
auto weight_transposed = weight.t();
|
||||
auto result_2d = result.view(get_2d_size_for_tensor(result));
|
||||
check_tensor_dtypes_for_linear(input_2d, weight_transposed, bias, result_2d);
|
||||
check_tensor_sizes_for_linear(input_2d, weight_transposed, bias, result_2d);
|
||||
// declare linear tensors
|
||||
matmul_context_t matmul_context;
|
||||
tensor_t input_tensor, weight_tensor, output_tensor, bias_tensor;
|
||||
create_zendnn_tensor(input_2d, input_tensor, "matmul_input", datatype);
|
||||
create_zendnn_tensor(
|
||||
weight_transposed,
|
||||
weight_tensor,
|
||||
"weights",
|
||||
datatype,
|
||||
is_weight_prepacked);
|
||||
create_zendnn_tensor(result_2d, output_tensor, "matmul_output", datatype);
|
||||
if (bias.defined()) {
|
||||
// adds dimension at dim=0 -> [1, n]
|
||||
auto bias_unsqueezed = bias.unsqueeze(0);
|
||||
create_zendnn_tensor(bias_unsqueezed, bias_tensor, "bias", datatype);
|
||||
set_linear_context_attributes(matmul_context, weight_tensor, bias_tensor);
|
||||
} else {
|
||||
set_linear_context_attributes(matmul_context, weight_tensor);
|
||||
}
|
||||
matmul_context.create();
|
||||
// define matmul operator
|
||||
matmul_operator_t matmul_operator;
|
||||
matmul_operator.set_name("matmul_operator")
|
||||
.set_context(matmul_context)
|
||||
.create();
|
||||
TORCH_CHECK(
|
||||
matmul_operator.check(),
|
||||
"operator ",
|
||||
matmul_operator.get_name(),
|
||||
" creation failed.");
|
||||
matmul_operator.set_input("matmul_input", input_tensor)
|
||||
.set_output("matmul_output", output_tensor);
|
||||
matmul_operator.execute();
|
||||
}
|
||||
|
||||
at::Tensor zendnn_linear_unary(
|
||||
const at::Tensor& input,
|
||||
const at::Tensor& weight,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
bool is_weight_prepacked,
|
||||
std::string_view post_op) {
|
||||
c10::MaybeOwned<at::Tensor> bias_maybe_owned =
|
||||
at::borrow_from_optional_tensor(bias);
|
||||
const at::Tensor& bias_t = *bias_maybe_owned;
|
||||
// Create output tensor with appropriate size and strides
|
||||
at::Tensor result = create_linear_output_tensor(input, weight);
|
||||
// Perform ZENDNN linear operation
|
||||
zendnn_linear_impl(input, weight, bias_t, result, is_weight_prepacked);
|
||||
return result;
|
||||
}
|
||||
} // namespace at::native
|
||||
|
||||
#endif // !AT_ZENDNN_ENABLED()
|
||||
136
aten/src/ATen/native/zendnn/Linear_utils.hpp
Normal file
136
aten/src/ATen/native/zendnn/Linear_utils.hpp
Normal file
@ -0,0 +1,136 @@
|
||||
#pragma once
|
||||
#include <ATen/native/zendnn/ZenDNN_utils.hpp>
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/empty.h>
|
||||
#endif
|
||||
#include <c10/util/Logging.h>
|
||||
#include <cstdint>
|
||||
#include <functional> // For std::reference_wrapper, std::ref, std::cref
|
||||
#include <iostream>
|
||||
#include <optional> // For std::optional, std::nullopt
|
||||
#include <unordered_map>
|
||||
|
||||
#if AT_ZENDNN_ENABLED()
|
||||
namespace at::native {
|
||||
using namespace zendnnl::interface;
|
||||
|
||||
inline std::vector<int64_t> get_2d_size_for_tensor(
|
||||
const at::Tensor& inp_tensor) {
|
||||
const int64_t dim = inp_tensor.dim();
|
||||
std::vector<int64_t> output_size(2);
|
||||
output_size[0] = inp_tensor.numel() / inp_tensor.size(dim - 1);
|
||||
output_size[1] = inp_tensor.size(dim - 1);
|
||||
return output_size;
|
||||
}
|
||||
|
||||
inline at::Tensor get_2d_view(const at::Tensor& tensor) {
|
||||
auto stride = tensor.strides();
|
||||
if (!std::is_sorted(stride.begin(), stride.end(), std::greater<int64_t>())) {
|
||||
auto new_tensor = tensor.clone(at::MemoryFormat::Contiguous)
|
||||
.view(get_2d_size_for_tensor(tensor));
|
||||
return new_tensor;
|
||||
}
|
||||
return tensor.view(get_2d_size_for_tensor(tensor));
|
||||
}
|
||||
|
||||
inline std::vector<int64_t> compute_linear_output_sizes(
|
||||
const at::Tensor& input,
|
||||
const at::Tensor& weights) {
|
||||
auto input_size = input.sizes();
|
||||
std::vector<int64_t> output_size(input_size.begin(), input_size.end() - 1);
|
||||
auto weights_last_dim_size = weights.size(weights.dim() - 1);
|
||||
output_size.emplace_back(weights_last_dim_size);
|
||||
return output_size;
|
||||
}
|
||||
// Returns output strides for linear (input @ weights) and linear operations
|
||||
inline std::vector<int64_t> compute_linear_output_strides(
|
||||
const std::vector<int64_t>& output_size) {
|
||||
std::vector<int64_t> output_strides(output_size.size(), 1);
|
||||
for (int i = output_size.size() - 2; i >= 0; --i) {
|
||||
output_strides[i] = output_strides[i + 1] * output_size[i + 1];
|
||||
}
|
||||
return output_strides;
|
||||
}
|
||||
|
||||
inline at::Tensor create_linear_output_tensor(
|
||||
const at::Tensor input,
|
||||
const at::Tensor weight) {
|
||||
auto output_size = compute_linear_output_sizes(input, weight.t());
|
||||
auto output_strides = compute_linear_output_strides(output_size);
|
||||
at::Tensor result = at::detail::empty_strided_cpu(
|
||||
output_size, output_strides, input.options());
|
||||
return result.is_contiguous() ? result : result.contiguous();
|
||||
}
|
||||
|
||||
inline void check_args_for_linear(
|
||||
const at::Tensor& input,
|
||||
const at::Tensor& weights) {
|
||||
TORCH_CHECK(
|
||||
(input.dim() != 1 && weights.dim() != 1),
|
||||
"1d dims are not supported yet.");
|
||||
get_zendnn_dtype(input);
|
||||
}
|
||||
|
||||
inline void check_tensor_sizes_for_linear(
|
||||
const at::Tensor& input,
|
||||
const at::Tensor& weights,
|
||||
const at::Tensor& bias,
|
||||
const at::Tensor& result) {
|
||||
const int input_dim = input.dim();
|
||||
const int weights_dim = weights.dim();
|
||||
TORCH_CHECK(
|
||||
(input_dim == 2 && weights_dim == 2),
|
||||
"unsupported dims for input and weights");
|
||||
const auto input_sizes = input.sizes();
|
||||
const auto weights_sizes = weights.sizes();
|
||||
TORCH_CHECK(
|
||||
input_sizes[input_dim - 1] == weights_sizes[input_dim - 2],
|
||||
"Tensor shapes incompatible for linear");
|
||||
if (bias.defined()) {
|
||||
TORCH_CHECK(
|
||||
bias.dim() == 1 && bias.size(0) == weights_sizes[1],
|
||||
"bias shape incompatible with linear");
|
||||
}
|
||||
}
|
||||
|
||||
inline void check_tensor_dtypes_for_linear(
|
||||
const at::Tensor& input,
|
||||
const at::Tensor& weights,
|
||||
const at::Tensor& bias,
|
||||
const at::Tensor& result) {
|
||||
auto is_fp32 = [](const at::Tensor& t) {
|
||||
return t.scalar_type() == c10::ScalarType::Float;
|
||||
};
|
||||
auto is_bf16 = [](const at::Tensor& t) {
|
||||
return t.scalar_type() == c10::ScalarType::BFloat16;
|
||||
};
|
||||
bool all_fp32 = is_fp32(input) && is_fp32(weights) && is_fp32(result) &&
|
||||
(!bias.defined() || is_fp32(bias));
|
||||
bool all_bf16 = is_bf16(input) && is_bf16(weights) && is_bf16(result) &&
|
||||
(!bias.defined() || is_bf16(bias));
|
||||
TORCH_CHECK(
|
||||
all_fp32 ^ all_bf16,
|
||||
"All tensors must have consistent dtype and zendnn linear only supports Float and BFloat16");
|
||||
if (all_bf16) {
|
||||
TORCH_CHECK(
|
||||
zendnn_bf16_device_check(),
|
||||
"zendnn linear bf16 path needs cpu support avx512bf16");
|
||||
}
|
||||
}
|
||||
|
||||
inline void set_linear_context_attributes(
|
||||
matmul_context_t& matmul_context,
|
||||
tensor_t& weights,
|
||||
std::optional<std::reference_wrapper<tensor_t>> bias_opt_ref =
|
||||
std::nullopt) {
|
||||
matmul_context.set_param("weights", weights);
|
||||
if (bias_opt_ref.has_value()) {
|
||||
tensor_t& bias = bias_opt_ref->get();
|
||||
matmul_context.set_param("bias", bias);
|
||||
}
|
||||
}
|
||||
} // namespace at::native
|
||||
#endif // AT_ZENDNN_ENABLED()
|
||||
104
aten/src/ATen/native/zendnn/Matmul.cpp
Normal file
104
aten/src/ATen/native/zendnn/Matmul.cpp
Normal file
@ -0,0 +1,104 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/Config.h>
|
||||
#include <ATen/Context.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/native/zendnn/Matmul.h>
|
||||
#include <ATen/native/zendnn/ZenDNN_utils.hpp>
|
||||
#include <ATen/record_function.h>
|
||||
|
||||
#if AT_ZENDNN_ENABLED()
|
||||
#include <zendnnl.hpp>
|
||||
namespace at::native {
|
||||
|
||||
using namespace zendnnl::lowoha;
|
||||
void zendnn_baddbmm(
|
||||
const Tensor& self,
|
||||
const Tensor& batch1,
|
||||
const Tensor& batch2,
|
||||
float beta,
|
||||
float alpha) {
|
||||
RECORD_FUNCTION(
|
||||
"zendnn::zendnn_baddbmm",
|
||||
std::vector<c10::IValue>({batch1, batch2, self}));
|
||||
|
||||
Tensor b1 = batch1;
|
||||
Tensor b2 = batch2;
|
||||
// Infer matrix dimensions from 3D inputs:
|
||||
// [B, M, K] x [B, K, N] -> [B, M, N]
|
||||
const int64_t M = b1.size(1);
|
||||
const int64_t N = b2.size(2);
|
||||
const int64_t K = b1.size(2);
|
||||
|
||||
// Check if a 3D tensor is transposed (transposed version of a contiguous
|
||||
// tensor) in the last two dimensions.
|
||||
// For a transposed tensor
|
||||
// [B, M, K] -> [B, K, M]:
|
||||
// - stride[0] should be M*K (batch stride unchanged)
|
||||
// - stride[1] should be 1 (innermost dimension after transpose)
|
||||
// - stride[2] should be M (step size for original rows, now columns)
|
||||
auto is_transposed = [](const Tensor& t) {
|
||||
const auto sizes = t.sizes();
|
||||
const auto strides = t.strides();
|
||||
return strides[0] == sizes[1] * sizes[2] && strides[1] == 1 &&
|
||||
strides[2] == sizes[1];
|
||||
};
|
||||
|
||||
// check if tensor is transposed
|
||||
bool transa = is_transposed(b1);
|
||||
bool transb = is_transposed(b2);
|
||||
|
||||
// make a copy of tensor when tensor is neither contiguous nor transposed
|
||||
b1 = (transa || b1.is_contiguous()) ? b1 : b1.contiguous();
|
||||
b2 = (transb || b2.is_contiguous()) ? b2 : b2.contiguous();
|
||||
|
||||
auto strideA = b1.strides();
|
||||
auto strideB = b2.strides();
|
||||
auto strideC = self.strides();
|
||||
|
||||
const int64_t lda = transa ? strideA[2] : strideA[1];
|
||||
const int64_t ldb = transb ? strideB[2] : strideB[1];
|
||||
const int64_t ldc = strideC[1];
|
||||
|
||||
data_type_t out_type = get_zendnn_dtype(self);
|
||||
data_type_t inp_dtype = get_zendnn_dtype(b1);
|
||||
data_type_t wgt_dtype = get_zendnn_dtype(b2);
|
||||
|
||||
TORCH_CHECK(
|
||||
(b1.scalar_type() == b2.scalar_type()),
|
||||
"zendnn_baddbmm: batch1 and batch2 data types should be same");
|
||||
|
||||
data_types matmul_dtype;
|
||||
matmul_dtype.src = inp_dtype;
|
||||
matmul_dtype.wei = wgt_dtype;
|
||||
matmul_dtype.dst = out_type;
|
||||
matmul_dtype.bias = data_type_t::none;
|
||||
matmul_dtype.compute = data_type_t::none;
|
||||
|
||||
lowoha_params params;
|
||||
params.dtypes = matmul_dtype;
|
||||
|
||||
// Execute batched matmul directly for LoA path
|
||||
matmul_direct(
|
||||
'r',
|
||||
transa,
|
||||
transb,
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
alpha,
|
||||
b1.data_ptr(),
|
||||
lda,
|
||||
b2.data_ptr(),
|
||||
ldb,
|
||||
nullptr,
|
||||
beta,
|
||||
self.data_ptr(),
|
||||
ldc,
|
||||
params,
|
||||
b1.size(0),
|
||||
b2.size(0));
|
||||
return;
|
||||
}
|
||||
} // namespace at::native
|
||||
|
||||
#endif // AT_ZENDNN_ENABLED()
|
||||
18
aten/src/ATen/native/zendnn/Matmul.h
Normal file
18
aten/src/ATen/native/zendnn/Matmul.h
Normal file
@ -0,0 +1,18 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/Config.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
|
||||
#if AT_ZENDNN_ENABLED()
|
||||
namespace at::native {
|
||||
|
||||
TORCH_API void zendnn_baddbmm(
|
||||
const Tensor& self,
|
||||
const Tensor& batch1,
|
||||
const Tensor& batch2,
|
||||
float beta,
|
||||
float alpha);
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
#endif // AT_ZENDNN_ENABLED()
|
||||
82
aten/src/ATen/native/zendnn/WeightPrepack.cpp
Normal file
82
aten/src/ATen/native/zendnn/WeightPrepack.cpp
Normal file
@ -0,0 +1,82 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/zendnn/ZenDNN_utils.hpp>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/as_strided.h>
|
||||
#include <ATen/ops/empty.h>
|
||||
#include <ATen/ops/zendnn_weight_prepack_for_linear_native.h>
|
||||
#endif
|
||||
|
||||
#if !AT_ZENDNN_ENABLED()
|
||||
namespace at::native {
|
||||
at::Tensor zendnn_weight_prepack_for_linear(const at::Tensor& weight) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"zendnn_weight_prepack_for_linear: ATen is not compiled with ZenDNN support");
|
||||
}
|
||||
} // namespace at::native
|
||||
#else // !AT_ZENDNN_ENABLED()
|
||||
namespace at::native {
|
||||
using namespace zendnnl::interface;
|
||||
at::Tensor zendnn_weight_prepack_for_linear(const at::Tensor& weight) {
|
||||
TORCH_CHECK(
|
||||
weight.dim() == 2,
|
||||
"Weight tensor must be 2D for linear layer prepacking, got ",
|
||||
weight.dim(),
|
||||
"D tensor.");
|
||||
TORCH_CHECK(
|
||||
weight.scalar_type() == c10::ScalarType::Float ||
|
||||
weight.scalar_type() == c10::ScalarType::BFloat16,
|
||||
"Currently weight prepacking only supports float32 or bfloat16 dtype for weight tensor");
|
||||
data_type_t datatype = get_zendnn_dtype(weight);
|
||||
// Linear op internally works on transposed weight tensor, so to
|
||||
// prepack the weight we need to use transposed weight.
|
||||
auto reorder_input = weight.t();
|
||||
tensor_t zen_reorder_input;
|
||||
create_zendnn_tensor(
|
||||
reorder_input, zen_reorder_input, "reorder_input", datatype);
|
||||
// Currently, ZenDNN only supports blocked layout with AOCL kernels.
|
||||
auto context = reorder_context_t().set_algo_format("aocl").create();
|
||||
auto reorder_op =
|
||||
reorder_operator_t().set_name("reorder_op").set_context(context).create();
|
||||
// Check if reorder operation creation is successful.
|
||||
TORCH_CHECK(
|
||||
reorder_op.check(),
|
||||
"operator ",
|
||||
reorder_op.get_name(),
|
||||
" creation failed.");
|
||||
reorder_op.set_input("reorder_input", zen_reorder_input);
|
||||
size_t reorder_bytes = reorder_op.get_reorder_size();
|
||||
int64_t num_elements = reorder_bytes / weight.element_size();
|
||||
// Create 1d tensor to hold the reordered weights with
|
||||
// a stride of 1 to ensure contiguous memory layout.
|
||||
at::Tensor reorder_output = at::detail::empty_strided_cpu(
|
||||
/*size*/ {num_elements}, /*stride*/ {1}, weight.options());
|
||||
tensor_t zen_reorder_output;
|
||||
std::vector<long unsigned int> reorder_output_sizes(
|
||||
reorder_input.sizes().begin(), reorder_input.sizes().end());
|
||||
void* reorder_output_ptr = reorder_output.data_ptr();
|
||||
zen_reorder_output.set_name("reorder_output")
|
||||
.set_size(reorder_output_sizes)
|
||||
.set_data_type(datatype)
|
||||
.set_storage(reorder_output_ptr, reorder_output.nbytes());
|
||||
if (is_tensor_2d_and_transposed(reorder_input)) {
|
||||
zen_reorder_output.set_order("ba");
|
||||
}
|
||||
zen_reorder_output.set_layout(tensor_layout_t::blocked);
|
||||
zen_reorder_output.create();
|
||||
// Check if reorder output tensor creation is successful.
|
||||
TORCH_CHECK(
|
||||
zen_reorder_output.check(),
|
||||
"tensor creation of ",
|
||||
zen_reorder_output.get_name(),
|
||||
" failed.");
|
||||
reorder_op.set_output("reorder_output", zen_reorder_output);
|
||||
reorder_op.execute();
|
||||
return at::as_strided(reorder_output, weight.sizes(), weight.strides());
|
||||
}
|
||||
} // namespace at::native
|
||||
#endif // !AT_ZENDNN_ENABLED()
|
||||
69
aten/src/ATen/native/zendnn/ZenDNN_utils.hpp
Normal file
69
aten/src/ATen/native/zendnn/ZenDNN_utils.hpp
Normal file
@ -0,0 +1,69 @@
|
||||
#pragma once
|
||||
#include <ATen/Config.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <cpuinfo.h>
|
||||
|
||||
#if AT_ZENDNN_ENABLED()
|
||||
#include <zendnnl.hpp>
|
||||
|
||||
namespace at::native {
|
||||
using namespace zendnnl::interface;
|
||||
inline bool zendnn_bf16_device_check() {
|
||||
return cpuinfo_initialize() && cpuinfo_has_x86_avx512bf16();
|
||||
}
|
||||
|
||||
inline data_type_t get_zendnn_dtype(const at::Tensor& tensor) {
|
||||
if (tensor.scalar_type() == c10::ScalarType::Float) {
|
||||
return data_type_t::f32;
|
||||
} else if (tensor.scalar_type() == c10::ScalarType::BFloat16) {
|
||||
return data_type_t::bf16;
|
||||
}
|
||||
TORCH_CHECK(false, "ZenDNN only supports Float32 and BFloat16.");
|
||||
}
|
||||
|
||||
inline bool is_tensor_2d_and_transposed(const at::Tensor& t) {
|
||||
if (t.dim() == 2) {
|
||||
return t.strides()[0] == 1 && t.strides()[1] == t.sizes()[0];
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
inline void set_zendnn_tensor_attributes(
|
||||
const at::Tensor& at_tensor,
|
||||
tensor_t& zendnn_tensor,
|
||||
const std::string& tensor_name,
|
||||
const data_type_t& tensor_datatype,
|
||||
const bool is_tensor_prepacked = false) {
|
||||
std::vector<long unsigned int> at_tensor_sizes_vec(
|
||||
at_tensor.sizes().begin(), at_tensor.sizes().end());
|
||||
void* at_tensor_ptr = at_tensor.data_ptr();
|
||||
zendnn_tensor.set_name(tensor_name)
|
||||
.set_size(at_tensor_sizes_vec)
|
||||
.set_data_type(tensor_datatype)
|
||||
.set_storage(at_tensor_ptr, at_tensor.nbytes());
|
||||
if (is_tensor_2d_and_transposed(at_tensor)) {
|
||||
zendnn_tensor.set_order("ba");
|
||||
}
|
||||
if (is_tensor_prepacked && tensor_name == "weights") {
|
||||
zendnn_tensor.set_layout(tensor_layout_t::blocked);
|
||||
}
|
||||
}
|
||||
|
||||
inline void create_zendnn_tensor(
|
||||
const at::Tensor& source_tensor,
|
||||
tensor_t& target_tensor,
|
||||
const std::string& tensor_name,
|
||||
const data_type_t datatype,
|
||||
const bool is_tensor_prepacked = false) {
|
||||
set_zendnn_tensor_attributes(
|
||||
source_tensor, target_tensor, tensor_name, datatype, is_tensor_prepacked);
|
||||
target_tensor.create();
|
||||
TORCH_CHECK(
|
||||
target_tensor.check(),
|
||||
"tensor creation of ",
|
||||
target_tensor.get_name(),
|
||||
" failed.");
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
#endif // AT_ZENDNN_ENABLED()
|
||||
122
aten/src/ATen/xpu/XPUScaledBlas.cpp
Normal file
122
aten/src/ATen/xpu/XPUScaledBlas.cpp
Normal file
@ -0,0 +1,122 @@
|
||||
#include <c10/core/Scalar.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/SmallVector.h>
|
||||
#include <c10/util/typeid.h>
|
||||
#include <cstdint>
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/BlasBackend.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/ExpandUtils.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <ATen/core/NamedTensor.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/native/GroupedMMUtils.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <c10/util/MaybeOwned.h>
|
||||
|
||||
#include <ATen/ceil_div.h>
|
||||
#include <ATen/xpu/XPUScaledBlas.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/_addmm_activation_native.h>
|
||||
#include <ATen/ops/_efficientzerotensor.h>
|
||||
#include <ATen/ops/_scaled_mm_native.h>
|
||||
#include <ATen/ops/_unsafe_view_native.h>
|
||||
#include <ATen/ops/abs.h>
|
||||
#include <ATen/ops/addmm_native.h>
|
||||
#include <ATen/ops/addmv_native.h>
|
||||
#include <ATen/ops/baddbmm_native.h>
|
||||
#include <ATen/ops/bmm_native.h>
|
||||
#include <ATen/ops/copy_native.h>
|
||||
#include <ATen/ops/dot_native.h>
|
||||
#include <ATen/ops/empty.h>
|
||||
#include <ATen/ops/empty_strided.h>
|
||||
#include <ATen/ops/gelu.h>
|
||||
#include <ATen/ops/max.h>
|
||||
#include <ATen/ops/mm_native.h>
|
||||
#include <ATen/ops/mul.h>
|
||||
#include <ATen/ops/ones.h>
|
||||
#include <ATen/ops/relu.h>
|
||||
#include <ATen/ops/scalar_tensor_native.h>
|
||||
#include <ATen/ops/vdot_native.h>
|
||||
#endif
|
||||
|
||||
using at::blas::ScalingType;
|
||||
|
||||
namespace at::native::onednn::scaled {
|
||||
|
||||
/**
|
||||
* Both inputs must be fp8,
|
||||
* Each needs a single scale, {Tensorwise (float)}
|
||||
*/
|
||||
bool check_tensorwise_recipe(
|
||||
c10::ScalarType type_a,
|
||||
std::vector<ScalingType>& recipe_a,
|
||||
ArrayRef<Tensor>& scales_a,
|
||||
c10::ScalarType type_b,
|
||||
std::vector<ScalingType>& recipe_b,
|
||||
ArrayRef<Tensor>& scales_b) {
|
||||
// both types must be fp8
|
||||
if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 1 scale each, {Tensorwise, float}
|
||||
if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 ||
|
||||
recipe_b.size() != 1) {
|
||||
return false;
|
||||
}
|
||||
// Need {Blockwise_1x32, e8m0} for A & B
|
||||
if (recipe_a[0] != ScalingType::TensorWise)
|
||||
return false;
|
||||
if (scales_a[0].scalar_type() != ScalarType::Float)
|
||||
return false;
|
||||
if (recipe_b[0] != ScalingType::TensorWise)
|
||||
return false;
|
||||
if (scales_b[0].scalar_type() != ScalarType::Float)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Both inputs must be fp8,
|
||||
* Each needs scales, {Rowwise (float)}
|
||||
*/
|
||||
bool check_rowwise_recipe(
|
||||
c10::ScalarType type_a,
|
||||
std::vector<ScalingType>& recipe_a,
|
||||
ArrayRef<Tensor>& scales_a,
|
||||
c10::ScalarType type_b,
|
||||
std::vector<ScalingType>& recipe_b,
|
||||
ArrayRef<Tensor>& scales_b) {
|
||||
// both types must be fp8
|
||||
if (!isFloat8Type(type_a) || !isFloat8Type(type_b)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 1 scale each, {Tensorwise, float}
|
||||
if (scales_a.size() != 1 || recipe_a.size() != 1 || scales_b.size() != 1 ||
|
||||
recipe_b.size() != 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Need {RowWise, dp32} for A & B
|
||||
if (recipe_a[0] != ScalingType::RowWise)
|
||||
return false;
|
||||
if (scales_a[0].scalar_type() != ScalarType::Float)
|
||||
return false;
|
||||
if (recipe_b[0] != ScalingType::RowWise)
|
||||
return false;
|
||||
if (scales_b[0].scalar_type() != ScalarType::Float)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace at::native::onednn::scaled
|
||||
95
aten/src/ATen/xpu/XPUScaledBlas.h
Normal file
95
aten/src/ATen/xpu/XPUScaledBlas.h
Normal file
@ -0,0 +1,95 @@
|
||||
#include <c10/core/Scalar.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/SmallVector.h>
|
||||
#include <c10/util/typeid.h>
|
||||
#include <cstdint>
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/ExpandUtils.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <ATen/core/NamedTensor.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <c10/util/MaybeOwned.h>
|
||||
|
||||
#include <ATen/BlasBackend.h>
|
||||
#include <ATen/ceil_div.h>
|
||||
|
||||
#ifdef USE_FBGEMM_GENAI
|
||||
#include <fbgemm_gpu/torch_ops.h>
|
||||
#endif
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/_addmm_activation_native.h>
|
||||
#include <ATen/ops/_efficientzerotensor.h>
|
||||
#include <ATen/ops/_scaled_mm_native.h>
|
||||
#include <ATen/ops/_unsafe_view_native.h>
|
||||
#include <ATen/ops/abs.h>
|
||||
#include <ATen/ops/addmm_native.h>
|
||||
#include <ATen/ops/addmv_native.h>
|
||||
#include <ATen/ops/baddbmm_native.h>
|
||||
#include <ATen/ops/bmm_native.h>
|
||||
#include <ATen/ops/copy_native.h>
|
||||
#include <ATen/ops/dot_native.h>
|
||||
#include <ATen/ops/empty.h>
|
||||
#include <ATen/ops/empty_strided.h>
|
||||
#include <ATen/ops/gelu.h>
|
||||
#include <ATen/ops/max.h>
|
||||
#include <ATen/ops/mm_native.h>
|
||||
#include <ATen/ops/mul.h>
|
||||
#include <ATen/ops/ones.h>
|
||||
#include <ATen/ops/relu.h>
|
||||
#include <ATen/ops/scalar_tensor_native.h>
|
||||
#include <ATen/ops/vdot_native.h>
|
||||
#endif
|
||||
|
||||
using at::blas::ScalingType;
|
||||
|
||||
namespace at::native::onednn::scaled {
|
||||
|
||||
/**
|
||||
* Track concrete implementations available
|
||||
*/
|
||||
enum class ScaledGemmImplementation {
|
||||
NONE = 0,
|
||||
TENSORWISE_TENSORWISE = 1,
|
||||
ROWWISE_ROWWISE = 2,
|
||||
};
|
||||
|
||||
/**
|
||||
* Convert passed int (enum) from python back into a
|
||||
* strictly-typed enum
|
||||
*/
|
||||
template <class EnumType, class ArrayType>
|
||||
std::vector<EnumType> convert_int_to_enum(ArrayType& v) {
|
||||
std::vector<EnumType> converted;
|
||||
converted.reserve(v.size());
|
||||
|
||||
for (auto vi : v) {
|
||||
converted.push_back(static_cast<EnumType>(vi));
|
||||
}
|
||||
return converted;
|
||||
}
|
||||
|
||||
bool check_tensorwise_recipe(
|
||||
c10::ScalarType,
|
||||
std::vector<ScalingType>&,
|
||||
ArrayRef<Tensor>&,
|
||||
c10::ScalarType,
|
||||
std::vector<ScalingType>&,
|
||||
ArrayRef<Tensor>&);
|
||||
|
||||
bool check_rowwise_recipe(
|
||||
c10::ScalarType,
|
||||
std::vector<ScalingType>&,
|
||||
ArrayRef<Tensor>&,
|
||||
c10::ScalarType,
|
||||
std::vector<ScalingType>&,
|
||||
ArrayRef<Tensor>&);
|
||||
|
||||
} // namespace at::native::onednn::scaled
|
||||
@ -1169,6 +1169,9 @@ def define_buck_targets(
|
||||
"--replace",
|
||||
"@AT_USE_EIGEN_SPARSE@",
|
||||
"0",
|
||||
"--replace",
|
||||
"@AT_ZENDNN_ENABLED@",
|
||||
"0",
|
||||
]),
|
||||
outs = {
|
||||
"Config.h": ["Config.h"],
|
||||
|
||||
@ -1184,6 +1184,9 @@ aten_cpu_source_non_codegen_list = [
|
||||
"aten/src/ATen/native/ComparisonUtils.cpp",
|
||||
"aten/src/ATen/native/DispatchStub.cpp",
|
||||
"aten/src/ATen/native/UpSample.cpp",
|
||||
"aten/src/ATen/native/zendnn/Matmul.cpp",
|
||||
"aten/src/ATen/native/zendnn/Linear.cpp",
|
||||
"aten/src/ATen/native/zendnn/WeightPrepack.cpp",
|
||||
"aten/src/ATen/native/mkldnn/BinaryOps.cpp",
|
||||
"aten/src/ATen/native/mkldnn/Conv.cpp",
|
||||
"aten/src/ATen/native/mkldnn/ConvPrepack.cpp",
|
||||
|
||||
@ -20,6 +20,22 @@
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define C10_CUDA_DRIVER_CHECK_GOTO(EXPR, NEXT) \
|
||||
do { \
|
||||
CUresult __err = EXPR; \
|
||||
if (__err != CUDA_SUCCESS) { \
|
||||
const char* err_str; \
|
||||
CUresult get_error_str_err [[maybe_unused]] = \
|
||||
c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \
|
||||
if (get_error_str_err != CUDA_SUCCESS) { \
|
||||
TORCH_WARN("CUDA driver error: unknown error"); \
|
||||
} else { \
|
||||
TORCH_WARN("CUDA driver error: ", err_str); \
|
||||
} \
|
||||
goto NEXT; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// The integer in the second column specifies the requested CUDA Driver API
|
||||
// version. The dynamic loader will accept a driver with a newer version, but it
|
||||
// ensures that the requested symbol exists in *at least* the specified version
|
||||
|
||||
@ -67,4 +67,5 @@
|
||||
{"USE_CUSPARSELT", "${USE_CUSPARSELT}"}, \
|
||||
{"USE_XPU", "${USE_XPU}"}, \
|
||||
{"USE_XCCL", "${USE_XCCL}"}, \
|
||||
{"USE_ZENDNN", "${USE_ZENDNN}"} \
|
||||
}
|
||||
|
||||
@ -117,6 +117,10 @@ if(@USE_MKLDNN@)
|
||||
include("${CMAKE_CURRENT_LIST_DIR}/public/mkldnn.cmake")
|
||||
endif()
|
||||
|
||||
if(@USE_ZENDNN@)
|
||||
include("${CMAKE_CURRENT_LIST_DIR}/public/zendnn.cmake")
|
||||
endif()
|
||||
|
||||
# import targets
|
||||
include ("${CMAKE_CURRENT_LIST_DIR}/Caffe2Targets.cmake")
|
||||
|
||||
|
||||
@ -118,6 +118,12 @@ if(INTERN_BUILD_ATEN_OPS)
|
||||
list(APPEND _file_compile_flags "-gencode;arch=compute_120a,code=sm_120a")
|
||||
endif()
|
||||
endif()
|
||||
# We will need to gate against CUDA version, sm_121a was introduced in CUDA 12.9
|
||||
if("${_arch}" STREQUAL "121a" AND CUDA_VERSION VERSION_GREATER_EQUAL 12.9)
|
||||
if(_existing_arch_flags MATCHES ".*compute_120.*")
|
||||
list(APPEND _file_compile_flags "-gencode;arch=compute_121a,code=sm_121a")
|
||||
endif()
|
||||
endif()
|
||||
endforeach()
|
||||
list(JOIN _file_compile_flags " " _file_compile_flags)
|
||||
|
||||
@ -126,7 +132,7 @@ if(INTERN_BUILD_ATEN_OPS)
|
||||
|
||||
_BUILD_FOR_ADDITIONAL_ARCHS(
|
||||
"${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/RowwiseScaledMM.cu"
|
||||
"89;90a;100a;103a;120a")
|
||||
"89;90a;100a;103a;120a;121a")
|
||||
_BUILD_FOR_ADDITIONAL_ARCHS(
|
||||
"${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/native/cuda/ScaledGroupMM.cu"
|
||||
"90a")
|
||||
|
||||
@ -162,6 +162,7 @@ set(AT_MKLDNN_ENABLED 0)
|
||||
set(AT_MKL_ENABLED 0)
|
||||
set(AT_KLEIDIAI_ENABLED 0)
|
||||
set(AT_USE_EIGEN_SPARSE 0)
|
||||
set(AT_ZENDNN_ENABLED 0)
|
||||
# setting default preferred BLAS options if not already present.
|
||||
if(NOT INTERN_BUILD_MOBILE)
|
||||
set(BLAS "MKL" CACHE STRING "Selected BLAS library")
|
||||
@ -1509,6 +1510,32 @@ if(NOT INTERN_BUILD_MOBILE)
|
||||
message("disabling MKLDNN because USE_MKLDNN is not set")
|
||||
endif()
|
||||
|
||||
if(USE_ZENDNN)
|
||||
if(NOT (CMAKE_SYSTEM_NAME MATCHES "Linux"))
|
||||
message(WARNING
|
||||
"USE_ZENDNN is currently only supported on Linux. Detected platform: ${CMAKE_SYSTEM_NAME}. Disabling ZenDNN support.")
|
||||
set(USE_ZENDNN OFF)
|
||||
elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
message(WARNING
|
||||
"x64 operating system is required for ZenDNN. "
|
||||
"ZenDNN codebase will not be compiled."
|
||||
"Turn this warning off by USE_ZENDNN=OFF.")
|
||||
set(USE_ZENDNN OFF)
|
||||
else()
|
||||
include(${CMAKE_CURRENT_LIST_DIR}/public/zendnn.cmake)
|
||||
if(ZENDNN_FOUND)
|
||||
set(AT_ZENDNN_ENABLED 1)
|
||||
# Add to Caffe2 private dependencies
|
||||
list(APPEND Caffe2_DEPENDENCY_LIBS zendnnl::zendnnl_archive)
|
||||
else()
|
||||
message(WARNING "ZENDNN could not be found.")
|
||||
caffe2_update_option(USE_ZENDNN OFF)
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "disabling ZENDNN because USE_ZENDNN is not set")
|
||||
endif()
|
||||
|
||||
if(USE_KLEIDIAI)
|
||||
set(TEMP_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS})
|
||||
set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libs" FORCE)
|
||||
|
||||
402
cmake/Modules/FindZENDNN.cmake
Normal file
402
cmake/Modules/FindZENDNN.cmake
Normal file
@ -0,0 +1,402 @@
|
||||
include_guard(GLOBAL)
|
||||
include(ExternalProject)
|
||||
|
||||
# declare a zendnnl dependency
|
||||
macro(zendnnl_add_dependency )
|
||||
set(options INCLUDE_ONLY)
|
||||
set(oneValueArgs NAME PATH LIB_SUFFIX INCLUDE_SUFFIX ARCHIVE_FILE ALIAS)
|
||||
set(multiValueArgs DEPENDS)
|
||||
cmake_parse_arguments(_zad "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
|
||||
string(TOUPPER ${_zad_NAME} _ZAD_UNAME)
|
||||
|
||||
if(DEFINED _zad_INCLUDE_SUFFIX)
|
||||
set(ZENDNNL_${_ZAD_UNAME}_INC_DIR "${_zad_PATH}/${_zad_INCLUDE_SUFFIX}")
|
||||
else()
|
||||
set(ZENDNNL_${_ZAD_UNAME}_INC_DIR "${_zad_PATH}/include")
|
||||
endif()
|
||||
|
||||
if(DEFINED _zad_LIB_SUFFIX)
|
||||
set(ZENDNNL_${_ZAD_UNAME}_LIB_DIR "${_zad_PATH}/${_zad_LIB_SUFFIX}")
|
||||
else()
|
||||
set(ZENDNNL_${_ZAD_UNAME}_LIB_DIR "${_zad_PATH}/lib")
|
||||
endif()
|
||||
|
||||
if(NOT EXISTS ${ZENDNNL_${_ZAD_UNAME}_INC_DIR})
|
||||
file(MAKE_DIRECTORY ${ZENDNNL_${_ZAD_UNAME}_INC_DIR})
|
||||
endif()
|
||||
|
||||
if(${_zad_INCLUDE_ONLY})
|
||||
add_library(zendnnl_${_zad_NAME}_deps INTERFACE IMPORTED GLOBAL)
|
||||
#add_dependencies(zendnnl_${_zad_NAME}_deps ${_zad_DEPENDS})
|
||||
|
||||
set_target_properties(zendnnl_${_zad_NAME}_deps
|
||||
PROPERTIES
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${ZENDNNL_${_ZAD_UNAME}_INC_DIR}")
|
||||
else()
|
||||
|
||||
add_library(zendnnl_${_zad_NAME}_deps STATIC IMPORTED GLOBAL)
|
||||
#add_dependencies(zendnnl_${_zad_NAME}_deps ${_zad_DEPENDS})
|
||||
|
||||
set_target_properties(zendnnl_${_zad_NAME}_deps
|
||||
PROPERTIES
|
||||
IMPORTED_LOCATION "${ZENDNNL_${_ZAD_UNAME}_LIB_DIR}/${_zad_ARCHIVE_FILE}"
|
||||
INCLUDE_DIRECTORIES "${ZENDNNL_${_ZAD_UNAME}_INC_DIR}"
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${ZENDNNL_${_ZAD_UNAME}_INC_DIR}")
|
||||
endif()
|
||||
|
||||
add_library(${_zad_ALIAS} ALIAS zendnnl_${_zad_NAME}_deps)
|
||||
|
||||
list(APPEND ZNL_BYPRODUCTS "${ZENDNNL_${_ZAD_UNAME}_LIB_DIR}/${_zad_ARCHIVE_FILE}")
|
||||
endmacro()
|
||||
|
||||
macro(zendnnl_add_option )
|
||||
set(options EXECLUDE_FROM_COMMAND_LIST FORCE)
|
||||
set(oneValueArgs NAME VALUE TYPE CACHE_STRING COMMAND_LIST)
|
||||
set(multiValueArgs "")
|
||||
cmake_parse_arguments(_zao "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
|
||||
if(${_zao_FORCE})
|
||||
set(${_zao_NAME} ${_zao_VALUE} CACHE ${_zao_TYPE} ${_zao_CACHE_STRING} FORCE)
|
||||
else()
|
||||
set(${_zao_NAME} ${_zao_VALUE} CACHE ${_zao_TYPE} ${_zao_CACHE_STRING})
|
||||
endif()
|
||||
|
||||
if (NOT ${_zao_EXECLUDE_FROM_COMMAND_LIST})
|
||||
list(APPEND ${_zao_COMMAND_LIST} "-D${_zao_NAME}:${_zao_TYPE}=${_zao_VALUE}")
|
||||
endif()
|
||||
endmacro()
|
||||
|
||||
message(AUTHOR_WARNING "(ZENDNNL) please ensure all zendnnl variables are set properly.")
|
||||
|
||||
if(NOT ZENDNN_FOUND)
|
||||
# find openmp
|
||||
find_package(OpenMP REQUIRED QUIET)
|
||||
|
||||
# set zendnnl source dir, where zendnnl has been downloaded.
|
||||
zendnnl_add_option(NAME ZENDNNL_SOURCE_DIR
|
||||
VALUE ${PROJECT_SOURCE_DIR}/third_party/ZenDNN
|
||||
TYPE PATH
|
||||
CACHE_STRING "zendnnl_source_dir"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
# set zendnnl binary dir, if unsure set ${CMAKE_CURRENT_BINARY_DIR}/zendnnl.
|
||||
zendnnl_add_option(NAME ZENDNNL_BINARY_DIR
|
||||
VALUE ${CMAKE_BINARY_DIR}/third_party/ZenDNN
|
||||
TYPE PATH
|
||||
CACHE_STRING "zendnnl_binary_dir"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
# set zendnnl install dir, if unsure set ${CMAKE_INSTALL_PREFIX}/zendnnl.
|
||||
zendnnl_add_option(NAME ZENDNNL_INSTALL_PREFIX
|
||||
VALUE ${ZENDNNL_BINARY_DIR}/install
|
||||
TYPE PATH
|
||||
CACHE_STRING "zendnnl_install_dir"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
## general zendnnl options
|
||||
# set ZenDNNL framework build, this should on ON to avoid standalone build.
|
||||
zendnnl_add_option(NAME ZENDNNL_FWK_BUILD
|
||||
VALUE ON
|
||||
TYPE BOOL
|
||||
CACHE_STRING "zendnnl framework build"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
# set zendnnl build option, default is Release.
|
||||
zendnnl_add_option(NAME ZENDNNL_BUILD_TYPE
|
||||
VALUE "Release"
|
||||
TYPE STRING
|
||||
CACHE_STRING "zendnnl build type"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
# set zendnnl log level.
|
||||
zendnnl_add_option(NAME ZENDNNL_MESSAGE_LOG_LEVEL
|
||||
VALUE "DEBUG"
|
||||
TYPE STRING
|
||||
CACHE_STRING "zendnnl message log level"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
# set zendnnl verbose makefile option.
|
||||
zendnnl_add_option(NAME ZENDNNL_VERBOSE_MAKEFILE
|
||||
VALUE ON
|
||||
TYPE BOOL
|
||||
CACHE_STRING "zendnnl verbose makefile"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
## components options
|
||||
# set building zendnnl examples, default os OFF.
|
||||
zendnnl_add_option(NAME ZENDNNL_BUILD_EXAMPLES
|
||||
VALUE OFF
|
||||
TYPE BOOL
|
||||
CACHE_STRING "build zendnnl examples"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
# set building zendnnl gtests, default os OFF.
|
||||
zendnnl_add_option(NAME ZENDNNL_BUILD_GTEST
|
||||
VALUE OFF
|
||||
TYPE BOOL
|
||||
CACHE_STRING "build zendnnl gtests"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
# set building zendnnl doxygen documentation, default os OFF.
|
||||
zendnnl_add_option(NAME ZENDNNL_BUILD_DOXYGEN
|
||||
VALUE OFF
|
||||
TYPE BOOL
|
||||
CACHE_STRING "build zendnnl doxygen documentation"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
# set building zendnnl benchmarking tool, default os OFF.
|
||||
zendnnl_add_option(NAME ZENDNNL_BUILD_BENCHDNN
|
||||
VALUE OFF
|
||||
TYPE BOOL
|
||||
CACHE_STRING "build zendnnl benchdnn"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
# set zendnnl code coverage option, default os OFF.
|
||||
zendnnl_add_option(NAME ZENDNNL_CODE_COVERAGE
|
||||
VALUE OFF
|
||||
TYPE BOOL
|
||||
CACHE_STRING "build zendnnl code coverage"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
## dependencies
|
||||
# set if zendnnl depends on amdblis. this should bf OFF only if
|
||||
# aocldlp dependency is ON.
|
||||
zendnnl_add_option(NAME ZENDNNL_DEPENDS_AMDBLIS
|
||||
VALUE OFF
|
||||
TYPE BOOL
|
||||
CACHE_STRING "zendnnl amdblis dependency"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
# set if zendnnl depends on aocldlp. this should bf ON only if
|
||||
# amdblis dependency is OFF.
|
||||
zendnnl_add_option(NAME ZENDNNL_DEPENDS_AOCLDLP
|
||||
VALUE ON
|
||||
TYPE BOOL
|
||||
CACHE_STRING "zendnnl aocldlp dependency"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
# set if zendnnl depends on onednn, default is OFF.
|
||||
zendnnl_add_option(NAME ZENDNNL_DEPENDS_ONEDNN
|
||||
VALUE OFF
|
||||
TYPE BOOL
|
||||
CACHE_STRING "zendnnl onednn dependency"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
# set if zendnnl depends on libxsmm, default is OFF.
|
||||
zendnnl_add_option(NAME ZENDNNL_DEPENDS_LIBXSMM
|
||||
VALUE ON
|
||||
TYPE BOOL
|
||||
CACHE_STRING "zendnnl libxsmm dependency"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
# set path of amdblis if amdblis is injected. if the framework
|
||||
# does not inject it, set it to "" (empty string).
|
||||
zendnnl_add_option(NAME ZENDNNL_AMDBLIS_FWK_DIR
|
||||
VALUE ""
|
||||
TYPE PATH
|
||||
CACHE_STRING "zendnnl amdblis framework path"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
# set path of aocldlp if aocldlp is injected. if the framework
|
||||
# does not inject it, set it to "" (empty string).
|
||||
zendnnl_add_option(NAME ZENDNNL_AOCLDLP_FWK_DIR
|
||||
VALUE ""
|
||||
TYPE PATH
|
||||
CACHE_STRING "zendnnl aocldlp framework path"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
# set path of onednn if onednn is injected. if the framework
|
||||
# does not inject it, set it to "" (empty string).
|
||||
zendnnl_add_option(NAME ZENDNNL_ONEDNN_FWK_DIR
|
||||
VALUE ""
|
||||
TYPE PATH
|
||||
CACHE_STRING "zendnnl onednnn framework path"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
# set path of libxsmm if libxsmm is injected. if the framework
|
||||
# does not inject it, set it to "" (empty string).
|
||||
zendnnl_add_option(NAME ZENDNNL_LIBXSMM_FWK_DIR
|
||||
VALUE ""
|
||||
TYPE PATH
|
||||
CACHE_STRING "zendnnl libxsmm framework path"
|
||||
COMMAND_LIST ZNL_CMAKE_ARGS)
|
||||
|
||||
# try to find pre-built package
|
||||
set(zendnnl_ROOT "${ZENDNNL_INSTALL_PREFIX}/zendnnl")
|
||||
set(zendnnl_DIR "${zendnnl_ROOT}/lib/cmake")
|
||||
find_package(zendnnl QUIET)
|
||||
if(zendnnl_FOUND)
|
||||
message(STATUS "(ZENDNNL) ZENDNNL FOUND AT ${zendnnl_ROOT}")
|
||||
message(STATUS "(ZENDNNL) if zendnnl options are changed from previous build,")
|
||||
message(STATUS "(ZENDNNL) they will not be reflected")
|
||||
message(STATUS "(ZENDNNL) If options are changed, please do a clean build.")
|
||||
if(TARGET zendnnl::zendnnl_archive)
|
||||
set_target_properties(zendnnl::zendnnl_archive
|
||||
PROPERTIES IMPORTED_GLOBAL ON)
|
||||
else()
|
||||
message(FATAL_ERROR "(ZENDNNL) zendnnl installation does not have imported target zendnnl::zendnnl_archive")
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "(ZENDNNL) ZENDNNL NOT FOUND, will be built as an external project.")
|
||||
|
||||
# declare zendnnl library
|
||||
set(ZENDNNL_LIBRARY_INC_DIR "${ZENDNNL_INSTALL_PREFIX}/zendnnl/include")
|
||||
set(ZENDNNL_LIBRARY_LIB_DIR "${ZENDNNL_INSTALL_PREFIX}/zendnnl/lib")
|
||||
|
||||
if(NOT EXISTS ${ZENDNNL_LIBRARY_INC_DIR})
|
||||
file(MAKE_DIRECTORY ${ZENDNNL_LIBRARY_INC_DIR})
|
||||
endif()
|
||||
|
||||
add_library(zendnnl_library STATIC IMPORTED GLOBAL)
|
||||
add_dependencies(zendnnl_library fwk_zendnnl)
|
||||
set_target_properties(zendnnl_library
|
||||
PROPERTIES
|
||||
IMPORTED_LOCATION "${ZENDNNL_LIBRARY_LIB_DIR}/libzendnnl_archive.a"
|
||||
INCLUDE_DIRECTORIES "${ZENDNNL_LIBRARY_INC_DIR}"
|
||||
INTERFACE_INCLUDE_DIRECTORIES "${ZENDNNL_LIBRARY_INC_DIR}")
|
||||
|
||||
target_link_options(zendnnl_library INTERFACE "-fopenmp")
|
||||
target_link_libraries(zendnnl_library
|
||||
INTERFACE OpenMP::OpenMP_CXX
|
||||
INTERFACE ${CMAKE_DL_LIBS})
|
||||
|
||||
add_library(zendnnl::zendnnl_archive ALIAS zendnnl_library)
|
||||
|
||||
list(APPEND ZNL_BYPRODUCTS "${ZENDNNL_LIBRARY_LIB_DIR}/libzendnnl_archive.a")
|
||||
|
||||
# declare all dependencies
|
||||
|
||||
# json dependency
|
||||
zendnnl_add_dependency(NAME json
|
||||
PATH "${ZENDNNL_INSTALL_PREFIX}/deps/json"
|
||||
ALIAS "nlohmann_json::nlohmann_json"
|
||||
INCLUDE_ONLY)
|
||||
|
||||
target_link_libraries(zendnnl_library INTERFACE nlohmann_json::nlohmann_json)
|
||||
|
||||
# aoclutils dependency
|
||||
if (DEFINED ENV{ZENDNNL_MANYLINUX_BUILD})
|
||||
|
||||
zendnnl_add_dependency(NAME aoclutils
|
||||
PATH "${ZENDNNL_INSTALL_PREFIX}/deps/aoclutils"
|
||||
LIB_SUFFIX lib64
|
||||
ARCHIVE_FILE "libaoclutils.a"
|
||||
ALIAS "au::aoclutils")
|
||||
|
||||
target_link_libraries(zendnnl_library INTERFACE au::aoclutils)
|
||||
|
||||
zendnnl_add_dependency(NAME aucpuid
|
||||
PATH "${ZENDNNL_INSTALL_PREFIX}/deps/aoclutils"
|
||||
LIB_SUFFIX lib64
|
||||
ARCHIVE_FILE "libau_cpuid.a"
|
||||
ALIAS "au::au_cpuid")
|
||||
|
||||
target_link_libraries(zendnnl_library INTERFACE au::au_cpuid)
|
||||
|
||||
else()
|
||||
zendnnl_add_dependency(NAME aoclutils
|
||||
PATH "${ZENDNNL_INSTALL_PREFIX}/deps/aoclutils"
|
||||
ARCHIVE_FILE "libaoclutils.a"
|
||||
ALIAS "au::aoclutils")
|
||||
|
||||
target_link_libraries(zendnnl_library INTERFACE au::aoclutils)
|
||||
|
||||
zendnnl_add_dependency(NAME aucpuid
|
||||
PATH "${ZENDNNL_INSTALL_PREFIX}/deps/aoclutils"
|
||||
ARCHIVE_FILE "libau_cpuid.a"
|
||||
ALIAS "au::au_cpuid")
|
||||
|
||||
target_link_libraries(zendnnl_library INTERFACE au::au_cpuid)
|
||||
|
||||
endif()
|
||||
|
||||
# amdblis dependency
|
||||
if (ZENDNNL_DEPENDS_AMDBLIS)
|
||||
zendnnl_add_dependency(NAME amdblis
|
||||
PATH "${ZENDNNL_INSTALL_PREFIX}/deps/amdblis"
|
||||
ARCHIVE_FILE "libblis-mt.a"
|
||||
ALIAS "amdblis::amdblis_archive")
|
||||
|
||||
target_link_libraries(zendnnl_library INTERFACE amdblis::amdblis_archive)
|
||||
endif()
|
||||
|
||||
if (ZENDNNL_DEPENDS_AOCLDLP)
|
||||
zendnnl_add_dependency(NAME aocldlp
|
||||
PATH "${ZENDNNL_INSTALL_PREFIX}/deps/aocldlp"
|
||||
ARCHIVE_FILE "libaocl-dlp.a"
|
||||
ALIAS "aocldlp::aocl_dlp_static")
|
||||
|
||||
target_link_libraries(zendnnl_library INTERFACE aocldlp::aocl_dlp_static)
|
||||
endif()
|
||||
|
||||
if (ZENDNNL_DEPENDS_ONEDNN)
|
||||
zendnnl_add_dependency(NAME onednn
|
||||
PATH "${ZENDNNL_INSTALL_PREFIX}/deps/onednn"
|
||||
ARCHIVE_FILE "libdnnl.a"
|
||||
ALIAS "DNNL::dnnl")
|
||||
|
||||
target_link_libraries(zendnnl_library INTERFACE DNNL::dnnl)
|
||||
endif()
|
||||
|
||||
# libxsmm dependency
|
||||
if (ZENDNNL_DEPENDS_LIBXSMM)
|
||||
zendnnl_add_dependency(NAME libxsmm
|
||||
PATH "${ZENDNNL_INSTALL_PREFIX}/deps/libxsmm"
|
||||
ARCHIVE_FILE "libxsmm.a"
|
||||
ALIAS "libxsmm::libxsmm_archive")
|
||||
|
||||
target_link_libraries(zendnnl_library INTERFACE libxsmm::libxsmm_archive)
|
||||
endif()
|
||||
|
||||
message(STATUS "(ZENDNNL) ZNL_BYPRODUCTS=${ZNL_BYPRODUCTS}")
|
||||
message(STATUS "(ZENDNNL) ZNL_CMAKE_ARGS=${ZNL_CMAKE_ARGS}")
|
||||
|
||||
ExternalProject_ADD(fwk_zendnnl
|
||||
SOURCE_DIR "${ZENDNNL_SOURCE_DIR}"
|
||||
BINARY_DIR "${ZENDNNL_BINARY_DIR}"
|
||||
CMAKE_ARGS "${ZNL_CMAKE_ARGS}"
|
||||
BUILD_COMMAND cmake --build . --target all -j
|
||||
INSTALL_COMMAND ""
|
||||
BUILD_BYPRODUCTS ${ZNL_BYPRODUCTS})
|
||||
|
||||
list(APPEND ZENDNNL_CLEAN_FILES "${ZENDNNL_BINARY_DIR}")
|
||||
list(APPEND ZENDNNL_CLEAN_FILES "${ZENDNNL_INSTALL_PREFIX}")
|
||||
set_target_properties(fwk_zendnnl
|
||||
PROPERTIES
|
||||
ADDITIONAL_CLEAN_FILES "${ZENDNNL_CLEAN_FILES}")
|
||||
|
||||
# framework dependencies
|
||||
# add_dependencies(fwk_zendnnl <injected dependency targets>)
|
||||
get_target_property(FWK_ZENDNNL_DEPENDS fwk_zendnnl MANUALLY_ADDED_DEPENDENCIES)
|
||||
if(${FWK_ZENDNNL_DEPENDS} STREQUAL "FWK_ZENDNNL_DEPENDS-NOTFOUND")
|
||||
message(AUTHOR_WARNING "(ZENDNNL) please ensure fwk_zendnnl depends on injected dependencies targets")
|
||||
else()
|
||||
message(STATUS "fwk_zendnnl dependencies : ${FWK_ZENDNNL_DEPENDS}")
|
||||
endif()
|
||||
|
||||
# make library and its dependencies depend on fwk_zendnnl
|
||||
add_dependencies(zendnnl_library fwk_zendnnl)
|
||||
add_dependencies(zendnnl_json_deps fwk_zendnnl)
|
||||
add_dependencies(zendnnl_aoclutils_deps fwk_zendnnl)
|
||||
add_dependencies(zendnnl_aucpuid_deps fwk_zendnnl)
|
||||
|
||||
if(ZENDNNL_DEPENDS_AMDBLIS)
|
||||
add_dependencies(zendnnl_amdblis_deps fwk_zendnnl)
|
||||
endif()
|
||||
|
||||
if(ZENDNNL_DEPENDS_AOCLDLP)
|
||||
add_dependencies(zendnnl_aocldlp_deps fwk_zendnnl)
|
||||
endif()
|
||||
|
||||
if(ZENDNNL_DEPENDS_ONEDNN)
|
||||
add_dependencies(zendnnl_onednn_deps fwk_zendnnl)
|
||||
endif()
|
||||
|
||||
if(ZENDNNL_DEPENDS_LIBXSMM)
|
||||
add_dependencies(zendnnl_libxsmm_deps fwk_zendnnl)
|
||||
endif()
|
||||
endif()
|
||||
set(ZENDNN_FOUND TRUE)
|
||||
|
||||
endif(NOT ZENDNN_FOUND)
|
||||
@ -148,6 +148,7 @@ function(caffe2_print_configuration_summary)
|
||||
message(STATUS " USE_PYTORCH_METAL_EXPORT : ${USE_PYTORCH_METAL_EXPORT}")
|
||||
message(STATUS " USE_MPS : ${USE_MPS}")
|
||||
message(STATUS " CAN_COMPILE_METAL : ${CAN_COMPILE_METAL}")
|
||||
message(STATUS " USE_ZENDNN : ${USE_ZENDNN}")
|
||||
message(STATUS " USE_MKL : ${CAFFE2_USE_MKL}")
|
||||
if(${CAFFE2_USE_MKL})
|
||||
message(STATUS " USE_STATIC_MKL : ${USE_STATIC_MKL}")
|
||||
|
||||
8
cmake/public/zendnn.cmake
Normal file
8
cmake/public/zendnn.cmake
Normal file
@ -0,0 +1,8 @@
|
||||
if(NOT EXISTS ${PROJECT_SOURCE_DIR}/third_party/ZenDNN)
|
||||
message(WARNING "(ZENDNNL) Library not found at ${PROJECT_SOURCE_DIR}/third_party/ZenDNN")
|
||||
else()
|
||||
find_package(ZENDNN QUIET)
|
||||
if(ZENDNN_FOUND)
|
||||
message(STATUS, "(ZENDNN) ZenDNN library was built successfully.")
|
||||
endif(ZENDNN_FOUND)
|
||||
endif()
|
||||
7
setup.py
7
setup.py
@ -67,6 +67,9 @@
|
||||
# USE_NUMPY=0
|
||||
# disables the NumPy build
|
||||
#
|
||||
# USE_ZENDNN=0
|
||||
# disables the ZenDNN build
|
||||
#
|
||||
# BUILD_TEST=0
|
||||
# disables the test build
|
||||
#
|
||||
@ -1221,6 +1224,10 @@ class build_ext(setuptools.command.build_ext.build_ext):
|
||||
report("-- Not using CBLAS in MKLDNN")
|
||||
else:
|
||||
report("-- Not using MKLDNN")
|
||||
if cmake_cache_vars["USE_ZENDNN"]:
|
||||
report("-- Using ZENDNN")
|
||||
else:
|
||||
report("-- Not using ZENDNN")
|
||||
if cmake_cache_vars["USE_NCCL"] and cmake_cache_vars["USE_SYSTEM_NCCL"]:
|
||||
report(
|
||||
"-- Using system provided NCCL library at "
|
||||
|
||||
@ -0,0 +1,20 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
uint64_t get_any_data_ptr(Tensor t, bool mutable_) {
|
||||
if (mutable_) {
|
||||
return reinterpret_cast<uint64_t>(t.mutable_data_ptr());
|
||||
} else {
|
||||
return reinterpret_cast<uint64_t>(t.const_data_ptr());
|
||||
}
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("get_any_data_ptr(Tensor t, bool mutable_) -> int");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("get_any_data_ptr", TORCH_BOX(&get_any_data_ptr));
|
||||
}
|
||||
@ -0,0 +1,34 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/headeronly/core/ScalarType.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
uint64_t get_template_any_data_ptr(Tensor t, torch::headeronly::ScalarType dtype, bool mutable_) {
|
||||
#define DEFINE_CASE(T, name) \
|
||||
case torch::headeronly::ScalarType::name: { \
|
||||
if (mutable_) { \
|
||||
return reinterpret_cast<uint64_t>(t.mutable_data_ptr<T>()); \
|
||||
} else { \
|
||||
return reinterpret_cast<uint64_t>(t.const_data_ptr<T>()); \
|
||||
} \
|
||||
}
|
||||
switch (dtype) {
|
||||
// per aten/src/ATen/templates/TensorMethods.cpp:
|
||||
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
|
||||
DEFINE_CASE(uint16_t, UInt16)
|
||||
DEFINE_CASE(uint32_t, UInt32)
|
||||
DEFINE_CASE(uint64_t, UInt64)
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
#undef DEFINE_CASE
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("get_template_any_data_ptr(Tensor t, ScalarType dtype, bool mutable_) -> int");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("get_template_any_data_ptr", TORCH_BOX(&get_template_any_data_ptr));
|
||||
}
|
||||
@ -0,0 +1,41 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
// Declare my__foreach_mul (defined in my__foreach_mul.cpp)
|
||||
extern std::vector<Tensor> my__foreach_mul(
|
||||
torch::headeronly::HeaderOnlyArrayRef<Tensor> self,
|
||||
torch::headeronly::HeaderOnlyArrayRef<Tensor> other);
|
||||
|
||||
// Helper function for cloning
|
||||
Tensor my_clone(Tensor t) {
|
||||
return clone(t);
|
||||
}
|
||||
|
||||
std::vector<Tensor> make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) {
|
||||
// This function tests that my__foreach_mul can take in std::initializer_lists
|
||||
// in addition to std::vectors.
|
||||
Tensor t1_1 = my_clone(t1);
|
||||
Tensor t1_2 = my_clone(t1);
|
||||
Tensor t2_1 = my_clone(t2);
|
||||
Tensor t2_2 = my_clone(t2);
|
||||
return my__foreach_mul({t1_1, t2_1}, {t1_2, t2_2});
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def(
|
||||
"make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) -> Tensor[]");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(
|
||||
libtorch_agnostic_2_10,
|
||||
CompositeExplicitAutograd,
|
||||
m) {
|
||||
m.impl(
|
||||
"make_tensor_clones_and_call_foreach",
|
||||
TORCH_BOX(&make_tensor_clones_and_call_foreach));
|
||||
}
|
||||
@ -0,0 +1,40 @@
|
||||
// This is duplicated from the libtorch_agnostic_2_9_extension
|
||||
// as a negative test for test_version_compatibility.py
|
||||
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
#include <torch/headeronly/util/Exception.h>
|
||||
#include <torch/headeronly/core/ScalarType.h>
|
||||
#include <torch/headeronly/core/Dispatch_v2.h>
|
||||
#include <torch/headeronly/core/TensorAccessor.h>
|
||||
|
||||
#include "tensor_accessor_kernel.h"
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
|
||||
STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
|
||||
STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
|
||||
STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
|
||||
STD_TORCH_CHECK(m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
|
||||
STD_TORCH_CHECK(m.device() == v.device(), "m and v must be on the same device");
|
||||
Tensor res = new_empty(m, {m.size(0)});
|
||||
THO_DISPATCH_V2(m.scalar_type(), "mv_tensor_accessor_cpu",
|
||||
AT_WRAP(([&]() {
|
||||
auto resa = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(res.data_ptr()), res.sizes().data(), res.strides().data());
|
||||
auto ma = Accessor_cpu<scalar_t, 2>(reinterpret_cast<scalar_t*>(m.data_ptr()), m.sizes().data(), m.strides().data());
|
||||
auto va = Accessor_cpu<scalar_t, 1>(reinterpret_cast<scalar_t*>(v.data_ptr()), v.sizes().data(), v.strides().data());
|
||||
mv_tensor_accessor_kernel<Accessor_cpu, scalar_t>(resa, ma, va);
|
||||
})),
|
||||
AT_FLOATING_TYPES);
|
||||
return res;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("mv_tensor_accessor_cpu(Tensor res, Tensor m, Tensor v) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("mv_tensor_accessor_cpu", TORCH_BOX(&mv_tensor_accessor_cpu));
|
||||
}
|
||||
@ -0,0 +1,47 @@
|
||||
// This is duplicated from the libtorch_agnostic_2_9_extension
|
||||
// as a negative test for test_version_compatibility.py
|
||||
|
||||
#include "tensor_accessor_kernel.h"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
|
||||
STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
|
||||
STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
|
||||
STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
|
||||
STD_TORCH_CHECK(
|
||||
m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
|
||||
STD_TORCH_CHECK(
|
||||
m.device() == v.device(), "m and v must be on the same device");
|
||||
Tensor res = new_empty(m, {m.size(0)});
|
||||
THO_DISPATCH_V2(
|
||||
m.scalar_type(),
|
||||
"mv_tensor_accessor_cuda",
|
||||
AT_WRAP(([&]() {
|
||||
auto resa = Accessor_cuda<scalar_t, 1>(
|
||||
reinterpret_cast<scalar_t*>(res.data_ptr()),
|
||||
res.sizes().data(),
|
||||
res.strides().data());
|
||||
auto ma = Accessor_cuda<scalar_t, 2>(
|
||||
reinterpret_cast<scalar_t*>(m.data_ptr()),
|
||||
m.sizes().data(),
|
||||
m.strides().data());
|
||||
auto va = Accessor_cuda<scalar_t, 1>(
|
||||
reinterpret_cast<scalar_t*>(v.data_ptr()),
|
||||
v.sizes().data(),
|
||||
v.strides().data());
|
||||
mv_tensor_accessor_kernel<Accessor_cuda, scalar_t>
|
||||
<<<1, 1, 0, 0>>>(resa, ma, va);
|
||||
})),
|
||||
AT_FLOATING_TYPES);
|
||||
return res;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CUDA, m) {
|
||||
m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
|
||||
}
|
||||
@ -0,0 +1,20 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
|
||||
#include <vector>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
|
||||
std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
|
||||
aoti_torch_call_dispatcher("aten::_foreach_mul", "List", stack.data());
|
||||
return torch::stable::detail::to<std::vector<Tensor>>(stack[0]);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("my__foreach_mul(Tensor[] self, Tensor[] other) -> Tensor[]");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("my__foreach_mul", TORCH_BOX(&my__foreach_mul));
|
||||
}
|
||||
@ -0,0 +1,19 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/stableivalue_conversions.h>
|
||||
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
void my__foreach_mul_(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
|
||||
std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
|
||||
aoti_torch_call_dispatcher("aten::_foreach_mul_", "List", stack.data());
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("my__foreach_mul_(Tensor(a!)[] self, Tensor[] other) -> ()");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("my__foreach_mul_", TORCH_BOX(&my__foreach_mul_));
|
||||
}
|
||||
@ -0,0 +1,25 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
|
||||
#include <optional>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
Tensor my_empty(
|
||||
torch::headeronly::HeaderOnlyArrayRef<int64_t> size,
|
||||
std::optional<torch::headeronly::ScalarType> dtype,
|
||||
std::optional<torch::stable::Device> device,
|
||||
std::optional<bool> pin_memory) {
|
||||
return empty(size, dtype, device, pin_memory);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def(
|
||||
"my_empty(int[] size, ScalarType? dtype=None, Device? device=None, bool? pin_memory=None) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_empty", TORCH_BOX(&my_empty));
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
Tensor my_reshape(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> shape) {
|
||||
return reshape(t, shape);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("my_reshape(Tensor t, int[] shape) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_reshape", TORCH_BOX(&my_reshape));
|
||||
}
|
||||
@ -0,0 +1,20 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
Tensor my_view(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> size) {
|
||||
return view(t, size);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("my_view(Tensor t, int[] size) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(
|
||||
libtorch_agnostic_2_10,
|
||||
CompositeExplicitAutograd,
|
||||
m) {
|
||||
m.impl("my_view", TORCH_BOX(&my_view));
|
||||
}
|
||||
@ -0,0 +1,31 @@
|
||||
// This is duplicated from the libtorch_agnostic_2_9_extension
|
||||
// as a negative test for test_version_compatibility.py
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <torch/headeronly/core/Dispatch_v2.h>
|
||||
#include <torch/headeronly/core/TensorAccessor.h>
|
||||
|
||||
template <typename T, size_t N>
|
||||
using Accessor_cpu = torch::headeronly::HeaderOnlyTensorAccessor<T, N>;
|
||||
|
||||
#if defined(__CUDACC__) || defined(__HIPCC__)
|
||||
#define MAYBE_GLOBAL __global__
|
||||
|
||||
template <typename T, size_t N>
|
||||
using Accessor_cuda = torch::headeronly::HeaderOnlyGenericPackedTensorAccessor<T, N, torch::headeronly::RestrictPtrTraits>;
|
||||
|
||||
#else
|
||||
#define MAYBE_GLOBAL
|
||||
#endif
|
||||
|
||||
template <template <typename, size_t> class Accessor, typename scalar_t>
|
||||
MAYBE_GLOBAL void mv_tensor_accessor_kernel(Accessor<scalar_t, 1> resa, Accessor<scalar_t, 2> ma, Accessor<scalar_t, 1> va) {
|
||||
for (int64_t i = 0; i < resa.size(0); i++) {
|
||||
scalar_t val = 0;
|
||||
for (int64_t j = 0; j < ma.size(1); j++) {
|
||||
val += ma[i][j] * va[j];
|
||||
}
|
||||
resa[i] = val;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,37 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
torch::stable::Device test_device_constructor(
|
||||
bool is_cuda,
|
||||
torch::stable::DeviceIndex index,
|
||||
bool use_str) {
|
||||
using torch::stable::Device;
|
||||
using torch::stable::DeviceType;
|
||||
|
||||
if (use_str) {
|
||||
std::string device_str;
|
||||
if (is_cuda) {
|
||||
device_str = "cuda:" + std::to_string(index);
|
||||
} else {
|
||||
device_str = "cpu";
|
||||
}
|
||||
return Device(device_str);
|
||||
} else {
|
||||
if (is_cuda) {
|
||||
return Device(DeviceType::CUDA, index);
|
||||
} else {
|
||||
return Device(DeviceType::CPU);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def(
|
||||
"test_device_constructor(bool is_cuda, DeviceIndex index, bool use_str) -> Device");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_device_constructor", TORCH_BOX(&test_device_constructor));
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
|
||||
bool test_device_equality(torch::stable::Device d1, torch::stable::Device d2) {
|
||||
return d1 == d2;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("test_device_equality(Device d1, Device d2) -> bool");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_device_equality", TORCH_BOX(&test_device_equality));
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
|
||||
torch::stable::DeviceIndex test_device_index(torch::stable::Device device) {
|
||||
return device.index();
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("test_device_index(Device device) -> DeviceIndex");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_device_index", TORCH_BOX(&test_device_index));
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
|
||||
bool test_device_is_cpu(torch::stable::Device device) {
|
||||
return device.is_cpu();
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("test_device_is_cpu(Device device) -> bool");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_device_is_cpu", TORCH_BOX(&test_device_is_cpu));
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
|
||||
bool test_device_is_cuda(torch::stable::Device device) {
|
||||
return device.is_cuda();
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("test_device_is_cuda(Device device) -> bool");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_device_is_cuda", TORCH_BOX(&test_device_is_cuda));
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
|
||||
torch::stable::Device test_device_set_index(
|
||||
torch::stable::Device device,
|
||||
torch::stable::DeviceIndex index) {
|
||||
device.set_index(index);
|
||||
return device;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("test_device_set_index(Device device, DeviceIndex index) -> Device");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_device_set_index", TORCH_BOX(&test_device_set_index));
|
||||
}
|
||||
@ -0,0 +1,14 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
|
||||
uint32_t test_get_num_threads() {
|
||||
return torch::stable::get_num_threads();
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("test_get_num_threads() -> int");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_get_num_threads", TORCH_BOX(&test_get_num_threads));
|
||||
}
|
||||
@ -0,0 +1,49 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
|
||||
#include <torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
Tensor test_parallel_for(int64_t size, int64_t grain_size) {
|
||||
AtenTensorHandle tensor_handle;
|
||||
int64_t stride = 1;
|
||||
|
||||
aoti_torch_empty_strided(
|
||||
1,
|
||||
&size,
|
||||
&stride,
|
||||
aoti_torch_dtype_int64(),
|
||||
aoti_torch_device_type_cpu(),
|
||||
0,
|
||||
&tensor_handle);
|
||||
|
||||
Tensor tensor(tensor_handle);
|
||||
int64_t* data_ptr = reinterpret_cast<int64_t*>(tensor.data_ptr());
|
||||
|
||||
torch::stable::zero_(tensor);
|
||||
|
||||
// Use parallel_for to fill each element with its index
|
||||
// If using a parallel path, the thread id is encoded in the upper 32 bits
|
||||
torch::stable::parallel_for(
|
||||
0, size, grain_size, [data_ptr](int64_t begin, int64_t end) {
|
||||
for (auto i = begin; i < end; i++) {
|
||||
STD_TORCH_CHECK(i <= UINT32_MAX);
|
||||
uint32_t thread_id;
|
||||
torch_get_thread_idx(&thread_id);
|
||||
data_ptr[i] = i | (static_cast<int64_t>(thread_id) << 32);
|
||||
}
|
||||
});
|
||||
|
||||
return tensor;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("test_parallel_for(int size, int grain_size) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_parallel_for", TORCH_BOX(&test_parallel_for));
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
torch::stable::Device test_tensor_device(torch::stable::Tensor tensor) {
|
||||
return tensor.device();
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||
m.def("test_tensor_device(Tensor t) -> Device");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_10, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_tensor_device", TORCH_BOX(&test_tensor_device));
|
||||
}
|
||||
@ -0,0 +1,225 @@
|
||||
import torch
|
||||
from torch import Tensor
|
||||
|
||||
|
||||
def my__foreach_mul_(tensors, others) -> ():
|
||||
"""
|
||||
Updates tensors to be the result of pointwise multiplying with others.
|
||||
|
||||
Args:
|
||||
tensors: list of tensors
|
||||
others: list of tensors (with the same corresponding shapes as tensors)
|
||||
|
||||
Returns: nothing, tensors is updated in place.
|
||||
"""
|
||||
torch.ops.libtorch_agnostic_2_10.my__foreach_mul_.default(tensors, others)
|
||||
|
||||
|
||||
def my__foreach_mul(tensors, others) -> list[Tensor]:
|
||||
"""
|
||||
Returns a list of tensors that are the results of pointwise multiplying
|
||||
tensors and others.
|
||||
|
||||
Args:
|
||||
tensors: list of tensors
|
||||
others: list of tensors (with the same corresponding shapes as tensors)
|
||||
|
||||
Returns: list of multiplied tensors
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.my__foreach_mul.default(tensors, others)
|
||||
|
||||
|
||||
def make_tensor_clones_and_call_foreach(t1, t2) -> list[Tensor]:
|
||||
"""
|
||||
Returns a list of 2 tensors corresponding to the square of the inputs.
|
||||
|
||||
Args:
|
||||
t1: Tensor
|
||||
t2: Tensor
|
||||
|
||||
Returns: list of [t1^2, t2^2]
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.make_tensor_clones_and_call_foreach.default(
|
||||
t1, t2
|
||||
)
|
||||
|
||||
|
||||
def test_tensor_device(t):
|
||||
"""
|
||||
Tests Tensor device() method.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to get device from
|
||||
|
||||
Returns: Device - device of the tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.test_tensor_device.default(t)
|
||||
|
||||
|
||||
def test_device_constructor(is_cuda, index, use_str):
|
||||
"""
|
||||
Tests creating a Device from DeviceType and index, or from a string.
|
||||
|
||||
Args:
|
||||
is_cuda: bool - if True, creates CUDA device; if False, creates CPU device
|
||||
index: int - device index
|
||||
use_str: bool - if True, constructs from string; if False, constructs from DeviceType
|
||||
|
||||
Returns: Device - A device with the specified type and index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.test_device_constructor.default(
|
||||
is_cuda, index, use_str
|
||||
)
|
||||
|
||||
|
||||
def test_device_equality(d1, d2) -> bool:
|
||||
"""
|
||||
Tests Device equality operator.
|
||||
|
||||
Args:
|
||||
d1: Device - first device
|
||||
d2: Device - second device
|
||||
|
||||
Returns: bool - True if devices are equal
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.test_device_equality.default(d1, d2)
|
||||
|
||||
|
||||
def test_device_set_index(device, index):
|
||||
"""
|
||||
Tests Device set_index() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to modify
|
||||
index: int - new device index
|
||||
|
||||
Returns: Device - device with updated index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.test_device_set_index.default(device, index)
|
||||
|
||||
|
||||
def test_device_index(device) -> int:
|
||||
"""
|
||||
Tests Device index() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to query
|
||||
|
||||
Returns: int - device index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.test_device_index.default(device)
|
||||
|
||||
|
||||
def test_device_is_cuda(device) -> bool:
|
||||
"""
|
||||
Tests Device is_cuda() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to check
|
||||
|
||||
Returns: bool - True if device is CUDA
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.test_device_is_cuda.default(device)
|
||||
|
||||
|
||||
def test_device_is_cpu(device) -> bool:
|
||||
"""
|
||||
Tests Device is_cpu() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to check
|
||||
|
||||
Returns: bool - True if device is CPU
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.test_device_is_cpu.default(device)
|
||||
|
||||
|
||||
def test_parallel_for(size, grain_size) -> Tensor:
|
||||
"""
|
||||
Tests the parallel_for functionality by using it to fill a tensor with indices.
|
||||
Args:
|
||||
size: int - size of the tensor to create
|
||||
grain_size: int - grain size for parallel_for
|
||||
Returns: Tensor - a 1D int64 tensor where each element contains its index
|
||||
(if multiple threads are used the threadid will be encoded in the upper 32 bits)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.test_parallel_for.default(size, grain_size)
|
||||
|
||||
|
||||
def test_get_num_threads() -> int:
|
||||
"""
|
||||
Tests the get_num_threads functionality by returning the number of threads
|
||||
for the parallel backend.
|
||||
|
||||
Returns: int - the number of threads for the parallel backend
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.test_get_num_threads.default()
|
||||
|
||||
|
||||
def my_empty(size, dtype=None, device=None, pin_memory=None) -> Tensor:
|
||||
"""
|
||||
Creates an empty tensor with the specified size, dtype, device, and pin_memory.
|
||||
|
||||
Args:
|
||||
size: list[int] - size of the tensor to create
|
||||
dtype: ScalarType or None - data type of the tensor
|
||||
device: Device or None - device on which to create the tensor
|
||||
pin_memory: bool or None - whether to use pinned memory
|
||||
|
||||
Returns: Tensor - an uninitialized tensor with the specified properties
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.my_empty.default(
|
||||
size, dtype, device, pin_memory
|
||||
)
|
||||
|
||||
|
||||
def my_reshape(t, shape) -> Tensor:
|
||||
"""
|
||||
Returns a tensor with the same data but different shape.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to reshape
|
||||
shape: list[int] - new shape for the tensor
|
||||
|
||||
Returns: Tensor - reshaped tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.my_reshape.default(t, shape)
|
||||
|
||||
|
||||
def my_view(t, size) -> Tensor:
|
||||
"""
|
||||
Returns a new tensor with the same data as the input tensor but of a different shape.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to view
|
||||
size: list[int] - new size for the tensor
|
||||
|
||||
Returns: Tensor - tensor with new view
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.my_view.default(t, size)
|
||||
|
||||
|
||||
def get_any_data_ptr(t, mutable) -> int:
|
||||
"""
|
||||
Return data pointer value of the tensor.
|
||||
Args:
|
||||
t: Input tensor
|
||||
mutable: whether data pointer qualifier is mutable or const
|
||||
Returns: int - pointer value
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.get_any_data_ptr.default(t, mutable)
|
||||
|
||||
|
||||
def get_template_any_data_ptr(t, dtype, mutable) -> int:
|
||||
"""
|
||||
Return data pointer value of the tensor iff it has dtype.
|
||||
Args:
|
||||
t: Input tensor
|
||||
dtype: Input dtype
|
||||
mutable: whether data pointer qualifier is mutable or const
|
||||
Returns: int - pointer value
|
||||
Raises RuntimeError when t.dtype() != dtype.
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_10.get_template_any_data_ptr.default(
|
||||
t, dtype, mutable
|
||||
)
|
||||
@ -9,7 +9,7 @@ from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtensio
|
||||
|
||||
|
||||
ROOT_DIR = Path(__file__).parent
|
||||
CSRC_DIR = ROOT_DIR / "libtorch_agnostic" / "csrc"
|
||||
CSRC_DIR = ROOT_DIR / "libtorch_agnostic_2_10" / "csrc"
|
||||
|
||||
|
||||
class clean(distutils.command.clean.clean):
|
||||
@ -18,13 +18,13 @@ class clean(distutils.command.clean.clean):
|
||||
distutils.command.clean.clean.run(self)
|
||||
|
||||
# Remove extension
|
||||
for path in (ROOT_DIR / "libtorch_agnostic").glob("**/*.so"):
|
||||
for path in (ROOT_DIR / "libtorch_agnostic_2_10").glob("**/*.so"):
|
||||
path.unlink()
|
||||
# Remove build and dist and egg-info directories
|
||||
dirs = [
|
||||
ROOT_DIR / "build",
|
||||
ROOT_DIR / "dist",
|
||||
ROOT_DIR / "libtorch_agnostic.egg-info",
|
||||
ROOT_DIR / "libtorch_agnostic_2_10.egg-info",
|
||||
]
|
||||
for path in dirs:
|
||||
if path.exists():
|
||||
@ -33,7 +33,11 @@ class clean(distutils.command.clean.clean):
|
||||
|
||||
def get_extension():
|
||||
extra_compile_args = {
|
||||
"cxx": ["-fdiagnostics-color=always"],
|
||||
"cxx": [
|
||||
"-fdiagnostics-color=always",
|
||||
"-DTORCH_STABLE_ONLY",
|
||||
"-DTORCH_TARGET_VERSION=0x020a000000000000",
|
||||
],
|
||||
}
|
||||
sources = list(CSRC_DIR.glob("**/*.cpp"))
|
||||
|
||||
@ -47,7 +51,7 @@ def get_extension():
|
||||
|
||||
return [
|
||||
extension(
|
||||
"libtorch_agnostic._C",
|
||||
"libtorch_agnostic_2_10._C",
|
||||
sources=sorted(str(s) for s in sources),
|
||||
py_limited_api=True,
|
||||
extra_compile_args=extra_compile_args,
|
||||
@ -57,12 +61,12 @@ def get_extension():
|
||||
|
||||
|
||||
setup(
|
||||
name="libtorch_agnostic",
|
||||
name="libtorch_agnostic_2_10",
|
||||
version="0.0",
|
||||
author="PyTorch Core Team",
|
||||
description="Example of libtorch agnostic extension",
|
||||
description="Example of libtorch agnostic extension for PyTorch 2.10+",
|
||||
packages=find_packages(exclude=("test",)),
|
||||
package_data={"libtorch_agnostic": ["*.dll", "*.dylib", "*.so"]},
|
||||
package_data={"libtorch_agnostic_2_10": ["*.dll", "*.dylib", "*.so"]},
|
||||
install_requires=[
|
||||
"torch",
|
||||
],
|
||||
@ -0,0 +1,308 @@
|
||||
# Owner(s): ["module: cpp"]
|
||||
|
||||
"""
|
||||
Unit tests to verify that each function file requires PyTorch 2.10+.
|
||||
|
||||
This test suite compiles each .cpp file in the csrc directory with
|
||||
TORCH_TARGET_VERSION=2.9.0 and expects compilation to fail.
|
||||
If compilation succeeds, it means that either
|
||||
|
||||
(1) The test function works with 2.9.0 and should not be in this directory.
|
||||
(2) The test function tests APIs that do not have proper TORCH_FEATURE_VERSION
|
||||
guards. If this is the case, and you incorrectly move the test function into
|
||||
libtorch_agnostic_2_9_extension the libtorch_agnostic_targetting CI workflow
|
||||
will catch this.
|
||||
|
||||
Run this script with VERSION_COMPAT_DEBUG=1 to see compilation errors.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
|
||||
from torch.utils.cpp_extension import CUDA_HOME, include_paths as torch_include_paths
|
||||
|
||||
|
||||
# TODO: Fix this error in Windows:
|
||||
# numba.cuda.cudadrv.driver:driver.py:384 Call to cuInit results in CUDA_ERROR_NO_DEVICE
|
||||
if not IS_WINDOWS:
|
||||
|
||||
class FunctionVersionCompatibilityTest(TestCase):
|
||||
"""Test that all function files require PyTorch 2.10+."""
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
"""Set up test environment once for all tests."""
|
||||
cls.csrc_dir = Path(__file__).parent / "libtorch_agnostic_2_10" / "csrc"
|
||||
cls.build_dir = Path(tempfile.mkdtemp(prefix="version_check_"))
|
||||
|
||||
cls.pytorch_includes = [
|
||||
f"-I{path}" for path in torch_include_paths(device_type="cpu")
|
||||
]
|
||||
cls.cuda_includes = []
|
||||
if CUDA_HOME:
|
||||
cuda_include_path = os.path.join(CUDA_HOME, "include")
|
||||
if os.path.exists(cuda_include_path):
|
||||
cls.cuda_includes = [f"-I{cuda_include_path}"]
|
||||
|
||||
cls.cuda_available = cls._check_cuda_available()
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
"""Clean up build directory."""
|
||||
import shutil
|
||||
|
||||
if cls.build_dir.exists():
|
||||
shutil.rmtree(cls.build_dir)
|
||||
|
||||
@staticmethod
|
||||
def _check_cuda_available() -> bool:
|
||||
"""Check if CUDA is available."""
|
||||
try:
|
||||
import torch
|
||||
|
||||
return torch.cuda.is_available()
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
def _compile_cpp_file(
|
||||
self, source_file: Path, output_file: Path
|
||||
) -> tuple[bool, str]:
|
||||
"""
|
||||
Compile a C++ file with TORCH_TARGET_VERSION=2.9.0.
|
||||
Returns (success, error_message).
|
||||
"""
|
||||
torch_version_2_9 = "0x0209000000000000"
|
||||
|
||||
cmd = [
|
||||
"g++",
|
||||
"-c",
|
||||
"-std=c++17",
|
||||
f"-DTORCH_TARGET_VERSION={torch_version_2_9}",
|
||||
f"-I{source_file.parent}", # For includes in same directory
|
||||
*self.pytorch_includes,
|
||||
]
|
||||
|
||||
# Add CUDA flags if available
|
||||
if self.cuda_available:
|
||||
cmd.extend(self.cuda_includes)
|
||||
|
||||
cmd.extend([str(source_file), "-o", str(output_file)])
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
||||
|
||||
if result.returncode == 0:
|
||||
return True, ""
|
||||
else:
|
||||
return False, result.stderr
|
||||
|
||||
def _compile_cu_file(
|
||||
self, source_file: Path, output_file: Path
|
||||
) -> tuple[bool, str]:
|
||||
"""
|
||||
Compile a CUDA file with TORCH_TARGET_VERSION=2.9.0.
|
||||
Returns (success, error_message).
|
||||
"""
|
||||
if not CUDA_HOME:
|
||||
return False, "CUDA_HOME not set"
|
||||
|
||||
torch_version_2_9 = "0x0209000000000000"
|
||||
|
||||
cmd = [
|
||||
os.path.join(CUDA_HOME, "bin", "nvcc"),
|
||||
"-c",
|
||||
"-std=c++17",
|
||||
f"-DTORCH_TARGET_VERSION={torch_version_2_9}",
|
||||
f"-I{source_file.parent}", # For includes in same directory
|
||||
*self.pytorch_includes,
|
||||
*self.cuda_includes,
|
||||
]
|
||||
|
||||
cmd.extend([str(source_file), "-o", str(output_file)])
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
||||
|
||||
if result.returncode == 0:
|
||||
return True, ""
|
||||
else:
|
||||
return False, result.stderr
|
||||
|
||||
def _test_function_file(self, source_file: Path):
|
||||
"""Test that a function file fails to compile with TORCH_TARGET_VERSION=2.9.0."""
|
||||
func_name = source_file.stem
|
||||
obj_file = self.build_dir / f"{func_name}.o"
|
||||
|
||||
# Choose the appropriate compiler based on file extension
|
||||
if source_file.suffix == ".cu":
|
||||
if not self.cuda_available:
|
||||
self.skipTest(f"CUDA not available, skipping {source_file.name}")
|
||||
success, error_msg = self._compile_cu_file(source_file, obj_file)
|
||||
else:
|
||||
success, error_msg = self._compile_cpp_file(source_file, obj_file)
|
||||
|
||||
obj_file.unlink(missing_ok=True)
|
||||
|
||||
# Print error details for debugging
|
||||
if not success:
|
||||
relevant_errors = self._extract_relevant_errors(error_msg)
|
||||
if relevant_errors:
|
||||
print(f"\n Compilation errors for {func_name} (requires 2.10+):")
|
||||
for err in relevant_errors:
|
||||
print(f" {err}")
|
||||
|
||||
self.assertFalse(
|
||||
success,
|
||||
f"Function {func_name} compiled successfully with TORCH_TARGET_VERSION=2.9.0. "
|
||||
f"This could mean two things.\n\t1. It should run with 2.9.0 and should be "
|
||||
"moved to libtorch_agnostic_2_9_extension\n\t2. The function(s) it tests do not use the "
|
||||
"proper TORCH_FEATURE_VERSION guards\n\nThe libtorch_agnostic_targetting CI workflow will "
|
||||
"verify if you incorrectly move this to the 2_9 extension instead of adding "
|
||||
"the appropriate version guards.",
|
||||
)
|
||||
|
||||
def test_mv_tensor_accessor_cpu_works_with_2_9(self):
|
||||
"""Test that mv_tensor_accessor_cpu.cpp compiles successfully with 2.9.0.
|
||||
|
||||
This is a negative test - it ensures that a file we expect to work with 2.9.0
|
||||
actually does compile. This validates that our test infrastructure correctly
|
||||
distinguishes between files that require 2.10+ and those that don't.
|
||||
"""
|
||||
cpp_file = self.csrc_dir / "mv_tensor_accessor_cpu.cpp"
|
||||
|
||||
if not cpp_file.exists():
|
||||
self.skipTest(f"{cpp_file} not found - this is a test file only")
|
||||
|
||||
obj_file = self.build_dir / "mv_tensor_accessor_cpu.o"
|
||||
success, error_msg = self._compile_cpp_file(cpp_file, obj_file)
|
||||
|
||||
# Clean up
|
||||
obj_file.unlink(missing_ok=True)
|
||||
|
||||
if not success:
|
||||
relevant_errors = self._extract_relevant_errors(error_msg)
|
||||
if relevant_errors:
|
||||
print(
|
||||
"\n Unexpected compilation errors for mv_tensor_accessor_cpu:"
|
||||
)
|
||||
for err in relevant_errors:
|
||||
print(f"{err}")
|
||||
|
||||
self.assertTrue(
|
||||
success,
|
||||
f"mv_tensor_accessor_cpu.cpp failed to compile with TORCH_TARGET_VERSION=2.9.0. "
|
||||
f"This file is expected to work with 2.9.0 since it doesn't use 2.10+ features. "
|
||||
f"Error: {error_msg}",
|
||||
)
|
||||
|
||||
def test_mv_tensor_accessor_cuda_works_with_2_9(self):
|
||||
"""Test that mv_tensor_accessor_cuda.cu compiles successfully with 2.9.0.
|
||||
|
||||
This is a negative test - it ensures that a .cu file we expect to work with 2.9.0
|
||||
actually does compile. This validates that our test infrastructure correctly
|
||||
compiles CUDA files and distinguishes between files that require 2.10+ and those
|
||||
that don't.
|
||||
"""
|
||||
if not self.cuda_available:
|
||||
self.skipTest(
|
||||
"CUDA not available, skipping mv_tensor_accessor_cuda.cu test"
|
||||
)
|
||||
|
||||
cu_file = self.csrc_dir / "mv_tensor_accessor_cuda.cu"
|
||||
|
||||
if not cu_file.exists():
|
||||
self.skipTest(f"{cu_file} not found - this is a test file only")
|
||||
|
||||
obj_file = self.build_dir / "cuda_kernel.o"
|
||||
success, error_msg = self._compile_cu_file(cu_file, obj_file)
|
||||
|
||||
# Clean up
|
||||
obj_file.unlink(missing_ok=True)
|
||||
|
||||
if not success:
|
||||
relevant_errors = self._extract_relevant_errors(error_msg)
|
||||
if relevant_errors:
|
||||
print(
|
||||
"\n Unexpected compilation errors for mv_tensor_accessor_cuda.cu:"
|
||||
)
|
||||
for err in relevant_errors:
|
||||
print(f"{err}")
|
||||
|
||||
self.assertTrue(
|
||||
success,
|
||||
f"mv_tensor_accessor_cuda.cu failed to compile with TORCH_TARGET_VERSION=2.9.0. "
|
||||
f"This file is expected to work with 2.9.0 since it doesn't use 2.10+ features. "
|
||||
f"Error: {error_msg}",
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _extract_relevant_errors(error_msg: str) -> list[str]:
|
||||
"""Extract the most relevant error messages."""
|
||||
error_lines = error_msg.strip().split("\n")
|
||||
relevant_errors = []
|
||||
|
||||
for line in error_lines:
|
||||
line_lower = line.lower()
|
||||
if (
|
||||
"error:" in line_lower
|
||||
or "undefined" in line_lower
|
||||
or "undeclared" in line_lower
|
||||
or "no member named" in line_lower
|
||||
):
|
||||
relevant_errors.append(line.strip())
|
||||
|
||||
return relevant_errors
|
||||
|
||||
# Dynamically create test methods for each .cpp and .cu file
|
||||
|
||||
def _create_test_method_for_file(source_file: Path):
|
||||
"""Create a test method for a specific source file."""
|
||||
|
||||
def test_method_impl(self):
|
||||
self._test_function_file(source_file)
|
||||
|
||||
# Set a descriptive name and docstring
|
||||
func_name = source_file.stem
|
||||
file_ext = source_file.suffix
|
||||
test_method_impl.__name__ = f"test_{func_name}_requires_2_10"
|
||||
test_method_impl.__doc__ = (
|
||||
f"Test that {func_name}{file_ext} requires PyTorch 2.10+"
|
||||
)
|
||||
|
||||
return test_method_impl
|
||||
|
||||
# Test discovery: generate a test for each .cpp and .cu file
|
||||
_csrc_dir = Path(__file__).parent / "libtorch_agnostic_2_10" / "csrc"
|
||||
if _csrc_dir.exists():
|
||||
# Collect both .cpp and .cu files, excluding those used for negative test
|
||||
# already defined above
|
||||
_source_files = sorted(
|
||||
[
|
||||
f
|
||||
for f in _csrc_dir.rglob("*.cpp")
|
||||
if f.name not in ("mv_tensor_accessor_cpu.cpp",)
|
||||
]
|
||||
+ [
|
||||
f
|
||||
for f in _csrc_dir.rglob("*.cu")
|
||||
if f.name not in ("mv_tensor_accessor_cuda.cu",)
|
||||
]
|
||||
)
|
||||
|
||||
for _source_file in _source_files:
|
||||
_test_method = _create_test_method_for_file(_source_file)
|
||||
setattr(
|
||||
FunctionVersionCompatibilityTest, _test_method.__name__, _test_method
|
||||
)
|
||||
|
||||
del (
|
||||
_create_test_method_for_file,
|
||||
_csrc_dir,
|
||||
_source_files,
|
||||
_source_file,
|
||||
_test_method,
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
@ -0,0 +1,21 @@
|
||||
import ctypes
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
so_files = list(Path(__file__).parent.glob("_C*.so"))
|
||||
assert len(so_files) == 1, f"Expected one _C*.so file, found {len(so_files)}"
|
||||
|
||||
# use ctypes.CDLL instead of load_library to be able to test the unload logic
|
||||
# below code is reduced from the load_library code
|
||||
with torch._ops.dl_open_guard():
|
||||
loaded_lib = ctypes.CDLL(so_files[0])
|
||||
|
||||
from . import ops
|
||||
|
||||
|
||||
__all__ = [
|
||||
"loaded_lib",
|
||||
"ops",
|
||||
]
|
||||
@ -0,0 +1,44 @@
|
||||
#include "kernel.h"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
|
||||
STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
|
||||
STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
|
||||
STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
|
||||
STD_TORCH_CHECK(
|
||||
m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
|
||||
STD_TORCH_CHECK(
|
||||
m.device() == v.device(), "m and v must be on the same device");
|
||||
Tensor res = new_empty(m, {m.size(0)});
|
||||
THO_DISPATCH_V2(
|
||||
m.scalar_type(),
|
||||
"mv_tensor_accessor_cuda",
|
||||
AT_WRAP(([&]() {
|
||||
auto resa = Accessor_cuda<scalar_t, 1>(
|
||||
reinterpret_cast<scalar_t*>(res.data_ptr()),
|
||||
res.sizes().data(),
|
||||
res.strides().data());
|
||||
auto ma = Accessor_cuda<scalar_t, 2>(
|
||||
reinterpret_cast<scalar_t*>(m.data_ptr()),
|
||||
m.sizes().data(),
|
||||
m.strides().data());
|
||||
auto va = Accessor_cuda<scalar_t, 1>(
|
||||
reinterpret_cast<scalar_t*>(v.data_ptr()),
|
||||
v.sizes().data(),
|
||||
v.strides().data());
|
||||
mv_tensor_accessor_kernel<Accessor_cuda, scalar_t>
|
||||
<<<1, 1, 0, 0>>>(resa, ma, va);
|
||||
})),
|
||||
AT_FLOATING_TYPES);
|
||||
return res;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CUDA, m) {
|
||||
m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
|
||||
}
|
||||
@ -1,8 +1,6 @@
|
||||
#include "kernel.h"
|
||||
|
||||
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
|
||||
#include <torch/csrc/stable/accelerator.h>
|
||||
#include <torch/csrc/stable/device.h>
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
@ -11,6 +9,7 @@
|
||||
|
||||
#ifdef LAE_USE_CUDA
|
||||
#include <cuda_runtime.h>
|
||||
#include <torch/csrc/stable/accelerator.h>
|
||||
#endif
|
||||
|
||||
#include <optional>
|
||||
@ -68,11 +67,11 @@ Tensor sgd_out_of_place(
|
||||
return out;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY(libtorch_agnostic, m) {
|
||||
STABLE_TORCH_LIBRARY(libtorch_agnostic_2_9, m) {
|
||||
m.def("sgd_out_of_place(Tensor param, Tensor grad, float weight_decay, float lr, bool maximize) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CPU, m) {
|
||||
m.impl("sgd_out_of_place", TORCH_BOX(&sgd_out_of_place));
|
||||
}
|
||||
|
||||
@ -81,15 +80,15 @@ Tensor identity(Tensor t) {
|
||||
}
|
||||
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
m.def("identity(Tensor t) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CUDA, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CUDA, m) {
|
||||
m.impl("identity", TORCH_BOX(&identity));
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CPU, m) {
|
||||
m.impl("identity", TORCH_BOX(&identity));
|
||||
}
|
||||
|
||||
@ -101,11 +100,11 @@ Tensor my_abs(Tensor t) {
|
||||
return torch::stable::detail::to<Tensor>(stack[0]);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
m.def("my_abs(Tensor t) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_abs", TORCH_BOX(&my_abs));
|
||||
}
|
||||
|
||||
@ -127,11 +126,11 @@ Tensor my_ones_like(Tensor t, StableIValue device) {
|
||||
return torch::stable::detail::to<Tensor>(stack[0]);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
m.def("my_ones_like(Tensor t, Device d) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_ones_like", TORCH_BOX(&my_ones_like));
|
||||
}
|
||||
|
||||
@ -154,11 +153,11 @@ std::tuple<Tensor, Tensor, bool> exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3
|
||||
torch::stable::detail::to<bool>(stack_is_leaf[0]));
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
m.def("exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3) -> (Tensor, Tensor, bool)");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
m.impl("exp_neg_is_leaf", TORCH_BOX(&exp_neg_is_leaf));
|
||||
}
|
||||
|
||||
@ -170,11 +169,11 @@ Tensor neg_exp(Tensor t) {
|
||||
return torch::stable::detail::to<Tensor>(stack[0]);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
m.def("neg_exp(Tensor t) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
m.impl("neg_exp", TORCH_BOX(&neg_exp));
|
||||
}
|
||||
|
||||
@ -194,11 +193,11 @@ Tensor divide_neg_exp(Tensor t) {
|
||||
return torch::stable::detail::to<Tensor>(stack_div[0]);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
m.def("divide_neg_exp(Tensor t) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
m.impl("divide_neg_exp", TORCH_BOX(÷_neg_exp));
|
||||
}
|
||||
|
||||
@ -206,11 +205,11 @@ bool is_contiguous(Tensor t) {
|
||||
return t.is_contiguous();
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
m.def("is_contiguous(Tensor t) -> bool");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
m.impl("is_contiguous", TORCH_BOX(&is_contiguous));
|
||||
}
|
||||
|
||||
@ -265,7 +264,7 @@ Tensor my_clone(Tensor t) {
|
||||
return clone(t);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
m.def("my_transpose(Tensor t, int dim0, int dim1) -> Tensor");
|
||||
m.def("my_empty_like(Tensor t) -> Tensor");
|
||||
m.def("fill_infinity(Tensor(a!) t) -> Tensor(a!)");
|
||||
@ -277,7 +276,7 @@ STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("my_clone(Tensor t) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_transpose", TORCH_BOX(&my_transpose));
|
||||
m.impl("my_empty_like", TORCH_BOX(&my_empty_like));
|
||||
m.impl("fill_infinity", TORCH_BOX(&fill_infinity));
|
||||
@ -288,7 +287,7 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_clone", TORCH_BOX(&my_clone));
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeImplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeImplicitAutograd, m) {
|
||||
m.impl("my_pad", TORCH_BOX(&my_pad));
|
||||
m.impl("my_narrow", TORCH_BOX(&my_narrow));
|
||||
}
|
||||
@ -305,7 +304,7 @@ Tensor my_amax_vec(Tensor t) {
|
||||
return amax(t, {0,1}, false);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
m.def("my_zero_(Tensor(a!) t) -> Tensor(a!)");
|
||||
m.def("my_amax(Tensor a) -> Tensor");
|
||||
m.def("my_amax_vec(Tensor a) -> Tensor");
|
||||
@ -332,223 +331,11 @@ bool test_default_constructor(bool defined) {
|
||||
return out.defined();
|
||||
}
|
||||
|
||||
uint64_t get_any_data_ptr(Tensor t, bool mutable_) {
|
||||
if (mutable_) {
|
||||
return reinterpret_cast<uint64_t>(t.mutable_data_ptr());
|
||||
} else {
|
||||
return reinterpret_cast<uint64_t>(t.const_data_ptr());
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t get_template_any_data_ptr(Tensor t, c10::ScalarType dtype, bool mutable_) {
|
||||
#define DEFINE_CASE(T, name) \
|
||||
case torch::headeronly::ScalarType::name: { \
|
||||
if (mutable_) { \
|
||||
return reinterpret_cast<uint64_t>(t.mutable_data_ptr<T>()); \
|
||||
} else { \
|
||||
return reinterpret_cast<uint64_t>(t.const_data_ptr<T>()); \
|
||||
} \
|
||||
}
|
||||
switch (dtype) {
|
||||
// per aten/src/ATen/templates/TensorMethods.cpp:
|
||||
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
|
||||
DEFINE_CASE(uint16_t, UInt16)
|
||||
DEFINE_CASE(uint32_t, UInt32)
|
||||
DEFINE_CASE(uint64_t, UInt64)
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
#undef DEFINE_CASE
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("get_any_data_ptr(Tensor t, bool mutable_) -> int");
|
||||
m.def("get_template_any_data_ptr(Tensor t, ScalarType dtype, bool mutable_) -> int");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_zero_", TORCH_BOX(&my_zero_));
|
||||
m.impl("my_amax", TORCH_BOX(&my_amax));
|
||||
m.impl("my_amax_vec", TORCH_BOX(&my_amax_vec));
|
||||
m.impl("test_default_constructor", TORCH_BOX(&test_default_constructor));
|
||||
m.impl("get_any_data_ptr", TORCH_BOX(&get_any_data_ptr));
|
||||
m.impl("get_template_any_data_ptr", TORCH_BOX(&get_template_any_data_ptr));
|
||||
}
|
||||
|
||||
std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
|
||||
std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
|
||||
aoti_torch_call_dispatcher("aten::_foreach_mul", "List", stack.data());
|
||||
return torch::stable::detail::to<std::vector<Tensor>>(stack[0]);
|
||||
}
|
||||
|
||||
void my__foreach_mul_(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
|
||||
std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
|
||||
aoti_torch_call_dispatcher("aten::_foreach_mul_", "List", stack.data());
|
||||
}
|
||||
|
||||
std::vector<Tensor> make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) {
|
||||
// This function tests that my__foreach_mul can take in std::initializer_lists
|
||||
// in addition to std::vectors.
|
||||
Tensor t1_1 = my_clone(t1);
|
||||
Tensor t1_2 = my_clone(t1);
|
||||
Tensor t2_1 = my_clone(t2);
|
||||
Tensor t2_2 = my_clone(t2);
|
||||
return my__foreach_mul({t1_1, t2_1}, {t1_2, t2_2});
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("my__foreach_mul(Tensor[] self, Tensor[] other) -> Tensor[]");
|
||||
m.def("my__foreach_mul_(Tensor(a!)[] self, Tensor[] other) -> ()");
|
||||
m.def("make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) -> Tensor[]");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("my__foreach_mul", TORCH_BOX(&my__foreach_mul));
|
||||
m.impl("my__foreach_mul_", TORCH_BOX(&my__foreach_mul_));
|
||||
m.impl("make_tensor_clones_and_call_foreach", TORCH_BOX(&make_tensor_clones_and_call_foreach));
|
||||
}
|
||||
|
||||
// Test functions for torch::stable::Tensor device method
|
||||
|
||||
torch::stable::Device test_tensor_device(torch::stable::Tensor tensor) {
|
||||
return tensor.device();
|
||||
}
|
||||
|
||||
void boxed_test_tensor_device(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
torch::stable::Device res = test_tensor_device(
|
||||
torch::stable::detail::to<torch::stable::Tensor>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
// Test functions for torch::stable::Device
|
||||
|
||||
torch::stable::Device test_device_constructor(
|
||||
bool is_cuda,
|
||||
torch::stable::DeviceIndex index,
|
||||
bool use_str) {
|
||||
using torch::stable::Device;
|
||||
using torch::stable::DeviceType;
|
||||
|
||||
if (use_str) {
|
||||
std::string device_str;
|
||||
if (is_cuda) {
|
||||
device_str = "cuda:" + std::to_string(index);
|
||||
} else {
|
||||
device_str = "cpu";
|
||||
}
|
||||
return Device(device_str);
|
||||
} else {
|
||||
if (is_cuda) {
|
||||
return Device(DeviceType::CUDA, index);
|
||||
} else {
|
||||
return Device(DeviceType::CPU);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void boxed_test_device_constructor(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
torch::stable::Device res = test_device_constructor(
|
||||
torch::stable::detail::to<bool>(stack[0]),
|
||||
torch::stable::detail::to<torch::stable::DeviceIndex>(stack[1]),
|
||||
torch::stable::detail::to<bool>(stack[2]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
bool test_device_equality(torch::stable::Device d1, torch::stable::Device d2) {
|
||||
return d1 == d2;
|
||||
}
|
||||
|
||||
void boxed_test_device_equality(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
bool res = test_device_equality(
|
||||
torch::stable::detail::to<torch::stable::Device>(stack[0]),
|
||||
torch::stable::detail::to<torch::stable::Device>(stack[1]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
torch::stable::Device test_device_set_index(
|
||||
torch::stable::Device device,
|
||||
torch::stable::DeviceIndex index) {
|
||||
device.set_index(index);
|
||||
return device;
|
||||
}
|
||||
|
||||
void boxed_test_device_set_index(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
torch::stable::Device res = test_device_set_index(
|
||||
torch::stable::detail::to<torch::stable::Device>(stack[0]),
|
||||
torch::stable::detail::to<torch::stable::DeviceIndex>(stack[1]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
torch::stable::DeviceIndex test_device_index(torch::stable::Device device) {
|
||||
return device.index();
|
||||
}
|
||||
|
||||
void boxed_test_device_index(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
torch::stable::DeviceIndex res = test_device_index(
|
||||
torch::stable::detail::to<torch::stable::Device>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
bool test_device_is_cuda(torch::stable::Device device) {
|
||||
return device.is_cuda();
|
||||
}
|
||||
|
||||
void boxed_test_device_is_cuda(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
bool res = test_device_is_cuda(
|
||||
torch::stable::detail::to<torch::stable::Device>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
bool test_device_is_cpu(torch::stable::Device device) {
|
||||
return device.is_cpu();
|
||||
}
|
||||
|
||||
void boxed_test_device_is_cpu(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
bool res = test_device_is_cpu(
|
||||
torch::stable::detail::to<torch::stable::Device>(stack[0]));
|
||||
stack[0] = torch::stable::detail::from(res);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("test_tensor_device(Tensor t) -> Device");
|
||||
m.def(
|
||||
"test_device_constructor(bool is_cuda, DeviceIndex index, bool use_str) -> Device");
|
||||
m.def("test_device_equality(Device d1, Device d2) -> bool");
|
||||
m.def("test_device_set_index(Device device, DeviceIndex index) -> Device");
|
||||
m.def("test_device_index(Device device) -> DeviceIndex");
|
||||
m.def("test_device_is_cuda(Device device) -> bool");
|
||||
m.def("test_device_is_cpu(Device device) -> bool");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_tensor_device", &boxed_test_tensor_device);
|
||||
m.impl("test_device_constructor", &boxed_test_device_constructor);
|
||||
m.impl("test_device_equality", &boxed_test_device_equality);
|
||||
m.impl("test_device_set_index", &boxed_test_device_set_index);
|
||||
m.impl("test_device_index", &boxed_test_device_index);
|
||||
m.impl("test_device_is_cuda", &boxed_test_device_is_cuda);
|
||||
m.impl("test_device_is_cpu", &boxed_test_device_is_cpu);
|
||||
}
|
||||
|
||||
Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
|
||||
@ -569,11 +356,11 @@ Tensor mv_tensor_accessor_cpu(Tensor m, Tensor v) {
|
||||
return res;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
m.def("mv_tensor_accessor(Tensor m, Tensor v) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CPU, m) {
|
||||
m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cpu));
|
||||
}
|
||||
|
||||
@ -619,14 +406,14 @@ int64_t test_get_current_device_index() {
|
||||
return torch::stable::accelerator::getCurrentDeviceIndex();
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
m.def("test_device_guard(int device_index) -> int");
|
||||
m.def("test_device_guard_set_index() -> int");
|
||||
m.def("test_stream(int device_index) -> int");
|
||||
m.def("test_get_current_device_index() -> int");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_device_guard", TORCH_BOX(&test_device_guard));
|
||||
m.impl("test_device_guard_set_index", TORCH_BOX(&test_device_guard_set_index));
|
||||
m.impl("test_stream", TORCH_BOX(&test_stream));
|
||||
@ -635,100 +422,14 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
|
||||
#endif // LAE_USE_CUDA
|
||||
|
||||
Tensor test_parallel_for(int64_t size, int64_t grain_size) {
|
||||
AtenTensorHandle tensor_handle;
|
||||
int64_t stride = 1;
|
||||
|
||||
aoti_torch_empty_strided(
|
||||
1,
|
||||
&size,
|
||||
&stride,
|
||||
aoti_torch_dtype_int64(),
|
||||
aoti_torch_device_type_cpu(),
|
||||
0,
|
||||
&tensor_handle);
|
||||
|
||||
Tensor tensor(tensor_handle);
|
||||
int64_t* data_ptr = reinterpret_cast<int64_t*>(tensor.data_ptr());
|
||||
|
||||
torch::stable::zero_(tensor);
|
||||
|
||||
// Use parallel_for to fill each element with its index
|
||||
// If using a parallel path, the thread id is encoded in the upper 32 bits
|
||||
torch::stable::parallel_for(
|
||||
0, size, grain_size, [data_ptr](int64_t begin, int64_t end) {
|
||||
for (auto i = begin; i < end; i++) {
|
||||
STD_TORCH_CHECK(i <= UINT32_MAX);
|
||||
uint32_t thread_id;
|
||||
torch_get_thread_idx(&thread_id);
|
||||
data_ptr[i] = i | (static_cast<int64_t>(thread_id) << 32);
|
||||
}
|
||||
});
|
||||
|
||||
return tensor;
|
||||
}
|
||||
|
||||
void boxed_test_parallel_for(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
Tensor res = test_parallel_for(to<int64_t>(stack[0]), to<int64_t>(stack[1]));
|
||||
stack[0] = from(res);
|
||||
}
|
||||
|
||||
uint32_t test_get_num_threads() {
|
||||
return torch::stable::get_num_threads();
|
||||
}
|
||||
|
||||
void boxed_test_get_num_threads(
|
||||
StableIValue* stack,
|
||||
uint64_t num_args,
|
||||
uint64_t num_outputs) {
|
||||
uint32_t res = test_get_num_threads();
|
||||
stack[0] = from(res);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def("test_parallel_for(int size, int grain_size) -> Tensor");
|
||||
m.def("test_get_num_threads() -> int");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("test_parallel_for", &boxed_test_parallel_for);
|
||||
m.impl("test_get_num_threads", &boxed_test_get_num_threads);
|
||||
}
|
||||
|
||||
Tensor my_empty(
|
||||
torch::headeronly::HeaderOnlyArrayRef<int64_t> size,
|
||||
std::optional<torch::headeronly::ScalarType> dtype,
|
||||
std::optional<torch::stable::Device> device,
|
||||
std::optional<bool> pin_memory) {
|
||||
return empty(size, dtype, device, pin_memory);
|
||||
}
|
||||
|
||||
Tensor my_flatten(Tensor t, int64_t start_dim, int64_t end_dim) {
|
||||
return flatten(t, start_dim, end_dim);
|
||||
}
|
||||
|
||||
Tensor my_reshape(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> shape) {
|
||||
return reshape(t, shape);
|
||||
}
|
||||
|
||||
Tensor my_view(Tensor t, torch::headeronly::HeaderOnlyArrayRef<int64_t> size) {
|
||||
return view(t, size);
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
|
||||
m.def(
|
||||
"my_empty(int[] size, ScalarType? dtype=None, Device? device=None, bool? pin_memory=None) -> Tensor");
|
||||
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_9, m) {
|
||||
m.def("my_flatten(Tensor t, int start_dim=0, int end_dim=-1) -> Tensor");
|
||||
m.def("my_reshape(Tensor t, int[] shape) -> Tensor");
|
||||
m.def("my_view(Tensor t, int[] size) -> Tensor");
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_empty", TORCH_BOX(&my_empty));
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic_2_9, CompositeExplicitAutograd, m) {
|
||||
m.impl("my_flatten", TORCH_BOX(&my_flatten));
|
||||
m.impl("my_reshape", TORCH_BOX(&my_reshape));
|
||||
m.impl("my_view", TORCH_BOX(&my_view));
|
||||
}
|
||||
@ -0,0 +1,363 @@
|
||||
import torch
|
||||
from torch import Tensor
|
||||
|
||||
|
||||
def sgd_out_of_place(param, grad, weight_decay, lr, maximize) -> Tensor:
|
||||
"""
|
||||
Computes a single step of SGD on a single parameter Tensor with grad.
|
||||
|
||||
Assumes:
|
||||
- param and grad are the same shape and are 1D.
|
||||
- param and grad are float and on CPU
|
||||
|
||||
Args:
|
||||
param: a 1D tensor of floats
|
||||
grad: a 1D tensor of floats
|
||||
weight_decay: a python double between 0 and 1
|
||||
lr: a python double
|
||||
|
||||
Returns:
|
||||
a 1D float Tensor the same shape as param
|
||||
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.sgd_out_of_place.default(
|
||||
param, grad, weight_decay, lr, maximize
|
||||
)
|
||||
|
||||
|
||||
def identity(t) -> Tensor:
|
||||
"""
|
||||
Returns the input tensor
|
||||
|
||||
Args:
|
||||
t: any Tensor
|
||||
|
||||
Returns:
|
||||
a Tensor, the same as input.
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.identity.default(t)
|
||||
|
||||
|
||||
def my_abs(t) -> Tensor:
|
||||
"""
|
||||
Returns abs on the input tensor, outputs a new Tensor
|
||||
|
||||
Args:
|
||||
t: any Tensor
|
||||
|
||||
Returns:
|
||||
a Tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_abs.default(t)
|
||||
|
||||
|
||||
def my_is_cpu(t) -> bool:
|
||||
"""
|
||||
Returns is_cpu on the input tensor.
|
||||
|
||||
Args:
|
||||
t: any Tensor
|
||||
|
||||
Returns:
|
||||
a bool
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_is_cpu.default(t)
|
||||
|
||||
|
||||
def my_ones_like(tensor, device) -> Tensor:
|
||||
"""
|
||||
Returns a new Tensor like the input tensor, but with all ones
|
||||
|
||||
Args:
|
||||
tensor: any Tensor
|
||||
device: a device string
|
||||
|
||||
Returns:
|
||||
a ones Tensor with the same dtype and shape and other attributes
|
||||
like the input tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_ones_like.default(tensor, device)
|
||||
|
||||
|
||||
def exp_neg_is_leaf(t1, t2, t3) -> tuple[Tensor, Tensor, bool]:
|
||||
"""
|
||||
Returns a Tensor, Tensor, bool tuple corresponding to the respective inputs
|
||||
t1, t2, and t3.
|
||||
|
||||
Args:
|
||||
t1: Tensor
|
||||
t2: Tensor
|
||||
t3: Tensor
|
||||
|
||||
Returns:
|
||||
(exp(t1), neg(t2), is_leaf(t3))
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.exp_neg_is_leaf.default(t1, t2, t3)
|
||||
|
||||
|
||||
def neg_exp(t) -> Tensor:
|
||||
"""
|
||||
Returns a Tensor composing neg of exp
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: neg(exp(t))
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.neg_exp.default(t)
|
||||
|
||||
|
||||
def divide_neg_exp(t) -> Tensor:
|
||||
"""
|
||||
Returns a Tensor division of neg and exp
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: divide(neg(t), exp(t))
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.divide_neg_exp.default(t)
|
||||
|
||||
|
||||
def is_contiguous(t) -> bool:
|
||||
"""
|
||||
Returns a bool indicating if the input tensor is contiguous
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: is_contiguous(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.is_contiguous.default(t)
|
||||
|
||||
|
||||
def my_transpose(t, dim0, dim1) -> Tensor:
|
||||
"""
|
||||
Returns t.transpose(dim0, dim1)
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: my_transpose(t, dim0, dim1)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_transpose.default(t, dim0, dim1)
|
||||
|
||||
|
||||
def my_empty_like(t) -> Tensor:
|
||||
"""
|
||||
Returns t.empty_like()
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: my_empty_like(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_empty_like.default(t)
|
||||
|
||||
|
||||
def my_zero_(t) -> Tensor:
|
||||
"""
|
||||
Returns t.zero_()
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: my_zero_(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_zero_.default(t)
|
||||
|
||||
|
||||
def my_amax(t) -> Tensor:
|
||||
"""
|
||||
Returns t.amax()
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: amax(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_amax.default(t)
|
||||
|
||||
|
||||
def my_amax_vec(t) -> Tensor:
|
||||
"""
|
||||
Returns t.amax()
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: amax(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_amax_vec.default(t)
|
||||
|
||||
|
||||
def fill_infinity(t) -> Tensor:
|
||||
"""
|
||||
Fills the tensor with inf.
|
||||
|
||||
Args:
|
||||
t: Tensor to fill
|
||||
|
||||
Returns: The modified tensor (same as input)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.fill_infinity.default(t)
|
||||
|
||||
|
||||
def test_default_constructor(defined) -> bool:
|
||||
"""
|
||||
Tests the default constructor for torch::stable::Tensor.
|
||||
|
||||
Args:
|
||||
defined: bool - if True, tests defined tensor; if False, tests undefined tensor
|
||||
|
||||
Returns: bool - result of calling .defined() on the tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.test_default_constructor.default(defined)
|
||||
|
||||
|
||||
def mv_tensor_accessor(m, v) -> Tensor:
|
||||
"""
|
||||
Returns matrix-vector product.
|
||||
|
||||
Args:
|
||||
m: any 2-D Tensor with shape (N, M)
|
||||
v: any 1-D Tensor with shape (M,)
|
||||
|
||||
Returns:
|
||||
a 1-D Tensor with shape (N,)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.mv_tensor_accessor.default(m, v)
|
||||
|
||||
|
||||
def my_pad(t) -> Tensor:
|
||||
"""
|
||||
Pads the input tensor with hardcoded padding parameters.
|
||||
|
||||
Args:
|
||||
t: Input tensor
|
||||
|
||||
Returns: Padded tensor with padding [1, 2, 2, 1], mode "constant", value 0.0
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_pad.default(t)
|
||||
|
||||
|
||||
def my_narrow(t, dim, start, length) -> Tensor:
|
||||
"""
|
||||
Returns a new tensor that is a narrowed version of the input tensor.
|
||||
|
||||
Args:
|
||||
t: Input tensor
|
||||
dim: Dimension along which to narrow
|
||||
start: Starting position
|
||||
length: Length of the narrowed section
|
||||
|
||||
Returns: Narrowed tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_narrow.default(t, dim, start, length)
|
||||
|
||||
|
||||
def my_copy_(dst, src, non_blocking) -> Tensor:
|
||||
"""
|
||||
Returns tensor dst that is updated with src elements.
|
||||
|
||||
Args:
|
||||
dst: Destination tensor
|
||||
src: Source tensor
|
||||
non_blocking: bool
|
||||
|
||||
Returns: Updated tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_copy_.default(dst, src, non_blocking)
|
||||
|
||||
|
||||
def my_clone(t) -> Tensor:
|
||||
"""
|
||||
Returns a clone of input tensor.
|
||||
|
||||
Args:
|
||||
t: Input tensor
|
||||
|
||||
Returns: Cloned tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_clone.default(t)
|
||||
|
||||
|
||||
def test_device_guard(device_index) -> int:
|
||||
"""
|
||||
Tests the DeviceGuard functionality by creating a device guard and returning an empty tensor.
|
||||
|
||||
Args:
|
||||
device_index: Device index to set the guard to
|
||||
|
||||
Returns: result of cudaGetDevice() as an integer after using the guard
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.test_device_guard.default(device_index)
|
||||
|
||||
|
||||
def test_device_guard_set_index() -> int:
|
||||
"""
|
||||
Tests the DeviceGuard set_index functionality by creating a device guard with index 1,
|
||||
then setting it to index 0, and returning the current device.
|
||||
|
||||
Returns: result of cudaGetDevice() as an integer after using set_index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.test_device_guard_set_index.default()
|
||||
|
||||
|
||||
def test_stream(device_index) -> int:
|
||||
"""
|
||||
Tests the Stream functionality by getting the current stream ID for the specified device.
|
||||
|
||||
Args:
|
||||
device_index: Device index to get the stream for
|
||||
|
||||
Returns: Stream ID as an integer
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.test_stream.default(device_index)
|
||||
|
||||
|
||||
def test_get_current_device_index() -> int:
|
||||
"""
|
||||
Tests the getCurrentDeviceIndex functionality by getting the current device index.
|
||||
|
||||
Returns: Current device index as an integer
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.test_get_current_device_index.default()
|
||||
|
||||
|
||||
def my_new_empty_dtype_variant(t) -> Tensor:
|
||||
"""
|
||||
Returns a new empty tensor with shape [2, 5] and dtype bfloat16
|
||||
|
||||
Args:
|
||||
t: Input tensor used as a reference for device and other properties
|
||||
|
||||
Returns: New empty tensor with shape [2, 5] and dtype bfloat16
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_new_empty_dtype_variant.default(t)
|
||||
|
||||
|
||||
def my_new_zeros_dtype_variant(t) -> Tensor:
|
||||
"""
|
||||
Returns a new tensor filled with 0s with shape [2, 5] and dtype Float
|
||||
|
||||
Args:
|
||||
t: Input tensor used as a reference for device and other properties
|
||||
|
||||
Returns: New zeros tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_new_zeros_dtype_variant.default(t)
|
||||
|
||||
|
||||
def my_flatten(t, start_dim=0, end_dim=-1) -> Tensor:
|
||||
"""
|
||||
Flattens the input tensor from start_dim to end_dim into a single dimension.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to flatten
|
||||
start_dim: int - first dimension to flatten (default: 0)
|
||||
end_dim: int - last dimension to flatten (default: -1)
|
||||
|
||||
Returns: Tensor - flattened tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic_2_9.my_flatten.default(t, start_dim, end_dim)
|
||||
82
test/cpp_extensions/libtorch_agnostic_2_9_extension/setup.py
Normal file
82
test/cpp_extensions/libtorch_agnostic_2_9_extension/setup.py
Normal file
@ -0,0 +1,82 @@
|
||||
import distutils.command.clean
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
import torch
|
||||
from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension
|
||||
|
||||
|
||||
ROOT_DIR = Path(__file__).parent
|
||||
CSRC_DIR = ROOT_DIR / "libtorch_agnostic_2_9" / "csrc"
|
||||
|
||||
|
||||
class clean(distutils.command.clean.clean):
|
||||
def run(self):
|
||||
# Run default behavior first
|
||||
distutils.command.clean.clean.run(self)
|
||||
|
||||
# Remove extension
|
||||
for path in (ROOT_DIR / "libtorch_agnostic_2_9").glob("**/*.so"):
|
||||
path.unlink()
|
||||
# Remove build and dist and egg-info directories
|
||||
dirs = [
|
||||
ROOT_DIR / "build",
|
||||
ROOT_DIR / "dist",
|
||||
ROOT_DIR / "libtorch_agnostic_2_9.egg-info",
|
||||
]
|
||||
for path in dirs:
|
||||
if path.exists():
|
||||
shutil.rmtree(str(path), ignore_errors=True)
|
||||
|
||||
|
||||
def get_extension():
|
||||
extra_compile_args = {
|
||||
"cxx": [
|
||||
"-fdiagnostics-color=always",
|
||||
"-DTORCH_STABLE_ONLY",
|
||||
"-DTORCH_TARGET_VERSION=0x0209000000000000",
|
||||
],
|
||||
}
|
||||
sources = list(CSRC_DIR.glob("**/*.cpp"))
|
||||
|
||||
extension = CppExtension
|
||||
# allow including <cuda_runtime.h>
|
||||
if torch.cuda.is_available():
|
||||
extra_compile_args["cxx"].append("-DLAE_USE_CUDA")
|
||||
extra_compile_args["nvcc"] = [
|
||||
"-O2",
|
||||
"-DTORCH_TARGET_VERSION=0x0209000000000000",
|
||||
]
|
||||
extension = CUDAExtension
|
||||
sources.extend(CSRC_DIR.glob("**/*.cu"))
|
||||
|
||||
return [
|
||||
extension(
|
||||
"libtorch_agnostic_2_9._C",
|
||||
sources=sorted(str(s) for s in sources),
|
||||
py_limited_api=True,
|
||||
extra_compile_args=extra_compile_args,
|
||||
extra_link_args=[],
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
setup(
|
||||
name="libtorch_agnostic_2_9",
|
||||
version="0.0",
|
||||
author="PyTorch Core Team",
|
||||
description="Example of libtorch agnostic extension for PyTorch 2.9",
|
||||
packages=find_packages(exclude=("test",)),
|
||||
package_data={"libtorch_agnostic_2_9": ["*.dll", "*.dylib", "*.so"]},
|
||||
install_requires=[
|
||||
"torch",
|
||||
],
|
||||
ext_modules=get_extension(),
|
||||
cmdclass={
|
||||
"build_ext": BuildExtension.with_options(no_python_abi_suffix=True),
|
||||
"clean": clean,
|
||||
},
|
||||
options={"bdist_wheel": {"py_limited_api": "cp39"}},
|
||||
)
|
||||
@ -1,30 +0,0 @@
|
||||
#include "kernel.h"
|
||||
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
using torch::stable::Tensor;
|
||||
|
||||
Tensor mv_tensor_accessor_cuda(Tensor m, Tensor v) {
|
||||
STD_TORCH_CHECK(m.dim() == 2, "m must be 2D");
|
||||
STD_TORCH_CHECK(v.dim() == 1, "v must be 1D");
|
||||
STD_TORCH_CHECK(m.size(1) == v.size(0), "m.shape[1] == v.shape[0] must hold");
|
||||
STD_TORCH_CHECK(m.scalar_type() == v.scalar_type(), "m and v must have the same dtype");
|
||||
STD_TORCH_CHECK(m.device() == v.device(), "m and v must be on the same device");
|
||||
Tensor res = new_empty(m, {m.size(0)});
|
||||
THO_DISPATCH_V2(m.scalar_type(), "mv_tensor_accessor_cuda",
|
||||
AT_WRAP(([&]() {
|
||||
auto resa = Accessor_cuda<scalar_t, 1>(reinterpret_cast<scalar_t*>(res.data_ptr()), res.sizes().data(), res.strides().data());
|
||||
auto ma = Accessor_cuda<scalar_t, 2>(reinterpret_cast<scalar_t*>(m.data_ptr()), m.sizes().data(), m.strides().data());
|
||||
auto va = Accessor_cuda<scalar_t, 1>(reinterpret_cast<scalar_t*>(v.data_ptr()), v.sizes().data(), v.strides().data());
|
||||
mv_tensor_accessor_kernel<Accessor_cuda, scalar_t><<<1, 1, 0, 0>>>(resa, ma, va);
|
||||
})),
|
||||
AT_FLOATING_TYPES);
|
||||
return res;
|
||||
}
|
||||
|
||||
STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CUDA, m) {
|
||||
m.impl("mv_tensor_accessor", TORCH_BOX(&mv_tensor_accessor_cuda));
|
||||
}
|
||||
@ -1,589 +0,0 @@
|
||||
import torch
|
||||
from torch import Tensor
|
||||
|
||||
|
||||
def sgd_out_of_place(param, grad, weight_decay, lr, maximize) -> Tensor:
|
||||
"""
|
||||
Computes a single step of SGD on a single parameter Tensor with grad.
|
||||
|
||||
Assumes:
|
||||
- param and grad are the same shape and are 1D.
|
||||
- param and grad are float and on CPU
|
||||
|
||||
Args:
|
||||
param: a 1D tensor of floats
|
||||
grad: a 1D tensor of floats
|
||||
weight_decay: a python double between 0 and 1
|
||||
lr: a python double
|
||||
|
||||
Returns:
|
||||
a 1D float Tensor the same shape as param
|
||||
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.sgd_out_of_place.default(
|
||||
param, grad, weight_decay, lr, maximize
|
||||
)
|
||||
|
||||
|
||||
def identity(t) -> Tensor:
|
||||
"""
|
||||
Returns the input tensor
|
||||
|
||||
Args:
|
||||
t: any Tensor
|
||||
|
||||
Returns:
|
||||
a Tensor, the same as input.
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.identity.default(t)
|
||||
|
||||
|
||||
def my_abs(t) -> Tensor:
|
||||
"""
|
||||
Returns abs on the input tensor, outputs a new Tensor
|
||||
|
||||
Args:
|
||||
t: any Tensor
|
||||
|
||||
Returns:
|
||||
a Tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_abs.default(t)
|
||||
|
||||
|
||||
def my_is_cpu(t) -> bool:
|
||||
"""
|
||||
Returns is_cpu on the input tensor.
|
||||
|
||||
Args:
|
||||
t: any Tensor
|
||||
|
||||
Returns:
|
||||
a bool
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_is_cpu.default(t)
|
||||
|
||||
|
||||
def my_ones_like(tensor, device) -> Tensor:
|
||||
"""
|
||||
Returns a new Tensor like the input tensor, but with all ones
|
||||
|
||||
Args:
|
||||
tensor: any Tensor
|
||||
device: a device string
|
||||
|
||||
Returns:
|
||||
a ones Tensor with the same dtype and shape and other attributes
|
||||
like the input tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_ones_like.default(tensor, device)
|
||||
|
||||
|
||||
def exp_neg_is_leaf(t1, t2, t3) -> tuple[Tensor, Tensor, bool]:
|
||||
"""
|
||||
Returns a Tensor, Tensor, bool tuple corresponding to the respective inputs
|
||||
t1, t2, and t3.
|
||||
|
||||
Args:
|
||||
t1: Tensor
|
||||
t2: Tensor
|
||||
t3: Tensor
|
||||
|
||||
Returns:
|
||||
(exp(t1), neg(t2), is_leaf(t3))
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.exp_neg_is_leaf.default(t1, t2, t3)
|
||||
|
||||
|
||||
def neg_exp(t) -> Tensor:
|
||||
"""
|
||||
Returns a Tensor composing neg of exp
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: neg(exp(t))
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.neg_exp.default(t)
|
||||
|
||||
|
||||
def divide_neg_exp(t) -> Tensor:
|
||||
"""
|
||||
Returns a Tensor division of neg and exp
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: divide(neg(t), exp(t))
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.divide_neg_exp.default(t)
|
||||
|
||||
|
||||
def is_contiguous(t) -> bool:
|
||||
"""
|
||||
Returns a bool indicating if the input tensor is contiguous
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: is_contiguous(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.is_contiguous.default(t)
|
||||
|
||||
|
||||
def my_transpose(t, dim0, dim1) -> Tensor:
|
||||
"""
|
||||
Returns t.transpose(dim0, dim1)
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: my_transpose(t, dim0, dim1)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_transpose.default(t, dim0, dim1)
|
||||
|
||||
|
||||
def my_empty_like(t) -> Tensor:
|
||||
"""
|
||||
Returns t.empty_like()
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: my_empty_like(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_empty_like.default(t)
|
||||
|
||||
|
||||
def my_zero_(t) -> Tensor:
|
||||
"""
|
||||
Returns t.zero_()
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: my_zero_(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_zero_.default(t)
|
||||
|
||||
|
||||
def my_amax(t) -> Tensor:
|
||||
"""
|
||||
Returns t.amax()
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: amax(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_amax.default(t)
|
||||
|
||||
|
||||
def my_amax_vec(t) -> Tensor:
|
||||
"""
|
||||
Returns t.amax()
|
||||
|
||||
Args:
|
||||
t: Tensor
|
||||
|
||||
Returns: amax(t)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_amax_vec.default(t)
|
||||
|
||||
|
||||
def fill_infinity(t) -> Tensor:
|
||||
"""
|
||||
Fills the tensor with inf.
|
||||
|
||||
Args:
|
||||
t: Tensor to fill
|
||||
|
||||
Returns: The modified tensor (same as input)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.fill_infinity.default(t)
|
||||
|
||||
|
||||
def test_default_constructor(defined) -> bool:
|
||||
"""
|
||||
Tests the default constructor for torch::stable::Tensor.
|
||||
|
||||
Args:
|
||||
defined: bool - if True, tests defined tensor; if False, tests undefined tensor
|
||||
|
||||
Returns: bool - result of calling .defined() on the tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_default_constructor.default(defined)
|
||||
|
||||
|
||||
def test_tensor_device(t):
|
||||
"""
|
||||
Tests Tensor device() method.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to get device from
|
||||
|
||||
Returns: Device - device of the tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_tensor_device.default(t)
|
||||
|
||||
|
||||
def get_any_data_ptr(t, mutable) -> int:
|
||||
"""
|
||||
Return data pointer value of the tensor.
|
||||
|
||||
Args:
|
||||
t: Input tensor
|
||||
mutable: whether data pointer qualifier is mutable or const
|
||||
|
||||
Returns: int - pointer value
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.get_any_data_ptr.default(t, mutable)
|
||||
|
||||
|
||||
def get_template_any_data_ptr(t, dtype, mutable) -> int:
|
||||
"""
|
||||
Return data pointer value of the tensor iff it has dtype.
|
||||
|
||||
Args:
|
||||
t: Input tensor
|
||||
dtype: Input dtype
|
||||
mutable: whether data pointer qualifier is mutable or const
|
||||
|
||||
Returns: int - pointer value
|
||||
|
||||
Raises RuntimeError when t.dtype() != dtype.
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.get_template_any_data_ptr.default(
|
||||
t, dtype, mutable
|
||||
)
|
||||
|
||||
|
||||
def my_pad(t) -> Tensor:
|
||||
"""
|
||||
Pads the input tensor with hardcoded padding parameters.
|
||||
|
||||
Args:
|
||||
t: Input tensor
|
||||
|
||||
Returns: Padded tensor with padding [1, 2, 2, 1], mode "constant", value 0.0
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_pad.default(t)
|
||||
|
||||
|
||||
def my_narrow(t, dim, start, length) -> Tensor:
|
||||
"""
|
||||
Returns a new tensor that is a narrowed version of the input tensor.
|
||||
|
||||
Args:
|
||||
t: Input tensor
|
||||
dim: Dimension along which to narrow
|
||||
start: Starting position
|
||||
length: Length of the narrowed section
|
||||
|
||||
Returns: Narrowed tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_narrow.default(t, dim, start, length)
|
||||
|
||||
|
||||
def my_copy_(dst, src, non_blocking) -> Tensor:
|
||||
"""
|
||||
Returns tensor dst that is updated with src elements.
|
||||
|
||||
Args:
|
||||
dst: Destination tensor
|
||||
src: Source tensor
|
||||
non_blocking: bool
|
||||
|
||||
Returns: Updated tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_copy_.default(dst, src, non_blocking)
|
||||
|
||||
|
||||
def my_clone(t) -> Tensor:
|
||||
"""
|
||||
Returns a clone of input tensor.
|
||||
|
||||
Args:
|
||||
t: Input tensor
|
||||
|
||||
Returns: Cloned tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_clone.default(t)
|
||||
|
||||
|
||||
def test_device_guard(device_index) -> int:
|
||||
"""
|
||||
Tests the DeviceGuard functionality by creating a device guard and returning an empty tensor.
|
||||
|
||||
Args:
|
||||
device_index: Device index to set the guard to
|
||||
|
||||
Returns: result of cudaGetDevice() as an integer after using the guard
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_guard.default(device_index)
|
||||
|
||||
|
||||
def test_device_guard_set_index() -> int:
|
||||
"""
|
||||
Tests the DeviceGuard set_index functionality by creating a device guard with index 1,
|
||||
then setting it to index 0, and returning the current device.
|
||||
|
||||
Returns: result of cudaGetDevice() as an integer after using set_index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_guard_set_index.default()
|
||||
|
||||
|
||||
def test_stream(device_index) -> int:
|
||||
"""
|
||||
Tests the Stream functionality by getting the current stream ID for the specified device.
|
||||
|
||||
Args:
|
||||
device_index: Device index to get the stream for
|
||||
|
||||
Returns: Stream ID as an integer
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_stream.default(device_index)
|
||||
|
||||
|
||||
def test_get_current_device_index() -> int:
|
||||
"""
|
||||
Tests the getCurrentDeviceIndex functionality by getting the current device index.
|
||||
|
||||
Returns: Current device index as an integer
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_get_current_device_index.default()
|
||||
|
||||
|
||||
def my_new_empty_dtype_variant(t) -> Tensor:
|
||||
"""
|
||||
Returns a new empty tensor with shape [2, 5] and dtype bfloat16
|
||||
|
||||
Args:
|
||||
t: Input tensor used as a reference for device and other properties
|
||||
|
||||
Returns: New empty tensor with shape [2, 5] and dtype bfloat16
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_new_empty_dtype_variant.default(t)
|
||||
|
||||
|
||||
def my_new_zeros_dtype_variant(t) -> Tensor:
|
||||
"""
|
||||
Returns a new tensor filled with 0s with shape [2, 5] and dtype Float
|
||||
|
||||
Args:
|
||||
t: Input tensor used as a reference for device and other properties
|
||||
|
||||
Returns: New zeros tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_new_zeros_dtype_variant.default(t)
|
||||
|
||||
|
||||
def my__foreach_mul_(tensors, others) -> ():
|
||||
"""
|
||||
Updates tensors to be the result of pointwise multiplying with others.
|
||||
|
||||
Args:
|
||||
tensors: list of tensors
|
||||
others: list of tensors (with the same corresponding shapes as tensors)
|
||||
|
||||
Returns: nothing, tensors is updated in place.
|
||||
"""
|
||||
torch.ops.libtorch_agnostic.my__foreach_mul_.default(tensors, others)
|
||||
|
||||
|
||||
def my__foreach_mul(tensors, others) -> list[Tensor]:
|
||||
"""
|
||||
Returns a list of tensors that are the results of pointwise multiplying
|
||||
tensors and others.
|
||||
|
||||
Args:
|
||||
tensors: list of tensors
|
||||
others: list of tensors (with the same corresponding shapes as tensors)
|
||||
|
||||
Returns: list of multiplied tensors
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my__foreach_mul.default(tensors, others)
|
||||
|
||||
|
||||
def make_tensor_clones_and_call_foreach(t1, t2) -> list[Tensor]:
|
||||
"""
|
||||
Returns a list of 2 tensors corresponding to the square of the inputs.
|
||||
|
||||
Args:
|
||||
t1: Tensor
|
||||
t2: Tensor
|
||||
|
||||
Returns: list of [t1^2, t2^2]
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.make_tensor_clones_and_call_foreach.default(
|
||||
t1, t2
|
||||
)
|
||||
|
||||
|
||||
def test_device_constructor(is_cuda, index, use_str):
|
||||
"""
|
||||
Tests creating a Device from DeviceType and index, or from a string.
|
||||
|
||||
Args:
|
||||
is_cuda: bool - if True, creates CUDA device; if False, creates CPU device
|
||||
index: int - device index
|
||||
use_str: bool - if True, constructs from string; if False, constructs from DeviceType
|
||||
|
||||
Returns: Device - A device with the specified type and index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_constructor.default(
|
||||
is_cuda, index, use_str
|
||||
)
|
||||
|
||||
|
||||
def test_device_equality(d1, d2) -> bool:
|
||||
"""
|
||||
Tests Device equality operator.
|
||||
|
||||
Args:
|
||||
d1: Device - first device
|
||||
d2: Device - second device
|
||||
|
||||
Returns: bool - True if devices are equal
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_equality.default(d1, d2)
|
||||
|
||||
|
||||
def test_device_set_index(device, index):
|
||||
"""
|
||||
Tests Device set_index() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to modify
|
||||
index: int - new device index
|
||||
|
||||
Returns: Device - device with updated index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_set_index.default(device, index)
|
||||
|
||||
|
||||
def test_device_index(device) -> int:
|
||||
"""
|
||||
Tests Device index() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to query
|
||||
|
||||
Returns: int - device index
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_index.default(device)
|
||||
|
||||
|
||||
def test_device_is_cuda(device) -> bool:
|
||||
"""
|
||||
Tests Device is_cuda() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to check
|
||||
|
||||
Returns: bool - True if device is CUDA
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_is_cuda.default(device)
|
||||
|
||||
|
||||
def test_device_is_cpu(device) -> bool:
|
||||
"""
|
||||
Tests Device is_cpu() method.
|
||||
|
||||
Args:
|
||||
device: Device - device to check
|
||||
|
||||
Returns: bool - True if device is CPU
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_device_is_cpu.default(device)
|
||||
|
||||
|
||||
def test_parallel_for(size, grain_size) -> Tensor:
|
||||
"""
|
||||
Tests the parallel_for functionality by using it to fill a tensor with indices.
|
||||
Args:
|
||||
size: int - size of the tensor to create
|
||||
grain_size: int - grain size for parallel_for
|
||||
Returns: Tensor - a 1D int64 tensor where each element contains its index
|
||||
(if multiple threads are used the threadid will be encoded in the upper 32 bits)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_parallel_for.default(size, grain_size)
|
||||
|
||||
|
||||
def test_get_num_threads() -> int:
|
||||
"""
|
||||
Tests the get_num_threads functionality by returning the number of threads
|
||||
for the parallel backend.
|
||||
|
||||
Returns: int - the number of threads for the parallel backend
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.test_get_num_threads.default()
|
||||
|
||||
|
||||
def my_empty(size, dtype=None, device=None, pin_memory=None) -> Tensor:
|
||||
"""
|
||||
Creates an empty tensor with the specified size, dtype, device, and pin_memory.
|
||||
|
||||
Args:
|
||||
size: list[int] - size of the tensor to create
|
||||
dtype: ScalarType or None - data type of the tensor
|
||||
device: Device or None - device on which to create the tensor
|
||||
pin_memory: bool or None - whether to use pinned memory
|
||||
|
||||
Returns: Tensor - an uninitialized tensor with the specified properties
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_empty.default(size, dtype, device, pin_memory)
|
||||
|
||||
|
||||
def my_flatten(t, start_dim=0, end_dim=-1) -> Tensor:
|
||||
"""
|
||||
Flattens the input tensor from start_dim to end_dim into a single dimension.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to flatten
|
||||
start_dim: int - first dimension to flatten (default: 0)
|
||||
end_dim: int - last dimension to flatten (default: -1)
|
||||
|
||||
Returns: Tensor - flattened tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_flatten.default(t, start_dim, end_dim)
|
||||
|
||||
|
||||
def my_reshape(t, shape) -> Tensor:
|
||||
"""
|
||||
Returns a tensor with the same data but different shape.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to reshape
|
||||
shape: list[int] - new shape for the tensor
|
||||
|
||||
Returns: Tensor - reshaped tensor
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_reshape.default(t, shape)
|
||||
|
||||
|
||||
def my_view(t, size) -> Tensor:
|
||||
"""
|
||||
Returns a new tensor with the same data as the input tensor but of a different shape.
|
||||
|
||||
Args:
|
||||
t: Tensor - tensor to view
|
||||
size: list[int] - new size for the tensor
|
||||
|
||||
Returns: Tensor - tensor with new view
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.my_view.default(t, size)
|
||||
|
||||
|
||||
def mv_tensor_accessor(m, v) -> Tensor:
|
||||
"""
|
||||
Returns matrix-vector product.
|
||||
|
||||
Args:
|
||||
m: any 2-D Tensor with shape (N, M)
|
||||
v: any 1-D Tensor with shape (M,)
|
||||
|
||||
Returns:
|
||||
a 1-D Tensor with shape (N,)
|
||||
"""
|
||||
return torch.ops.libtorch_agnostic.mv_tensor_accessor.default(m, v)
|
||||
@ -1,6 +1,7 @@
|
||||
# Owner(s): ["module: cpp"]
|
||||
|
||||
import math
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
@ -46,21 +47,78 @@ def get_supported_dtypes():
|
||||
]
|
||||
|
||||
|
||||
def skipIfTorchVersionLessThan(major, minor):
|
||||
"""Skip test if PyTorch version is less than specified version."""
|
||||
|
||||
def decorator(func):
|
||||
version_parts = torch.__version__.split(".")
|
||||
current_major = int(version_parts[0])
|
||||
current_minor = int(
|
||||
version_parts[1].split("+")[0].split("a")[0].split("b")[0].split("rc")[0]
|
||||
)
|
||||
|
||||
should_skip = (current_major < major) or (
|
||||
current_major == major and current_minor < minor
|
||||
)
|
||||
reason = f"Test requires PyTorch >= {major}.{minor}, current version is {torch.__version__}"
|
||||
|
||||
return unittest.skipIf(should_skip, reason)(func)
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
# TODO: Fix this error in Windows:
|
||||
# LINK : error LNK2001: unresolved external symbol PyInit__C
|
||||
if not IS_WINDOWS:
|
||||
|
||||
class TestLibtorchAgnostic(TestCase):
|
||||
"""
|
||||
Tests for versioned libtorch_agnostic extensions.
|
||||
|
||||
This test class supports testing both:
|
||||
|
||||
- libtorch_agnostic_2_9: Extension built with TORCH_TARGET_VERSION=2.9.0
|
||||
- libtorch_agnostic_2_10: Extension built with TORCH_TARGET_VERSION=2.10.0
|
||||
|
||||
Tests should be decorated with @skipIfTorchVersionLessThan to indicate the
|
||||
version that they target.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
# Build both 2.9 and 2.10 extensions
|
||||
base_dir = Path(__file__).parent
|
||||
|
||||
try:
|
||||
import libtorch_agnostic # noqa: F401
|
||||
import libtorch_agnostic_2_9 # noqa: F401
|
||||
except Exception:
|
||||
install_cpp_extension(extension_root=Path(__file__).parent.parent)
|
||||
install_cpp_extension(
|
||||
extension_root=base_dir / "libtorch_agnostic_2_9_extension"
|
||||
)
|
||||
|
||||
# Only build 2.10 extension if running on PyTorch 2.10+
|
||||
import re
|
||||
|
||||
version_parts = torch.__version__.split(".")
|
||||
current_major = int(version_parts[0])
|
||||
# Extract just the numeric part of the minor version (handles "10+git", "10a1", etc.)
|
||||
current_minor = int(re.match(r"\d+", version_parts[1]).group())
|
||||
|
||||
if (current_major > 2) or (current_major == 2 and current_minor >= 10):
|
||||
try:
|
||||
import libtorch_agnostic_2_10 # noqa: F401
|
||||
except Exception:
|
||||
install_cpp_extension(
|
||||
extension_root=base_dir / "libtorch_agnostic_2_10_extension"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"Skipping 2.10 extension (running on PyTorch {torch.__version__})"
|
||||
)
|
||||
|
||||
@onlyCPU
|
||||
def test_slow_sgd(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
param = torch.rand(5, device=device)
|
||||
grad = torch.rand_like(param)
|
||||
@ -87,7 +145,7 @@ if not IS_WINDOWS:
|
||||
|
||||
@onlyCUDA
|
||||
def test_identity_does_not_hog_memory(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
def _run_identity(prior_mem):
|
||||
t = torch.rand(32, 32, device=device)
|
||||
@ -103,7 +161,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(curr_mem, init_mem)
|
||||
|
||||
def test_exp_neg_is_leaf(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
t1 = torch.rand(2, 3, device=device)
|
||||
t2 = torch.rand(3, 2, device=device)
|
||||
@ -115,7 +173,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(is_leaf, t3.is_leaf)
|
||||
|
||||
def test_my_abs(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
t = torch.rand(32, 16, device=device) - 0.5
|
||||
res = libtorch_agnostic.ops.my_abs(t)
|
||||
@ -134,7 +192,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(curr_mem, init_mem)
|
||||
|
||||
def test_neg_exp(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
t = torch.rand(32, 16, device=device) - 0.5
|
||||
res = libtorch_agnostic.ops.neg_exp(t)
|
||||
@ -153,7 +211,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(curr_mem, init_mem)
|
||||
|
||||
def test_divide_neg_exp(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
t = torch.zeros(2, 3, device=device) - 0.5
|
||||
res = libtorch_agnostic.ops.divide_neg_exp(t)
|
||||
@ -172,7 +230,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(curr_mem, init_mem)
|
||||
|
||||
def test_is_contiguous(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
t = torch.rand(2, 7, device=device)
|
||||
self.assertTrue(libtorch_agnostic.ops.is_contiguous(t))
|
||||
@ -184,7 +242,7 @@ if not IS_WINDOWS:
|
||||
# **{}): got AssertionError("tensor's device must be `meta`, got cpu instead")
|
||||
@xfailIfTorchDynamo
|
||||
def test_my_ones_like(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
t = torch.rand(3, 1, device=device) - 0.5
|
||||
cpu_t = libtorch_agnostic.ops.my_ones_like(t, "cpu")
|
||||
@ -203,7 +261,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(curr_mem, init_mem)
|
||||
|
||||
def test_my_transpose(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
t = torch.rand(2, 7, device=device)
|
||||
out = libtorch_agnostic.ops.my_transpose(t, 0, 1)
|
||||
@ -213,7 +271,7 @@ if not IS_WINDOWS:
|
||||
libtorch_agnostic.ops.my_transpose(t, 1, 2)
|
||||
|
||||
def test_my_empty_like(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
deterministic = torch.are_deterministic_algorithms_enabled()
|
||||
try:
|
||||
@ -229,7 +287,7 @@ if not IS_WINDOWS:
|
||||
|
||||
@onlyCPU
|
||||
def test_my_zero_(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
t = torch.rand(2, 7, device=device)
|
||||
out = libtorch_agnostic.ops.my_zero_(t)
|
||||
@ -237,28 +295,28 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(out, torch.zeros_like(t))
|
||||
|
||||
def test_my_amax(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
t = torch.rand(2, 7, device=device)
|
||||
out = libtorch_agnostic.ops.my_amax(t)
|
||||
self.assertEqual(out, torch.amax(t, 0))
|
||||
|
||||
def test_my_amax_vec(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
t = torch.rand(2, 7, 5, device=device)
|
||||
out = libtorch_agnostic.ops.my_amax_vec(t)
|
||||
self.assertEqual(out, torch.amax(t, (0, 1)))
|
||||
|
||||
def test_my_is_cpu(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
t = torch.rand(2, 7, device=device)
|
||||
out = libtorch_agnostic.ops.my_is_cpu(t)
|
||||
self.assertEqual(out, t.is_cpu)
|
||||
|
||||
def test_fill_infinity(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
t = torch.rand(3, 4, device=device)
|
||||
out = libtorch_agnostic.ops.fill_infinity(t)
|
||||
@ -269,7 +327,7 @@ if not IS_WINDOWS:
|
||||
|
||||
@onlyCPU
|
||||
def test_default_constructor(self):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
defined_tensor_is_defined = libtorch_agnostic.ops.test_default_constructor(
|
||||
True
|
||||
@ -282,7 +340,7 @@ if not IS_WINDOWS:
|
||||
self.assertFalse(undefined_tensor_is_defined)
|
||||
|
||||
def test_my_pad(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
t = torch.rand(2, 3, device=device)
|
||||
out = libtorch_agnostic.ops.my_pad(t)
|
||||
@ -290,7 +348,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(out, expected)
|
||||
|
||||
def test_my_narrow(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
t = torch.randn(2, 5, device=device)
|
||||
|
||||
@ -301,47 +359,10 @@ if not IS_WINDOWS:
|
||||
expected0 = torch.narrow(t, dim0, start0, length0)
|
||||
self.assertEqual(out0, expected0)
|
||||
|
||||
@skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
|
||||
def test_get_any_data_ptr(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
t = torch.empty(2, 5, device=device, dtype=torch.float32)
|
||||
expected_p = t.data_ptr()
|
||||
|
||||
for mutable in [True, False]:
|
||||
p = libtorch_agnostic.ops.get_any_data_ptr(t, mutable)
|
||||
self.assertEqual(p, expected_p)
|
||||
|
||||
@skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
|
||||
def test_get_template_any_data_ptr(self, device):
|
||||
import libtorch_agnostic
|
||||
|
||||
supported_dtypes = get_supported_dtypes()
|
||||
|
||||
for dtype in supported_dtypes:
|
||||
t = torch.empty(2, 5, device=device, dtype=dtype)
|
||||
expected_p = t.data_ptr()
|
||||
|
||||
for rdtype in supported_dtypes:
|
||||
if dtype == rdtype:
|
||||
for mutable in [True, False]:
|
||||
p = libtorch_agnostic.ops.get_template_any_data_ptr(
|
||||
t, rdtype, mutable
|
||||
)
|
||||
self.assertEqual(p, expected_p)
|
||||
else:
|
||||
for mutable in [True, False]:
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError, "expected scalar type.* but found"
|
||||
):
|
||||
libtorch_agnostic.ops.get_template_any_data_ptr(
|
||||
t, rdtype, mutable
|
||||
)
|
||||
|
||||
@onlyCUDA
|
||||
@deviceCountAtLeast(2)
|
||||
def test_device_guard(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
device_index = 1
|
||||
out = libtorch_agnostic.ops.test_device_guard(device_index)
|
||||
@ -350,7 +371,7 @@ if not IS_WINDOWS:
|
||||
@onlyCUDA
|
||||
@deviceCountAtLeast(2)
|
||||
def test_device_guard_set_index(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
# This test creates a DeviceGuard with index 1, then sets it to index 0
|
||||
# and returns the current device (should be 0)
|
||||
@ -359,7 +380,7 @@ if not IS_WINDOWS:
|
||||
|
||||
@onlyCUDA
|
||||
def test_stream(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
stream = torch.cuda.Stream()
|
||||
device = torch.cuda.current_device()
|
||||
@ -373,7 +394,7 @@ if not IS_WINDOWS:
|
||||
@onlyCUDA
|
||||
@deviceCountAtLeast(2)
|
||||
def test_get_current_device_index(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
prev_device = torch.cuda.current_device()
|
||||
|
||||
@ -387,7 +408,7 @@ if not IS_WINDOWS:
|
||||
torch.cuda.set_device(prev_device)
|
||||
|
||||
def test_my_new_empty_dtype_variant(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
deterministic = torch.are_deterministic_algorithms_enabled()
|
||||
try:
|
||||
@ -402,7 +423,7 @@ if not IS_WINDOWS:
|
||||
torch.use_deterministic_algorithms(deterministic)
|
||||
|
||||
def test_my_new_zeros_dtype_variant(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
t = torch.randn(3, 4, device=device)
|
||||
out = libtorch_agnostic.ops.my_new_zeros_dtype_variant(t)
|
||||
@ -410,7 +431,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(out, ref_out, exact_device=True)
|
||||
|
||||
def test_my_copy_(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
dst = torch.empty(2, 5, device=device)
|
||||
src = torch.randn(2, 5, device=device)
|
||||
@ -421,7 +442,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(result.data_ptr(), dst.data_ptr())
|
||||
|
||||
def test_my_clone(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
t = torch.randn(2, 5, device=device)
|
||||
|
||||
@ -431,8 +452,9 @@ if not IS_WINDOWS:
|
||||
self.assertNotEqual(result.data_ptr(), expected.data_ptr())
|
||||
self.assertEqual(result.stride(), expected.stride())
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
def test_my__foreach_mul_(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
|
||||
N = 5
|
||||
tensors = [torch.rand(32, 16, device=device) for _ in range(N)]
|
||||
@ -445,8 +467,9 @@ if not IS_WINDOWS:
|
||||
for tensor_t, expected_t in zip(tensors, expected_values):
|
||||
self.assertEqual(tensor_t, expected_t)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
def test_my__foreach_mul(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
|
||||
N = 5
|
||||
tensors = [torch.rand(32, 16, device=device) for _ in range(N)]
|
||||
@ -473,8 +496,9 @@ if not IS_WINDOWS:
|
||||
curr_mem = torch.cuda.memory_allocated(device)
|
||||
self.assertEqual(curr_mem, init_mem)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
def test_make_tensor_clones_and_call_foreach(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
|
||||
t1 = torch.rand(2, 5, device=device)
|
||||
t2 = torch.rand(3, 4, device=device)
|
||||
@ -482,9 +506,10 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(result[0], t1 * t1)
|
||||
self.assertEqual(result[1], t2 * t2)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
@onlyCUDA
|
||||
def test_device(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
|
||||
cuda_device = libtorch_agnostic.ops.test_device_constructor(
|
||||
is_cuda=True, index=1, use_str=False
|
||||
@ -537,10 +562,11 @@ if not IS_WINDOWS:
|
||||
):
|
||||
libtorch_agnostic.ops.test_device_set_index(cuda_device, 129)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
@onlyCUDA
|
||||
@deviceCountAtLeast(2)
|
||||
def test_tensor_device(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
|
||||
t = torch.randn(2, 3)
|
||||
self.assertEqual(libtorch_agnostic.ops.test_tensor_device(t), t.device)
|
||||
@ -555,6 +581,7 @@ if not IS_WINDOWS:
|
||||
libtorch_agnostic.ops.test_tensor_device(t_cuda_1), t_cuda_1.device
|
||||
)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
@onlyCPU
|
||||
# TODO: Debug this:
|
||||
# Dynamo failed to run FX node with fake tensors:
|
||||
@ -564,7 +591,7 @@ if not IS_WINDOWS:
|
||||
# Declaration: libtorch_agnostic::test_parallel_for(int size, int grain_size) -> Tensor')
|
||||
@xfailIfTorchDynamo
|
||||
def test_parallel_for(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
|
||||
num_threads = torch.get_num_threads()
|
||||
size = 100
|
||||
@ -581,16 +608,18 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(result_values, expected)
|
||||
self.assertEqual(result_thread_ids, torch.arange(expected_num_threads_used))
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
@onlyCPU
|
||||
def test_get_num_threads(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
|
||||
num_threads = libtorch_agnostic.ops.test_get_num_threads()
|
||||
expected_num_threads = torch.get_num_threads()
|
||||
self.assertEqual(num_threads, expected_num_threads)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
def test_my_empty(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
|
||||
deterministic = torch.are_deterministic_algorithms_enabled()
|
||||
try:
|
||||
@ -631,7 +660,7 @@ if not IS_WINDOWS:
|
||||
torch.use_deterministic_algorithms(deterministic)
|
||||
|
||||
def test_my_flatten(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
t = torch.randn(2, 3, 4, device=device)
|
||||
result = libtorch_agnostic.ops.my_flatten(t)
|
||||
@ -646,8 +675,9 @@ if not IS_WINDOWS:
|
||||
expected_range = torch.flatten(t, 2, -1)
|
||||
self.assertEqual(result_range, expected_range)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
def test_my_reshape(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
|
||||
t = torch.randn(2, 3, 4, device=device)
|
||||
|
||||
@ -663,8 +693,9 @@ if not IS_WINDOWS:
|
||||
expected_flat = torch.reshape(t, [-1])
|
||||
self.assertEqual(result_flat, expected_flat)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
def test_my_view(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
|
||||
t = torch.randn(2, 3, 4, device=device)
|
||||
|
||||
@ -681,7 +712,7 @@ if not IS_WINDOWS:
|
||||
self.assertEqual(result_flat, expected_flat)
|
||||
|
||||
def test_mv_tensor_accessor(self, device):
|
||||
import libtorch_agnostic
|
||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||
|
||||
m = torch.rand(3, 5, device=device)
|
||||
v = torch.rand(5, device=device)
|
||||
@ -696,6 +727,45 @@ if not IS_WINDOWS:
|
||||
expected = torch.mv(m, v)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
@skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
|
||||
def test_get_any_data_ptr(self, device):
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
|
||||
t = torch.empty(2, 5, device=device, dtype=torch.float32)
|
||||
expected_p = t.data_ptr()
|
||||
|
||||
for mutable in [True, False]:
|
||||
p = libtorch_agnostic.ops.get_any_data_ptr(t, mutable)
|
||||
self.assertEqual(p, expected_p)
|
||||
|
||||
@skipIfTorchVersionLessThan(2, 10)
|
||||
@skipIfTorchDynamo("no data pointer defined for FakeTensor, FunctionalTensor")
|
||||
def test_get_template_any_data_ptr(self, device):
|
||||
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||
|
||||
supported_dtypes = get_supported_dtypes()
|
||||
|
||||
for dtype in supported_dtypes:
|
||||
t = torch.empty(2, 5, device=device, dtype=dtype)
|
||||
expected_p = t.data_ptr()
|
||||
|
||||
for rdtype in supported_dtypes:
|
||||
if dtype == rdtype:
|
||||
for mutable in [True, False]:
|
||||
p = libtorch_agnostic.ops.get_template_any_data_ptr(
|
||||
t, rdtype, mutable
|
||||
)
|
||||
self.assertEqual(p, expected_p)
|
||||
else:
|
||||
for mutable in [True, False]:
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError, "expected scalar type.* but found"
|
||||
):
|
||||
libtorch_agnostic.ops.get_template_any_data_ptr(
|
||||
t, rdtype, mutable
|
||||
)
|
||||
|
||||
instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -230,6 +230,98 @@ class DistConvolutionOpsTest(DTensorTestBase):
|
||||
out_dt, out = self._run_single_arg_fwd(model, x, [Shard(0)])
|
||||
self.assertEqual(out_dt, out)
|
||||
|
||||
@with_comms
|
||||
def test_conv2d_no_bias_compile(self):
|
||||
"""Test Conv2d with bias=False in compile mode (Issue #167091)
|
||||
|
||||
Regression test: Previously this would fail during torch.compile
|
||||
tracing with AssertionError when bias_spec was None.
|
||||
"""
|
||||
device_mesh = self.build_device_mesh()
|
||||
|
||||
def conv_fn(x, w):
|
||||
return F.conv2d(x, w, bias=None, padding=1)
|
||||
|
||||
compiled_fn = torch.compile(conv_fn)
|
||||
|
||||
# Create tensors
|
||||
x = torch.randn(1, 4, 5, 5, device=self.device_type)
|
||||
w = torch.randn(8, 4, 3, 3, device=self.device_type)
|
||||
|
||||
# Distribute tensors
|
||||
x_dt = distribute_tensor(x, device_mesh, [Replicate()])
|
||||
w_dt = distribute_tensor(w, device_mesh, [Replicate()])
|
||||
|
||||
# Test eager mode for comparison
|
||||
result_eager = conv_fn(x_dt, w_dt)
|
||||
|
||||
# Test compiled mode - this should not crash
|
||||
result_compiled = compiled_fn(x_dt, w_dt)
|
||||
|
||||
# Verify shape is correct (the key regression test)
|
||||
self.assertEqual(result_compiled.shape, torch.Size([1, 8, 5, 5]))
|
||||
|
||||
# Verify numerical correctness
|
||||
torch.testing.assert_close(result_compiled.to_local(), result_eager.to_local())
|
||||
|
||||
@with_comms
|
||||
def test_conv2d_no_bias_backward(self):
|
||||
"""Test Conv2d backward pass with bias=False (Issue #167091)
|
||||
|
||||
Regression test: Previously backward pass would fail when
|
||||
grad_bias_spec was None.
|
||||
"""
|
||||
device_mesh = self.build_device_mesh()
|
||||
|
||||
# Create tensors with requires_grad
|
||||
x = torch.randn(1, 4, 5, 5, device=self.device_type)
|
||||
w = torch.randn(8, 4, 3, 3, device=self.device_type, requires_grad=True)
|
||||
|
||||
# Distribute tensors
|
||||
x_dt = distribute_tensor(x, device_mesh, [Replicate()])
|
||||
w_dt = torch.nn.Parameter(distribute_tensor(w, device_mesh, [Replicate()]))
|
||||
|
||||
# Forward pass
|
||||
result = F.conv2d(x_dt, w_dt, bias=None, padding=1)
|
||||
|
||||
# Backward pass - this should not crash
|
||||
grad_output = torch.randn_like(result)
|
||||
result.backward(grad_output)
|
||||
|
||||
# Check weight gradient exists (the key regression test)
|
||||
self.assertIsNotNone(w_dt.grad)
|
||||
self.assertEqual(w_dt.grad.shape, torch.Size([8, 4, 3, 3]))
|
||||
|
||||
@with_comms
|
||||
def test_conv2d_module_no_bias(self):
|
||||
"""Test nn.Conv2d module with bias=False (Issue #167091)
|
||||
|
||||
Regression test: Ensures nn.Conv2d with bias=False works with DTensor.
|
||||
"""
|
||||
device_mesh = self.build_device_mesh()
|
||||
|
||||
# Create model with bias=False
|
||||
model = nn.Conv2d(4, 8, kernel_size=3, padding=1, bias=False).to(
|
||||
self.device_type
|
||||
)
|
||||
nn.init.ones_(model.weight)
|
||||
|
||||
# Distribute model
|
||||
model_dt = distribute_module(model, device_mesh, _conv_fn)
|
||||
|
||||
# Create input
|
||||
x = torch.randn(1, 4, 5, 5, device=self.device_type)
|
||||
x_dt = distribute_tensor(x, device_mesh, [Replicate()])
|
||||
|
||||
# Forward pass - this should not crash
|
||||
output_dt = model_dt(x_dt)
|
||||
|
||||
# Check outputs shape is correct
|
||||
self.assertEqual(output_dt.shape, torch.Size([1, 8, 5, 5]))
|
||||
|
||||
# Check that model.bias is None
|
||||
self.assertIsNone(model.bias)
|
||||
|
||||
|
||||
DistConvolutionOpsTestWithLocalTensor = create_local_tensor_test_class(
|
||||
DistConvolutionOpsTest,
|
||||
@ -238,6 +330,10 @@ DistConvolutionOpsTestWithLocalTensor = create_local_tensor_test_class(
|
||||
"test_conv_backward_none_grad_inp",
|
||||
"test_depthwise_convolution",
|
||||
"test_downsampling_convolution",
|
||||
# New tests for Issue #167091 - use send/recv via tp_convolution
|
||||
"test_conv2d_no_bias_compile",
|
||||
"test_conv2d_no_bias_backward",
|
||||
"test_conv2d_module_no_bias",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@ -10,6 +10,7 @@ import torch._dynamo.test_case
|
||||
|
||||
# for some reason importing functional collectives after dynamo breaks collectives handling!
|
||||
import torch.distributed._functional_collectives as _functional_collectives
|
||||
import torch.fx as fx
|
||||
from torch._C import FileCheck
|
||||
from torch._dynamo.utils import counters, same
|
||||
from torch._inductor.utils import run_and_get_code, run_and_get_triton_code
|
||||
@ -238,6 +239,49 @@ graph():
|
||||
self.assertTrue(same(out, correct))
|
||||
self.assertEqual(counters["inductor"]["overlap_scheduling_exposed"], 0)
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@torch._inductor.config.patch(get_patches())
|
||||
def test_schedulable_wait(self):
|
||||
"""Test that if a wait node is scheduable or not."""
|
||||
from torch._inductor.fx_passes.bucketing import _schedulable_wait_node
|
||||
|
||||
def test_graph():
|
||||
graph = fx.Graph()
|
||||
|
||||
inp = graph.placeholder("inp")
|
||||
group_size = graph.placeholder("group_size")
|
||||
group_name = graph.placeholder("group_name")
|
||||
|
||||
ag_0_out = graph.call_function(
|
||||
torch.ops._c10d_functional.all_gather_into_tensor.default,
|
||||
args=(inp, group_size, group_name),
|
||||
)
|
||||
ag_0_wait = graph.call_function(
|
||||
torch.ops._c10d_functional.wait_tensor.default,
|
||||
args=(ag_0_out,),
|
||||
)
|
||||
ag_1_out = graph.call_function(
|
||||
torch.ops._c10d_functional.all_gather_into_tensor.default,
|
||||
args=(ag_0_wait, group_size, group_name),
|
||||
)
|
||||
ag_1_wait = graph.call_function(
|
||||
torch.ops._c10d_functional.wait_tensor.default,
|
||||
args=(ag_1_out,),
|
||||
)
|
||||
ag_2_wait = graph.call_function(
|
||||
torch.ops._c10d_functional.wait_tensor.default,
|
||||
args=(ag_1_wait,),
|
||||
)
|
||||
|
||||
graph.output(ag_2_wait)
|
||||
return graph
|
||||
|
||||
graph = test_graph()
|
||||
schedulable = {"wait_tensor_default", "wait_tensor_default_1"}
|
||||
for node in list(graph.nodes):
|
||||
expected = node.name in schedulable
|
||||
assert _schedulable_wait_node(node) is expected
|
||||
|
||||
@torch._inductor.config.patch(get_patches())
|
||||
def test_reorder_compute_for_overlap_mul(self):
|
||||
def func(a, *, tag, ranks, group_size):
|
||||
@ -1061,6 +1105,63 @@ class TestComputeCommReorderingBucketing(TestComputeCommReorderingMultiProc):
|
||||
correct = func(a, b, c)
|
||||
self.assertTrue(same(out, correct))
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@torch._inductor.config.patch(get_bucket_patches())
|
||||
def test_multiple_hiding_nodes_bucketing(self):
|
||||
"""Test that collectives hidden by multiple compute ops can bucket together."""
|
||||
|
||||
# Use 0.5 compute multiplier so each collective needs 2 matmuls to be fully hidden
|
||||
def estimate_with_half_compute(fx_node, override_size=None):
|
||||
return estimate_aten_runtime(fx_node, compute_multiplier=0.5)
|
||||
|
||||
def func(a, b, *, ranks):
|
||||
# Two all_gathers that will be hidden by multiple compute operations
|
||||
ag1 = _functional_collectives.all_gather_tensor(a, 0, ranks)
|
||||
ag2 = _functional_collectives.all_gather_tensor(b, 0, ranks)
|
||||
|
||||
# Multiple compute operations that can hide the collectives
|
||||
# With 0.5 multiplier: mm1 and mm2 together hide ag1, mm2 and mm3 together hide ag2
|
||||
mm1 = torch.matmul(a, a.T)
|
||||
mm2 = torch.matmul(b, b.T)
|
||||
mm3 = torch.matmul(a + b, (a + b).T)
|
||||
|
||||
return ag1.sum() + ag2.sum() + mm1.sum() + mm2.sum() + mm3.sum()
|
||||
|
||||
with _dynamo_dist_per_rank_init(
|
||||
self.rank,
|
||||
self.world_size,
|
||||
self.backend(device_type),
|
||||
fake_pg=not at_least_x_gpu(2),
|
||||
):
|
||||
a = torch.ones(8, 8, dtype=torch.float, device=device_type)
|
||||
b = torch.ones(8, 8, dtype=torch.float, device=device_type) * 2
|
||||
ranks = list(range(self.world_size))
|
||||
|
||||
func_c = functools.partial(func, ranks=ranks)
|
||||
|
||||
# Patch with custom estimation that uses 0.5 multiplier
|
||||
with torch._inductor.config.patch(
|
||||
{
|
||||
"aten_distributed_optimizations.custom_runtime_estimation": estimate_with_half_compute
|
||||
}
|
||||
):
|
||||
compiled = torch.compile(func_c)
|
||||
out, aten_graph_str = run_and_get_aten_graph(compiled, a, b)
|
||||
|
||||
# Should have 1 bucketed all_gather (both ag1 and ag2 bucketed together)
|
||||
FileCheck().check_count(
|
||||
"torch.ops._c10d_functional.wait_tensor.default", 1, exactly=True
|
||||
).run(aten_graph_str)
|
||||
|
||||
# Verify bucketed collective is scheduled before all matmuls
|
||||
FileCheck().check("functional.all_gather_into_tensor").check(
|
||||
"aten.mm"
|
||||
).check("aten.mm").check("aten.mm").check("wait_tensor").run(aten_graph_str)
|
||||
|
||||
# Verify correctness
|
||||
correct = func(a, b, ranks=ranks)
|
||||
self.assertTrue(same(out, correct))
|
||||
|
||||
|
||||
def get_toy_model(device_type: str):
|
||||
"""
|
||||
|
||||
@ -23,7 +23,12 @@ from torch._inductor.comms import (
|
||||
sink_waits_iterative,
|
||||
)
|
||||
from torch._inductor.compile_fx import compile_fx as inductor_compile_fx
|
||||
from torch._inductor.fx_passes.bucketing import is_all_gather_into_tensor
|
||||
from torch._inductor.fx_passes.bucketing import (
|
||||
is_all_gather_into_tensor,
|
||||
is_all_reduce_tensor,
|
||||
is_all_to_all_tensor,
|
||||
is_reduce_scatter_tensor,
|
||||
)
|
||||
from torch._inductor.scheduler import (
|
||||
_get_mm_like_fn,
|
||||
BaseSchedulerNode,
|
||||
@ -2188,7 +2193,7 @@ class TestSyncDecisionCrossRanks(MultiProcessTestCase):
|
||||
self.assertEqual(saved_values, [wt1])
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_comm_analysis(self):
|
||||
def test_all_gather_comm_analysis(self):
|
||||
store = c10d.FileStore(self.file_name, self.world_size)
|
||||
torch.cuda.set_device(self.rank)
|
||||
c10d.init_process_group(
|
||||
@ -2229,6 +2234,140 @@ class TestSyncDecisionCrossRanks(MultiProcessTestCase):
|
||||
)
|
||||
assert est_ms_nccl > 0
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_reduce_scatter_comm_analysis(self):
|
||||
store = c10d.FileStore(self.file_name, self.world_size)
|
||||
torch.cuda.set_device(self.rank)
|
||||
c10d.init_process_group(
|
||||
backend="nccl", store=store, rank=self.rank, world_size=self.world_size
|
||||
)
|
||||
group = c10d.distributed_c10d._get_default_group()
|
||||
group_name = "default"
|
||||
torch._C._distributed_c10d._register_process_group(
|
||||
group_name, torch.distributed.group.WORLD
|
||||
)
|
||||
group_size = group.size()
|
||||
|
||||
def func(inp, group_size, group_name):
|
||||
rs_0_out = torch.ops._c10d_functional.reduce_scatter_tensor(
|
||||
inp, "sum", group_size, group_name
|
||||
)
|
||||
rs_0_wait = torch.ops.c10d_functional.wait_tensor(rs_0_out)
|
||||
rs_1_out = torch.ops._c10d_functional.reduce_scatter_tensor(
|
||||
rs_0_wait, "sum", group_size, group_name
|
||||
)
|
||||
rs_1_wait = torch.ops.c10d_functional.wait_tensor(rs_1_out)
|
||||
return rs_1_wait
|
||||
|
||||
gm = make_fx(func)(torch.ones(4, 4, device=self.device), group_size, group_name)
|
||||
g = gm.graph
|
||||
for n in g.nodes:
|
||||
if is_reduce_scatter_tensor(n):
|
||||
from torch._inductor.comm_analysis import (
|
||||
estimate_nccl_collective_runtime_from_fx_node,
|
||||
)
|
||||
|
||||
est_ms = estimate_nccl_collective_runtime_from_fx_node(
|
||||
n, use_nccl_estimator=False
|
||||
)
|
||||
assert est_ms > 0
|
||||
est_ms_nccl = estimate_nccl_collective_runtime_from_fx_node(
|
||||
n, use_nccl_estimator=True
|
||||
)
|
||||
assert est_ms_nccl > 0
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_all_reduce_comm_analysis(self):
|
||||
store = c10d.FileStore(self.file_name, self.world_size)
|
||||
torch.cuda.set_device(self.rank)
|
||||
c10d.init_process_group(
|
||||
backend="nccl", store=store, rank=self.rank, world_size=self.world_size
|
||||
)
|
||||
group = c10d.distributed_c10d._get_default_group()
|
||||
group_name = "default"
|
||||
torch._C._distributed_c10d._register_process_group(
|
||||
group_name, torch.distributed.group.WORLD
|
||||
)
|
||||
group_size = group.size()
|
||||
|
||||
def func(inp, group_size, group_name):
|
||||
ar_0_out = torch.ops._c10d_functional.all_reduce(inp, "sum", group_name)
|
||||
ar_0_wait = torch.ops.c10d_functional.wait_tensor(ar_0_out)
|
||||
ar_1_out = torch.ops._c10d_functional.all_reduce(
|
||||
ar_0_wait, "sum", group_name
|
||||
)
|
||||
ar_1_wait = torch.ops.c10d_functional.wait_tensor(ar_1_out)
|
||||
return ar_1_wait
|
||||
|
||||
gm = make_fx(func)(torch.ones(4, 4, device=self.device), group_size, group_name)
|
||||
g = gm.graph
|
||||
for n in g.nodes:
|
||||
if is_all_reduce_tensor(n):
|
||||
from torch._inductor.comm_analysis import (
|
||||
estimate_nccl_collective_runtime_from_fx_node,
|
||||
)
|
||||
|
||||
est_ms = estimate_nccl_collective_runtime_from_fx_node(
|
||||
n, use_nccl_estimator=False
|
||||
)
|
||||
assert est_ms > 0
|
||||
est_ms_nccl = estimate_nccl_collective_runtime_from_fx_node(
|
||||
n, use_nccl_estimator=True
|
||||
)
|
||||
assert est_ms_nccl > 0
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_all_to_all_comm_analysis(self):
|
||||
store = c10d.FileStore(self.file_name, self.world_size)
|
||||
torch.cuda.set_device(self.rank)
|
||||
c10d.init_process_group(
|
||||
backend="nccl", store=store, rank=self.rank, world_size=self.world_size
|
||||
)
|
||||
group = c10d.distributed_c10d._get_default_group()
|
||||
group_name = "default"
|
||||
torch._C._distributed_c10d._register_process_group(
|
||||
group_name, torch.distributed.group.WORLD
|
||||
)
|
||||
group_size = group.size()
|
||||
|
||||
def func(inp, group_size, group_name):
|
||||
chunk = inp.numel() // self.world_size
|
||||
split_sizes = [chunk] * self.world_size
|
||||
a2a_0_out = torch.ops._c10d_functional.all_to_all_single(
|
||||
inp,
|
||||
split_sizes,
|
||||
split_sizes,
|
||||
group_name,
|
||||
)
|
||||
a2a_0_wait = torch.ops.c10d_functional.wait_tensor(a2a_0_out)
|
||||
a2a_1_out = torch.ops._c10d_functional.all_to_all_single(
|
||||
a2a_0_wait,
|
||||
split_sizes,
|
||||
split_sizes,
|
||||
group_name,
|
||||
)
|
||||
a2a_1_wait = torch.ops.c10d_functional.wait_tensor(a2a_1_out)
|
||||
return a2a_1_wait
|
||||
|
||||
gm = make_fx(func)(
|
||||
torch.ones(group_size * 4, 1, device=self.device), group_size, group_name
|
||||
)
|
||||
g = gm.graph
|
||||
for n in g.nodes:
|
||||
if is_all_to_all_tensor(n):
|
||||
from torch._inductor.comm_analysis import (
|
||||
estimate_nccl_collective_runtime_from_fx_node,
|
||||
)
|
||||
|
||||
est_ms = estimate_nccl_collective_runtime_from_fx_node(
|
||||
n, use_nccl_estimator=False
|
||||
)
|
||||
assert est_ms > 0
|
||||
est_ms_nccl = estimate_nccl_collective_runtime_from_fx_node(
|
||||
n, use_nccl_estimator=True
|
||||
)
|
||||
assert est_ms_nccl > 0
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@requires_gloo()
|
||||
def test_regression_use_nccl_estimate_with_gloo(self):
|
||||
|
||||
@ -49,7 +49,8 @@ def build_collective_info(graph, hiding_annotations):
|
||||
"""
|
||||
Build CollectiveInfo dict from manual hiding annotations.
|
||||
|
||||
hiding_annotations: dict mapping collective_start -> hiding_compute_node
|
||||
hiding_annotations: dict mapping collective_start -> hiding_compute_node(s)
|
||||
Can be a single node or a list/OrderedSet of nodes
|
||||
"""
|
||||
from torch._inductor.fx_passes.overlap_scheduling import CollectiveInfo
|
||||
|
||||
@ -65,12 +66,20 @@ def build_collective_info(graph, hiding_annotations):
|
||||
|
||||
# Build CollectiveInfo for each collective
|
||||
for start_node, wait_node in start_to_wait.items():
|
||||
hiding_node = hiding_annotations.get(start_node)
|
||||
hiding_annotation = hiding_annotations.get(start_node)
|
||||
|
||||
# Convert to OrderedSet
|
||||
hiding_nodes = OrderedSet()
|
||||
if hiding_annotation is not None:
|
||||
if isinstance(hiding_annotation, list | OrderedSet):
|
||||
hiding_nodes = OrderedSet(hiding_annotation)
|
||||
else:
|
||||
hiding_nodes = OrderedSet([hiding_annotation])
|
||||
|
||||
# Estimate size and time
|
||||
size_bytes = 16 * 4 # 4x4 tensor of floats
|
||||
estimated_time_ms = 1.0 # Dummy time
|
||||
exposed_time_ms = 0.0 if hiding_node else 1.0 # Hidden if has hiding_node
|
||||
exposed_time_ms = 0.0 if hiding_nodes else 1.0 # Hidden if has hiding_nodes
|
||||
|
||||
collective_info[start_node] = CollectiveInfo(
|
||||
start_node=start_node,
|
||||
@ -78,7 +87,7 @@ def build_collective_info(graph, hiding_annotations):
|
||||
size_bytes=size_bytes,
|
||||
estimated_time_ms=estimated_time_ms,
|
||||
exposed_time_ms=exposed_time_ms,
|
||||
hiding_node=hiding_node,
|
||||
hiding_nodes=hiding_nodes,
|
||||
)
|
||||
|
||||
return collective_info
|
||||
@ -567,6 +576,97 @@ class TestOverlapPreservingBucketing(InductorTestCase):
|
||||
graph_str
|
||||
)
|
||||
|
||||
def test_can_bucket_with_multiple_hiding_nodes(self):
|
||||
"""
|
||||
Test that collectives with multiple hiding nodes CAN bucket.
|
||||
|
||||
Graph structure:
|
||||
ag1_start -> ag2_start -> mm1 -> mm2 -> mm3 -> ag1_wait -> ag2_wait
|
||||
|
||||
Where:
|
||||
- ag1 is hidden by mm1 and mm2
|
||||
- ag2 is hidden by mm2 and mm3
|
||||
- Both collectives share mm2 as a hiding node
|
||||
"""
|
||||
|
||||
def func(a, b):
|
||||
group_name = "0"
|
||||
group_size = 1
|
||||
|
||||
# Start both collectives
|
||||
ag1 = torch.ops._c10d_functional.all_gather_into_tensor(
|
||||
a, group_size, group_name
|
||||
)
|
||||
ag2 = torch.ops._c10d_functional.all_gather_into_tensor(
|
||||
b, group_size, group_name
|
||||
)
|
||||
|
||||
# Three compute operations that hide the collectives
|
||||
mm1 = torch.mm(a, a)
|
||||
mm2 = torch.mm(b, b)
|
||||
mm3 = torch.mm(a + b, a + b)
|
||||
|
||||
# Wait for both
|
||||
ag1_out = torch.ops._c10d_functional.wait_tensor(ag1)
|
||||
ag2_out = torch.ops._c10d_functional.wait_tensor(ag2)
|
||||
|
||||
return ag1_out.sum() + ag2_out.sum() + mm1.sum() + mm2.sum() + mm3.sum()
|
||||
|
||||
# Use fake mode to trace without executing
|
||||
with FakeTensorMode():
|
||||
a = torch.ones(4, 4, device=self.device)
|
||||
b = torch.ones(4, 4, device=self.device) * 2
|
||||
|
||||
# Trace with make_fx
|
||||
traced = make_fx(func)(a, b)
|
||||
|
||||
# Find nodes using find_nodes
|
||||
ag1, ag2 = traced.graph.find_nodes(
|
||||
op="call_function",
|
||||
target=torch.ops._c10d_functional.all_gather_into_tensor.default,
|
||||
)
|
||||
mm1, mm2, mm3 = traced.graph.find_nodes(
|
||||
op="call_function", target=torch.ops.aten.mm.default
|
||||
)
|
||||
|
||||
# Manually annotate hiding relationships with multiple hiding nodes
|
||||
hiding_annotations = {
|
||||
ag1: [mm1, mm2], # ag1 is hidden by mm1 and mm2
|
||||
ag2: [mm2, mm3], # ag2 is hidden by mm2 and mm3
|
||||
}
|
||||
|
||||
# Build collective info and ancestors
|
||||
collective_info = build_collective_info(traced.graph, hiding_annotations)
|
||||
node_ancestors = compute_ancestors(traced.graph)
|
||||
scheduled = OrderedSet(traced.graph.nodes)
|
||||
|
||||
# Verify hiding_nodes are correctly set
|
||||
self.assertEqual(len(collective_info[ag1].hiding_nodes), 2)
|
||||
self.assertIn(mm1, collective_info[ag1].hiding_nodes)
|
||||
self.assertIn(mm2, collective_info[ag1].hiding_nodes)
|
||||
self.assertEqual(len(collective_info[ag2].hiding_nodes), 2)
|
||||
self.assertIn(mm2, collective_info[ag2].hiding_nodes)
|
||||
self.assertIn(mm3, collective_info[ag2].hiding_nodes)
|
||||
|
||||
# Run bucketing
|
||||
from torch._inductor.fx_passes.overlap_preserving_bucketer import (
|
||||
OverlapPreservingBucketer,
|
||||
)
|
||||
|
||||
bucketer = OverlapPreservingBucketer(
|
||||
traced.graph,
|
||||
collective_info,
|
||||
node_ancestors,
|
||||
scheduled,
|
||||
)
|
||||
bucketer.bucket_collectives()
|
||||
|
||||
FileCheck().check_count(
|
||||
"all_gather_into_tensor_out", 1, exactly=False
|
||||
).check_count("torch.ops.aten.mm.default", 3, exactly=True).run(
|
||||
str(traced.graph)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -253,6 +253,14 @@ class StoreTestBase:
|
||||
a.set("foo", "bar")
|
||||
self.assertEqual(b.get("foo"), b"bar")
|
||||
|
||||
def test_list_keys(self):
|
||||
a = self._create_store()
|
||||
a.set("foo", "bar")
|
||||
a.set("baz", "qux")
|
||||
keys = a.list_keys()
|
||||
self.assertIn("foo", keys)
|
||||
self.assertIn("baz", keys)
|
||||
|
||||
# This is the number of keys used in test_set_get. Adding this as a class
|
||||
# property instead of hardcoding in the test since some Store
|
||||
# implementations will have differing number of keys. In the base case,
|
||||
|
||||
@ -39,7 +39,10 @@ from torch.testing._internal.common_utils import (
|
||||
)
|
||||
from torch.testing._internal.hop_db import hop_db
|
||||
from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton
|
||||
from torch.testing._internal.triton_utils import (
|
||||
requires_cuda_and_triton,
|
||||
requires_gpu_and_triton,
|
||||
)
|
||||
|
||||
|
||||
def count_ops(gm, args, freq, op):
|
||||
@ -3395,6 +3398,91 @@ class GraphModule(torch.nn.Module):
|
||||
with self.assertRaisesRegex(RuntimeError, msg):
|
||||
fn_with_hints(x, y)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
def test_wrap_inductor_compiled_regions_option(self):
|
||||
"""
|
||||
Test that wrap_inductor_compiled_regions option wraps compiled regions
|
||||
in inductor_compiled_code HOP, making them visible to DebugMode.
|
||||
"""
|
||||
from torch.utils._debug_mode import DebugMode
|
||||
|
||||
# Test with wrapping enabled
|
||||
@torch.compile(
|
||||
backend="inductor",
|
||||
options={"wrap_inductor_compiled_regions": True},
|
||||
fullgraph=True,
|
||||
)
|
||||
def fn_wrapped(x, y):
|
||||
return torch.matmul(x, y)
|
||||
|
||||
# Test with wrapping disabled (default)
|
||||
@torch.compile(backend="inductor", fullgraph=True)
|
||||
def fn_not_wrapped(x, y):
|
||||
return torch.matmul(x, y)
|
||||
|
||||
x = torch.randn(4, 4, device="cuda")
|
||||
y = torch.randn(4, 4, device="cuda")
|
||||
|
||||
# Test wrapped version - HOP should be visible in DebugMode
|
||||
with DebugMode() as debug_mode_wrapped:
|
||||
result_wrapped = fn_wrapped(x, y)
|
||||
|
||||
debug_string_wrapped = debug_mode_wrapped.debug_string()
|
||||
self.assertIn("inductor_compiled_code", debug_string_wrapped)
|
||||
|
||||
# Test non-wrapped version - HOP should NOT be visible
|
||||
with DebugMode() as debug_mode_not_wrapped:
|
||||
result_not_wrapped = fn_not_wrapped(x, y)
|
||||
|
||||
debug_string_not_wrapped = debug_mode_not_wrapped.debug_string()
|
||||
self.assertNotIn("inductor_compiled_code", debug_string_not_wrapped)
|
||||
|
||||
# Both should produce correct results
|
||||
expected = torch.matmul(x, y)
|
||||
self.assertEqual(result_wrapped, expected)
|
||||
self.assertEqual(result_not_wrapped, expected)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
def test_wrap_inductor_compiled_regions_with_backward(self):
|
||||
"""
|
||||
Test that wrap_inductor_compiled_regions works correctly with autograd.
|
||||
"""
|
||||
from torch.utils._debug_mode import DebugMode
|
||||
|
||||
@torch.compile(
|
||||
backend="inductor",
|
||||
options={"wrap_inductor_compiled_regions": True},
|
||||
fullgraph=True,
|
||||
)
|
||||
def fn(x, y):
|
||||
return torch.matmul(x, y)
|
||||
|
||||
x = torch.randn(4, 4, device="cuda", requires_grad=True)
|
||||
y = torch.randn(4, 4, device="cuda", requires_grad=True)
|
||||
|
||||
# Clone for eager comparison
|
||||
x_eager = x.detach().clone().requires_grad_(True)
|
||||
y_eager = y.detach().clone().requires_grad_(True)
|
||||
|
||||
# Compiled forward and backward
|
||||
with DebugMode() as debug_mode:
|
||||
result = fn(x, y)
|
||||
loss = result.sum()
|
||||
loss.backward()
|
||||
|
||||
# HOP should be visible in forward pass
|
||||
self.assertIn("inductor_compiled_code", debug_mode.debug_string())
|
||||
|
||||
# Eager forward and backward for comparison
|
||||
expected = torch.matmul(x_eager, y_eager)
|
||||
expected_loss = expected.sum()
|
||||
expected_loss.backward()
|
||||
|
||||
# Check correctness
|
||||
self.assertEqual(result, expected)
|
||||
self.assertEqual(x.grad, x_eager.grad)
|
||||
self.assertEqual(y.grad, y_eager.grad)
|
||||
|
||||
|
||||
class HigherOrderOpVmapGuardTests(
|
||||
torch._dynamo.test_case.TestCaseWithNestedGraphBreaks, LoggingTestCase
|
||||
@ -6895,7 +6983,7 @@ class ActivationCheckpointingTests(
|
||||
fn, backend, x, y, skip_check=True
|
||||
) # dropout decomp is known to diverge with eager
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._functorch.config.patch(functionalize_rng_ops=True)
|
||||
def test_fallback(self):
|
||||
def gn(x, y):
|
||||
|
||||
@ -470,7 +470,7 @@ class <lambda>(torch.nn.Module):
|
||||
)
|
||||
|
||||
@requires_cuda
|
||||
def test_stream_backward(self) -> None:
|
||||
def test_stream_backward_simple(self) -> None:
|
||||
def fn(x, y):
|
||||
s2 = torch.Stream()
|
||||
s0 = torch.Stream()
|
||||
@ -524,7 +524,68 @@ class GraphModule(torch.nn.Module):
|
||||
# Annotation: {'stream': 1}
|
||||
mul_3: "f32[2, 2]" = torch.ops.aten.mul.Tensor(tangents_1, 2); tangents_1 = None
|
||||
|
||||
# Annotation: {'stream': 0}
|
||||
add_3: "f32[2, 2]" = torch.ops.aten.add.Tensor(mul_2, mul_3); mul_2 = mul_3 = None
|
||||
return (add_3, add_2)
|
||||
""",
|
||||
)
|
||||
|
||||
@requires_cuda
|
||||
def test_stream_backward_sync(self) -> None:
|
||||
def fn(x, y):
|
||||
s2 = torch.Stream()
|
||||
s0 = torch.Stream()
|
||||
with s0:
|
||||
y0 = 2 * x + y
|
||||
with s2:
|
||||
z = 2 * x + y
|
||||
|
||||
return y0, z
|
||||
|
||||
inp = (
|
||||
torch.ones(2, 2, device="cuda:0", requires_grad=True) + 1,
|
||||
torch.ones(2, 2, device="cuda:0", requires_grad=True),
|
||||
)
|
||||
expected = fn(*inp)
|
||||
(
|
||||
actual,
|
||||
_,
|
||||
fw_graphs,
|
||||
bw_graphs,
|
||||
) = extract_graph(fn, *inp)
|
||||
self.assertEqual(len(fw_graphs), 1)
|
||||
self.assertEqual(expected, actual)
|
||||
self.assertExpectedInline(
|
||||
print_graph(fw_graphs[0]),
|
||||
"""\
|
||||
class GraphModule(torch.nn.Module):
|
||||
def forward(self, primals_1: "f32[2, 2]", primals_2: "f32[2, 2]"):
|
||||
# Annotation: {'stream': 1}
|
||||
mul: "f32[2, 2]" = torch.ops.aten.mul.Tensor(primals_1, 2); primals_1 = None
|
||||
add: "f32[2, 2]" = torch.ops.aten.add.Tensor(mul, primals_2)
|
||||
|
||||
# Annotation: {'stream': 0}
|
||||
add_1: "f32[2, 2]" = torch.ops.aten.add.Tensor(mul, primals_2); mul = primals_2 = None
|
||||
return (add, add_1)
|
||||
""",
|
||||
)
|
||||
|
||||
actual[1].sum().backward()
|
||||
self.assertExpectedInline(
|
||||
print_graph(bw_graphs[0]),
|
||||
"""\
|
||||
class GraphModule(torch.nn.Module):
|
||||
def forward(self, tangents_1: "f32[2, 2]", tangents_2: "f32[2, 2]"):
|
||||
# Annotation: {'stream': 0}
|
||||
mul_2: "f32[2, 2]" = torch.ops.aten.mul.Tensor(tangents_2, 2)
|
||||
|
||||
#
|
||||
add_2: "f32[2, 2]" = torch.ops.aten.add.Tensor(tangents_2, tangents_1); tangents_2 = None
|
||||
|
||||
# Annotation: {'stream': 1}
|
||||
mul_3: "f32[2, 2]" = torch.ops.aten.mul.Tensor(tangents_1, 2); tangents_1 = None
|
||||
|
||||
# Annotation: {'stream': 0}
|
||||
add_3: "f32[2, 2]" = torch.ops.aten.add.Tensor(mul_2, mul_3); mul_2 = mul_3 = None
|
||||
return (add_3, add_2)
|
||||
""",
|
||||
|
||||
1075
test/dynamo/test_wrap_inductor_compiled_regions.py
Normal file
1075
test/dynamo/test_wrap_inductor_compiled_regions.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -1372,6 +1372,8 @@ aten::view_as_complex_copy.out
|
||||
aten::view_as_real
|
||||
aten::view_as_real_copy
|
||||
aten::view_as_real_copy.out
|
||||
aten::zendnn_linear_unary
|
||||
aten::zendnn_weight_prepack_for_linear
|
||||
aten::zeros.names
|
||||
aten::zeros.names_out
|
||||
aten::zeros.out
|
||||
|
||||
@ -456,6 +456,31 @@ def forward(self, x):
|
||||
test_inputs = make_inputs()
|
||||
self.assertEqual(gm(*test_inputs), foo(*test_inputs))
|
||||
|
||||
def test_dynamo_graph_capture_with_call_override(self):
|
||||
class _InterestingModule(torch.nn.Module):
|
||||
def __init__(self, module):
|
||||
super().__init__()
|
||||
self._module = module
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
return self._module(*args, **kwargs)
|
||||
|
||||
class MyModel(torch.nn.Module):
|
||||
def forward(self, x):
|
||||
return x + 1
|
||||
|
||||
foo = _InterestingModule(MyModel())
|
||||
|
||||
def make_inputs():
|
||||
return (torch.randn(2, 3),)
|
||||
|
||||
trace_inputs = make_inputs()
|
||||
gm = dynamo_graph_capture_for_export(foo)(*trace_inputs)
|
||||
test_inputs = make_inputs()
|
||||
self.assertEqual(gm(*test_inputs), foo(*test_inputs))
|
||||
self.assertEqual(len(list(gm.buffers())), len(list(foo.buffers())))
|
||||
self.assertEqual(len(list(gm.parameters())), len(list(foo.parameters())))
|
||||
|
||||
def test_dynamo_graph_capture_custom_pytree_type(self):
|
||||
import torch.utils._pytree as pytree
|
||||
|
||||
|
||||
@ -3,12 +3,17 @@ import io
|
||||
from unittest.mock import patch
|
||||
|
||||
import torch
|
||||
from torch._dynamo.utils import counters
|
||||
from torch._functorch.aot_autograd import aot_export_module
|
||||
from torch.fx.experimental.proxy_tensor import make_fx
|
||||
from torch.testing._internal.common_utils import run_tests, TestCase
|
||||
from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
parametrize,
|
||||
run_tests,
|
||||
TestCase,
|
||||
)
|
||||
|
||||
|
||||
@instantiate_parametrized_tests
|
||||
class TestHopPrint(TestCase):
|
||||
def test_base_print(self):
|
||||
def f(x):
|
||||
@ -18,7 +23,6 @@ class TestHopPrint(TestCase):
|
||||
torch._higher_order_ops.print("moo")
|
||||
return x
|
||||
|
||||
counters.clear()
|
||||
x = torch.randn(3, 3)
|
||||
with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
|
||||
f(x)
|
||||
@ -33,7 +37,6 @@ class TestHopPrint(TestCase):
|
||||
x = x * x
|
||||
return x
|
||||
|
||||
counters.clear()
|
||||
x = torch.randn(3, 3)
|
||||
with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
|
||||
f(x)
|
||||
@ -184,6 +187,62 @@ x = add_1, y = add_2); getitem = None
|
||||
"""print(str format_str) -> ()""",
|
||||
)
|
||||
|
||||
@parametrize("backend", ["eager", "aot_eager"])
|
||||
def test_reorder_print_no_graph_break(self, backend):
|
||||
def f(x):
|
||||
x1 = x + x
|
||||
torch._higher_order_ops.print("moo {x}", x=x1)
|
||||
x2 = x1 * x1
|
||||
torch._higher_order_ops.print("moo {x}", x=x2)
|
||||
x3 = x2 + x2
|
||||
return (x1, x3)
|
||||
|
||||
# Eager and aot_eager backend for dynamo tracing testing
|
||||
x = torch.randn(3, 3)
|
||||
opt_f = torch.compile(backend=backend, fullgraph=True)(f)
|
||||
with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
|
||||
opt_out = opt_f(x)
|
||||
printed_output = mock_stdout.getvalue().strip()
|
||||
orig_out = f(x)
|
||||
|
||||
self.assertEqual(
|
||||
printed_output,
|
||||
f"moo {x * 2}\nmoo {x * 2 * x * 2}",
|
||||
)
|
||||
self.assertEqual(orig_out, opt_out)
|
||||
|
||||
x_new = torch.randn(2, 2)
|
||||
with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
|
||||
opt_out = opt_f(x_new)
|
||||
printed_output = mock_stdout.getvalue().strip()
|
||||
|
||||
self.assertEqual(
|
||||
printed_output,
|
||||
f"moo {x_new * 2}\nmoo {x_new * 2 * x_new * 2}",
|
||||
)
|
||||
|
||||
@parametrize("backend", ["eager", "aot_eager"])
|
||||
def test_constant_mutation(self, backend):
|
||||
def f(x):
|
||||
alist = [x]
|
||||
alist.append(x + 1)
|
||||
torch._higher_order_ops.print("moo {x}", x=alist[-1])
|
||||
alist[0].sum().item() # graph break
|
||||
res = alist.pop()
|
||||
torch._higher_order_ops.print("moo {x}", x=alist[-1])
|
||||
res.sum().item() # graph break
|
||||
return res
|
||||
|
||||
inputs = (torch.tensor([1]),)
|
||||
opt_f = torch.compile(backend=backend, fullgraph=True)(f)
|
||||
with patch("sys.stdout", new_callable=io.StringIO) as mock_stdout:
|
||||
opt_out = opt_f(*inputs)
|
||||
printed_output = mock_stdout.getvalue().strip()
|
||||
orig_out = f(*inputs)
|
||||
|
||||
self.assertEqual(printed_output, "moo tensor([2])\nmoo tensor([1])")
|
||||
self.assertEqual(orig_out, opt_out)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
||||
@ -1554,7 +1554,8 @@ class AOTInductorTestsTemplate:
|
||||
|
||||
# scaled_dot_product_flash_attention
|
||||
@unittest.skipIf(
|
||||
not HAS_XPU_AND_TRITON and not SM80OrLater, "bfloat16 only supported in sm80+"
|
||||
not SM80OrLater and not HAS_XPU_AND_TRITON,
|
||||
"bfloat16 only supported in sm80+ or XPU",
|
||||
)
|
||||
def test_sdpa(self):
|
||||
class Model(torch.nn.Module):
|
||||
@ -1571,7 +1572,10 @@ class AOTInductorTestsTemplate:
|
||||
)
|
||||
self.check_model(Model(), example_inputs)
|
||||
|
||||
@unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
|
||||
@unittest.skipIf(
|
||||
not SM80OrLater and not HAS_XPU_AND_TRITON,
|
||||
"bfloat16 only supported in sm80+ or XPU",
|
||||
)
|
||||
@unittest.skipIf(
|
||||
# for archs where this isn't lowered to flash attention, the math
|
||||
# backend will be used and it doesn't work for bfloat16
|
||||
@ -5926,8 +5930,8 @@ class AOTInductorTestsTemplate:
|
||||
@requires_gpu
|
||||
def test_d2h_copy(self):
|
||||
# device to copy host should always have the same stride
|
||||
if "cuda" not in self.device:
|
||||
raise unittest.SkipTest("This test is only for CUDA")
|
||||
if self.device not in ["cuda", "xpu"]:
|
||||
raise unittest.SkipTest("This test is only for CUDA or XPU")
|
||||
|
||||
class ToCpuModel(nn.Module):
|
||||
def forward(self, x):
|
||||
|
||||
@ -28,7 +28,7 @@ from torch.export.pt2_archive._package import (
|
||||
load_weights_to_pt2_contents,
|
||||
)
|
||||
from torch.testing._internal.common_cuda import _get_torch_cuda_version
|
||||
from torch.testing._internal.common_utils import IS_FBCODE, skipIfXpu
|
||||
from torch.testing._internal.common_utils import IS_FBCODE, skipIfXpu, TEST_CUDA
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
|
||||
|
||||
|
||||
@ -267,9 +267,9 @@ class TestAOTInductorPackage(TestCase):
|
||||
|
||||
@unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
|
||||
@unittest.skipIf(
|
||||
_get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
|
||||
TEST_CUDA and _get_torch_cuda_version() < (12, 6),
|
||||
"Test is only supported on CUDA 12.6+",
|
||||
)
|
||||
@skipIfXpu # build system may be different
|
||||
def test_compile_after_package(self):
|
||||
self.check_package_cpp_only()
|
||||
|
||||
|
||||
@ -11,19 +11,19 @@ from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
TestCase,
|
||||
)
|
||||
from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA_AND_TRITON
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU_AND_TRITON
|
||||
from torch.testing._internal.triton_utils import requires_gpu_and_triton
|
||||
|
||||
|
||||
aten = torch.ops.aten
|
||||
|
||||
try:
|
||||
try:
|
||||
from .test_torchinductor import check_model, check_model_cuda
|
||||
from .test_torchinductor import check_model, check_model_gpu
|
||||
except ImportError:
|
||||
from test_torchinductor import ( # @manual=fbcode//caffe2/test/inductor:test_inductor-library
|
||||
check_model,
|
||||
check_model_cuda,
|
||||
check_model_gpu,
|
||||
)
|
||||
except (unittest.SkipTest, ImportError) as e:
|
||||
sys.stderr.write(f"{type(e)}: {e}\n")
|
||||
@ -34,7 +34,7 @@ except (unittest.SkipTest, ImportError) as e:
|
||||
|
||||
@instantiate_parametrized_tests
|
||||
class ComboKernelTests(TestCase):
|
||||
check_model_cuda = check_model_cuda
|
||||
check_model_gpu = check_model_gpu
|
||||
check_model_cpu = check_model
|
||||
check_kernel_count = True
|
||||
|
||||
@ -56,7 +56,7 @@ class ComboKernelTests(TestCase):
|
||||
torch._inductor.metrics.reset()
|
||||
super().tearDown()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_activation_functions(self):
|
||||
def test_activations(a, b, c):
|
||||
a1 = torch.nn.functional.relu(a)
|
||||
@ -65,9 +65,9 @@ class ComboKernelTests(TestCase):
|
||||
return a1, b1, c1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_activations(*inps)
|
||||
@ -76,7 +76,7 @@ class ComboKernelTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_reduce_functions(self):
|
||||
def test_reduce(a, b, c, d):
|
||||
a1 = torch.sum(a, dim=0)
|
||||
@ -87,10 +87,10 @@ class ComboKernelTests(TestCase):
|
||||
return a1, b1, c1, d1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(30, 8, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(30, 8, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_reduce(*inps)
|
||||
@ -99,7 +99,7 @@ class ComboKernelTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertTrue(torch._inductor.metrics.generated_kernel_count <= 2)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_mutated_args(self):
|
||||
def test_mutated(a, b, c, d):
|
||||
a.add_(1)
|
||||
@ -110,10 +110,10 @@ class ComboKernelTests(TestCase):
|
||||
return a, b, c, d
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(30, 8, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(30, 8, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_mutated(*inps)
|
||||
@ -122,7 +122,7 @@ class ComboKernelTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_reduce_split(self):
|
||||
def fn(a, b):
|
||||
a1 = torch.linalg.vector_norm(a)
|
||||
@ -130,15 +130,15 @@ class ComboKernelTests(TestCase):
|
||||
return a1, b1
|
||||
|
||||
inps = [
|
||||
torch.rand(2048, 512, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(2048, 512, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
]
|
||||
out_eager = fn(*inps)
|
||||
out_compiled = torch.compile(fn)(*inps)
|
||||
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_2d_blocking_partitioning(self):
|
||||
def fn(a0, a1, a2, b0, b1, b2):
|
||||
c0 = torch.add(a0, b0)
|
||||
@ -146,15 +146,15 @@ class ComboKernelTests(TestCase):
|
||||
c2 = torch.add(a2, b2)
|
||||
return c0, c1, c2
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(30, 20, device="cuda"),
|
||||
torch.rand(40, 30, device="cuda"),
|
||||
torch.rand(36, 40, device="cuda"),
|
||||
torch.rand(30, 20, device="cuda"),
|
||||
torch.rand(30, 40, device="cuda").t(),
|
||||
torch.rand(40, 36, device="cuda").t(),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(40, 30, device=GPU_TYPE),
|
||||
torch.rand(36, 40, device=GPU_TYPE),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(30, 40, device=GPU_TYPE).t(),
|
||||
torch.rand(40, 36, device=GPU_TYPE).t(),
|
||||
),
|
||||
)
|
||||
|
||||
@ -163,7 +163,7 @@ class ComboKernelTests(TestCase):
|
||||
|
||||
@instantiate_parametrized_tests
|
||||
class ComboKernelBenchmarkTests(TestCase):
|
||||
check_model_cuda = check_model_cuda
|
||||
check_model_gpu = check_model_gpu
|
||||
check_model_cpu = check_model
|
||||
check_kernel_count = True
|
||||
|
||||
@ -185,7 +185,7 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
torch._inductor.metrics.reset()
|
||||
super().tearDown()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_activation_benchmark(self):
|
||||
def test_activations(a, b, c):
|
||||
a1 = torch.nn.functional.relu(a)
|
||||
@ -194,9 +194,9 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
return a1, b1, c1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_activations(*inps)
|
||||
@ -205,7 +205,7 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_reduce_benchmark(self):
|
||||
def test_reduce(a, b, c, d):
|
||||
a1 = torch.sum(a, dim=0)
|
||||
@ -216,10 +216,10 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
return a1, b1, c1, d1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(30, 8, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(30, 8, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_reduce(*inps)
|
||||
@ -228,7 +228,7 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertTrue(4 < torch._inductor.metrics.generated_kernel_count <= 10)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_mutated_benchmark(self):
|
||||
def test_mutated(a, b, c, d):
|
||||
a.add_(1)
|
||||
@ -239,10 +239,10 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
return a, b, c, d
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(30, 8, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(30, 8, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_mutated(*inps)
|
||||
@ -251,7 +251,7 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertTrue(torch._inductor.metrics.generated_kernel_count in [6, 9])
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_round_robin_dispatch(self):
|
||||
# combo kernel dispatch strategy: round robin
|
||||
def test_mutated(a, b, c, d):
|
||||
@ -263,10 +263,10 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
return a, b, c, d
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 5, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(5, 18, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 5, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(5, 18, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_mutated(*inps)
|
||||
@ -275,7 +275,7 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 6)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_2d_blocking_benchmark(self):
|
||||
def fn(a0, a1, a2, b0, b1, b2):
|
||||
c0 = torch.add(a0, b0)
|
||||
@ -283,28 +283,28 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
c2 = torch.add(a2, b2)
|
||||
return c0, c1, c2
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(30, 20, device="cuda"),
|
||||
torch.rand(40, 30, device="cuda"),
|
||||
torch.rand(36, 40, device="cuda"),
|
||||
torch.rand(30, 20, device="cuda"),
|
||||
torch.rand(30, 40, device="cuda").t(),
|
||||
torch.rand(40, 36, device="cuda").t(),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(40, 30, device=GPU_TYPE),
|
||||
torch.rand(36, 40, device=GPU_TYPE),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(30, 40, device=GPU_TYPE).t(),
|
||||
torch.rand(40, 36, device=GPU_TYPE).t(),
|
||||
),
|
||||
)
|
||||
|
||||
self.assertTrue(7 <= torch._inductor.metrics.generated_kernel_count <= 8)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_persistent_reduction_no_x_dim(self):
|
||||
def fn(x, y):
|
||||
return x.sum(1), y.sum(1)
|
||||
|
||||
inps = (
|
||||
torch.rand(16, 256, device="cuda"),
|
||||
torch.rand(32, 256, device="cuda"),
|
||||
torch.rand(16, 256, device=GPU_TYPE),
|
||||
torch.rand(32, 256, device=GPU_TYPE),
|
||||
)
|
||||
torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
|
||||
torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
|
||||
@ -317,7 +317,7 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
|
||||
@instantiate_parametrized_tests
|
||||
class ComboKernelDynamicShapesTests(TestCase):
|
||||
check_model_cuda = check_model_cuda
|
||||
check_model_gpu = check_model_gpu
|
||||
check_model_cpu = check_model
|
||||
check_kernel_count = True
|
||||
|
||||
@ -347,7 +347,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
torch._inductor.metrics.reset()
|
||||
super().tearDown()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_dynamic_shapes_activations(self):
|
||||
def test_activations(a, b, c):
|
||||
a1 = torch.nn.functional.relu(a)
|
||||
@ -356,9 +356,9 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return a1, b1, c1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_activations(*inps)
|
||||
@ -367,7 +367,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_dynamic_shapes_2d_blocking(self):
|
||||
def fn(a0, a1, a2, b0, b1, b2):
|
||||
c0 = torch.add(a0, b0)
|
||||
@ -375,21 +375,21 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
c2 = torch.add(a2, b2)
|
||||
return c0, c1, c2
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(30, 20, device="cuda"),
|
||||
torch.rand(40, 30, device="cuda"),
|
||||
torch.rand(36, 40, device="cuda"),
|
||||
torch.rand(30, 20, device="cuda"),
|
||||
torch.rand(30, 40, device="cuda").t(),
|
||||
torch.rand(40, 36, device="cuda").t(),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(40, 30, device=GPU_TYPE),
|
||||
torch.rand(36, 40, device=GPU_TYPE),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(30, 40, device=GPU_TYPE).t(),
|
||||
torch.rand(40, 36, device=GPU_TYPE).t(),
|
||||
),
|
||||
)
|
||||
|
||||
self.assertTrue(7 <= torch._inductor.metrics.generated_kernel_count <= 8)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_dynamic_shapes_reduce(self):
|
||||
def test_reduce(a, b, c, d):
|
||||
a1 = torch.sum(a, dim=0)
|
||||
@ -400,10 +400,10 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return a1, b1, c1, d1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(30, 8, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(30, 8, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_reduce(*inps)
|
||||
@ -412,7 +412,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertTrue(4 < torch._inductor.metrics.generated_kernel_count <= 10)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_dynamic_shapes_mutated(self):
|
||||
# combo kernel dispatch strategy: round robin
|
||||
def test_mutated(a, b, c, d):
|
||||
@ -424,10 +424,10 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return a, b, c, d
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 5, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(5, 18, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 5, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(5, 18, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_mutated(*inps)
|
||||
@ -436,7 +436,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 6)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._inductor.config.patch("combo_kernels_autotune", 0)
|
||||
def test_dynamic_shapes_activations_no_autotune(self):
|
||||
def test_activations(a, b, c):
|
||||
@ -446,9 +446,9 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return a1, b1, c1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_activations(*inps)
|
||||
@ -457,7 +457,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._dynamo.config.patch("automatic_dynamic_shapes", True)
|
||||
@torch._dynamo.config.patch("assume_static_by_default", True)
|
||||
def test_dynamic_shapes_persistent_reduction_no_x_dim(self):
|
||||
@ -465,8 +465,8 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return x.sum(1), y.sum(1)
|
||||
|
||||
inps = (
|
||||
torch.rand(16, 256, device="cuda"),
|
||||
torch.rand(32, 256, device="cuda"),
|
||||
torch.rand(16, 256, device=GPU_TYPE),
|
||||
torch.rand(32, 256, device=GPU_TYPE),
|
||||
)
|
||||
torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
|
||||
torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
|
||||
@ -476,7 +476,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._dynamo.config.patch("automatic_dynamic_shapes", True)
|
||||
@torch._dynamo.config.patch("assume_static_by_default", True)
|
||||
def test_dynamic_shapes_persistent_reduction_no_x_dim_2(self):
|
||||
@ -484,8 +484,8 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return x.sum(2), y.sum(2)
|
||||
|
||||
inps = (
|
||||
torch.rand(8, 16, 256, device="cuda"),
|
||||
torch.rand(8, 32, 256, device="cuda"),
|
||||
torch.rand(8, 16, 256, device=GPU_TYPE),
|
||||
torch.rand(8, 32, 256, device=GPU_TYPE),
|
||||
)
|
||||
torch._dynamo.mark_dynamic(inps[0], (0, 1), min=1, max=256)
|
||||
torch._dynamo.mark_dynamic(inps[1], (0, 1), min=1, max=256)
|
||||
@ -495,7 +495,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._dynamo.config.patch("automatic_dynamic_shapes", True)
|
||||
@torch._dynamo.config.patch("assume_static_by_default", True)
|
||||
def test_dynamic_shapes_2d_blocking_round_robin(self):
|
||||
@ -506,12 +506,12 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return c0, c1, c2
|
||||
|
||||
inps = (
|
||||
torch.rand(20, 30, device="cuda"),
|
||||
torch.rand(30, 30, device="cuda"),
|
||||
torch.rand(40, 32, device="cuda"),
|
||||
torch.rand(30, 20, device="cuda").t(),
|
||||
torch.rand(30, 30, device="cuda").t(),
|
||||
torch.rand(32, 40, device="cuda").t(),
|
||||
torch.rand(20, 30, device=GPU_TYPE),
|
||||
torch.rand(30, 30, device=GPU_TYPE),
|
||||
torch.rand(40, 32, device=GPU_TYPE),
|
||||
torch.rand(30, 20, device=GPU_TYPE).t(),
|
||||
torch.rand(30, 30, device=GPU_TYPE).t(),
|
||||
torch.rand(32, 40, device=GPU_TYPE).t(),
|
||||
)
|
||||
|
||||
out_eager = fn(*inps)
|
||||
@ -522,19 +522,19 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
torch._inductor.metrics.reset()
|
||||
|
||||
inps = (
|
||||
torch.rand(24, 30, device="cuda"),
|
||||
torch.rand(32, 30, device="cuda"),
|
||||
torch.rand(48, 32, device="cuda"),
|
||||
torch.rand(30, 24, device="cuda").t(),
|
||||
torch.rand(30, 32, device="cuda").t(),
|
||||
torch.rand(32, 48, device="cuda").t(),
|
||||
torch.rand(24, 30, device=GPU_TYPE),
|
||||
torch.rand(32, 30, device=GPU_TYPE),
|
||||
torch.rand(48, 32, device=GPU_TYPE),
|
||||
torch.rand(30, 24, device=GPU_TYPE).t(),
|
||||
torch.rand(30, 32, device=GPU_TYPE).t(),
|
||||
torch.rand(32, 48, device=GPU_TYPE).t(),
|
||||
)
|
||||
out_compiled = compiled(*inps)
|
||||
out_eager = fn(*inps)
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertTrue(5 <= torch._inductor.metrics.generated_kernel_count <= 6)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._dynamo.config.patch("automatic_dynamic_shapes", True)
|
||||
@torch._dynamo.config.patch("assume_static_by_default", True)
|
||||
@torch._inductor.config.patch("triton.autotune_at_compile_time", True)
|
||||
@ -543,9 +543,9 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return x.sum(1), y.mean(1), z.max(1)
|
||||
|
||||
inps = (
|
||||
torch.rand(16, 128, device="cuda"),
|
||||
torch.rand(32, 128, device="cuda"),
|
||||
torch.rand(32, 256, device="cuda"),
|
||||
torch.rand(16, 128, device=GPU_TYPE),
|
||||
torch.rand(32, 128, device=GPU_TYPE),
|
||||
torch.rand(32, 256, device=GPU_TYPE),
|
||||
)
|
||||
torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
|
||||
torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
|
||||
@ -555,15 +555,15 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_helper_fn_defined(self):
|
||||
def fn(x, y, z):
|
||||
return x.sum(1), y.mean(1), z.cumsum(1)
|
||||
|
||||
inps = (
|
||||
torch.rand(16, 128, device="cuda"),
|
||||
torch.rand(32, 128, device="cuda"),
|
||||
torch.rand(32, 256, device="cuda"),
|
||||
torch.rand(16, 128, device=GPU_TYPE),
|
||||
torch.rand(32, 128, device=GPU_TYPE),
|
||||
torch.rand(32, 256, device=GPU_TYPE),
|
||||
)
|
||||
|
||||
out_eager = fn(*inps)
|
||||
@ -577,5 +577,5 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
if __name__ == "__main__":
|
||||
from torch._dynamo.test_case import run_tests
|
||||
|
||||
if HAS_CPU or HAS_CUDA_AND_TRITON:
|
||||
if HAS_CPU or HAS_GPU_AND_TRITON:
|
||||
run_tests(needs="filelock")
|
||||
|
||||
@ -45,6 +45,7 @@ from torch.testing._internal.common_utils import (
|
||||
parametrize,
|
||||
scoped_load_inline,
|
||||
skipIfWindows,
|
||||
skipIfXpu,
|
||||
)
|
||||
from torch.testing._internal.hop_db import hop_db
|
||||
from torch.testing._internal.inductor_utils import (
|
||||
@ -52,9 +53,13 @@ from torch.testing._internal.inductor_utils import (
|
||||
HAS_CPU,
|
||||
HAS_CUDA_AND_TRITON,
|
||||
HAS_GPU,
|
||||
HAS_XPU_AND_TRITON,
|
||||
)
|
||||
from torch.testing._internal.logging_utils import logs_to_string
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton
|
||||
from torch.testing._internal.triton_utils import (
|
||||
requires_cuda_and_triton,
|
||||
requires_gpu_and_triton,
|
||||
)
|
||||
from torch.utils._python_dispatch import TorchDispatchMode
|
||||
|
||||
|
||||
@ -3049,13 +3054,14 @@ main()
|
||||
|
||||
self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@skipIfXpu(msg="cudagraphs not supported on xpu for now!")
|
||||
@requires_gpu_and_triton
|
||||
def test_cudagraphs_sdpa(self):
|
||||
query = torch.rand(
|
||||
32, 8, 128, 64, dtype=torch.float16, device="cuda", requires_grad=True
|
||||
32, 8, 128, 64, dtype=torch.float16, device=GPU_TYPE, requires_grad=True
|
||||
)
|
||||
key = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
|
||||
value = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
|
||||
key = torch.rand(32, 8, 128, 64, dtype=torch.float16, device=GPU_TYPE)
|
||||
value = torch.rand(32, 8, 128, 64, dtype=torch.float16, device=GPU_TYPE)
|
||||
out = torch.nn.functional.scaled_dot_product_attention(query, key, value)
|
||||
|
||||
with (
|
||||
@ -3747,7 +3753,7 @@ class CompiledAutograd0(torch.nn.Module):
|
||||
self.assertTrue(isinstance(view_nodes[0].args[1][0], torch.fx.Node))
|
||||
self.assertTrue(isinstance(view_nodes[1].args[1][0], torch.fx.Node))
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_flex_attention(self):
|
||||
def _squared(score, b, h, m, n):
|
||||
"""Joint graph needed for correctness"""
|
||||
@ -3765,7 +3771,7 @@ class CompiledAutograd0(torch.nn.Module):
|
||||
a * b,
|
||||
b,
|
||||
dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
device=GPU_TYPE,
|
||||
requires_grad=True,
|
||||
)
|
||||
fwd_bwd(v)
|
||||
@ -5333,12 +5339,13 @@ if IS_S390X:
|
||||
test_autograd = load_test_module("test_autograd")
|
||||
test_custom_ops = load_test_module("test_custom_ops")
|
||||
test_higher_order_ops = load_test_module("dynamo/test_higher_order_ops")
|
||||
|
||||
TestAutogradWithCompiledAutograd = wrap_test_class(test_autograd.TestAutograd)
|
||||
if not HAS_XPU_AND_TRITON:
|
||||
TestAutogradWithCompiledAutograd = wrap_test_class(test_autograd.TestAutograd)
|
||||
TestNestedCheckpointWithCompiledAutograd = wrap_test_class(
|
||||
test_autograd.TestNestedCheckpoint
|
||||
)
|
||||
TestCustomOpWithCompiledAutograd = wrap_test_class(test_custom_ops.TestCustomOp)
|
||||
if not HAS_XPU_AND_TRITON:
|
||||
TestCustomOpWithCompiledAutograd = wrap_test_class(test_custom_ops.TestCustomOp)
|
||||
HigherOrderOpTestsWithCompiledAutograd = wrap_test_class(
|
||||
test_higher_order_ops.HigherOrderOpTests
|
||||
)
|
||||
@ -5367,6 +5374,7 @@ class TestCompiledAutogradOpInfo(TestCase):
|
||||
super(TestCase, self).tearDown()
|
||||
reset()
|
||||
|
||||
@skipIfXpu(msg="NotImplementedError: The operator 'testlib::mutating_custom_op'")
|
||||
@ops(
|
||||
list(filter(lambda op: op.name not in xfail_hops, hop_db)),
|
||||
allowed_dtypes=(torch.float,),
|
||||
@ -5419,7 +5427,7 @@ class TestCompiledAutogradOpInfo(TestCase):
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
|
||||
instantiate_device_type_tests(TestCompiledAutogradOpInfo, globals())
|
||||
instantiate_device_type_tests(TestCompiledAutogradOpInfo, globals(), allow_xpu=True)
|
||||
instantiate_parametrized_tests(TestCompiledAutograd)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -65,7 +65,11 @@ from torch.testing._internal.inductor_utils import (
|
||||
HAS_GPU,
|
||||
has_triton,
|
||||
)
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton, requires_gpu
|
||||
from torch.testing._internal.triton_utils import (
|
||||
requires_cuda_and_triton,
|
||||
requires_gpu,
|
||||
requires_gpu_and_triton,
|
||||
)
|
||||
|
||||
|
||||
def get_inputs(optim):
|
||||
@ -946,7 +950,7 @@ class CompiledOptimizerTests(TestCase):
|
||||
kwargs = aot_graph_input_parser(forward)
|
||||
torch.compile(forward)(**kwargs)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_foreach_map_adam(self):
|
||||
params = [
|
||||
torch.rand(
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user