mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-19 01:54:54 +08:00
Compare commits
2 Commits
main
...
ciflow/tru
| Author | SHA1 | Date | |
|---|---|---|---|
| 6e16bfcd3e | |||
| d1f572a901 |
@ -1,19 +0,0 @@
|
||||
# Aarch64 (ARM/Graviton) Support Scripts
|
||||
Scripts for building aarch64 PyTorch PIP Wheels. These scripts build the following wheels:
|
||||
* torch
|
||||
* torchvision
|
||||
* torchaudio
|
||||
* torchtext
|
||||
* torchdata
|
||||
## Aarch64_ci_build.sh
|
||||
This script is design to support CD operations within PyPi manylinux aarch64 container, and be executed in the container. It prepares the container and then executes __aarch64_wheel_ci_build.py__ to build the wheels. The script "assumes" the PyTorch repo is located at: ```/pytorch``` and will put the wheels into ```/artifacts```.
|
||||
### Usage
|
||||
```DESIRED_PYTHON=<PythonVersion> aarch64_ci_build.sh```
|
||||
|
||||
__NOTE:__ CI build is currently __EXPERMINTAL__
|
||||
|
||||
## Build_aarch64_wheel.py
|
||||
This app allows a person to build using AWS EC3 resources and requires AWS-CLI and Boto3 with AWS credentials to support building EC2 instances for the wheel builds. Can be used in a codebuild CD or from a local system.
|
||||
|
||||
### Usage
|
||||
```build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch <RCtag>```
|
||||
@ -1,53 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -eux -o pipefail
|
||||
|
||||
GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
|
||||
|
||||
# Set CUDA architecture lists to match x86 build_cuda.sh
|
||||
if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0"
|
||||
elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
|
||||
elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
|
||||
elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
|
||||
fi
|
||||
|
||||
# Compress the fatbin with -compress-mode=size for CUDA 13
|
||||
if [[ "$DESIRED_CUDA" == *"13"* ]]; then
|
||||
export TORCH_NVCC_FLAGS="-compress-mode=size"
|
||||
# Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
|
||||
export BUILD_BUNDLE_PTXAS=1
|
||||
fi
|
||||
|
||||
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
|
||||
source $SCRIPTPATH/aarch64_ci_setup.sh
|
||||
|
||||
###############################################################################
|
||||
# Run aarch64 builder python
|
||||
###############################################################################
|
||||
cd /
|
||||
# adding safe directory for git as the permissions will be
|
||||
# on the mounted pytorch repo
|
||||
git config --global --add safe.directory /pytorch
|
||||
pip install -r /pytorch/requirements.txt
|
||||
pip install auditwheel==6.2.0 wheel
|
||||
if [ "$DESIRED_CUDA" = "cpu" ]; then
|
||||
echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
|
||||
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
|
||||
else
|
||||
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
|
||||
export USE_SYSTEM_NCCL=1
|
||||
|
||||
# Check if we should use NVIDIA libs from PyPI (similar to x86 build_cuda.sh logic)
|
||||
if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
|
||||
echo "Bundling CUDA libraries with wheel for aarch64."
|
||||
else
|
||||
echo "Using nvidia libs from pypi for aarch64."
|
||||
echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
|
||||
export USE_NVIDIA_PYPI_LIBS=1
|
||||
fi
|
||||
|
||||
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
|
||||
fi
|
||||
@ -1,21 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -eux -o pipefail
|
||||
|
||||
# This script is used to prepare the Docker container for aarch64_ci_wheel_build.py python script
|
||||
# By creating symlinks from desired /opt/python to /usr/local/bin/
|
||||
|
||||
NUMPY_VERSION=2.0.2
|
||||
if [[ "$DESIRED_PYTHON" == "3.13" || "$DESIRED_PYTHON" == "3.13t" ]]; then
|
||||
NUMPY_VERSION=2.1.2
|
||||
fi
|
||||
|
||||
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
|
||||
source $SCRIPTPATH/../manywheel/set_desired_python.sh
|
||||
|
||||
pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2
|
||||
|
||||
for tool in python python3 pip pip3 ninja scons patchelf; do
|
||||
ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin;
|
||||
done
|
||||
|
||||
python --version
|
||||
@ -1,333 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# encoding: UTF-8
|
||||
|
||||
import os
|
||||
import shutil
|
||||
from subprocess import check_call, check_output
|
||||
|
||||
|
||||
def list_dir(path: str) -> list[str]:
|
||||
"""'
|
||||
Helper for getting paths for Python
|
||||
"""
|
||||
return check_output(["ls", "-1", path]).decode().split("\n")
|
||||
|
||||
|
||||
def replace_tag(filename) -> None:
|
||||
with open(filename) as f:
|
||||
lines = f.readlines()
|
||||
for i, line in enumerate(lines):
|
||||
if line.startswith("Tag:"):
|
||||
lines[i] = line.replace("-linux_", "-manylinux_2_28_")
|
||||
print(f"Updated tag from {line} to {lines[i]}")
|
||||
break
|
||||
|
||||
with open(filename, "w") as f:
|
||||
f.writelines(lines)
|
||||
|
||||
|
||||
def patch_library_rpath(
|
||||
folder: str,
|
||||
lib_name: str,
|
||||
use_nvidia_pypi_libs: bool = False,
|
||||
desired_cuda: str = "",
|
||||
) -> None:
|
||||
"""Apply patchelf to set RPATH for a library in torch/lib"""
|
||||
lib_path = f"{folder}/tmp/torch/lib/{lib_name}"
|
||||
|
||||
if use_nvidia_pypi_libs:
|
||||
# For PyPI NVIDIA libraries, construct CUDA RPATH
|
||||
cuda_rpaths = [
|
||||
"$ORIGIN/../../nvidia/cudnn/lib",
|
||||
"$ORIGIN/../../nvidia/nvshmem/lib",
|
||||
"$ORIGIN/../../nvidia/nccl/lib",
|
||||
"$ORIGIN/../../nvidia/cusparselt/lib",
|
||||
]
|
||||
|
||||
if "130" in desired_cuda:
|
||||
cuda_rpaths.append("$ORIGIN/../../nvidia/cu13/lib")
|
||||
else:
|
||||
cuda_rpaths.extend(
|
||||
[
|
||||
"$ORIGIN/../../nvidia/cublas/lib",
|
||||
"$ORIGIN/../../nvidia/cuda_cupti/lib",
|
||||
"$ORIGIN/../../nvidia/cuda_nvrtc/lib",
|
||||
"$ORIGIN/../../nvidia/cuda_runtime/lib",
|
||||
"$ORIGIN/../../nvidia/cufft/lib",
|
||||
"$ORIGIN/../../nvidia/curand/lib",
|
||||
"$ORIGIN/../../nvidia/cusolver/lib",
|
||||
"$ORIGIN/../../nvidia/cusparse/lib",
|
||||
"$ORIGIN/../../nvidia/nvtx/lib",
|
||||
"$ORIGIN/../../nvidia/cufile/lib",
|
||||
]
|
||||
)
|
||||
|
||||
# Add $ORIGIN for local torch libs
|
||||
rpath = ":".join(cuda_rpaths) + ":$ORIGIN"
|
||||
else:
|
||||
# For bundled libraries, just use $ORIGIN
|
||||
rpath = "$ORIGIN"
|
||||
|
||||
if os.path.exists(lib_path):
|
||||
os.system(
|
||||
f"cd {folder}/tmp/torch/lib/; "
|
||||
f"patchelf --set-rpath '{rpath}' --force-rpath {lib_name}"
|
||||
)
|
||||
|
||||
|
||||
def copy_and_patch_library(
|
||||
src_path: str,
|
||||
folder: str,
|
||||
use_nvidia_pypi_libs: bool = False,
|
||||
desired_cuda: str = "",
|
||||
) -> None:
|
||||
"""Copy a library to torch/lib and patch its RPATH"""
|
||||
if os.path.exists(src_path):
|
||||
lib_name = os.path.basename(src_path)
|
||||
shutil.copy2(src_path, f"{folder}/tmp/torch/lib/{lib_name}")
|
||||
patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
|
||||
|
||||
|
||||
def package_cuda_wheel(wheel_path, desired_cuda) -> None:
|
||||
"""
|
||||
Package the cuda wheel libraries
|
||||
"""
|
||||
folder = os.path.dirname(wheel_path)
|
||||
os.mkdir(f"{folder}/tmp")
|
||||
os.system(f"unzip {wheel_path} -d {folder}/tmp")
|
||||
# Delete original wheel since it will be repackaged
|
||||
os.system(f"rm {wheel_path}")
|
||||
|
||||
# Check if we should use PyPI NVIDIA libraries or bundle system libraries
|
||||
use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
|
||||
|
||||
if use_nvidia_pypi_libs:
|
||||
print("Using nvidia libs from pypi - skipping CUDA library bundling")
|
||||
# For PyPI approach, we don't bundle CUDA libraries - they come from PyPI packages
|
||||
# We only need to bundle non-NVIDIA libraries
|
||||
minimal_libs_to_copy = [
|
||||
"/lib64/libgomp.so.1",
|
||||
"/usr/lib64/libgfortran.so.5",
|
||||
"/acl/build/libarm_compute.so",
|
||||
"/acl/build/libarm_compute_graph.so",
|
||||
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_lapack_core.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_core.so.0",
|
||||
]
|
||||
|
||||
# Copy minimal libraries to unzipped_folder/torch/lib
|
||||
for lib_path in minimal_libs_to_copy:
|
||||
copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
|
||||
|
||||
# Patch torch libraries used for searching libraries
|
||||
torch_libs_to_patch = [
|
||||
"libtorch.so",
|
||||
"libtorch_cpu.so",
|
||||
"libtorch_cuda.so",
|
||||
"libtorch_cuda_linalg.so",
|
||||
"libtorch_global_deps.so",
|
||||
"libtorch_python.so",
|
||||
"libtorch_nvshmem.so",
|
||||
"libc10.so",
|
||||
"libc10_cuda.so",
|
||||
"libcaffe2_nvrtc.so",
|
||||
"libshm.so",
|
||||
]
|
||||
for lib_name in torch_libs_to_patch:
|
||||
patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
|
||||
else:
|
||||
print("Bundling CUDA libraries with wheel")
|
||||
# Original logic for bundling system CUDA libraries
|
||||
# Common libraries for all CUDA versions
|
||||
common_libs = [
|
||||
# Non-NVIDIA system libraries
|
||||
"/lib64/libgomp.so.1",
|
||||
"/usr/lib64/libgfortran.so.5",
|
||||
"/acl/build/libarm_compute.so",
|
||||
"/acl/build/libarm_compute_graph.so",
|
||||
# Common CUDA libraries (same for all versions)
|
||||
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_lapack_core.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_core.so.0",
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
|
||||
"/usr/local/cuda/lib64/libcudnn.so.9",
|
||||
"/usr/local/cuda/lib64/libcusparseLt.so.0",
|
||||
"/usr/local/cuda/lib64/libcurand.so.10",
|
||||
"/usr/local/cuda/lib64/libnccl.so.2",
|
||||
"/usr/local/cuda/lib64/libnvshmem_host.so.3",
|
||||
"/usr/local/cuda/lib64/libcudnn_adv.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_cnn.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_graph.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_ops.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
|
||||
"/usr/local/cuda/lib64/libcufile.so.0",
|
||||
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
|
||||
"/usr/local/cuda/lib64/libcusparse.so.12",
|
||||
]
|
||||
|
||||
# CUDA version-specific libraries
|
||||
if "13" in desired_cuda:
|
||||
minor_version = desired_cuda[-1]
|
||||
version_specific_libs = [
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
|
||||
"/usr/local/cuda/lib64/libcublas.so.13",
|
||||
"/usr/local/cuda/lib64/libcublasLt.so.13",
|
||||
"/usr/local/cuda/lib64/libcudart.so.13",
|
||||
"/usr/local/cuda/lib64/libcufft.so.12",
|
||||
"/usr/local/cuda/lib64/libcusolver.so.12",
|
||||
"/usr/local/cuda/lib64/libnvJitLink.so.13",
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.13",
|
||||
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",
|
||||
]
|
||||
elif "12" in desired_cuda:
|
||||
# Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
|
||||
minor_version = desired_cuda[-1]
|
||||
version_specific_libs = [
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
|
||||
"/usr/local/cuda/lib64/libcublas.so.12",
|
||||
"/usr/local/cuda/lib64/libcublasLt.so.12",
|
||||
"/usr/local/cuda/lib64/libcudart.so.12",
|
||||
"/usr/local/cuda/lib64/libcufft.so.11",
|
||||
"/usr/local/cuda/lib64/libcusolver.so.11",
|
||||
"/usr/local/cuda/lib64/libnvJitLink.so.12",
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.12",
|
||||
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
|
||||
]
|
||||
else:
|
||||
raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")
|
||||
|
||||
# Combine all libraries
|
||||
libs_to_copy = common_libs + version_specific_libs
|
||||
|
||||
# Copy libraries to unzipped_folder/torch/lib
|
||||
for lib_path in libs_to_copy:
|
||||
copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
|
||||
|
||||
# Make sure the wheel is tagged with manylinux_2_28
|
||||
for f in os.scandir(f"{folder}/tmp/"):
|
||||
if f.is_dir() and f.name.endswith(".dist-info"):
|
||||
replace_tag(f"{f.path}/WHEEL")
|
||||
break
|
||||
|
||||
os.system(f"wheel pack {folder}/tmp/ -d {folder}")
|
||||
os.system(f"rm -rf {folder}/tmp/")
|
||||
|
||||
|
||||
def complete_wheel(folder: str) -> str:
|
||||
"""
|
||||
Complete wheel build and put in artifact location
|
||||
"""
|
||||
wheel_name = list_dir(f"/{folder}/dist")[0]
|
||||
|
||||
# Please note for cuda we don't run auditwheel since we use custom script to package
|
||||
# the cuda dependencies to the wheel file using update_wheel() method.
|
||||
# However we need to make sure filename reflects the correct Manylinux platform.
|
||||
if "pytorch" in folder and not enable_cuda:
|
||||
print("Repairing Wheel with AuditWheel")
|
||||
check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
|
||||
repaired_wheel_name = list_dir(f"/{folder}/wheelhouse")[0]
|
||||
|
||||
print(f"Moving {repaired_wheel_name} wheel to /{folder}/dist")
|
||||
os.rename(
|
||||
f"/{folder}/wheelhouse/{repaired_wheel_name}",
|
||||
f"/{folder}/dist/{repaired_wheel_name}",
|
||||
)
|
||||
else:
|
||||
repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
|
||||
|
||||
print(f"Copying {repaired_wheel_name} to artifacts")
|
||||
shutil.copy2(
|
||||
f"/{folder}/dist/{repaired_wheel_name}", f"/artifacts/{repaired_wheel_name}"
|
||||
)
|
||||
|
||||
return repaired_wheel_name
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
"""
|
||||
Parse inline arguments
|
||||
"""
|
||||
from argparse import ArgumentParser
|
||||
|
||||
parser = ArgumentParser("AARCH64 wheels python CD")
|
||||
parser.add_argument("--debug", action="store_true")
|
||||
parser.add_argument("--build-only", action="store_true")
|
||||
parser.add_argument("--test-only", type=str)
|
||||
parser.add_argument("--enable-mkldnn", action="store_true")
|
||||
parser.add_argument("--enable-cuda", action="store_true")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
Entry Point
|
||||
"""
|
||||
args = parse_arguments()
|
||||
enable_mkldnn = args.enable_mkldnn
|
||||
enable_cuda = args.enable_cuda
|
||||
branch = check_output(
|
||||
["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd="/pytorch"
|
||||
).decode()
|
||||
|
||||
print("Building PyTorch wheel")
|
||||
build_vars = ""
|
||||
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
|
||||
if enable_cuda:
|
||||
build_vars += "MAX_JOBS=5 "
|
||||
|
||||
# Handle PyPI NVIDIA libraries vs bundled libraries
|
||||
use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
|
||||
if use_nvidia_pypi_libs:
|
||||
print("Configuring build for PyPI NVIDIA libraries")
|
||||
# Configure for dynamic linking (matching x86 logic)
|
||||
build_vars += "ATEN_STATIC_CUDA=0 USE_CUDA_STATIC_LINK=0 USE_CUPTI_SO=1 "
|
||||
else:
|
||||
print("Configuring build for bundled NVIDIA libraries")
|
||||
# Keep existing static linking approach - already configured above
|
||||
|
||||
override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
|
||||
desired_cuda = os.getenv("DESIRED_CUDA")
|
||||
if override_package_version is not None:
|
||||
version = override_package_version
|
||||
build_vars += (
|
||||
f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
|
||||
)
|
||||
elif branch in ["nightly", "main"]:
|
||||
build_date = (
|
||||
check_output(["git", "log", "--pretty=format:%cs", "-1"], cwd="/pytorch")
|
||||
.decode()
|
||||
.replace("-", "")
|
||||
)
|
||||
version = (
|
||||
check_output(["cat", "version.txt"], cwd="/pytorch").decode().strip()[:-2]
|
||||
)
|
||||
if enable_cuda:
|
||||
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date}+{desired_cuda} PYTORCH_BUILD_NUMBER=1 "
|
||||
else:
|
||||
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
|
||||
elif branch.startswith(("v1.", "v2.")):
|
||||
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
|
||||
|
||||
if enable_mkldnn:
|
||||
print("build pytorch with mkldnn+acl backend")
|
||||
build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
|
||||
build_vars += "ACL_ROOT_DIR=/acl "
|
||||
if enable_cuda:
|
||||
build_vars += "BLAS=NVPL "
|
||||
else:
|
||||
build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS "
|
||||
else:
|
||||
print("build pytorch without mkldnn backend")
|
||||
|
||||
os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")
|
||||
if enable_cuda:
|
||||
print("Updating Cuda Dependency")
|
||||
filename = os.listdir("/pytorch/dist/")
|
||||
wheel_path = f"/pytorch/dist/{filename[0]}"
|
||||
package_cuda_wheel(wheel_path, desired_cuda)
|
||||
pytorch_wheel_name = complete_wheel("/pytorch/")
|
||||
print(f"Build Complete. Created {pytorch_wheel_name}..")
|
||||
@ -1,999 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# This script is for building AARCH64 wheels using AWS EC2 instances.
|
||||
# To generate binaries for the release follow these steps:
|
||||
# 1. Update mappings for each of the Domain Libraries by adding new row to a table like this:
|
||||
# "v1.11.0": ("0.11.0", "rc1"),
|
||||
# 2. Run script with following arguments for each of the supported python versions and required tag, for example:
|
||||
# build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch v1.11.0-rc3
|
||||
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from typing import Optional, Union
|
||||
|
||||
import boto3
|
||||
|
||||
|
||||
# AMI images for us-east-1, change the following based on your ~/.aws/config
|
||||
os_amis = {
|
||||
"ubuntu20_04": "ami-052eac90edaa9d08f", # login_name: ubuntu
|
||||
"ubuntu22_04": "ami-0c6c29c5125214c77", # login_name: ubuntu
|
||||
"redhat8": "ami-0698b90665a2ddcf1", # login_name: ec2-user
|
||||
}
|
||||
|
||||
ubuntu20_04_ami = os_amis["ubuntu20_04"]
|
||||
|
||||
|
||||
def compute_keyfile_path(key_name: Optional[str] = None) -> tuple[str, str]:
|
||||
if key_name is None:
|
||||
key_name = os.getenv("AWS_KEY_NAME")
|
||||
if key_name is None:
|
||||
return os.getenv("SSH_KEY_PATH", ""), ""
|
||||
|
||||
homedir_path = os.path.expanduser("~")
|
||||
default_path = os.path.join(homedir_path, ".ssh", f"{key_name}.pem")
|
||||
return os.getenv("SSH_KEY_PATH", default_path), key_name
|
||||
|
||||
|
||||
ec2 = boto3.resource("ec2")
|
||||
|
||||
|
||||
def ec2_get_instances(filter_name, filter_value):
|
||||
return ec2.instances.filter(
|
||||
Filters=[{"Name": filter_name, "Values": [filter_value]}]
|
||||
)
|
||||
|
||||
|
||||
def ec2_instances_of_type(instance_type="t4g.2xlarge"):
|
||||
return ec2_get_instances("instance-type", instance_type)
|
||||
|
||||
|
||||
def ec2_instances_by_id(instance_id):
|
||||
rc = list(ec2_get_instances("instance-id", instance_id))
|
||||
return rc[0] if len(rc) > 0 else None
|
||||
|
||||
|
||||
def start_instance(
|
||||
key_name, ami=ubuntu20_04_ami, instance_type="t4g.2xlarge", ebs_size: int = 50
|
||||
):
|
||||
inst = ec2.create_instances(
|
||||
ImageId=ami,
|
||||
InstanceType=instance_type,
|
||||
SecurityGroups=["ssh-allworld"],
|
||||
KeyName=key_name,
|
||||
MinCount=1,
|
||||
MaxCount=1,
|
||||
BlockDeviceMappings=[
|
||||
{
|
||||
"DeviceName": "/dev/sda1",
|
||||
"Ebs": {
|
||||
"DeleteOnTermination": True,
|
||||
"VolumeSize": ebs_size,
|
||||
"VolumeType": "standard",
|
||||
},
|
||||
}
|
||||
],
|
||||
)[0]
|
||||
print(f"Create instance {inst.id}")
|
||||
inst.wait_until_running()
|
||||
running_inst = ec2_instances_by_id(inst.id)
|
||||
print(f"Instance started at {running_inst.public_dns_name}")
|
||||
return running_inst
|
||||
|
||||
|
||||
class RemoteHost:
|
||||
addr: str
|
||||
keyfile_path: str
|
||||
login_name: str
|
||||
container_id: Optional[str] = None
|
||||
ami: Optional[str] = None
|
||||
|
||||
def __init__(self, addr: str, keyfile_path: str, login_name: str = "ubuntu"):
|
||||
self.addr = addr
|
||||
self.keyfile_path = keyfile_path
|
||||
self.login_name = login_name
|
||||
|
||||
def _gen_ssh_prefix(self) -> list[str]:
|
||||
return [
|
||||
"ssh",
|
||||
"-o",
|
||||
"StrictHostKeyChecking=no",
|
||||
"-i",
|
||||
self.keyfile_path,
|
||||
f"{self.login_name}@{self.addr}",
|
||||
"--",
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def _split_cmd(args: Union[str, list[str]]) -> list[str]:
|
||||
return args.split() if isinstance(args, str) else args
|
||||
|
||||
def run_ssh_cmd(self, args: Union[str, list[str]]) -> None:
|
||||
subprocess.check_call(self._gen_ssh_prefix() + self._split_cmd(args))
|
||||
|
||||
def check_ssh_output(self, args: Union[str, list[str]]) -> str:
|
||||
return subprocess.check_output(
|
||||
self._gen_ssh_prefix() + self._split_cmd(args)
|
||||
).decode("utf-8")
|
||||
|
||||
def scp_upload_file(self, local_file: str, remote_file: str) -> None:
|
||||
subprocess.check_call(
|
||||
[
|
||||
"scp",
|
||||
"-i",
|
||||
self.keyfile_path,
|
||||
local_file,
|
||||
f"{self.login_name}@{self.addr}:{remote_file}",
|
||||
]
|
||||
)
|
||||
|
||||
def scp_download_file(
|
||||
self, remote_file: str, local_file: Optional[str] = None
|
||||
) -> None:
|
||||
if local_file is None:
|
||||
local_file = "."
|
||||
subprocess.check_call(
|
||||
[
|
||||
"scp",
|
||||
"-i",
|
||||
self.keyfile_path,
|
||||
f"{self.login_name}@{self.addr}:{remote_file}",
|
||||
local_file,
|
||||
]
|
||||
)
|
||||
|
||||
def start_docker(self, image="quay.io/pypa/manylinux2014_aarch64:latest") -> None:
|
||||
self.run_ssh_cmd("sudo apt-get install -y docker.io")
|
||||
self.run_ssh_cmd(f"sudo usermod -a -G docker {self.login_name}")
|
||||
self.run_ssh_cmd("sudo service docker start")
|
||||
self.run_ssh_cmd(f"docker pull {image}")
|
||||
self.container_id = self.check_ssh_output(
|
||||
f"docker run -t -d -w /root {image}"
|
||||
).strip()
|
||||
|
||||
def using_docker(self) -> bool:
|
||||
return self.container_id is not None
|
||||
|
||||
def run_cmd(self, args: Union[str, list[str]]) -> None:
|
||||
if not self.using_docker():
|
||||
return self.run_ssh_cmd(args)
|
||||
assert self.container_id is not None
|
||||
docker_cmd = self._gen_ssh_prefix() + [
|
||||
"docker",
|
||||
"exec",
|
||||
"-i",
|
||||
self.container_id,
|
||||
"bash",
|
||||
]
|
||||
p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE)
|
||||
p.communicate(
|
||||
input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
|
||||
"utf-8"
|
||||
)
|
||||
)
|
||||
rc = p.wait()
|
||||
if rc != 0:
|
||||
raise subprocess.CalledProcessError(rc, docker_cmd)
|
||||
|
||||
def check_output(self, args: Union[str, list[str]]) -> str:
|
||||
if not self.using_docker():
|
||||
return self.check_ssh_output(args)
|
||||
assert self.container_id is not None
|
||||
docker_cmd = self._gen_ssh_prefix() + [
|
||||
"docker",
|
||||
"exec",
|
||||
"-i",
|
||||
self.container_id,
|
||||
"bash",
|
||||
]
|
||||
p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
(out, err) = p.communicate(
|
||||
input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
|
||||
"utf-8"
|
||||
)
|
||||
)
|
||||
rc = p.wait()
|
||||
if rc != 0:
|
||||
raise subprocess.CalledProcessError(rc, docker_cmd, output=out, stderr=err)
|
||||
return out.decode("utf-8")
|
||||
|
||||
def upload_file(self, local_file: str, remote_file: str) -> None:
|
||||
if not self.using_docker():
|
||||
return self.scp_upload_file(local_file, remote_file)
|
||||
tmp_file = os.path.join("/tmp", os.path.basename(local_file))
|
||||
self.scp_upload_file(local_file, tmp_file)
|
||||
self.run_ssh_cmd(
|
||||
["docker", "cp", tmp_file, f"{self.container_id}:/root/{remote_file}"]
|
||||
)
|
||||
self.run_ssh_cmd(["rm", tmp_file])
|
||||
|
||||
def download_file(self, remote_file: str, local_file: Optional[str] = None) -> None:
|
||||
if not self.using_docker():
|
||||
return self.scp_download_file(remote_file, local_file)
|
||||
tmp_file = os.path.join("/tmp", os.path.basename(remote_file))
|
||||
self.run_ssh_cmd(
|
||||
["docker", "cp", f"{self.container_id}:/root/{remote_file}", tmp_file]
|
||||
)
|
||||
self.scp_download_file(tmp_file, local_file)
|
||||
self.run_ssh_cmd(["rm", tmp_file])
|
||||
|
||||
def download_wheel(
|
||||
self, remote_file: str, local_file: Optional[str] = None
|
||||
) -> None:
|
||||
if self.using_docker() and local_file is None:
|
||||
basename = os.path.basename(remote_file)
|
||||
local_file = basename.replace(
|
||||
"-linux_aarch64.whl", "-manylinux2014_aarch64.whl"
|
||||
)
|
||||
self.download_file(remote_file, local_file)
|
||||
|
||||
def list_dir(self, path: str) -> list[str]:
|
||||
return self.check_output(["ls", "-1", path]).split("\n")
|
||||
|
||||
|
||||
def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
|
||||
import socket
|
||||
|
||||
for i in range(attempt_cnt):
|
||||
try:
|
||||
with socket.create_connection((addr, port), timeout=timeout):
|
||||
return
|
||||
except (ConnectionRefusedError, TimeoutError): # noqa: PERF203
|
||||
if i == attempt_cnt - 1:
|
||||
raise
|
||||
time.sleep(timeout)
|
||||
|
||||
|
||||
def update_apt_repo(host: RemoteHost) -> None:
|
||||
time.sleep(5)
|
||||
host.run_cmd("sudo systemctl stop apt-daily.service || true")
|
||||
host.run_cmd("sudo systemctl stop unattended-upgrades.service || true")
|
||||
host.run_cmd(
|
||||
"while systemctl is-active --quiet apt-daily.service; do sleep 1; done"
|
||||
)
|
||||
host.run_cmd(
|
||||
"while systemctl is-active --quiet unattended-upgrades.service; do sleep 1; done"
|
||||
)
|
||||
host.run_cmd("sudo apt-get update")
|
||||
time.sleep(3)
|
||||
host.run_cmd("sudo apt-get update")
|
||||
|
||||
|
||||
def install_condaforge(
|
||||
host: RemoteHost, suffix: str = "latest/download/Miniforge3-Linux-aarch64.sh"
|
||||
) -> None:
|
||||
print("Install conda-forge")
|
||||
host.run_cmd(f"curl -OL https://github.com/conda-forge/miniforge/releases/{suffix}")
|
||||
host.run_cmd(f"sh -f {os.path.basename(suffix)} -b")
|
||||
host.run_cmd(f"rm -f {os.path.basename(suffix)}")
|
||||
if host.using_docker():
|
||||
host.run_cmd("echo 'PATH=$HOME/miniforge3/bin:$PATH'>>.bashrc")
|
||||
else:
|
||||
host.run_cmd(
|
||||
[
|
||||
"sed",
|
||||
"-i",
|
||||
"'/^# If not running interactively.*/i PATH=$HOME/miniforge3/bin:$PATH'",
|
||||
".bashrc",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
|
||||
if python_version == "3.6":
|
||||
# Python-3.6 EOLed and not compatible with conda-4.11
|
||||
install_condaforge(
|
||||
host, suffix="download/4.10.3-10/Miniforge3-4.10.3-10-Linux-aarch64.sh"
|
||||
)
|
||||
host.run_cmd(f"conda install -y python={python_version} numpy pyyaml")
|
||||
else:
|
||||
install_condaforge(
|
||||
host, suffix="download/4.11.0-4/Miniforge3-4.11.0-4-Linux-aarch64.sh"
|
||||
)
|
||||
# Pytorch-1.10 or older are not compatible with setuptools=59.6 or newer
|
||||
host.run_cmd(
|
||||
f"conda install -y python={python_version} numpy pyyaml setuptools>=59.5.0"
|
||||
)
|
||||
|
||||
|
||||
def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:
|
||||
host.run_cmd("pip3 install auditwheel")
|
||||
host.run_cmd(
|
||||
"conda install -y patchelf" if use_conda else "sudo apt-get install -y patchelf"
|
||||
)
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
with NamedTemporaryFile() as tmp:
|
||||
tmp.write(embed_library_script.encode("utf-8"))
|
||||
tmp.flush()
|
||||
host.upload_file(tmp.name, "embed_library.py")
|
||||
|
||||
print("Embedding libgomp into wheel")
|
||||
if host.using_docker():
|
||||
host.run_cmd(f"python3 embed_library.py {wheel_name} --update-tag")
|
||||
else:
|
||||
host.run_cmd(f"python3 embed_library.py {wheel_name}")
|
||||
|
||||
|
||||
def checkout_repo(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
url: str,
|
||||
git_clone_flags: str,
|
||||
mapping: dict[str, tuple[str, str]],
|
||||
) -> Optional[str]:
|
||||
for prefix in mapping:
|
||||
if not branch.startswith(prefix):
|
||||
continue
|
||||
tag = f"v{mapping[prefix][0]}-{mapping[prefix][1]}"
|
||||
host.run_cmd(f"git clone {url} -b {tag} {git_clone_flags}")
|
||||
return mapping[prefix][0]
|
||||
|
||||
host.run_cmd(f"git clone {url} -b {branch} {git_clone_flags}")
|
||||
return None
|
||||
|
||||
|
||||
def build_torchvision(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
use_conda: bool = True,
|
||||
git_clone_flags: str,
|
||||
run_smoke_tests: bool = True,
|
||||
) -> str:
|
||||
print("Checking out TorchVision repo")
|
||||
build_version = checkout_repo(
|
||||
host,
|
||||
branch=branch,
|
||||
url="https://github.com/pytorch/vision",
|
||||
git_clone_flags=git_clone_flags,
|
||||
mapping={
|
||||
"v1.7.1": ("0.8.2", "rc2"),
|
||||
"v1.8.0": ("0.9.0", "rc3"),
|
||||
"v1.8.1": ("0.9.1", "rc1"),
|
||||
"v1.9.0": ("0.10.0", "rc1"),
|
||||
"v1.10.0": ("0.11.1", "rc1"),
|
||||
"v1.10.1": ("0.11.2", "rc1"),
|
||||
"v1.10.2": ("0.11.3", "rc1"),
|
||||
"v1.11.0": ("0.12.0", "rc1"),
|
||||
"v1.12.0": ("0.13.0", "rc4"),
|
||||
"v1.12.1": ("0.13.1", "rc6"),
|
||||
"v1.13.0": ("0.14.0", "rc4"),
|
||||
"v1.13.1": ("0.14.1", "rc2"),
|
||||
"v2.0.0": ("0.15.1", "rc2"),
|
||||
"v2.0.1": ("0.15.2", "rc2"),
|
||||
},
|
||||
)
|
||||
print("Building TorchVision wheel")
|
||||
|
||||
# Please note libnpg and jpeg are required to build image.so extension
|
||||
if use_conda:
|
||||
host.run_cmd("conda install -y libpng jpeg")
|
||||
# Remove .so files to force static linking
|
||||
host.run_cmd(
|
||||
"rm miniforge3/lib/libpng.so miniforge3/lib/libpng16.so miniforge3/lib/libjpeg.so"
|
||||
)
|
||||
# And patch setup.py to include libz dependency for libpng
|
||||
host.run_cmd(
|
||||
[
|
||||
'sed -i -e \'s/image_link_flags\\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py'
|
||||
]
|
||||
)
|
||||
|
||||
build_vars = ""
|
||||
if branch == "nightly":
|
||||
version = host.check_output(
|
||||
["if [ -f vision/version.txt ]; then cat vision/version.txt; fi"]
|
||||
).strip()
|
||||
if len(version) == 0:
|
||||
# In older revisions, version was embedded in setup.py
|
||||
version = (
|
||||
host.check_output(["grep", '"version = \'"', "vision/setup.py"])
|
||||
.strip()
|
||||
.split("'")[1][:-2]
|
||||
)
|
||||
build_date = (
|
||||
host.check_output("cd vision && git log --pretty=format:%s -1")
|
||||
.strip()
|
||||
.split()[0]
|
||||
.replace("-", "")
|
||||
)
|
||||
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
|
||||
elif build_version is not None:
|
||||
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation")
|
||||
vision_wheel_name = host.list_dir("vision/dist")[0]
|
||||
embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name))
|
||||
|
||||
print("Copying TorchVision wheel")
|
||||
host.download_wheel(os.path.join("vision", "dist", vision_wheel_name))
|
||||
if run_smoke_tests:
|
||||
host.run_cmd(
|
||||
f"pip3 install {os.path.join('vision', 'dist', vision_wheel_name)}"
|
||||
)
|
||||
host.run_cmd("python3 vision/test/smoke_test.py")
|
||||
print("Delete vision checkout")
|
||||
host.run_cmd("rm -rf vision")
|
||||
|
||||
return vision_wheel_name
|
||||
|
||||
|
||||
def build_torchdata(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
use_conda: bool = True,
|
||||
git_clone_flags: str = "",
|
||||
) -> str:
|
||||
print("Checking out TorchData repo")
|
||||
git_clone_flags += " --recurse-submodules"
|
||||
build_version = checkout_repo(
|
||||
host,
|
||||
branch=branch,
|
||||
url="https://github.com/pytorch/data",
|
||||
git_clone_flags=git_clone_flags,
|
||||
mapping={
|
||||
"v1.13.1": ("0.5.1", ""),
|
||||
"v2.0.0": ("0.6.0", "rc5"),
|
||||
"v2.0.1": ("0.6.1", "rc1"),
|
||||
},
|
||||
)
|
||||
print("Building TorchData wheel")
|
||||
build_vars = ""
|
||||
if branch == "nightly":
|
||||
version = host.check_output(
|
||||
["if [ -f data/version.txt ]; then cat data/version.txt; fi"]
|
||||
).strip()
|
||||
build_date = (
|
||||
host.check_output("cd data && git log --pretty=format:%s -1")
|
||||
.strip()
|
||||
.split()[0]
|
||||
.replace("-", "")
|
||||
)
|
||||
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
|
||||
elif build_version is not None:
|
||||
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation")
|
||||
wheel_name = host.list_dir("data/dist")[0]
|
||||
embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name))
|
||||
|
||||
print("Copying TorchData wheel")
|
||||
host.download_wheel(os.path.join("data", "dist", wheel_name))
|
||||
|
||||
return wheel_name
|
||||
|
||||
|
||||
def build_torchtext(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
use_conda: bool = True,
|
||||
git_clone_flags: str = "",
|
||||
) -> str:
|
||||
print("Checking out TorchText repo")
|
||||
git_clone_flags += " --recurse-submodules"
|
||||
build_version = checkout_repo(
|
||||
host,
|
||||
branch=branch,
|
||||
url="https://github.com/pytorch/text",
|
||||
git_clone_flags=git_clone_flags,
|
||||
mapping={
|
||||
"v1.9.0": ("0.10.0", "rc1"),
|
||||
"v1.10.0": ("0.11.0", "rc2"),
|
||||
"v1.10.1": ("0.11.1", "rc1"),
|
||||
"v1.10.2": ("0.11.2", "rc1"),
|
||||
"v1.11.0": ("0.12.0", "rc1"),
|
||||
"v1.12.0": ("0.13.0", "rc2"),
|
||||
"v1.12.1": ("0.13.1", "rc5"),
|
||||
"v1.13.0": ("0.14.0", "rc3"),
|
||||
"v1.13.1": ("0.14.1", "rc1"),
|
||||
"v2.0.0": ("0.15.1", "rc2"),
|
||||
"v2.0.1": ("0.15.2", "rc2"),
|
||||
},
|
||||
)
|
||||
print("Building TorchText wheel")
|
||||
build_vars = ""
|
||||
if branch == "nightly":
|
||||
version = host.check_output(
|
||||
["if [ -f text/version.txt ]; then cat text/version.txt; fi"]
|
||||
).strip()
|
||||
build_date = (
|
||||
host.check_output("cd text && git log --pretty=format:%s -1")
|
||||
.strip()
|
||||
.split()[0]
|
||||
.replace("-", "")
|
||||
)
|
||||
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
|
||||
elif build_version is not None:
|
||||
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation")
|
||||
wheel_name = host.list_dir("text/dist")[0]
|
||||
embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name))
|
||||
|
||||
print("Copying TorchText wheel")
|
||||
host.download_wheel(os.path.join("text", "dist", wheel_name))
|
||||
|
||||
return wheel_name
|
||||
|
||||
|
||||
def build_torchaudio(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
use_conda: bool = True,
|
||||
git_clone_flags: str = "",
|
||||
) -> str:
|
||||
print("Checking out TorchAudio repo")
|
||||
git_clone_flags += " --recurse-submodules"
|
||||
build_version = checkout_repo(
|
||||
host,
|
||||
branch=branch,
|
||||
url="https://github.com/pytorch/audio",
|
||||
git_clone_flags=git_clone_flags,
|
||||
mapping={
|
||||
"v1.9.0": ("0.9.0", "rc2"),
|
||||
"v1.10.0": ("0.10.0", "rc5"),
|
||||
"v1.10.1": ("0.10.1", "rc1"),
|
||||
"v1.10.2": ("0.10.2", "rc1"),
|
||||
"v1.11.0": ("0.11.0", "rc1"),
|
||||
"v1.12.0": ("0.12.0", "rc3"),
|
||||
"v1.12.1": ("0.12.1", "rc5"),
|
||||
"v1.13.0": ("0.13.0", "rc4"),
|
||||
"v1.13.1": ("0.13.1", "rc2"),
|
||||
"v2.0.0": ("2.0.1", "rc3"),
|
||||
"v2.0.1": ("2.0.2", "rc2"),
|
||||
},
|
||||
)
|
||||
print("Building TorchAudio wheel")
|
||||
build_vars = ""
|
||||
if branch == "nightly":
|
||||
version = (
|
||||
host.check_output(["grep", '"version = \'"', "audio/setup.py"])
|
||||
.strip()
|
||||
.split("'")[1][:-2]
|
||||
)
|
||||
build_date = (
|
||||
host.check_output("cd audio && git log --pretty=format:%s -1")
|
||||
.strip()
|
||||
.split()[0]
|
||||
.replace("-", "")
|
||||
)
|
||||
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
|
||||
elif build_version is not None:
|
||||
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
host.run_cmd(
|
||||
f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \
|
||||
&& ./packaging/ffmpeg/build.sh \
|
||||
&& {build_vars} python3 -m build --wheel --no-isolation"
|
||||
)
|
||||
|
||||
wheel_name = host.list_dir("audio/dist")[0]
|
||||
embed_libgomp(host, use_conda, os.path.join("audio", "dist", wheel_name))
|
||||
|
||||
print("Copying TorchAudio wheel")
|
||||
host.download_wheel(os.path.join("audio", "dist", wheel_name))
|
||||
|
||||
return wheel_name
|
||||
|
||||
|
||||
def configure_system(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
compiler: str = "gcc-8",
|
||||
use_conda: bool = True,
|
||||
python_version: str = "3.8",
|
||||
) -> None:
|
||||
if use_conda:
|
||||
install_condaforge_python(host, python_version)
|
||||
|
||||
print("Configuring the system")
|
||||
if not host.using_docker():
|
||||
update_apt_repo(host)
|
||||
host.run_cmd("sudo apt-get install -y ninja-build g++ git cmake gfortran unzip")
|
||||
else:
|
||||
host.run_cmd("yum install -y sudo")
|
||||
host.run_cmd("conda install -y ninja scons")
|
||||
|
||||
if not use_conda:
|
||||
host.run_cmd(
|
||||
"sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
|
||||
)
|
||||
host.run_cmd("pip3 install dataclasses typing-extensions")
|
||||
if not use_conda:
|
||||
print("Installing Cython + numpy from PyPy")
|
||||
host.run_cmd("sudo pip3 install Cython")
|
||||
host.run_cmd("sudo pip3 install numpy")
|
||||
|
||||
|
||||
def build_domains(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
use_conda: bool = True,
|
||||
git_clone_flags: str = "",
|
||||
) -> tuple[str, str, str, str]:
|
||||
vision_wheel_name = build_torchvision(
|
||||
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
|
||||
)
|
||||
audio_wheel_name = build_torchaudio(
|
||||
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
|
||||
)
|
||||
data_wheel_name = build_torchdata(
|
||||
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
|
||||
)
|
||||
text_wheel_name = build_torchtext(
|
||||
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
|
||||
)
|
||||
return (vision_wheel_name, audio_wheel_name, data_wheel_name, text_wheel_name)
|
||||
|
||||
|
||||
def start_build(
|
||||
host: RemoteHost,
|
||||
*,
|
||||
branch: str = "main",
|
||||
compiler: str = "gcc-8",
|
||||
use_conda: bool = True,
|
||||
python_version: str = "3.8",
|
||||
pytorch_only: bool = False,
|
||||
pytorch_build_number: Optional[str] = None,
|
||||
shallow_clone: bool = True,
|
||||
enable_mkldnn: bool = False,
|
||||
) -> tuple[str, str, str, str, str]:
|
||||
git_clone_flags = " --depth 1 --shallow-submodules" if shallow_clone else ""
|
||||
if host.using_docker() and not use_conda:
|
||||
print("Auto-selecting conda option for docker images")
|
||||
use_conda = True
|
||||
if not host.using_docker():
|
||||
print("Disable mkldnn for host builds")
|
||||
enable_mkldnn = False
|
||||
|
||||
configure_system(
|
||||
host, compiler=compiler, use_conda=use_conda, python_version=python_version
|
||||
)
|
||||
|
||||
if host.using_docker():
|
||||
print("Move libgfortant.a into a standard location")
|
||||
# HACK: pypa gforntran.a is compiled without PIC, which leads to the following error
|
||||
# libgfortran.a(error.o)(.text._gfortrani_st_printf+0x34): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `__stack_chk_guard@@GLIBC_2.17' # noqa: E501, B950
|
||||
# Workaround by copying gfortran library from the host
|
||||
host.run_ssh_cmd("sudo apt-get install -y gfortran-8")
|
||||
host.run_cmd("mkdir -p /usr/lib/gcc/aarch64-linux-gnu/8")
|
||||
host.run_ssh_cmd(
|
||||
[
|
||||
"docker",
|
||||
"cp",
|
||||
"/usr/lib/gcc/aarch64-linux-gnu/8/libgfortran.a",
|
||||
f"{host.container_id}:/opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/",
|
||||
]
|
||||
)
|
||||
|
||||
print("Checking out PyTorch repo")
|
||||
host.run_cmd(
|
||||
f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"
|
||||
)
|
||||
|
||||
host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")
|
||||
|
||||
print("Building PyTorch wheel")
|
||||
build_opts = ""
|
||||
if pytorch_build_number is not None:
|
||||
build_opts += f" -C--build-option=--build-number={pytorch_build_number}"
|
||||
# Breakpad build fails on aarch64
|
||||
build_vars = "USE_BREAKPAD=0 "
|
||||
if branch == "nightly":
|
||||
build_date = (
|
||||
host.check_output("cd pytorch && git log --pretty=format:%s -1")
|
||||
.strip()
|
||||
.split()[0]
|
||||
.replace("-", "")
|
||||
)
|
||||
version = host.check_output("cat pytorch/version.txt").strip()[:-2]
|
||||
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
|
||||
if branch.startswith(("v1.", "v2.")):
|
||||
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
if enable_mkldnn:
|
||||
host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")
|
||||
print("build pytorch with mkldnn+acl backend")
|
||||
build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
|
||||
build_vars += " BLAS=OpenBLAS"
|
||||
build_vars += " OpenBLAS_HOME=/opt/OpenBLAS"
|
||||
build_vars += " ACL_ROOT_DIR=/acl"
|
||||
host.run_cmd(
|
||||
f"cd $HOME/pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
|
||||
)
|
||||
print("Repair the wheel")
|
||||
pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
|
||||
ld_library_path = "/acl/build:$HOME/pytorch/build/lib"
|
||||
host.run_cmd(
|
||||
f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"
|
||||
)
|
||||
print("replace the original wheel with the repaired one")
|
||||
pytorch_repaired_wheel_name = host.list_dir("wheelhouse")[0]
|
||||
host.run_cmd(
|
||||
f"cp $HOME/wheelhouse/{pytorch_repaired_wheel_name} $HOME/pytorch/dist/{pytorch_wheel_name}"
|
||||
)
|
||||
else:
|
||||
print("build pytorch without mkldnn backend")
|
||||
host.run_cmd(
|
||||
f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
|
||||
)
|
||||
|
||||
print("Deleting build folder")
|
||||
host.run_cmd("cd pytorch && rm -rf build")
|
||||
pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
|
||||
embed_libgomp(host, use_conda, os.path.join("pytorch", "dist", pytorch_wheel_name))
|
||||
print("Copying the wheel")
|
||||
host.download_wheel(os.path.join("pytorch", "dist", pytorch_wheel_name))
|
||||
|
||||
print("Installing PyTorch wheel")
|
||||
host.run_cmd(f"pip3 install pytorch/dist/{pytorch_wheel_name}")
|
||||
|
||||
if pytorch_only:
|
||||
return (pytorch_wheel_name, None, None, None, None)
|
||||
domain_wheels = build_domains(
|
||||
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
|
||||
)
|
||||
|
||||
return (pytorch_wheel_name, *domain_wheels)
|
||||
|
||||
|
||||
embed_library_script = """
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from auditwheel.patcher import Patchelf
|
||||
from auditwheel.wheeltools import InWheelCtx
|
||||
from auditwheel.elfutils import elf_file_filter
|
||||
from auditwheel.repair import copylib
|
||||
from auditwheel.lddtree import lddtree
|
||||
from subprocess import check_call
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
|
||||
def replace_tag(filename):
|
||||
with open(filename, 'r') as f:
|
||||
lines = f.read().split("\\n")
|
||||
for i,line in enumerate(lines):
|
||||
if not line.startswith("Tag: "):
|
||||
continue
|
||||
lines[i] = line.replace("-linux_", "-manylinux2014_")
|
||||
print(f'Updated tag from {line} to {lines[i]}')
|
||||
|
||||
with open(filename, 'w') as f:
|
||||
f.write("\\n".join(lines))
|
||||
|
||||
|
||||
class AlignedPatchelf(Patchelf):
|
||||
def set_soname(self, file_name: str, new_soname: str) -> None:
|
||||
check_call(['patchelf', '--page-size', '65536', '--set-soname', new_soname, file_name])
|
||||
|
||||
def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
|
||||
check_call(['patchelf', '--page-size', '65536', '--replace-needed', soname, new_soname, file_name])
|
||||
|
||||
|
||||
def embed_library(whl_path, lib_soname, update_tag=False):
|
||||
patcher = AlignedPatchelf()
|
||||
out_dir = TemporaryDirectory()
|
||||
whl_name = os.path.basename(whl_path)
|
||||
tmp_whl_name = os.path.join(out_dir.name, whl_name)
|
||||
with InWheelCtx(whl_path) as ctx:
|
||||
torchlib_path = os.path.join(ctx._tmpdir.name, 'torch', 'lib')
|
||||
ctx.out_wheel=tmp_whl_name
|
||||
new_lib_path, new_lib_soname = None, None
|
||||
for filename, elf in elf_file_filter(ctx.iter_files()):
|
||||
if not filename.startswith('torch/lib'):
|
||||
continue
|
||||
libtree = lddtree(filename)
|
||||
if lib_soname not in libtree['needed']:
|
||||
continue
|
||||
lib_path = libtree['libs'][lib_soname]['path']
|
||||
if lib_path is None:
|
||||
print(f"Can't embed {lib_soname} as it could not be found")
|
||||
break
|
||||
if lib_path.startswith(torchlib_path):
|
||||
continue
|
||||
|
||||
if new_lib_path is None:
|
||||
new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
|
||||
patcher.replace_needed(filename, lib_soname, new_lib_soname)
|
||||
print(f'Replacing {lib_soname} with {new_lib_soname} for {filename}')
|
||||
if update_tag:
|
||||
# Add manylinux2014 tag
|
||||
for filename in ctx.iter_files():
|
||||
if os.path.basename(filename) != 'WHEEL':
|
||||
continue
|
||||
replace_tag(filename)
|
||||
shutil.move(tmp_whl_name, whl_path)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
embed_library(sys.argv[1], 'libgomp.so.1', len(sys.argv) > 2 and sys.argv[2] == '--update-tag')
|
||||
"""
|
||||
|
||||
|
||||
def run_tests(host: RemoteHost, whl: str, branch="main") -> None:
|
||||
print("Configuring the system")
|
||||
update_apt_repo(host)
|
||||
host.run_cmd("sudo apt-get install -y python3-pip git")
|
||||
host.run_cmd("sudo pip3 install Cython")
|
||||
host.run_cmd("sudo pip3 install numpy")
|
||||
host.upload_file(whl, ".")
|
||||
host.run_cmd(f"sudo pip3 install {whl}")
|
||||
host.run_cmd("python3 -c 'import torch;print(torch.rand((3,3))'")
|
||||
host.run_cmd(f"git clone -b {branch} https://github.com/pytorch/pytorch")
|
||||
host.run_cmd("cd pytorch/test; python3 test_torch.py -v")
|
||||
|
||||
|
||||
def get_instance_name(instance) -> Optional[str]:
|
||||
if instance.tags is None:
|
||||
return None
|
||||
for tag in instance.tags:
|
||||
if tag["Key"] == "Name":
|
||||
return tag["Value"]
|
||||
return None
|
||||
|
||||
|
||||
def list_instances(instance_type: str) -> None:
|
||||
print(f"All instances of type {instance_type}")
|
||||
for instance in ec2_instances_of_type(instance_type):
|
||||
ifaces = instance.network_interfaces
|
||||
az = ifaces[0].subnet.availability_zone if len(ifaces) > 0 else None
|
||||
print(
|
||||
f"{instance.id} {get_instance_name(instance)} {instance.public_dns_name} {instance.state['Name']} {az}"
|
||||
)
|
||||
|
||||
|
||||
def terminate_instances(instance_type: str) -> None:
|
||||
print(f"Terminating all instances of type {instance_type}")
|
||||
instances = list(ec2_instances_of_type(instance_type))
|
||||
for instance in instances:
|
||||
print(f"Terminating {instance.id}")
|
||||
instance.terminate()
|
||||
print("Waiting for termination to complete")
|
||||
for instance in instances:
|
||||
instance.wait_until_terminated()
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
from argparse import ArgumentParser
|
||||
|
||||
parser = ArgumentParser("Build and test AARCH64 wheels using EC2")
|
||||
parser.add_argument("--key-name", type=str)
|
||||
parser.add_argument("--debug", action="store_true")
|
||||
parser.add_argument("--build-only", action="store_true")
|
||||
parser.add_argument("--test-only", type=str)
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
group.add_argument("--os", type=str, choices=list(os_amis.keys()))
|
||||
group.add_argument("--ami", type=str)
|
||||
parser.add_argument(
|
||||
"--python-version",
|
||||
type=str,
|
||||
choices=[f"3.{d}" for d in range(6, 12)],
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument("--alloc-instance", action="store_true")
|
||||
parser.add_argument("--list-instances", action="store_true")
|
||||
parser.add_argument("--pytorch-only", action="store_true")
|
||||
parser.add_argument("--keep-running", action="store_true")
|
||||
parser.add_argument("--terminate-instances", action="store_true")
|
||||
parser.add_argument("--instance-type", type=str, default="t4g.2xlarge")
|
||||
parser.add_argument("--ebs-size", type=int, default=50)
|
||||
parser.add_argument("--branch", type=str, default="main")
|
||||
parser.add_argument("--use-docker", action="store_true")
|
||||
parser.add_argument(
|
||||
"--compiler",
|
||||
type=str,
|
||||
choices=["gcc-7", "gcc-8", "gcc-9", "clang"],
|
||||
default="gcc-8",
|
||||
)
|
||||
parser.add_argument("--use-torch-from-pypi", action="store_true")
|
||||
parser.add_argument("--pytorch-build-number", type=str, default=None)
|
||||
parser.add_argument("--disable-mkldnn", action="store_true")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_arguments()
|
||||
ami = (
|
||||
args.ami
|
||||
if args.ami is not None
|
||||
else os_amis[args.os]
|
||||
if args.os is not None
|
||||
else ubuntu20_04_ami
|
||||
)
|
||||
keyfile_path, key_name = compute_keyfile_path(args.key_name)
|
||||
|
||||
if args.list_instances:
|
||||
list_instances(args.instance_type)
|
||||
sys.exit(0)
|
||||
|
||||
if args.terminate_instances:
|
||||
terminate_instances(args.instance_type)
|
||||
sys.exit(0)
|
||||
|
||||
if len(key_name) == 0:
|
||||
raise RuntimeError("""
|
||||
Cannot start build without key_name, please specify
|
||||
--key-name argument or AWS_KEY_NAME environment variable.""")
|
||||
if len(keyfile_path) == 0 or not os.path.exists(keyfile_path):
|
||||
raise RuntimeError(f"""
|
||||
Cannot find keyfile with name: [{key_name}] in path: [{keyfile_path}], please
|
||||
check `~/.ssh/` folder or manually set SSH_KEY_PATH environment variable.""")
|
||||
|
||||
# Starting the instance
|
||||
inst = start_instance(
|
||||
key_name, ami=ami, instance_type=args.instance_type, ebs_size=args.ebs_size
|
||||
)
|
||||
instance_name = f"{args.key_name}-{args.os}"
|
||||
if args.python_version is not None:
|
||||
instance_name += f"-py{args.python_version}"
|
||||
inst.create_tags(
|
||||
DryRun=False,
|
||||
Tags=[
|
||||
{
|
||||
"Key": "Name",
|
||||
"Value": instance_name,
|
||||
}
|
||||
],
|
||||
)
|
||||
addr = inst.public_dns_name
|
||||
wait_for_connection(addr, 22)
|
||||
host = RemoteHost(addr, keyfile_path)
|
||||
host.ami = ami
|
||||
if args.use_docker:
|
||||
update_apt_repo(host)
|
||||
host.start_docker()
|
||||
|
||||
if args.test_only:
|
||||
run_tests(host, args.test_only)
|
||||
sys.exit(0)
|
||||
|
||||
if args.alloc_instance:
|
||||
if args.python_version is None:
|
||||
sys.exit(0)
|
||||
install_condaforge_python(host, args.python_version)
|
||||
sys.exit(0)
|
||||
|
||||
python_version = args.python_version if args.python_version is not None else "3.10"
|
||||
|
||||
if args.use_torch_from_pypi:
|
||||
configure_system(host, compiler=args.compiler, python_version=python_version)
|
||||
print("Installing PyTorch wheel")
|
||||
host.run_cmd("pip3 install torch")
|
||||
build_domains(
|
||||
host, branch=args.branch, git_clone_flags=" --depth 1 --shallow-submodules"
|
||||
)
|
||||
else:
|
||||
start_build(
|
||||
host,
|
||||
branch=args.branch,
|
||||
compiler=args.compiler,
|
||||
python_version=python_version,
|
||||
pytorch_only=args.pytorch_only,
|
||||
pytorch_build_number=args.pytorch_build_number,
|
||||
enable_mkldnn=not args.disable_mkldnn,
|
||||
)
|
||||
if not args.keep_running:
|
||||
print(f"Waiting for instance {inst.id} to terminate")
|
||||
inst.terminate()
|
||||
inst.wait_until_terminated()
|
||||
@ -1,87 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from subprocess import check_call
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from auditwheel.elfutils import elf_file_filter
|
||||
from auditwheel.lddtree import lddtree
|
||||
from auditwheel.patcher import Patchelf
|
||||
from auditwheel.repair import copylib
|
||||
from auditwheel.wheeltools import InWheelCtx
|
||||
|
||||
|
||||
def replace_tag(filename):
|
||||
with open(filename) as f:
|
||||
lines = f.read().split("\\n")
|
||||
for i, line in enumerate(lines):
|
||||
if not line.startswith("Tag: "):
|
||||
continue
|
||||
lines[i] = line.replace("-linux_", "-manylinux2014_")
|
||||
print(f"Updated tag from {line} to {lines[i]}")
|
||||
|
||||
with open(filename, "w") as f:
|
||||
f.write("\\n".join(lines))
|
||||
|
||||
|
||||
class AlignedPatchelf(Patchelf):
|
||||
def set_soname(self, file_name: str, new_soname: str) -> None:
|
||||
check_call(
|
||||
["patchelf", "--page-size", "65536", "--set-soname", new_soname, file_name]
|
||||
)
|
||||
|
||||
def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
|
||||
check_call(
|
||||
[
|
||||
"patchelf",
|
||||
"--page-size",
|
||||
"65536",
|
||||
"--replace-needed",
|
||||
soname,
|
||||
new_soname,
|
||||
file_name,
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def embed_library(whl_path, lib_soname, update_tag=False):
|
||||
patcher = AlignedPatchelf()
|
||||
out_dir = TemporaryDirectory()
|
||||
whl_name = os.path.basename(whl_path)
|
||||
tmp_whl_name = os.path.join(out_dir.name, whl_name)
|
||||
with InWheelCtx(whl_path) as ctx:
|
||||
torchlib_path = os.path.join(ctx._tmpdir.name, "torch", "lib")
|
||||
ctx.out_wheel = tmp_whl_name
|
||||
new_lib_path, new_lib_soname = None, None
|
||||
for filename, _ in elf_file_filter(ctx.iter_files()):
|
||||
if not filename.startswith("torch/lib"):
|
||||
continue
|
||||
libtree = lddtree(filename)
|
||||
if lib_soname not in libtree["needed"]:
|
||||
continue
|
||||
lib_path = libtree["libs"][lib_soname]["path"]
|
||||
if lib_path is None:
|
||||
print(f"Can't embed {lib_soname} as it could not be found")
|
||||
break
|
||||
if lib_path.startswith(torchlib_path):
|
||||
continue
|
||||
|
||||
if new_lib_path is None:
|
||||
new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
|
||||
patcher.replace_needed(filename, lib_soname, new_lib_soname)
|
||||
print(f"Replacing {lib_soname} with {new_lib_soname} for {filename}")
|
||||
if update_tag:
|
||||
# Add manylinux2014 tag
|
||||
for filename in ctx.iter_files():
|
||||
if os.path.basename(filename) != "WHEEL":
|
||||
continue
|
||||
replace_tag(filename)
|
||||
shutil.move(tmp_whl_name, whl_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
embed_library(
|
||||
sys.argv[1], "libgomp.so.1", len(sys.argv) > 2 and sys.argv[2] == "--update-tag"
|
||||
)
|
||||
@ -188,7 +188,7 @@ case "$tag" in
|
||||
fi
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
ROCM_VERSION=7.1
|
||||
ROCM_VERSION=7.0
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
KATEX=yes
|
||||
|
||||
@ -60,16 +60,14 @@ EOF
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
|
||||
fi
|
||||
|
||||
if [[ $(ver $ROCM_VERSION) -lt $(ver 7.1) ]]; then
|
||||
# precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5, removed in ROCm 7.1
|
||||
# search for all unversioned packages
|
||||
# if search fails it will abort this script; use true to avoid case where search fails
|
||||
MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
|
||||
if [[ "x${MIOPENHIPGFX}" = x ]]; then
|
||||
echo "miopen-hip-gfx package not available" && exit 1
|
||||
else
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
|
||||
fi
|
||||
# precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
|
||||
# search for all unversioned packages
|
||||
# if search fails it will abort this script; use true to avoid case where search fails
|
||||
MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
|
||||
if [[ "x${MIOPENHIPGFX}" = x ]]; then
|
||||
echo "miopen-hip-gfx package not available" && exit 1
|
||||
else
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
|
||||
fi
|
||||
|
||||
# ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
|
||||
|
||||
@ -12,8 +12,8 @@ function do_install() {
|
||||
|
||||
rocm_version_nodot=${rocm_version//./}
|
||||
|
||||
# https://github.com/icl-utk-edu/magma/pull/65
|
||||
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
|
||||
# post merge of https://github.com/icl-utk-edu/magma/pull/65
|
||||
MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
|
||||
magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
|
||||
|
||||
rocm_dir="/opt/rocm"
|
||||
|
||||
@ -4,14 +4,17 @@ set -ex
|
||||
|
||||
SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||
|
||||
# Source the common build script for architecture-specific configurations (MKLDNN, ACL, etc.)
|
||||
source "${SCRIPTPATH}/../pytorch/build.sh" || true
|
||||
|
||||
case "${GPU_ARCH_TYPE:-BLANK}" in
|
||||
cuda)
|
||||
cuda | cuda-aarch64)
|
||||
bash "${SCRIPTPATH}/build_cuda.sh"
|
||||
;;
|
||||
rocm)
|
||||
bash "${SCRIPTPATH}/build_rocm.sh"
|
||||
;;
|
||||
cpu | cpu-cxx11-abi | cpu-s390x)
|
||||
cpu | cpu-cxx11-abi | cpu-aarch64 | cpu-s390x)
|
||||
bash "${SCRIPTPATH}/build_cpu.sh"
|
||||
;;
|
||||
xpu)
|
||||
|
||||
@ -18,12 +18,31 @@ retry () {
|
||||
$* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
|
||||
}
|
||||
|
||||
# Detect architecture first
|
||||
ARCH=$(uname -m)
|
||||
echo "Detected architecture: $ARCH"
|
||||
|
||||
PLATFORM=""
|
||||
# TODO move this into the Docker images
|
||||
OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
|
||||
if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
|
||||
retry yum install -q -y zip openssl
|
||||
PLATFORM="manylinux_2_28_x86_64"
|
||||
# Set platform based on architecture
|
||||
case $ARCH in
|
||||
x86_64)
|
||||
PLATFORM="manylinux_2_28_x86_64"
|
||||
;;
|
||||
aarch64)
|
||||
PLATFORM="manylinux_2_28_aarch64"
|
||||
;;
|
||||
s390x)
|
||||
PLATFORM="manylinux_2_28_s390x"
|
||||
;;
|
||||
*)
|
||||
echo "Unsupported architecture: $ARCH"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
|
||||
retry dnf install -q -y zip openssl
|
||||
elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
|
||||
@ -38,6 +57,8 @@ else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Platform set to: $PLATFORM"
|
||||
|
||||
# We use the package name to test the package by passing this to 'pip install'
|
||||
# This is the env variable that setup.py uses to name the package. Note that
|
||||
# pip 'normalizes' the name first by changing all - to _
|
||||
@ -299,8 +320,8 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
|
||||
# ROCm workaround for roctracer dlopens
|
||||
if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
|
||||
patchedpath=$(fname_without_so_number $destpath)
|
||||
# Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load
|
||||
elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then
|
||||
# Keep the so number for XPU dependencies, libgomp.so.1, ACL libraries, and NVPL libraries to avoid twice load
|
||||
elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" || "$filename" == libarm_compute* || "$filename" == libnvpl* || "$filename" == "libgfortran.so.5" ]]; then
|
||||
patchedpath=$destpath
|
||||
else
|
||||
patchedpath=$(fname_with_sha256 $destpath)
|
||||
@ -346,9 +367,22 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
|
||||
done
|
||||
|
||||
# create Manylinux 2_28 tag this needs to happen before regenerate the RECORD
|
||||
if [[ $PLATFORM == "manylinux_2_28_x86_64" && $GPU_ARCH_TYPE != "cpu-s390x" && $GPU_ARCH_TYPE != "xpu" ]]; then
|
||||
# Support all architectures (x86_64, aarch64, s390x)
|
||||
if [[ "$IS_MANYLINUX2_28" == "1" && $GPU_ARCH_TYPE != "xpu" ]]; then
|
||||
wheel_file=$(echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/WHEEL/g')
|
||||
sed -i -e s#linux_x86_64#"${PLATFORM}"# $wheel_file;
|
||||
echo "Updating wheel tag for $ARCH architecture"
|
||||
# Replace linux_* with manylinux_2_28_* based on architecture
|
||||
case $ARCH in
|
||||
x86_64)
|
||||
sed -i -e 's#linux_x86_64#manylinux_2_28_x86_64#g' $wheel_file
|
||||
;;
|
||||
aarch64)
|
||||
sed -i -e 's#linux_aarch64#manylinux_2_28_aarch64#g' $wheel_file
|
||||
;;
|
||||
s390x)
|
||||
sed -i -e 's#linux_s390x#manylinux_2_28_s390x#g' $wheel_file
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# regenerate the RECORD file with new hashes
|
||||
|
||||
@ -15,6 +15,10 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
|
||||
EXTRA_CAFFE2_CMAKE_FLAGS=()
|
||||
fi
|
||||
|
||||
# Detect architecture
|
||||
ARCH=$(uname -m)
|
||||
echo "Building CPU wheel for architecture: $ARCH"
|
||||
|
||||
WHEELHOUSE_DIR="wheelhousecpu"
|
||||
LIBTORCH_HOUSE_DIR="libtorch_housecpu"
|
||||
if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
|
||||
@ -34,8 +38,10 @@ elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
|
||||
elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
|
||||
LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
|
||||
elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
|
||||
if [[ "$(uname -m)" == "s390x" ]]; then
|
||||
if [[ "$ARCH" == "s390x" ]]; then
|
||||
LIBGOMP_PATH="/usr/lib/s390x-linux-gnu/libgomp.so.1"
|
||||
elif [[ "$ARCH" == "aarch64" ]]; then
|
||||
LIBGOMP_PATH="/usr/lib/aarch64-linux-gnu/libgomp.so.1"
|
||||
else
|
||||
LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
|
||||
fi
|
||||
@ -49,6 +55,32 @@ DEPS_SONAME=(
|
||||
"libgomp.so.1"
|
||||
)
|
||||
|
||||
# Add ARM-specific library dependencies for CPU builds
|
||||
if [[ "$ARCH" == "aarch64" ]]; then
|
||||
echo "Adding ARM-specific CPU library dependencies"
|
||||
|
||||
# ARM Compute Library (if available)
|
||||
if [[ -d "/acl/build" ]]; then
|
||||
echo "Adding ARM Compute Library for CPU"
|
||||
DEPS_LIST+=(
|
||||
"/acl/build/libarm_compute.so"
|
||||
"/acl/build/libarm_compute_graph.so"
|
||||
)
|
||||
DEPS_SONAME+=(
|
||||
"libarm_compute.so"
|
||||
"libarm_compute_graph.so"
|
||||
)
|
||||
fi
|
||||
|
||||
# ARM system libraries
|
||||
DEPS_LIST+=(
|
||||
"/usr/lib64/libgfortran.so.5"
|
||||
)
|
||||
DEPS_SONAME+=(
|
||||
"libgfortran.so.5"
|
||||
)
|
||||
fi
|
||||
|
||||
rm -rf /usr/local/cuda*
|
||||
|
||||
SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
|
||||
|
||||
@ -29,6 +29,10 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
|
||||
EXTRA_CAFFE2_CMAKE_FLAGS=()
|
||||
fi
|
||||
|
||||
# Detect architecture
|
||||
ARCH=$(uname -m)
|
||||
echo "Building for architecture: $ARCH"
|
||||
|
||||
# Determine CUDA version and architectures to build for
|
||||
#
|
||||
# NOTE: We should first check `DESIRED_CUDA` when determining `CUDA_VERSION`,
|
||||
@ -53,34 +57,60 @@ fi
|
||||
cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
|
||||
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
|
||||
|
||||
# Function to remove architectures from a list
|
||||
remove_archs() {
|
||||
local result="$1"
|
||||
shift
|
||||
for arch in "$@"; do
|
||||
result="${result//${arch};/}"
|
||||
done
|
||||
echo "$result"
|
||||
}
|
||||
|
||||
# Function to filter CUDA architectures for aarch64
|
||||
# aarch64 ARM GPUs only support certain compute capabilities
|
||||
# Keep: 8.0 (A100), 9.0+ (Hopper, Grace Hopper, newer)
|
||||
# Remove: < 8.0 (no ARM GPUs), 8.6 (x86_64 RTX 3090/A6000 only)
|
||||
filter_aarch64_archs() {
|
||||
local arch_list="$1"
|
||||
# Explicitly remove architectures not needed on aarch64
|
||||
arch_list=$(remove_archs "$arch_list" "5.0" "6.0" "7.0" "7.5" "8.6")
|
||||
echo "$arch_list"
|
||||
}
|
||||
|
||||
# Base: Common architectures across all modern CUDA versions
|
||||
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0"
|
||||
|
||||
case ${CUDA_VERSION} in
|
||||
#removing sm_50-sm_60 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
|
||||
#however we would like to keep sm_70 architecture see: https://github.com/pytorch/pytorch/issues/157517
|
||||
12.8)
|
||||
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0"
|
||||
;;
|
||||
12.9)
|
||||
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0+PTX"
|
||||
# WAR to resolve the ld error in libtorch build with CUDA 12.9
|
||||
12.6) TORCH_CUDA_ARCH_LIST="5.0;6.0;${TORCH_CUDA_ARCH_LIST}" ;; # Only 12.6 includes Legacy Maxwell/Pascal that will be removed in future releases
|
||||
12.8) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0" ;; # +Hopper/Blackwell support
|
||||
12.9) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0+PTX" # +Hopper/Blackwell support + PTX for forward compatibility
|
||||
if [[ "$PACKAGE_TYPE" == "libtorch" ]]; then
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
|
||||
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//7.0;/}" # Remove 7.0 to resolve the ld error
|
||||
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//8.6;/}" # Remove 8.6 for libtorch
|
||||
fi
|
||||
;;
|
||||
13.0)
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
|
||||
;;
|
||||
12.6)
|
||||
TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
|
||||
;;
|
||||
*)
|
||||
echo "unknown cuda version $CUDA_VERSION"
|
||||
exit 1
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;$([[ "$ARCH" == "aarch64" ]] && echo "11.0;" || echo "")12.0+PTX"
|
||||
export TORCH_NVCC_FLAGS="-compress-mode=size"
|
||||
export BUILD_BUNDLE_PTXAS=1
|
||||
;;
|
||||
*) echo "unknown cuda version $CUDA_VERSION"; exit 1 ;;
|
||||
esac
|
||||
|
||||
# Filter for aarch64: Remove < 8.0 and 8.6
|
||||
[[ "$ARCH" == "aarch64" ]] && TORCH_CUDA_ARCH_LIST=$(filter_aarch64_archs "$TORCH_CUDA_ARCH_LIST")
|
||||
|
||||
echo "TORCH_CUDA_ARCH_LIST set to: $TORCH_CUDA_ARCH_LIST"
|
||||
export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
|
||||
echo "${TORCH_CUDA_ARCH_LIST}"
|
||||
|
||||
# Disable MAGMA for aarch64 as pre-built libraries are x86-64 only
|
||||
if [[ "$ARCH" == "aarch64" ]]; then
|
||||
echo "Disabling MAGMA for aarch64 architecture"
|
||||
export USE_MAGMA=0
|
||||
fi
|
||||
|
||||
# Package directories
|
||||
WHEELHOUSE_DIR="wheelhouse$cuda_version_nodot"
|
||||
LIBTORCH_HOUSE_DIR="libtorch_house$cuda_version_nodot"
|
||||
@ -244,6 +274,51 @@ else
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Add ARM-specific library dependencies
|
||||
if [[ "$ARCH" == "aarch64" ]]; then
|
||||
echo "Adding ARM-specific library dependencies"
|
||||
|
||||
# ARM Compute Library (if available)
|
||||
if [[ -d "/acl/build" ]]; then
|
||||
echo "Adding ARM Compute Library"
|
||||
DEPS_LIST+=(
|
||||
"/acl/build/libarm_compute.so"
|
||||
"/acl/build/libarm_compute_graph.so"
|
||||
)
|
||||
DEPS_SONAME+=(
|
||||
"libarm_compute.so"
|
||||
"libarm_compute_graph.so"
|
||||
)
|
||||
fi
|
||||
|
||||
# ARM system libraries
|
||||
DEPS_LIST+=(
|
||||
"/lib64/libgomp.so.1"
|
||||
"/usr/lib64/libgfortran.so.5"
|
||||
)
|
||||
DEPS_SONAME+=(
|
||||
"libgomp.so.1"
|
||||
"libgfortran.so.5"
|
||||
)
|
||||
|
||||
# NVPL libraries (ARM optimized BLAS/LAPACK)
|
||||
if [[ -d "/usr/local/lib" && -f "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0" ]]; then
|
||||
echo "Adding NVPL libraries for ARM"
|
||||
DEPS_LIST+=(
|
||||
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0"
|
||||
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0"
|
||||
"/usr/local/lib/libnvpl_lapack_core.so.0"
|
||||
"/usr/local/lib/libnvpl_blas_core.so.0"
|
||||
)
|
||||
DEPS_SONAME+=(
|
||||
"libnvpl_lapack_lp64_gomp.so.0"
|
||||
"libnvpl_blas_lp64_gomp.so.0"
|
||||
"libnvpl_lapack_core.so.0"
|
||||
"libnvpl_blas_core.so.0"
|
||||
)
|
||||
fi
|
||||
fi
|
||||
|
||||
# run_tests.sh requires DESIRED_CUDA to know what tests to exclude
|
||||
export DESIRED_CUDA="$cuda_version_nodot"
|
||||
|
||||
@ -251,9 +326,11 @@ export DESIRED_CUDA="$cuda_version_nodot"
|
||||
rm -rf /usr/local/cuda || true
|
||||
ln -s "/usr/local/cuda-${CUDA_VERSION}" /usr/local/cuda
|
||||
|
||||
# Switch `/usr/local/magma` to the desired CUDA version
|
||||
rm -rf /usr/local/magma || true
|
||||
ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
|
||||
# Switch `/usr/local/magma` to the desired CUDA version (skip for aarch64)
|
||||
if [[ "$ARCH" != "aarch64" ]]; then
|
||||
rm -rf /usr/local/magma || true
|
||||
ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
|
||||
fi
|
||||
|
||||
export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev) # 10.0.130
|
||||
export CUDA_VERSION_SHORT=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev | cut -f1,2 -d".") # 10.0
|
||||
|
||||
@ -21,87 +21,3 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
|
||||
fi
|
||||
|
||||
mkdir -p "$pytest_reports_dir" || true
|
||||
|
||||
##########################################
|
||||
# copied from .ci/pytorch/common_utils.sh
|
||||
##########################################
|
||||
|
||||
function get_pinned_commit() {
|
||||
cat .github/ci_commit_pins/"${1}".txt
|
||||
}
|
||||
|
||||
function pip_install_whl() {
|
||||
# This is used to install PyTorch and other build artifacts wheel locally
|
||||
# without using any network connection
|
||||
|
||||
# Convert the input arguments into an array
|
||||
local args=("$@")
|
||||
|
||||
# Check if the first argument contains multiple paths separated by spaces
|
||||
if [[ "${args[0]}" == *" "* ]]; then
|
||||
# Split the string by spaces into an array
|
||||
IFS=' ' read -r -a paths <<< "${args[0]}"
|
||||
# Loop through each path and install individually
|
||||
for path in "${paths[@]}"; do
|
||||
echo "Installing $path"
|
||||
python3 -mpip install --no-index --no-deps "$path"
|
||||
done
|
||||
else
|
||||
# Loop through each argument and install individually
|
||||
for path in "${args[@]}"; do
|
||||
echo "Installing $path"
|
||||
python3 -mpip install --no-index --no-deps "$path"
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
function pip_build_and_install() {
|
||||
local build_target=$1
|
||||
local wheel_dir=$2
|
||||
|
||||
local found_whl=0
|
||||
for file in "${wheel_dir}"/*.whl
|
||||
do
|
||||
if [[ -f "${file}" ]]; then
|
||||
found_whl=1
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
# Build the wheel if it doesn't exist
|
||||
if [ "${found_whl}" == "0" ]; then
|
||||
python3 -m pip wheel \
|
||||
--no-build-isolation \
|
||||
--no-deps \
|
||||
-w "${wheel_dir}" \
|
||||
"${build_target}"
|
||||
fi
|
||||
|
||||
for file in "${wheel_dir}"/*.whl
|
||||
do
|
||||
pip_install_whl "${file}"
|
||||
done
|
||||
}
|
||||
|
||||
function install_torchvision() {
|
||||
local orig_preload
|
||||
local commit
|
||||
commit=$(get_pinned_commit vision)
|
||||
orig_preload=${LD_PRELOAD}
|
||||
if [ -n "${LD_PRELOAD}" ]; then
|
||||
# Silence dlerror to work-around glibc ASAN bug, see https://sourceware.org/bugzilla/show_bug.cgi?id=27653#c9
|
||||
echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c -
|
||||
LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so
|
||||
fi
|
||||
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
|
||||
# Not sure if both are needed, but why not
|
||||
export FORCE_CUDA=1
|
||||
export WITH_CUDA=1
|
||||
fi
|
||||
pip_build_and_install "git+https://github.com/pytorch/vision.git@${commit}" dist/vision
|
||||
|
||||
if [ -n "${LD_PRELOAD}" ]; then
|
||||
LD_PRELOAD=${orig_preload}
|
||||
fi
|
||||
}
|
||||
|
||||
@ -19,7 +19,7 @@ git config --global --add safe.directory /var/lib/jenkins/workspace
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
|
||||
# TODO: This can be removed later once vision is also part of the Docker image
|
||||
install_torchvision
|
||||
pip install -q --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
|
||||
# JIT C++ extensions require ninja, so put it into PATH.
|
||||
export PATH="/var/lib/jenkins/.local/bin:$PATH"
|
||||
# NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we
|
||||
|
||||
@ -86,10 +86,20 @@ else
|
||||
fi
|
||||
fi
|
||||
|
||||
# Enable MKLDNN with ARM Compute Library for ARM builds
|
||||
if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
|
||||
export USE_MKLDNN=1
|
||||
|
||||
# ACL is required for aarch64 builds
|
||||
if [[ ! -d "/acl" ]]; then
|
||||
echo "ERROR: ARM Compute Library not found at /acl"
|
||||
echo "ACL is required for aarch64 builds. Check Docker image setup."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export USE_MKLDNN_ACL=1
|
||||
export ACL_ROOT_DIR=/acl
|
||||
echo "ARM Compute Library enabled for MKLDNN: ACL_ROOT_DIR=/acl"
|
||||
fi
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then
|
||||
|
||||
@ -100,6 +100,337 @@ def check_lib_statically_linked_libstdc_cxx_abi_symbols(lib: str) -> None:
|
||||
)
|
||||
|
||||
|
||||
def _compile_and_extract_symbols(
|
||||
cpp_content: str, compile_flags: list[str], exclude_list: list[str] | None = None
|
||||
) -> list[str]:
|
||||
"""
|
||||
Helper to compile a C++ file and extract all symbols.
|
||||
|
||||
Args:
|
||||
cpp_content: C++ source code to compile
|
||||
compile_flags: Compilation flags
|
||||
exclude_list: List of symbol names to exclude. Defaults to ["main"].
|
||||
|
||||
Returns:
|
||||
List of all symbols found in the object file (excluding those in exclude_list).
|
||||
"""
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
if exclude_list is None:
|
||||
exclude_list = ["main"]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmppath = Path(tmpdir)
|
||||
cpp_file = tmppath / "test.cpp"
|
||||
obj_file = tmppath / "test.o"
|
||||
|
||||
cpp_file.write_text(cpp_content)
|
||||
|
||||
result = subprocess.run(
|
||||
compile_flags + [str(cpp_file), "-o", str(obj_file)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Compilation failed: {result.stderr}")
|
||||
|
||||
symbols = get_symbols(str(obj_file))
|
||||
|
||||
# Return all symbol names, excluding those in the exclude list
|
||||
return [name for _addr, _stype, name in symbols if name not in exclude_list]
|
||||
|
||||
|
||||
def check_stable_only_symbols(install_root: Path) -> None:
|
||||
"""
|
||||
Test TORCH_STABLE_ONLY and TORCH_TARGET_VERSION by compiling test code and comparing symbol counts.
|
||||
|
||||
This approach tests:
|
||||
1. WITHOUT macros -> many torch symbols exposed
|
||||
2. WITH TORCH_STABLE_ONLY -> zero torch symbols (all hidden)
|
||||
3. WITH TORCH_TARGET_VERSION -> zero torch symbols (all hidden)
|
||||
4. WITH both macros -> zero torch symbols (all hidden)
|
||||
"""
|
||||
include_dir = install_root / "include"
|
||||
assert include_dir.exists(), f"Expected {include_dir} to be present"
|
||||
|
||||
test_cpp_content = """
|
||||
// Main torch C++ API headers
|
||||
#include <torch/torch.h>
|
||||
#include <torch/all.h>
|
||||
|
||||
// ATen tensor library
|
||||
#include <ATen/ATen.h>
|
||||
|
||||
// Core c10 headers (commonly used)
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/core/DeviceType.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <c10/core/TensorOptions.h>
|
||||
#include <c10/util/Optional.h>
|
||||
|
||||
int main() { return 0; }
|
||||
"""
|
||||
|
||||
base_compile_flags = [
|
||||
"g++",
|
||||
"-std=c++17",
|
||||
f"-I{include_dir}",
|
||||
f"-I{include_dir}/torch/csrc/api/include",
|
||||
"-c", # Compile only, don't link
|
||||
]
|
||||
|
||||
# Compile WITHOUT any macros
|
||||
symbols_without = _compile_and_extract_symbols(
|
||||
cpp_content=test_cpp_content,
|
||||
compile_flags=base_compile_flags,
|
||||
)
|
||||
|
||||
# We expect constexpr symbols, inline functions used by other headers etc.
|
||||
# to produce symbols
|
||||
num_symbols_without = len(symbols_without)
|
||||
print(f"Found {num_symbols_without} symbols without any macros defined")
|
||||
assert num_symbols_without != 0, (
|
||||
"Expected a non-zero number of symbols without any macros"
|
||||
)
|
||||
|
||||
# Compile WITH TORCH_STABLE_ONLY (expect 0 symbols)
|
||||
compile_flags_with_stable_only = base_compile_flags + ["-DTORCH_STABLE_ONLY"]
|
||||
|
||||
symbols_with_stable_only = _compile_and_extract_symbols(
|
||||
cpp_content=test_cpp_content,
|
||||
compile_flags=compile_flags_with_stable_only,
|
||||
)
|
||||
|
||||
num_symbols_with_stable_only = len(symbols_with_stable_only)
|
||||
assert num_symbols_with_stable_only == 0, (
|
||||
f"Expected no symbols with TORCH_STABLE_ONLY macro, but found {num_symbols_with_stable_only}"
|
||||
)
|
||||
|
||||
# Compile WITH TORCH_TARGET_VERSION (expect 0 symbols)
|
||||
compile_flags_with_target_version = base_compile_flags + [
|
||||
"-DTORCH_TARGET_VERSION=1"
|
||||
]
|
||||
|
||||
symbols_with_target_version = _compile_and_extract_symbols(
|
||||
cpp_content=test_cpp_content,
|
||||
compile_flags=compile_flags_with_target_version,
|
||||
)
|
||||
|
||||
num_symbols_with_target_version = len(symbols_with_target_version)
|
||||
assert num_symbols_with_target_version == 0, (
|
||||
f"Expected no symbols with TORCH_TARGET_VERSION macro, but found {num_symbols_with_target_version}"
|
||||
)
|
||||
|
||||
# Compile WITH both macros (expect 0 symbols)
|
||||
compile_flags_with_both = base_compile_flags + [
|
||||
"-DTORCH_STABLE_ONLY",
|
||||
"-DTORCH_TARGET_VERSION=1",
|
||||
]
|
||||
|
||||
symbols_with_both = _compile_and_extract_symbols(
|
||||
cpp_content=test_cpp_content,
|
||||
compile_flags=compile_flags_with_both,
|
||||
)
|
||||
|
||||
num_symbols_with_both = len(symbols_with_both)
|
||||
assert num_symbols_with_both == 0, (
|
||||
f"Expected no symbols with both macros, but found {num_symbols_with_both}"
|
||||
)
|
||||
|
||||
|
||||
def check_stable_api_symbols(install_root: Path) -> None:
|
||||
"""
|
||||
Test that stable API headers still expose symbols with TORCH_STABLE_ONLY.
|
||||
The torch/csrc/stable/c/shim.h header is tested in check_stable_c_shim_symbols
|
||||
"""
|
||||
include_dir = install_root / "include"
|
||||
assert include_dir.exists(), f"Expected {include_dir} to be present"
|
||||
|
||||
stable_dir = include_dir / "torch" / "csrc" / "stable"
|
||||
assert stable_dir.exists(), f"Expected {stable_dir} to be present"
|
||||
|
||||
stable_headers = list(stable_dir.rglob("*.h"))
|
||||
if not stable_headers:
|
||||
raise RuntimeError("Could not find any stable headers")
|
||||
|
||||
includes = []
|
||||
for header in stable_headers:
|
||||
rel_path = header.relative_to(include_dir)
|
||||
includes.append(f"#include <{rel_path.as_posix()}>")
|
||||
|
||||
includes_str = "\n".join(includes)
|
||||
test_stable_content = f"""
|
||||
{includes_str}
|
||||
int main() {{ return 0; }}
|
||||
"""
|
||||
|
||||
compile_flags = [
|
||||
"g++",
|
||||
"-std=c++17",
|
||||
f"-I{include_dir}",
|
||||
f"-I{include_dir}/torch/csrc/api/include",
|
||||
"-c",
|
||||
"-DTORCH_STABLE_ONLY",
|
||||
]
|
||||
|
||||
symbols_stable = _compile_and_extract_symbols(
|
||||
cpp_content=test_stable_content,
|
||||
compile_flags=compile_flags,
|
||||
)
|
||||
num_symbols_stable = len(symbols_stable)
|
||||
print(f"Found {num_symbols_stable} symbols in torch/csrc/stable")
|
||||
assert num_symbols_stable > 0, (
|
||||
f"Expected stable headers to expose symbols with TORCH_STABLE_ONLY, "
|
||||
f"but found {num_symbols_stable} symbols"
|
||||
)
|
||||
|
||||
|
||||
def check_headeronly_symbols(install_root: Path) -> None:
|
||||
"""
|
||||
Test that header-only utility headers still expose symbols with TORCH_STABLE_ONLY.
|
||||
"""
|
||||
include_dir = install_root / "include"
|
||||
assert include_dir.exists(), f"Expected {include_dir} to be present"
|
||||
|
||||
# Find all headers in torch/headeronly
|
||||
headeronly_dir = include_dir / "torch" / "headeronly"
|
||||
assert headeronly_dir.exists(), f"Expected {headeronly_dir} to be present"
|
||||
headeronly_headers = list(headeronly_dir.rglob("*.h"))
|
||||
if not headeronly_headers:
|
||||
raise RuntimeError("Could not find any headeronly headers")
|
||||
|
||||
# Filter out platform-specific headers that may not compile everywhere
|
||||
platform_specific_keywords = [
|
||||
"cpu/vec",
|
||||
]
|
||||
|
||||
filtered_headers = []
|
||||
for header in headeronly_headers:
|
||||
rel_path = header.relative_to(include_dir).as_posix()
|
||||
if not any(
|
||||
keyword in rel_path.lower() for keyword in platform_specific_keywords
|
||||
):
|
||||
filtered_headers.append(header)
|
||||
|
||||
includes = []
|
||||
for header in filtered_headers:
|
||||
rel_path = header.relative_to(include_dir)
|
||||
includes.append(f"#include <{rel_path.as_posix()}>")
|
||||
|
||||
includes_str = "\n".join(includes)
|
||||
test_headeronly_content = f"""
|
||||
{includes_str}
|
||||
int main() {{ return 0; }}
|
||||
"""
|
||||
|
||||
compile_flags = [
|
||||
"g++",
|
||||
"-std=c++17",
|
||||
f"-I{include_dir}",
|
||||
f"-I{include_dir}/torch/csrc/api/include",
|
||||
"-c",
|
||||
"-DTORCH_STABLE_ONLY",
|
||||
]
|
||||
|
||||
symbols_headeronly = _compile_and_extract_symbols(
|
||||
cpp_content=test_headeronly_content,
|
||||
compile_flags=compile_flags,
|
||||
)
|
||||
num_symbols_headeronly = len(symbols_headeronly)
|
||||
print(f"Found {num_symbols_headeronly} symbols in torch/headeronly")
|
||||
assert num_symbols_headeronly > 0, (
|
||||
f"Expected headeronly headers to expose symbols with TORCH_STABLE_ONLY, "
|
||||
f"but found {num_symbols_headeronly} symbols"
|
||||
)
|
||||
|
||||
|
||||
def check_aoti_shim_symbols(install_root: Path) -> None:
|
||||
"""
|
||||
Test that AOTI shim headers still expose symbols with TORCH_STABLE_ONLY.
|
||||
"""
|
||||
include_dir = install_root / "include"
|
||||
assert include_dir.exists(), f"Expected {include_dir} to be present"
|
||||
|
||||
# There are no constexpr symbols etc., so we need to actually use functions
|
||||
# so that some symbols are found.
|
||||
test_shim_content = """
|
||||
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
|
||||
int main() {
|
||||
int32_t (*fp1)() = &aoti_torch_device_type_cpu;
|
||||
int32_t (*fp2)() = &aoti_torch_dtype_float32;
|
||||
(void)fp1; (void)fp2;
|
||||
return 0;
|
||||
}
|
||||
"""
|
||||
|
||||
compile_flags = [
|
||||
"g++",
|
||||
"-std=c++17",
|
||||
f"-I{include_dir}",
|
||||
f"-I{include_dir}/torch/csrc/api/include",
|
||||
"-c",
|
||||
"-DTORCH_STABLE_ONLY",
|
||||
]
|
||||
|
||||
symbols_shim = _compile_and_extract_symbols(
|
||||
cpp_content=test_shim_content,
|
||||
compile_flags=compile_flags,
|
||||
)
|
||||
num_symbols_shim = len(symbols_shim)
|
||||
assert num_symbols_shim > 0, (
|
||||
f"Expected shim headers to expose symbols with TORCH_STABLE_ONLY, "
|
||||
f"but found {num_symbols_shim} symbols"
|
||||
)
|
||||
|
||||
|
||||
def check_stable_c_shim_symbols(install_root: Path) -> None:
|
||||
"""
|
||||
Test that stable C shim headers still expose symbols with TORCH_STABLE_ONLY.
|
||||
"""
|
||||
include_dir = install_root / "include"
|
||||
assert include_dir.exists(), f"Expected {include_dir} to be present"
|
||||
|
||||
# Check if the stable C shim exists
|
||||
stable_shim = include_dir / "torch" / "csrc" / "stable" / "c" / "shim.h"
|
||||
if not stable_shim.exists():
|
||||
raise RuntimeError("Could not find stable c shim")
|
||||
|
||||
# There are no constexpr symbols etc., so we need to actually use functions
|
||||
# so that some symbols are found.
|
||||
test_stable_shim_content = """
|
||||
#include <torch/csrc/stable/c/shim.h>
|
||||
int main() {
|
||||
// Reference stable C API functions to create undefined symbols
|
||||
AOTITorchError (*fp1)(const char*, uint32_t*, int32_t*) = &torch_parse_device_string;
|
||||
AOTITorchError (*fp2)(uint32_t*) = &torch_get_num_threads;
|
||||
(void)fp1; (void)fp2;
|
||||
return 0;
|
||||
}
|
||||
"""
|
||||
|
||||
compile_flags = [
|
||||
"g++",
|
||||
"-std=c++17",
|
||||
f"-I{include_dir}",
|
||||
f"-I{include_dir}/torch/csrc/api/include",
|
||||
"-c",
|
||||
"-DTORCH_STABLE_ONLY",
|
||||
]
|
||||
|
||||
symbols_stable_shim = _compile_and_extract_symbols(
|
||||
cpp_content=test_stable_shim_content,
|
||||
compile_flags=compile_flags,
|
||||
)
|
||||
num_symbols_stable_shim = len(symbols_stable_shim)
|
||||
assert num_symbols_stable_shim > 0, (
|
||||
f"Expected stable C shim headers to expose symbols with TORCH_STABLE_ONLY, "
|
||||
f"but found {num_symbols_stable_shim} symbols"
|
||||
)
|
||||
|
||||
|
||||
def check_lib_symbols_for_abi_correctness(lib: str) -> None:
|
||||
print(f"lib: {lib}")
|
||||
cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS)
|
||||
@ -129,6 +460,13 @@ def main() -> None:
|
||||
check_lib_symbols_for_abi_correctness(libtorch_cpu_path)
|
||||
check_lib_statically_linked_libstdc_cxx_abi_symbols(libtorch_cpu_path)
|
||||
|
||||
# Check symbols when TORCH_STABLE_ONLY is defined
|
||||
check_stable_only_symbols(install_root)
|
||||
check_stable_api_symbols(install_root)
|
||||
check_headeronly_symbols(install_root)
|
||||
check_aoti_shim_symbols(install_root)
|
||||
check_stable_c_shim_symbols(install_root)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@ -389,13 +389,6 @@ test_lazy_tensor_meta_reference_disabled() {
|
||||
export -n TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE
|
||||
}
|
||||
|
||||
test_dynamo_core() {
|
||||
time python test/run_test.py \
|
||||
--include-dynamo-core-tests \
|
||||
--verbose \
|
||||
--upload-artifacts-while-running
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_dynamo_wrapped_shard() {
|
||||
if [[ -z "$NUM_TEST_SHARDS" ]]; then
|
||||
@ -1250,97 +1243,6 @@ test_custom_script_ops() {
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_libtorch_agnostic_targetting() {
|
||||
echo "Testing libtorch_agnostic runs correctly on TORCH_TARGET_VERSION"
|
||||
|
||||
REPO_DIR=$(pwd)
|
||||
WHEEL_DIR="${REPO_DIR}/test/cpp_extensions/.wheels"
|
||||
|
||||
# Build wheel with current PyTorch (this has TORCH_TARGET_VERSION 2_9_0)
|
||||
echo "Building 2.9 extension wheel with current PyTorch..."
|
||||
pushd test/cpp_extensions/libtorch_agnostic_2_9_extension
|
||||
time python setup.py bdist_wheel
|
||||
|
||||
# Save the wheel
|
||||
mkdir -p "$WHEEL_DIR"
|
||||
cp dist/*.whl "$WHEEL_DIR/"
|
||||
WHEEL_FILE=$(find "$WHEEL_DIR" -maxdepth 1 -name "*.whl" -type f | head -1)
|
||||
echo "Built wheel: $(basename "$WHEEL_FILE")"
|
||||
popd
|
||||
|
||||
# Create venv and install PyTorch 2.9
|
||||
python -m venv venv_pytorch_2_9
|
||||
# shellcheck disable=SC1091
|
||||
. venv_pytorch_2_9/bin/activate
|
||||
|
||||
# Clear PYTHONPATH to avoid using the development PyTorch
|
||||
echo "Clearing PYTHONPATH to use only venv packages..."
|
||||
unset PYTHONPATH
|
||||
|
||||
# Upgrade pip to latest version
|
||||
echo "Upgrading pip to latest version..."
|
||||
pip install --upgrade pip
|
||||
pip --version
|
||||
|
||||
echo "Installing PyTorch 2.9..."
|
||||
|
||||
# Install from release channel only
|
||||
PYTORCH_VERSION="2.9.0"
|
||||
|
||||
# Extract CUDA version from BUILD_ENVIRONMENT (e.g., "cuda12.1" -> "cu121")
|
||||
if [[ "$BUILD_ENVIRONMENT" =~ cuda([0-9]+)\.([0-9]+) ]]; then
|
||||
CUDA_MAJOR="${BASH_REMATCH[1]}"
|
||||
CUDA_MINOR="${BASH_REMATCH[2]}"
|
||||
CUDA_VERSION="cu${CUDA_MAJOR}${CUDA_MINOR}"
|
||||
echo " Detected CUDA ${CUDA_MAJOR}.${CUDA_MINOR} from BUILD_ENVIRONMENT, using ${CUDA_VERSION}"
|
||||
else
|
||||
# Default to CPU build
|
||||
CUDA_VERSION="cpu"
|
||||
echo " No CUDA detected in BUILD_ENVIRONMENT, using CPU build"
|
||||
fi
|
||||
|
||||
if pip install torch=="${PYTORCH_VERSION}" --index-url https://download.pytorch.org/whl/${CUDA_VERSION}/; then
|
||||
echo "Installed PyTorch ${PYTORCH_VERSION} from release channel (${CUDA_VERSION})"
|
||||
else
|
||||
echo " FAILED to install PyTorch 2.9.0 from release channel"
|
||||
echo " URL: https://download.pytorch.org/whl/${CUDA_VERSION}/"
|
||||
deactivate
|
||||
rm -rf venv_pytorch_2_9
|
||||
return 1
|
||||
fi
|
||||
|
||||
INSTALLED_VERSION=$(python -c "import torch; print(torch.__version__)" 2>/dev/null || echo "unknown")
|
||||
echo " Installed version: $INSTALLED_VERSION"
|
||||
|
||||
# Install test dependencies
|
||||
echo "Installing test dependencies..."
|
||||
pip install expecttest numpy unittest-xml-reporting
|
||||
|
||||
# Install the pre-built wheel
|
||||
echo ""
|
||||
echo "Installing pre-built 2.9 extension wheel (built with PyTorch 2.10)..."
|
||||
pip install "$WHEEL_FILE"
|
||||
echo "Installed $(basename "$WHEEL_FILE") into PyTorch 2.9 environment"
|
||||
|
||||
# Run tests with PyTorch 2.9 runtime (2.10 tests will be skipped automatically)
|
||||
echo ""
|
||||
echo "Running tests with PyTorch 2.9 runtime (using wheel built on PyTorch 2.10)..."
|
||||
if time python test/cpp_extensions/test_libtorch_agnostic.py -v; then
|
||||
echo ""
|
||||
echo " Wheel built with current torch and TORCH_TARGET_VERSION 2_9_0 works with PyTorch 2.9 runtime!"
|
||||
else
|
||||
echo "targeting test failed"
|
||||
deactivate
|
||||
rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
|
||||
return 1
|
||||
fi
|
||||
|
||||
deactivate
|
||||
rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
|
||||
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_jit_hooks() {
|
||||
echo "Testing jit hooks in cpp"
|
||||
HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
|
||||
@ -1813,8 +1715,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]];
|
||||
elif [[ "${TEST_CONFIG}" == *backward* ]]; then
|
||||
test_forward_backward_compatibility
|
||||
# Do NOT add tests after bc check tests, see its comment.
|
||||
elif [[ "${TEST_CONFIG}" == *libtorch_agnostic_targetting* ]]; then
|
||||
test_libtorch_agnostic_targetting
|
||||
elif [[ "${TEST_CONFIG}" == *xla* ]]; then
|
||||
install_torchvision
|
||||
build_xla
|
||||
@ -1914,8 +1814,6 @@ elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
|
||||
test_inductor_shard "${SHARD_NUMBER}"
|
||||
elif [[ "${TEST_CONFIG}" == *einops* ]]; then
|
||||
test_einops
|
||||
elif [[ "${TEST_CONFIG}" == *dynamo_core* ]]; then
|
||||
test_dynamo_core
|
||||
elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
|
||||
install_torchvision
|
||||
test_dynamo_wrapped_shard "${SHARD_NUMBER}"
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
ee1a1350eb37804b94334768f328144f058f14e9
|
||||
07b6cbde121417a70e4dc871adb6d27030e0ce3f
|
||||
|
||||
2
.github/ci_commit_pins/vision.txt
vendored
2
.github/ci_commit_pins/vision.txt
vendored
@ -1 +1 @@
|
||||
2d82dc5caa336d179d9b46ac4a0fb8c43d84c5cc
|
||||
acccf86477759b2d3500f1ae1be065f7b1e409ec
|
||||
|
||||
2
.github/ci_commit_pins/xla.txt
vendored
2
.github/ci_commit_pins/xla.txt
vendored
@ -1 +1 @@
|
||||
94631807d22c09723dd006f7be5beb649d5f88d0
|
||||
e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a
|
||||
|
||||
7
.github/labeler.yml
vendored
7
.github/labeler.yml
vendored
@ -91,6 +91,13 @@
|
||||
"ciflow/trunk":
|
||||
- .ci/docker/ci_commit_pins/triton.txt
|
||||
|
||||
"oncall: distributed":
|
||||
- torch/csrc/distributed/**
|
||||
- torch/distributed/**
|
||||
- torch/nn/parallel/**
|
||||
- test/distributed/**
|
||||
- torch/testing/_internal/distributed/**
|
||||
|
||||
"release notes: distributed (checkpoint)":
|
||||
- torch/distributed/checkpoint/**
|
||||
- test/distributed/checkpoint/**
|
||||
|
||||
1
.github/pytorch-probot.yml
vendored
1
.github/pytorch-probot.yml
vendored
@ -7,7 +7,6 @@ ciflow_push_tags:
|
||||
- ciflow/binaries
|
||||
- ciflow/binaries_libtorch
|
||||
- ciflow/binaries_wheel
|
||||
- ciflow/dynamo
|
||||
- ciflow/h100
|
||||
- ciflow/h100-cutlass-backend
|
||||
- ciflow/h100-distributed
|
||||
|
||||
7
.github/workflows/_binary-build-linux.yml
vendored
7
.github/workflows/_binary-build-linux.yml
vendored
@ -260,11 +260,8 @@ jobs:
|
||||
"${DOCKER_IMAGE}"
|
||||
)
|
||||
docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"aarch64"* ]]; then
|
||||
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/aarch64_linux/aarch64_ci_build.sh"
|
||||
else
|
||||
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
|
||||
fi
|
||||
# Unified build script for all architectures (x86_64, aarch64, s390x)
|
||||
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
|
||||
|
||||
- name: Chown artifacts
|
||||
if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}
|
||||
|
||||
2
.github/workflows/_linux-test.yml
vendored
2
.github/workflows/_linux-test.yml
vendored
@ -326,7 +326,7 @@ jobs:
|
||||
SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
|
||||
SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
|
||||
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
|
||||
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
DOCKER_IMAGE: ${{ inputs.docker-image }}
|
||||
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
|
||||
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
|
||||
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
|
||||
|
||||
70
.github/workflows/dynamo-unittest.yml
vendored
70
.github/workflows/dynamo-unittest.yml
vendored
@ -1,70 +0,0 @@
|
||||
# Workflow: Dynamo Unit Test
|
||||
# runs unit tests for dynamo.
|
||||
name: dynamo-unittest
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/dynamo/*
|
||||
workflow_call:
|
||||
schedule:
|
||||
- cron: 29 8 * * * # about 1:29am PDT
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
opt_out_experiments: lf
|
||||
|
||||
dynamo-build:
|
||||
name: dynamo-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ['3.11', '3.12']
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py${{ matrix.python-version }}-clang12
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py${{ matrix.python-version }}-clang12
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "dynamo_core", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
dynamo-test:
|
||||
name: dynamo-test
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: [get-label-type, dynamo-build]
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ['3.11', '3.12']
|
||||
with:
|
||||
build-environment: linux-jammy-py${{ matrix.python-version }}-clang12
|
||||
docker-image: ci-image:pytorch-linux-jammy-py${{ matrix.python-version }}-clang12
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "dynamo_core", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
1
.github/workflows/pull.yml
vendored
1
.github/workflows/pull.yml
vendored
@ -70,7 +70,6 @@ jobs:
|
||||
{ config: "distributed", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "distributed", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "numpy_2_x", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
|
||||
{ config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
1
.github/workflows/trunk.yml
vendored
1
.github/workflows/trunk.yml
vendored
@ -83,7 +83,6 @@ jobs:
|
||||
{ config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
|
||||
{ config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
|
||||
{ config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
@ -144,7 +144,7 @@ inline std::bitset<kVmapNumLevels> createVmapLevelsBitset(BatchDimsRef bdims) {
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& out, const BatchDim& bdim) {
|
||||
out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ')';
|
||||
out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ")";
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ namespace indexing {
|
||||
const EllipsisIndexType Ellipsis = EllipsisIndexType();
|
||||
|
||||
std::ostream& operator<<(std::ostream& stream, const Slice& slice) {
|
||||
stream << slice.start() << ':' << slice.stop() << ':' << slice.step();
|
||||
stream << slice.start() << ":" << slice.stop() << ":" << slice.step();
|
||||
return stream;
|
||||
}
|
||||
|
||||
@ -31,12 +31,12 @@ std::ostream& operator<<(std::ostream& stream, const TensorIndex& tensor_index)
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& stream, const std::vector<TensorIndex>& tensor_indices) {
|
||||
stream << '(';
|
||||
stream << "(";
|
||||
for (const auto i : c10::irange(tensor_indices.size())) {
|
||||
stream << tensor_indices[i];
|
||||
if (i < tensor_indices.size() - 1) stream << ", ";
|
||||
}
|
||||
stream << ')';
|
||||
stream << ")";
|
||||
return stream;
|
||||
}
|
||||
|
||||
|
||||
@ -113,7 +113,7 @@ void TensorNames::checkUnique(const char* op_name) const {
|
||||
std::ostream& operator<<(std::ostream& out, const TensorName& tensorname) {
|
||||
out << tensorname.name_ << " (index ";
|
||||
out << tensorname.origin_idx_ << " of ";
|
||||
out << tensorname.origin_ << ')';
|
||||
out << tensorname.origin_ << ")";
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
@ -13,9 +13,9 @@ std::ostream& operator<<(std::ostream & out, const TensorGeometryArg& t) {
|
||||
if (t.pos == 0) {
|
||||
// 0 is distinguished; it usually indicates 'self' or the return
|
||||
// tensor
|
||||
out << '\'' << t.name << '\'';
|
||||
out << "'" << t.name << "'";
|
||||
} else {
|
||||
out << "argument #" << t.pos << " '" << t.name << '\'';
|
||||
out << "argument #" << t.pos << " '" << t.name << "'";
|
||||
}
|
||||
return out;
|
||||
}
|
||||
@ -154,7 +154,7 @@ void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
|
||||
oss << "Tensor for " << t2 << " is on CPU, ";
|
||||
}
|
||||
oss << "but expected " << ((!t1->is_cpu() && !t2->is_cpu()) ? "them" : "it")
|
||||
<< " to be on GPU (while checking arguments for " << c << ')';
|
||||
<< " to be on GPU (while checking arguments for " << c << ")";
|
||||
TORCH_CHECK(false, oss.str());
|
||||
}
|
||||
TORCH_CHECK(
|
||||
@ -199,7 +199,7 @@ void checkScalarTypes(CheckedFrom c, const TensorArg& t,
|
||||
i++;
|
||||
}
|
||||
oss << "; but got " << t->toString()
|
||||
<< " instead (while checking arguments for " << c << ')';
|
||||
<< " instead (while checking arguments for " << c << ")";
|
||||
TORCH_CHECK(false, oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
@ -43,8 +43,8 @@ std::string get_mkldnn_version() {
|
||||
// https://github.com/intel/ideep/issues/29
|
||||
{
|
||||
const dnnl_version_t* ver = dnnl_version();
|
||||
ss << "Intel(R) MKL-DNN v" << ver->major << '.' << ver->minor << '.' << ver->patch
|
||||
<< " (Git Hash " << ver->hash << ')';
|
||||
ss << "Intel(R) MKL-DNN v" << ver->major << "." << ver->minor << "." << ver->patch
|
||||
<< " (Git Hash " << ver->hash << ")";
|
||||
}
|
||||
#else
|
||||
ss << "MKLDNN not found";
|
||||
@ -81,7 +81,7 @@ std::string get_openmp_version() {
|
||||
break;
|
||||
}
|
||||
if (ver_str) {
|
||||
ss << " (a.k.a. OpenMP " << ver_str << ')';
|
||||
ss << " (a.k.a. OpenMP " << ver_str << ")";
|
||||
}
|
||||
}
|
||||
#else
|
||||
@ -135,38 +135,38 @@ std::string show_config() {
|
||||
|
||||
#if defined(__GNUC__)
|
||||
{
|
||||
ss << " - GCC " << __GNUC__ << '.' << __GNUC_MINOR__ << '\n';
|
||||
ss << " - GCC " << __GNUC__ << "." << __GNUC_MINOR__ << "\n";
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__cplusplus)
|
||||
{
|
||||
ss << " - C++ Version: " << __cplusplus << '\n';
|
||||
ss << " - C++ Version: " << __cplusplus << "\n";
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__clang_major__)
|
||||
{
|
||||
ss << " - clang " << __clang_major__ << '.' << __clang_minor__ << '.' << __clang_patchlevel__ << '\n';
|
||||
ss << " - clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__ << "\n";
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
{
|
||||
ss << " - MSVC " << _MSC_FULL_VER << '\n';
|
||||
ss << " - MSVC " << _MSC_FULL_VER << "\n";
|
||||
}
|
||||
#endif
|
||||
|
||||
#if AT_MKL_ENABLED()
|
||||
ss << " - " << get_mkl_version() << '\n';
|
||||
ss << " - " << get_mkl_version() << "\n";
|
||||
#endif
|
||||
|
||||
#if AT_MKLDNN_ENABLED()
|
||||
ss << " - " << get_mkldnn_version() << '\n';
|
||||
ss << " - " << get_mkldnn_version() << "\n";
|
||||
#endif
|
||||
|
||||
#ifdef _OPENMP
|
||||
ss << " - " << get_openmp_version() << '\n';
|
||||
ss << " - " << get_openmp_version() << "\n";
|
||||
#endif
|
||||
|
||||
#if AT_BUILD_WITH_LAPACK()
|
||||
@ -183,7 +183,7 @@ std::string show_config() {
|
||||
ss << " - Cross compiling on MacOSX\n";
|
||||
#endif
|
||||
|
||||
ss << " - "<< used_cpu_capability() << '\n';
|
||||
ss << " - "<< used_cpu_capability() << "\n";
|
||||
|
||||
if (hasCUDA()) {
|
||||
ss << detail::getCUDAHooks().showConfig();
|
||||
@ -200,10 +200,10 @@ std::string show_config() {
|
||||
ss << " - Build settings: ";
|
||||
for (const auto& pair : caffe2::GetBuildOptions()) {
|
||||
if (!pair.second.empty()) {
|
||||
ss << pair.first << '=' << pair.second << ", ";
|
||||
ss << pair.first << "=" << pair.second << ", ";
|
||||
}
|
||||
}
|
||||
ss << '\n';
|
||||
ss << "\n";
|
||||
|
||||
// TODO: do HIP
|
||||
// TODO: do XLA
|
||||
|
||||
@ -209,7 +209,7 @@ struct CodeTemplate {
|
||||
// to indent correctly in the context.
|
||||
void emitIndent(std::ostream& out, size_t indent) const {
|
||||
for ([[maybe_unused]] const auto i : c10::irange(indent)) {
|
||||
out << ' ';
|
||||
out << " ";
|
||||
}
|
||||
}
|
||||
void emitStringWithIndents(
|
||||
|
||||
@ -10,7 +10,7 @@ std::ostream& operator<<(std::ostream& out, const Dimname& dimname) {
|
||||
if (dimname.type() == NameType::WILDCARD) {
|
||||
out << "None";
|
||||
} else {
|
||||
out << '\'' << dimname.symbol().toUnqualString() << '\'';
|
||||
out << "'" << dimname.symbol().toUnqualString() << "'";
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
namespace at {
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, const Range& range) {
|
||||
out << "Range[" << range.begin << ", " << range.end << ']';
|
||||
out << "Range[" << range.begin << ", " << range.end << "]";
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
@ -71,7 +71,7 @@ void TensorBase::enforce_invariants() {
|
||||
|
||||
void TensorBase::print() const {
|
||||
if (defined()) {
|
||||
std::cerr << '[' << toString() << ' ' << sizes() << ']' << '\n';
|
||||
std::cerr << "[" << toString() << " " << sizes() << "]" << '\n';
|
||||
} else {
|
||||
std::cerr << "[UndefinedTensor]" << '\n';
|
||||
}
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
#pragma once
|
||||
|
||||
#include <torch/headeronly/core/TensorAccessor.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/ArrayRef.h>
|
||||
#include <c10/util/Deprecated.h>
|
||||
@ -12,37 +11,252 @@
|
||||
|
||||
namespace at {
|
||||
|
||||
using torch::headeronly::DefaultPtrTraits;
|
||||
// The PtrTraits argument to the TensorAccessor/GenericPackedTensorAccessor
|
||||
// is used to enable the __restrict__ keyword/modifier for the data
|
||||
// passed to cuda.
|
||||
template <typename T>
|
||||
struct DefaultPtrTraits {
|
||||
typedef T* PtrType;
|
||||
};
|
||||
|
||||
#if defined(__CUDACC__) || defined(__HIPCC__)
|
||||
using torch::headeronly::RestrictPtrTraits;
|
||||
template <typename T>
|
||||
struct RestrictPtrTraits {
|
||||
typedef T* __restrict__ PtrType;
|
||||
};
|
||||
#endif
|
||||
|
||||
// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors.
|
||||
// For CUDA tensors it is used in device code (only). This means that we restrict ourselves
|
||||
// to functions and types available there (e.g. IntArrayRef isn't).
|
||||
|
||||
// The PtrTraits argument is only relevant to cuda to support `__restrict__` pointers.
|
||||
template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
|
||||
using TensorAccessorBase = torch::headeronly::detail::TensorAccessorBase<c10::IntArrayRef, T, N, PtrTraits, index_t>;
|
||||
class TensorAccessorBase {
|
||||
public:
|
||||
typedef typename PtrTraits<T>::PtrType PtrType;
|
||||
|
||||
C10_HOST_DEVICE TensorAccessorBase(
|
||||
PtrType data_,
|
||||
const index_t* sizes_,
|
||||
const index_t* strides_)
|
||||
: data_(data_), sizes_(sizes_), strides_(strides_) {}
|
||||
C10_HOST IntArrayRef sizes() const {
|
||||
return IntArrayRef(sizes_,N);
|
||||
}
|
||||
C10_HOST IntArrayRef strides() const {
|
||||
return IntArrayRef(strides_,N);
|
||||
}
|
||||
C10_HOST_DEVICE index_t stride(index_t i) const {
|
||||
return strides_[i];
|
||||
}
|
||||
C10_HOST_DEVICE index_t size(index_t i) const {
|
||||
return sizes_[i];
|
||||
}
|
||||
C10_HOST_DEVICE PtrType data() {
|
||||
return data_;
|
||||
}
|
||||
C10_HOST_DEVICE const PtrType data() const {
|
||||
return data_;
|
||||
}
|
||||
protected:
|
||||
PtrType data_;
|
||||
const index_t* sizes_;
|
||||
const index_t* strides_;
|
||||
};
|
||||
|
||||
// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using
|
||||
// `Tensor.accessor<T, N>()`.
|
||||
// For CUDA `Tensor`s, `GenericPackedTensorAccessor` is used on the host and only
|
||||
// indexing on the device uses `TensorAccessor`s.
|
||||
template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
|
||||
using TensorAccessor = torch::headeronly::detail::TensorAccessor<c10::IntArrayRef, T, N, PtrTraits, index_t>;
|
||||
class TensorAccessor : public TensorAccessorBase<T,N,PtrTraits,index_t> {
|
||||
public:
|
||||
typedef typename PtrTraits<T>::PtrType PtrType;
|
||||
|
||||
namespace detail {
|
||||
C10_HOST_DEVICE TensorAccessor(
|
||||
PtrType data_,
|
||||
const index_t* sizes_,
|
||||
const index_t* strides_)
|
||||
: TensorAccessorBase<T, N, PtrTraits, index_t>(data_,sizes_,strides_) {}
|
||||
|
||||
template <size_t N, typename index_t>
|
||||
struct IndexBoundsCheck {
|
||||
IndexBoundsCheck(index_t i) {
|
||||
TORCH_CHECK_INDEX(
|
||||
C10_HOST_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
|
||||
return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
|
||||
}
|
||||
|
||||
C10_HOST_DEVICE const TensorAccessor<T, N-1, PtrTraits, index_t> operator[](index_t i) const {
|
||||
return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, template <typename U> class PtrTraits, typename index_t>
|
||||
class TensorAccessor<T,1,PtrTraits,index_t> : public TensorAccessorBase<T,1,PtrTraits,index_t> {
|
||||
public:
|
||||
typedef typename PtrTraits<T>::PtrType PtrType;
|
||||
|
||||
C10_HOST_DEVICE TensorAccessor(
|
||||
PtrType data_,
|
||||
const index_t* sizes_,
|
||||
const index_t* strides_)
|
||||
: TensorAccessorBase<T, 1, PtrTraits, index_t>(data_,sizes_,strides_) {}
|
||||
C10_HOST_DEVICE T & operator[](index_t i) {
|
||||
// NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
|
||||
return this->data_[this->strides_[0]*i];
|
||||
}
|
||||
C10_HOST_DEVICE const T & operator[](index_t i) const {
|
||||
return this->data_[this->strides_[0]*i];
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// GenericPackedTensorAccessorBase and GenericPackedTensorAccessor are used on for CUDA `Tensor`s on the host
|
||||
// and as
|
||||
// In contrast to `TensorAccessor`s, they copy the strides and sizes on instantiation (on the host)
|
||||
// in order to transfer them on the device when calling kernels.
|
||||
// On the device, indexing of multidimensional tensors gives to `TensorAccessor`s.
|
||||
// Use RestrictPtrTraits as PtrTraits if you want the tensor's data pointer to be marked as __restrict__.
|
||||
// Instantiation from data, sizes, strides is only needed on the host and std::copy isn't available
|
||||
// on the device, so those functions are host only.
|
||||
template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
|
||||
class GenericPackedTensorAccessorBase {
|
||||
public:
|
||||
typedef typename PtrTraits<T>::PtrType PtrType;
|
||||
C10_HOST GenericPackedTensorAccessorBase(
|
||||
PtrType data_,
|
||||
const index_t* sizes_,
|
||||
const index_t* strides_)
|
||||
: data_(data_) {
|
||||
std::copy(sizes_, sizes_ + N, std::begin(this->sizes_));
|
||||
std::copy(strides_, strides_ + N, std::begin(this->strides_));
|
||||
}
|
||||
|
||||
// if index_t is not int64_t, we want to have an int64_t constructor
|
||||
template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
|
||||
C10_HOST GenericPackedTensorAccessorBase(
|
||||
PtrType data_,
|
||||
const source_index_t* sizes_,
|
||||
const source_index_t* strides_)
|
||||
: data_(data_) {
|
||||
for (const auto i : c10::irange(N)) {
|
||||
this->sizes_[i] = sizes_[i];
|
||||
this->strides_[i] = strides_[i];
|
||||
}
|
||||
}
|
||||
|
||||
C10_HOST_DEVICE index_t stride(index_t i) const {
|
||||
return strides_[i];
|
||||
}
|
||||
C10_HOST_DEVICE index_t size(index_t i) const {
|
||||
return sizes_[i];
|
||||
}
|
||||
C10_HOST_DEVICE PtrType data() {
|
||||
return data_;
|
||||
}
|
||||
C10_HOST_DEVICE const PtrType data() const {
|
||||
return data_;
|
||||
}
|
||||
protected:
|
||||
PtrType data_;
|
||||
// NOLINTNEXTLINE(*c-arrays*)
|
||||
index_t sizes_[N];
|
||||
// NOLINTNEXTLINE(*c-arrays*)
|
||||
index_t strides_[N];
|
||||
C10_HOST void bounds_check_(index_t i) const {
|
||||
TORCH_CHECK_INDEX(
|
||||
0 <= i && i < index_t{N},
|
||||
"Index ",
|
||||
i,
|
||||
" is not within bounds of a tensor of dimension ",
|
||||
N);
|
||||
}
|
||||
}
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
|
||||
using GenericPackedTensorAccessorBase = torch::headeronly::detail::GenericPackedTensorAccessorBase<detail::IndexBoundsCheck<N, index_t>, T, N, PtrTraits, index_t>;
|
||||
class GenericPackedTensorAccessor : public GenericPackedTensorAccessorBase<T,N,PtrTraits,index_t> {
|
||||
public:
|
||||
typedef typename PtrTraits<T>::PtrType PtrType;
|
||||
|
||||
C10_HOST GenericPackedTensorAccessor(
|
||||
PtrType data_,
|
||||
const index_t* sizes_,
|
||||
const index_t* strides_)
|
||||
: GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
|
||||
|
||||
// if index_t is not int64_t, we want to have an int64_t constructor
|
||||
template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
|
||||
C10_HOST GenericPackedTensorAccessor(
|
||||
PtrType data_,
|
||||
const source_index_t* sizes_,
|
||||
const source_index_t* strides_)
|
||||
: GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
|
||||
|
||||
C10_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
|
||||
index_t* new_sizes = this->sizes_ + 1;
|
||||
index_t* new_strides = this->strides_ + 1;
|
||||
return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
|
||||
}
|
||||
|
||||
C10_DEVICE const TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) const {
|
||||
const index_t* new_sizes = this->sizes_ + 1;
|
||||
const index_t* new_strides = this->strides_ + 1;
|
||||
return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
|
||||
}
|
||||
|
||||
/// Returns a PackedTensorAccessor of the same dimension after transposing the
|
||||
/// two dimensions given. Does not actually move elements; transposition is
|
||||
/// made by permuting the size/stride arrays. If the dimensions are not valid,
|
||||
/// asserts.
|
||||
C10_HOST GenericPackedTensorAccessor<T, N, PtrTraits, index_t> transpose(
|
||||
index_t dim1,
|
||||
index_t dim2) const {
|
||||
this->bounds_check_(dim1);
|
||||
this->bounds_check_(dim2);
|
||||
GenericPackedTensorAccessor<T, N, PtrTraits, index_t> result(
|
||||
this->data_, this->sizes_, this->strides_);
|
||||
std::swap(result.strides_[dim1], result.strides_[dim2]);
|
||||
std::swap(result.sizes_[dim1], result.sizes_[dim2]);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, template <typename U> class PtrTraits, typename index_t>
|
||||
class GenericPackedTensorAccessor<T,1,PtrTraits,index_t> : public GenericPackedTensorAccessorBase<T,1,PtrTraits,index_t> {
|
||||
public:
|
||||
typedef typename PtrTraits<T>::PtrType PtrType;
|
||||
C10_HOST GenericPackedTensorAccessor(
|
||||
PtrType data_,
|
||||
const index_t* sizes_,
|
||||
const index_t* strides_)
|
||||
: GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
|
||||
|
||||
// if index_t is not int64_t, we want to have an int64_t constructor
|
||||
template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
|
||||
C10_HOST GenericPackedTensorAccessor(
|
||||
PtrType data_,
|
||||
const source_index_t* sizes_,
|
||||
const source_index_t* strides_)
|
||||
: GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
|
||||
|
||||
C10_DEVICE T & operator[](index_t i) {
|
||||
return this->data_[this->strides_[0] * i];
|
||||
}
|
||||
C10_DEVICE const T& operator[](index_t i) const {
|
||||
return this->data_[this->strides_[0]*i];
|
||||
}
|
||||
|
||||
// Same as in the general N-dimensional case, but note that in the
|
||||
// 1-dimensional case the returned PackedTensorAccessor will always be an
|
||||
// identical copy of the original
|
||||
C10_HOST GenericPackedTensorAccessor<T, 1, PtrTraits, index_t> transpose(
|
||||
index_t dim1,
|
||||
index_t dim2) const {
|
||||
this->bounds_check_(dim1);
|
||||
this->bounds_check_(dim2);
|
||||
return GenericPackedTensorAccessor<T, 1, PtrTraits, index_t>(
|
||||
this->data_, this->sizes_, this->strides_);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
|
||||
using GenericPackedTensorAccessor = torch::headeronly::detail::GenericPackedTensorAccessor<TensorAccessor<T, N-1, PtrTraits, index_t>, detail::IndexBoundsCheck<N, index_t>, T, N, PtrTraits, index_t>;
|
||||
|
||||
// Can't put this directly into the macro function args because of commas
|
||||
#define AT_X GenericPackedTensorAccessor<T, N, PtrTraits, index_t>
|
||||
|
||||
@ -245,9 +245,6 @@ class TORCH_API TensorBase {
|
||||
size_t weak_use_count() const noexcept {
|
||||
return impl_.weak_use_count();
|
||||
}
|
||||
bool is_uniquely_owned() const noexcept {
|
||||
return impl_.is_uniquely_owned();
|
||||
}
|
||||
|
||||
std::string toString() const;
|
||||
|
||||
|
||||
@ -9,8 +9,8 @@ APIVitals VitalsAPI;
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, TorchVital const& tv) {
|
||||
for (const auto& m : tv.attrs) {
|
||||
os << "[TORCH_VITAL] " << tv.name << '.' << m.first << "\t\t "
|
||||
<< m.second.value << '\n';
|
||||
os << "[TORCH_VITAL] " << tv.name << "." << m.first << "\t\t "
|
||||
<< m.second.value << "\n";
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
@ -100,18 +100,18 @@ inline bool operator==(const AliasInfo& lhs, const AliasInfo& rhs) {
|
||||
|
||||
// this does match the way things are represented in the schema
|
||||
inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
|
||||
out << '(';
|
||||
out << "(";
|
||||
bool first = true;
|
||||
for (const auto& set : aliasInfo.beforeSets()) {
|
||||
if (first) {
|
||||
first = false;
|
||||
} else {
|
||||
out << '|';
|
||||
out << "|";
|
||||
}
|
||||
out << set.toUnqualString();
|
||||
}
|
||||
if (aliasInfo.isWrite()) {
|
||||
out << '!';
|
||||
out << "!";
|
||||
}
|
||||
if (aliasInfo.beforeSets() != aliasInfo.afterSets()) {
|
||||
out << " -> ";
|
||||
@ -120,12 +120,12 @@ inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
|
||||
if (first) {
|
||||
first = false;
|
||||
} else {
|
||||
out << '|';
|
||||
out << "|";
|
||||
}
|
||||
out << set.toUnqualString();
|
||||
}
|
||||
}
|
||||
out << ')';
|
||||
out << ")";
|
||||
return out;
|
||||
}
|
||||
} // namespace c10
|
||||
|
||||
@ -198,7 +198,7 @@ inline void swap(Blob& lhs, Blob& rhs) noexcept {
|
||||
}
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& out, const Blob& v) {
|
||||
return out << "Blob[" << v.TypeName() << ']';
|
||||
return out << "Blob[" << v.TypeName() << "]";
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
@ -456,8 +456,8 @@ bool ClassType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
|
||||
*why_not << "Method on class '" << repr_str()
|
||||
<< "' (1) is not compatible with interface '"
|
||||
<< rhs.repr_str() << "' (2)\n"
|
||||
<< " (1) " << self_method->getSchema() << '\n'
|
||||
<< " (2) " << schema << '\n';
|
||||
<< " (1) " << self_method->getSchema() << "\n"
|
||||
<< " (2) " << schema << "\n";
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -100,7 +100,7 @@ struct TORCH_API ClassType : public NamedType {
|
||||
std::string repr_str() const override {
|
||||
std::stringstream ss;
|
||||
ss << str()
|
||||
<< " (of Python compilation unit at: " << compilation_unit().get() << ')';
|
||||
<< " (of Python compilation unit at: " << compilation_unit().get() << ")";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
|
||||
@ -58,12 +58,12 @@ std::string DispatchKeyExtractor::dumpState() const {
|
||||
std::ostringstream oss;
|
||||
for (const auto i : c10::irange(c10::utils::bitset::NUM_BITS())) {
|
||||
if (dispatch_arg_indices_reverse_.get(i)) {
|
||||
oss << '1';
|
||||
oss << "1";
|
||||
} else {
|
||||
oss << '0';
|
||||
oss << "0";
|
||||
}
|
||||
}
|
||||
oss << ' ' << nonFallthroughKeys_ << '\n';
|
||||
oss << " " << nonFallthroughKeys_ << "\n";
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
|
||||
@ -69,8 +69,8 @@ private:
|
||||
|
||||
void _print_dispatch_trace(const std::string& label, const std::string& op_name, const DispatchKeySet& dispatchKeySet) {
|
||||
auto nesting_value = dispatch_trace_nesting_value();
|
||||
for (int64_t i = 0; i < nesting_value; ++i) std::cerr << ' ';
|
||||
std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << ']' << std::endl;
|
||||
for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
|
||||
std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
|
||||
@ -570,7 +570,7 @@ void OperatorEntry::checkInvariants() const {
|
||||
|
||||
std::string OperatorEntry::listAllDispatchKeys() const {
|
||||
std::ostringstream str;
|
||||
str << '[';
|
||||
str << "[";
|
||||
|
||||
bool has_kernels = false;
|
||||
for (auto k : allDispatchKeysInFullSet()) {
|
||||
@ -584,7 +584,7 @@ std::string OperatorEntry::listAllDispatchKeys() const {
|
||||
str << k;
|
||||
has_kernels = true;
|
||||
}
|
||||
str << ']';
|
||||
str << "]";
|
||||
return str.str();
|
||||
}
|
||||
|
||||
@ -683,12 +683,12 @@ void OperatorEntry::setReportErrorCallback_(std::unique_ptr<c10::SafePyObject> c
|
||||
// This WON'T report backend fallbacks.
|
||||
std::string OperatorEntry::dumpState() const {
|
||||
std::ostringstream oss;
|
||||
oss << "name: " << name_ << '\n';
|
||||
oss << "name: " << name_ << "\n";
|
||||
if (schema_) {
|
||||
oss << "schema: " << schema_->schema << '\n';
|
||||
oss << "debug: " << schema_->debug << '\n';
|
||||
oss << "schema: " << schema_->schema << "\n";
|
||||
oss << "debug: " << schema_->debug << "\n";
|
||||
oss << "alias analysis kind: " << toString(schema_->schema.aliasAnalysis())
|
||||
<< (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << '\n';
|
||||
<< (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << "\n";
|
||||
} else {
|
||||
oss << "schema: (none)\n";
|
||||
}
|
||||
|
||||
@ -7,7 +7,7 @@
|
||||
namespace c10 {
|
||||
|
||||
void FunctionSchema::dump() const {
|
||||
std::cout << *this << '\n';
|
||||
std::cout << *this << "\n";
|
||||
}
|
||||
|
||||
const std::vector<Argument>& FunctionSchema::getCorrectList(SchemaArgType type) const {
|
||||
@ -210,9 +210,9 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
|
||||
|
||||
out << schema.name();
|
||||
if (!schema.overload_name().empty()) {
|
||||
out << '.' << schema.overload_name();
|
||||
out << "." << schema.overload_name();
|
||||
}
|
||||
out << '(';
|
||||
out << "(";
|
||||
|
||||
bool seen_kwarg_only = false;
|
||||
for (const auto i : c10::irange(schema.arguments().size())) {
|
||||
@ -273,7 +273,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
|
||||
}
|
||||
|
||||
if (need_paren) {
|
||||
out << '(';
|
||||
out << "(";
|
||||
}
|
||||
for (const auto i : c10::irange(returns.size())) {
|
||||
if (i > 0) {
|
||||
@ -288,7 +288,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
|
||||
out << "...";
|
||||
}
|
||||
if (need_paren) {
|
||||
out << ')';
|
||||
out << ")";
|
||||
}
|
||||
return out;
|
||||
}
|
||||
@ -471,7 +471,7 @@ bool FunctionSchema::isForwardCompatibleWith(
|
||||
if (!arguments().at(i).isForwardCompatibleWith(old.arguments().at(i))) {
|
||||
if (why_not) {
|
||||
why_not
|
||||
<< '\'' << arguments().at(i).name() << '\''
|
||||
<< "'" << arguments().at(i).name() << "'"
|
||||
<< " is not forward compatible with the older version of the schema";
|
||||
}
|
||||
return false;
|
||||
@ -511,7 +511,7 @@ bool FunctionSchema::isForwardCompatibleWith(
|
||||
.isForwardCompatibleWith(old.arguments().at(i))) {
|
||||
if (why_not) {
|
||||
why_not << "Out argument '"
|
||||
<< '\'' << arguments().at(i).name()
|
||||
<< "'" << arguments().at(i).name()
|
||||
<< " is not FC with the older version of the schema";
|
||||
}
|
||||
return false;
|
||||
|
||||
@ -571,7 +571,7 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
|
||||
if (arg.N()) {
|
||||
N = std::to_string(*arg.N());
|
||||
}
|
||||
out << '[' << N << ']';
|
||||
out << "[" << N << "]";
|
||||
} else {
|
||||
out << unopt_type->str();
|
||||
}
|
||||
@ -582,15 +582,15 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
|
||||
}
|
||||
|
||||
if (is_opt) {
|
||||
out << '?';
|
||||
out << "?";
|
||||
}
|
||||
|
||||
if (!arg.name().empty()) {
|
||||
out << ' ' << arg.name();
|
||||
out << " " << arg.name();
|
||||
}
|
||||
|
||||
if (arg.default_value()) {
|
||||
out << '=';
|
||||
out << "=";
|
||||
if ((type->kind() == c10::TypeKind::StringType ||
|
||||
unopt_type->kind() == c10::TypeKind::StringType) &&
|
||||
arg.default_value().value().isString()) {
|
||||
|
||||
@ -66,7 +66,7 @@ bool operator==(const ivalue::Tuple& lhs, const ivalue::Tuple& rhs) {
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {
|
||||
out << v.qualifiedClassName() << '.' << v.name();
|
||||
out << v.qualifiedClassName() << "." << v.name();
|
||||
return out;
|
||||
}
|
||||
|
||||
@ -526,7 +526,7 @@ std::ostream& printMaybeAnnotatedList(
|
||||
!elementTypeCanBeInferredFromMembers(list_elem_type)) {
|
||||
out << "annotate(" << the_list.type<c10::Type>()->annotation_str() << ", ";
|
||||
printList(out, the_list.toListRef(), "[", "]", formatter);
|
||||
out << ')';
|
||||
out << ")";
|
||||
return out;
|
||||
} else {
|
||||
return printList(out, the_list.toListRef(), "[", "]", formatter);
|
||||
@ -538,7 +538,7 @@ std::ostream& printDict(
|
||||
std::ostream& out,
|
||||
const Dict& v,
|
||||
const IValueFormatter& formatter) {
|
||||
out << '{';
|
||||
out << "{";
|
||||
|
||||
bool first = true;
|
||||
for (const auto& pair : v) {
|
||||
@ -552,7 +552,7 @@ std::ostream& printDict(
|
||||
first = false;
|
||||
}
|
||||
|
||||
out << '}';
|
||||
out << "}";
|
||||
return out;
|
||||
}
|
||||
}
|
||||
@ -565,8 +565,8 @@ static std::ostream& printMaybeAnnotatedDict(
|
||||
auto value_type = the_dict.type()->castRaw<DictType>()->getValueType();
|
||||
if (the_dict.toGenericDict().empty() ||
|
||||
!elementTypeCanBeInferredFromMembers(value_type)) {
|
||||
out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ',';
|
||||
printDict(out, the_dict.toGenericDict(), formatter) << ')';
|
||||
out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ",";
|
||||
printDict(out, the_dict.toGenericDict(), formatter) << ")";
|
||||
} else {
|
||||
return printDict(out, the_dict.toGenericDict(), formatter);
|
||||
}
|
||||
@ -577,7 +577,7 @@ static std::ostream& printComplex(std::ostream & out, const IValue & v) {
|
||||
c10::complex<double> d = v.toComplexDouble();
|
||||
IValue real(d.real()), imag(std::abs(d.imag()));
|
||||
auto sign = d.imag() >= 0 ? '+' : '-';
|
||||
return out << real << sign << imag << 'j';
|
||||
return out << real << sign << imag << "j";
|
||||
}
|
||||
|
||||
std::ostream& IValue::repr(
|
||||
@ -605,9 +605,9 @@ std::ostream& IValue::repr(
|
||||
if (static_cast<double>(i) == d) {
|
||||
// -0.0 (signed zero) needs to be parsed as -0.
|
||||
if (i == 0 && std::signbit(d)) {
|
||||
return out << '-' << i << '.';
|
||||
return out << "-" << i << ".";
|
||||
}
|
||||
return out << i << '.';
|
||||
return out << i << ".";
|
||||
}
|
||||
}
|
||||
auto orig_prec = out.precision();
|
||||
@ -643,20 +643,20 @@ std::ostream& IValue::repr(
|
||||
device_stream << v.toDevice();
|
||||
out << "torch.device(";
|
||||
c10::printQuotedString(out, device_stream.str());
|
||||
return out << ')';
|
||||
return out << ")";
|
||||
}
|
||||
case IValue::Tag::Generator: {
|
||||
auto generator = v.toGenerator();
|
||||
out << "torch.Generator(device=";
|
||||
c10::printQuotedString(out, generator.device().str());
|
||||
out << ", seed=" << generator.current_seed() << ')';
|
||||
out << ", seed=" << generator.current_seed() << ")";
|
||||
return out;
|
||||
}
|
||||
case IValue::Tag::GenericDict:
|
||||
return printMaybeAnnotatedDict(out, v, formatter);
|
||||
case IValue::Tag::Enum: {
|
||||
auto enum_holder = v.toEnumHolder();
|
||||
return out << enum_holder->qualifiedClassName() << '.' <<
|
||||
return out << enum_holder->qualifiedClassName() << "." <<
|
||||
enum_holder->name();
|
||||
}
|
||||
case IValue::Tag::Object: {
|
||||
@ -801,7 +801,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
|
||||
if (c == FP_NORMAL || c == FP_ZERO) {
|
||||
int64_t i = static_cast<int64_t>(d);
|
||||
if (static_cast<double>(i) == d) {
|
||||
return out << i << '.';
|
||||
return out << i << ".";
|
||||
}
|
||||
}
|
||||
auto orig_prec = out.precision();
|
||||
@ -852,7 +852,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
|
||||
return printDict(out, v.toGenericDict(), formatter);
|
||||
case IValue::Tag::PyObject: {
|
||||
auto py_obj = v.toPyObject();
|
||||
return out << "<PyObject at" << py_obj << '>';
|
||||
return out << "<PyObject at" << py_obj << ">";
|
||||
}
|
||||
case IValue::Tag::Generator:
|
||||
return out << "Generator";
|
||||
@ -862,22 +862,22 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
|
||||
// TODO we should attempt to call __str__ if the object defines it.
|
||||
auto obj = v.toObject();
|
||||
// print this out the way python would do it
|
||||
return out << '<' << obj->name() << " object at " << obj.get() << '>';
|
||||
return out << "<" << obj->name() << " object at " << obj.get() << ">";
|
||||
}
|
||||
case IValue::Tag::Enum: {
|
||||
auto enum_holder = v.toEnumHolder();
|
||||
return out << "Enum<" << enum_holder->unqualifiedClassName() << '.' <<
|
||||
enum_holder->name() << '>';
|
||||
return out << "Enum<" << enum_holder->unqualifiedClassName() << "." <<
|
||||
enum_holder->name() << ">";
|
||||
}
|
||||
|
||||
}
|
||||
return out << "<Invalid IValue tag=" << std::to_string(static_cast<uint32_t>(v.tag)) << '>';
|
||||
return out << "<Invalid IValue tag=" << std::to_string(static_cast<uint32_t>(v.tag)) << ">";
|
||||
}
|
||||
|
||||
#undef TORCH_FORALL_TAGS
|
||||
|
||||
void IValue::dump() const {
|
||||
std::cout << *this << '\n';
|
||||
std::cout << *this << "\n";
|
||||
}
|
||||
|
||||
std::shared_ptr<ClassType> ivalue::Object::type() const {
|
||||
@ -1050,7 +1050,7 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(
|
||||
std::stringstream err;
|
||||
err << "Cannot serialize custom bound C++ class";
|
||||
if (auto qualname = type()->name()) {
|
||||
err << ' ' << qualname->qualifiedName();
|
||||
err << " " << qualname->qualifiedName();
|
||||
}
|
||||
err << ". Please define serialization methods via def_pickle() for "
|
||||
"this class.";
|
||||
|
||||
@ -211,7 +211,7 @@ struct TORCH_API OptionalType : public UnionType {
|
||||
|
||||
std::string str() const override {
|
||||
std::stringstream ss;
|
||||
ss << getElementType()->str() << '?';
|
||||
ss << getElementType()->str() << "?";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
@ -240,7 +240,7 @@ struct TORCH_API OptionalType : public UnionType {
|
||||
|
||||
std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
|
||||
std::stringstream ss;
|
||||
ss << "Optional[" << getElementType()->annotation_str(printer) << ']';
|
||||
ss << "Optional[" << getElementType()->annotation_str(printer) << "]";
|
||||
return ss.str();
|
||||
}
|
||||
};
|
||||
@ -906,7 +906,7 @@ struct TORCH_API ListType
|
||||
|
||||
std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
|
||||
std::stringstream ss;
|
||||
ss << "List[" << getElementType()->annotation_str(printer) << ']';
|
||||
ss << "List[" << getElementType()->annotation_str(printer) << "]";
|
||||
return ss.str();
|
||||
}
|
||||
};
|
||||
@ -946,7 +946,7 @@ struct TORCH_API DictType : public SharedType {
|
||||
std::string str() const override {
|
||||
std::stringstream ss;
|
||||
ss << "Dict(" << getKeyType()->str() << ", " << getValueType()->str()
|
||||
<< ')';
|
||||
<< ")";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
@ -1018,7 +1018,7 @@ struct TORCH_API FutureType
|
||||
|
||||
std::string str() const override {
|
||||
std::stringstream ss;
|
||||
ss << "Future(" << getElementType()->str() << ')';
|
||||
ss << "Future(" << getElementType()->str() << ")";
|
||||
return ss.str();
|
||||
}
|
||||
TypePtr createWithContained(
|
||||
@ -1041,7 +1041,7 @@ struct TORCH_API FutureType
|
||||
|
||||
std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
|
||||
std::stringstream ss;
|
||||
ss << "Future[" << getElementType()->annotation_str(printer) << ']';
|
||||
ss << "Future[" << getElementType()->annotation_str(printer) << "]";
|
||||
return ss.str();
|
||||
}
|
||||
};
|
||||
@ -1060,7 +1060,7 @@ struct TORCH_API AwaitType
|
||||
|
||||
std::string str() const override {
|
||||
std::stringstream ss;
|
||||
ss << "Await(" << getElementType()->str() << ')';
|
||||
ss << "Await(" << getElementType()->str() << ")";
|
||||
return ss.str();
|
||||
}
|
||||
TypePtr createWithContained(
|
||||
@ -1083,7 +1083,7 @@ struct TORCH_API AwaitType
|
||||
|
||||
std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
|
||||
std::stringstream ss;
|
||||
ss << "Await[" << getElementType()->annotation_str(printer) << ']';
|
||||
ss << "Await[" << getElementType()->annotation_str(printer) << "]";
|
||||
return ss.str();
|
||||
}
|
||||
};
|
||||
@ -1102,7 +1102,7 @@ struct TORCH_API RRefType
|
||||
|
||||
std::string str() const override {
|
||||
std::stringstream ss;
|
||||
ss << "RRef(" << getElementType()->str() << ')';
|
||||
ss << "RRef(" << getElementType()->str() << ")";
|
||||
return ss.str();
|
||||
}
|
||||
TypePtr createWithContained(
|
||||
@ -1115,7 +1115,7 @@ struct TORCH_API RRefType
|
||||
|
||||
std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
|
||||
std::stringstream ss;
|
||||
ss << "RRef[" << getElementType()->annotation_str(printer) << ']';
|
||||
ss << "RRef[" << getElementType()->annotation_str(printer) << "]";
|
||||
return ss.str();
|
||||
}
|
||||
};
|
||||
|
||||
@ -11,7 +11,7 @@ std::string toString(const OperatorName& opName) {
|
||||
std::ostream& operator<<(std::ostream& os, const OperatorName& opName) {
|
||||
os << opName.name;
|
||||
if (!opName.overload_name.empty()) {
|
||||
os << '.' << opName.overload_name;
|
||||
os << "." << opName.overload_name;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
@ -65,7 +65,7 @@ VaryingShape<T> VaryingShape<T>::merge(const VaryingShape<T>& other) const {
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& out, const VaryingShape<T>& vs) {
|
||||
out << '(';
|
||||
out << "(";
|
||||
if (!vs.size()) {
|
||||
out << "*)";
|
||||
return out;
|
||||
@ -79,10 +79,10 @@ std::ostream& operator<<(std::ostream& out, const VaryingShape<T>& vs) {
|
||||
if (v.has_value()) {
|
||||
out << v.value();
|
||||
} else {
|
||||
out << '*';
|
||||
out << "*";
|
||||
}
|
||||
}
|
||||
out << ')';
|
||||
out << ")";
|
||||
return out;
|
||||
}
|
||||
|
||||
@ -105,7 +105,7 @@ std::ostream& operator<<(
|
||||
}
|
||||
auto sizes_opt = ss.sizes();
|
||||
|
||||
os << '(';
|
||||
os << "(";
|
||||
for (size_t i = 0; i < rank_opt.value(); i++) {
|
||||
if (i > 0) {
|
||||
os << ", ";
|
||||
@ -113,10 +113,10 @@ std::ostream& operator<<(
|
||||
if(sizes_opt.has_value() && sizes_opt.value()[i].is_static()) {
|
||||
os << sizes_opt.value()[i];
|
||||
} else {
|
||||
os << '*';
|
||||
os << "*";
|
||||
}
|
||||
}
|
||||
os << ')';
|
||||
os << ")";
|
||||
|
||||
return os;
|
||||
}
|
||||
@ -131,17 +131,17 @@ std::ostream& operator<<(std::ostream& os, const ShapeSymbol& s) {
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const Stride& s) {
|
||||
os << '{';
|
||||
os << "{";
|
||||
if (s.stride_index_.has_value()) {
|
||||
os << *s.stride_index_;
|
||||
} else {
|
||||
os << '*';
|
||||
os << "*";
|
||||
}
|
||||
os << ':';
|
||||
os << ":";
|
||||
if (s.stride_.has_value()) {
|
||||
os << *s.stride_;
|
||||
} else {
|
||||
os << '*';
|
||||
os << "*";
|
||||
}
|
||||
os << '}';
|
||||
return os;
|
||||
|
||||
@ -67,7 +67,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
|
||||
bool has_valid_strides_info = ndim > 0 &&
|
||||
value->strides().isComplete() && value->strides().size() == ndim;
|
||||
|
||||
out << '(';
|
||||
out << "(";
|
||||
size_t i = 0;
|
||||
bool symbolic = type_verbosity() == TypeVerbosity::Symbolic;
|
||||
for (i = 0; i < *ndim; ++i) {
|
||||
@ -79,7 +79,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
|
||||
} else if (symbolic) {
|
||||
out << value->symbolic_sizes().at(i);
|
||||
} else {
|
||||
out << '*';
|
||||
out << "*";
|
||||
}
|
||||
}
|
||||
if (has_valid_strides_info &&
|
||||
@ -91,7 +91,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
|
||||
}
|
||||
out << value->strides()[i].value();
|
||||
}
|
||||
out << ']';
|
||||
out << "]";
|
||||
}
|
||||
if (type_verbosity() >= TypeVerbosity::Full) {
|
||||
if (value->requiresGrad()) {
|
||||
@ -107,12 +107,12 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
|
||||
out << "device=" << *value->device();
|
||||
}
|
||||
}
|
||||
out << ')';
|
||||
out << ")";
|
||||
} else {
|
||||
if (type_verbosity() >= TypeVerbosity::Full) {
|
||||
size_t i = 0;
|
||||
if (value->requiresGrad()) {
|
||||
out << '('
|
||||
out << "("
|
||||
<< "requires_grad=" << *value->requiresGrad();
|
||||
i++;
|
||||
}
|
||||
@ -120,7 +120,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
|
||||
out << ((i++ > 0) ? ", " : "(") << "device=" << *value->device();
|
||||
}
|
||||
if (i > 0) {
|
||||
out << ')';
|
||||
out << ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -133,18 +133,18 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
|
||||
out << *prim << "[]";
|
||||
} else if (t.kind() == TypeKind::OptionalType) {
|
||||
auto prim = t.castRaw<OptionalType>()->getElementType();
|
||||
out << *prim << '?';
|
||||
out << *prim << "?";
|
||||
} else if(t.kind() == TypeKind::FutureType) {
|
||||
auto elem = t.castRaw<FutureType>()->getElementType();
|
||||
out << "Future[" << *elem << ']';
|
||||
out << "Future[" << *elem << "]";
|
||||
} else if(t.kind() == TypeKind::RRefType) {
|
||||
auto elem = t.castRaw<RRefType>()->getElementType();
|
||||
out << "RRef[" << *elem << ']';
|
||||
out << "RRef[" << *elem << "]";
|
||||
} else if(auto tup = t.cast<TupleType>()) {
|
||||
if (tup->schema()) {
|
||||
out << "NamedTuple";
|
||||
}
|
||||
out << '(';
|
||||
out << "(";
|
||||
for(size_t i = 0; i < tup->elements().size(); ++i) {
|
||||
if(i > 0)
|
||||
out << ", ";
|
||||
@ -160,7 +160,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
|
||||
out << *(tup->elements()[i]);
|
||||
}
|
||||
}
|
||||
out << ')';
|
||||
out << ")";
|
||||
} else if (t.kind() == TypeKind::FunctionType) {
|
||||
out << "Function";
|
||||
} else {
|
||||
@ -475,7 +475,7 @@ std::optional<TypePtr> unifyTypeList(
|
||||
why_not << "Could not unify type list since element " << i << " of type "
|
||||
<< elements.at(i)->repr_str()
|
||||
<< " did not match the types before it ("
|
||||
<< ret_type->repr_str() << ')';
|
||||
<< ret_type->repr_str() << ")";
|
||||
return std::nullopt;
|
||||
}
|
||||
ret_type = *maybe_unified;
|
||||
@ -907,13 +907,13 @@ std::string TupleType::str() const {
|
||||
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
|
||||
ss << name()->qualifiedName();
|
||||
} else {
|
||||
ss << '(';
|
||||
ss << "(";
|
||||
for(size_t i = 0; i < elements().size(); ++i) {
|
||||
if(i > 0)
|
||||
ss << ", ";
|
||||
ss << elements()[i]->str();
|
||||
}
|
||||
ss << ')';
|
||||
ss << ")";
|
||||
}
|
||||
return ss.str();
|
||||
}
|
||||
@ -1003,8 +1003,8 @@ bool InterfaceType::isSubTypeImpl(
|
||||
*why_not << "Method on interface '" << lhs.repr_str()
|
||||
<< "' (1) is not compatible with interface '"
|
||||
<< rhs.repr_str() << "' (2)\n"
|
||||
<< " (1) " << *self_schema << '\n'
|
||||
<< " (2) " << schema << '\n';
|
||||
<< " (1) " << *self_schema << "\n"
|
||||
<< " (2) " << schema << "\n";
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
@ -1078,7 +1078,7 @@ SymbolicShape SymbolicShape::merge(const SymbolicShape& other) const {
|
||||
}
|
||||
|
||||
void SymbolicShape::dump() const {
|
||||
std::cout << *this << '\n';
|
||||
std::cout << *this << "\n";
|
||||
}
|
||||
|
||||
bool EnumType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
|
||||
|
||||
@ -205,9 +205,9 @@ UnionType::UnionType(std::vector<TypePtr> reference, TypeKind kind) : SharedType
|
||||
for (const auto i : c10::irange(reference.size())) {
|
||||
msg << reference[i]->repr_str();
|
||||
if (i > 0) {
|
||||
msg << ',';
|
||||
msg << ",";
|
||||
}
|
||||
msg << ' ';
|
||||
msg << " ";
|
||||
}
|
||||
msg << "} has the single type " << types_[0]->repr_str()
|
||||
<< ". Use the common supertype instead of creating a Union"
|
||||
|
||||
@ -80,7 +80,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
|
||||
}
|
||||
stream << buf[i];
|
||||
}
|
||||
stream << ']';
|
||||
stream << "]";
|
||||
return stream;
|
||||
}
|
||||
|
||||
|
||||
@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
|
||||
}
|
||||
stream << buf[i];
|
||||
}
|
||||
stream << ']';
|
||||
stream << "]";
|
||||
return stream;
|
||||
}
|
||||
|
||||
|
||||
@ -3,7 +3,6 @@
|
||||
|
||||
#include <cstdint>
|
||||
#include <map>
|
||||
#include <shared_mutex>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <cusparse.h>
|
||||
@ -89,13 +88,8 @@ TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
|
||||
TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
|
||||
|
||||
TORCH_CUDA_CPP_API void clearCublasWorkspaces();
|
||||
struct WorkspaceMapWithMutex {
|
||||
std::map<std::tuple<void*, void*>, at::DataPtr> map;
|
||||
std::shared_mutex mutex;
|
||||
};
|
||||
|
||||
TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
|
||||
TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
|
||||
TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace();
|
||||
TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace();
|
||||
TORCH_CUDA_CPP_API size_t getChosenWorkspaceSize();
|
||||
TORCH_CUDA_CPP_API size_t getCUDABlasLtWorkspaceSize();
|
||||
TORCH_CUDA_CPP_API void* getCUDABlasLtWorkspace();
|
||||
|
||||
@ -175,24 +175,17 @@ void CUDAGraph::instantiate() {
|
||||
// Trailing NULL, NULL, 0 arguments were recommended by Cuda driver people,
|
||||
// who prefer not to report error message through these arguments moving forward
|
||||
// (they prefer return value, or errors on api calls internal to the capture)
|
||||
// ROCM appears to fail with HIP error: invalid argument
|
||||
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && !defined(USE_ROCM)
|
||||
AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, cudaGraphInstantiateFlagUseNodePriority));
|
||||
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000)
|
||||
AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, 0));
|
||||
#else
|
||||
AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0));
|
||||
#endif
|
||||
//Since ROCm 6.2, we want to go down this path as hipGraphExecDestroy in the destructor will not immediately free the memory.
|
||||
//It will wait for the next sync operation. cudaGraphInstantiateFlagAutoFreeOnLaunch will add async frees after graph launch.
|
||||
} else {
|
||||
#if !defined(USE_ROCM)
|
||||
AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
|
||||
graph_,
|
||||
cudaGraphInstantiateFlagAutoFreeOnLaunch | cudaGraphInstantiateFlagUseNodePriority));
|
||||
#else
|
||||
AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
|
||||
graph_,
|
||||
cudaGraphInstantiateFlagAutoFreeOnLaunch));
|
||||
#endif
|
||||
}
|
||||
has_graph_exec_ = true;
|
||||
}
|
||||
|
||||
@ -99,7 +99,7 @@ void destroyCublasHandle(cublasHandle_t handle) {
|
||||
// - Comments of @soumith copied from cuDNN handle pool implementation
|
||||
#ifdef NO_CUDNN_DESTROY_HANDLE
|
||||
#else
|
||||
cublasDestroy(handle);
|
||||
cublasDestroy(handle);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -107,27 +107,19 @@ using CuBlasPoolType = DeviceThreadHandlePool<cublasHandle_t, createCublasHandle
|
||||
|
||||
} // namespace
|
||||
|
||||
WorkspaceMapWithMutex& cublas_handle_stream_to_workspace() {
|
||||
static auto& instance = *new WorkspaceMapWithMutex;
|
||||
std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace() {
|
||||
static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
|
||||
return instance;
|
||||
}
|
||||
|
||||
WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace() {
|
||||
static auto& instance = *new WorkspaceMapWithMutex;
|
||||
std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace() {
|
||||
static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
|
||||
return instance;
|
||||
}
|
||||
|
||||
void clearCublasWorkspaces() {
|
||||
{
|
||||
auto& workspace = cublas_handle_stream_to_workspace();
|
||||
std::unique_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
workspace.map.clear();
|
||||
}
|
||||
{
|
||||
auto& workspace = cublaslt_handle_stream_to_workspace();
|
||||
std::unique_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
workspace.map.clear();
|
||||
}
|
||||
cublas_handle_stream_to_workspace().clear();
|
||||
cublaslt_handle_stream_to_workspace().clear();
|
||||
}
|
||||
|
||||
size_t parseChosenWorkspaceSize() {
|
||||
@ -241,38 +233,6 @@ at::DataPtr getNewCUDABlasLtWorkspace() {
|
||||
return c10::cuda::CUDACachingAllocator::get()->allocate(getCUDABlasLtWorkspaceSize());
|
||||
}
|
||||
|
||||
void setWorkspaceForHandle(cublasHandle_t handle, c10::cuda::CUDAStream stream) {
|
||||
cudaStream_t _stream = stream;
|
||||
auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
|
||||
|
||||
auto& workspace = cublas_handle_stream_to_workspace();
|
||||
|
||||
size_t workspace_size = getChosenWorkspaceSize();
|
||||
|
||||
// Fast path: check if workspace already exists
|
||||
{
|
||||
std::shared_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
auto workspace_it = workspace.map.find(key);
|
||||
if (workspace_it != workspace.map.end()) {
|
||||
TORCH_CUDABLAS_CHECK(cublasSetWorkspace(
|
||||
handle, workspace_it->second.get(), workspace_size));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Slow path: allocate workspace outside the lock
|
||||
auto new_workspace = getNewWorkspace();
|
||||
|
||||
// Insert with lock (double-check in case another thread inserted while we
|
||||
// were allocating)
|
||||
{
|
||||
std::unique_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
auto workspace_it = workspace.map.try_emplace(key, std::move(new_workspace)).first;
|
||||
TORCH_CUDABLAS_CHECK(
|
||||
cublasSetWorkspace(handle, workspace_it->second.get(), workspace_size));
|
||||
}
|
||||
}
|
||||
|
||||
void* getCUDABlasLtWorkspace() {
|
||||
#ifndef USE_ROCM
|
||||
static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true;
|
||||
@ -281,10 +241,8 @@ void* getCUDABlasLtWorkspace() {
|
||||
auto stream = c10::cuda::getCurrentCUDAStream();
|
||||
cudaStream_t _stream = stream;
|
||||
auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
|
||||
auto& workspace = at::cuda::cublas_handle_stream_to_workspace();
|
||||
std::shared_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
auto workspace_it = workspace.map.find(key);
|
||||
TORCH_INTERNAL_ASSERT(workspace_it != workspace.map.end());
|
||||
auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key);
|
||||
TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end());
|
||||
return workspace_it->second.mutable_get();
|
||||
}
|
||||
#endif
|
||||
@ -292,29 +250,11 @@ void* getCUDABlasLtWorkspace() {
|
||||
auto stream = c10::cuda::getCurrentCUDAStream();
|
||||
cudaStream_t _stream = stream;
|
||||
auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
|
||||
|
||||
auto& workspace = cublaslt_handle_stream_to_workspace();
|
||||
|
||||
// Fast path: check if workspace already exists
|
||||
{
|
||||
std::shared_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
auto workspace_it = workspace.map.find(key);
|
||||
if (workspace_it != workspace.map.end()) {
|
||||
return workspace_it->second.mutable_get();
|
||||
}
|
||||
}
|
||||
|
||||
// Slow path: allocate workspace outside the lock
|
||||
auto new_workspace = getNewCUDABlasLtWorkspace();
|
||||
|
||||
// Insert with lock (double-check in case another thread inserted while we
|
||||
// were allocating)
|
||||
{
|
||||
std::unique_lock<std::shared_mutex> lock(workspace.mutex);
|
||||
auto workspace_it =
|
||||
workspace.map.try_emplace(key, std::move(new_workspace)).first;
|
||||
return workspace_it->second.mutable_get();
|
||||
auto workspace_it = cublaslt_handle_stream_to_workspace().find(key);
|
||||
if (workspace_it == cublaslt_handle_stream_to_workspace().end()) {
|
||||
workspace_it = cublaslt_handle_stream_to_workspace().insert(workspace_it, {key, getNewCUDABlasLtWorkspace()});
|
||||
}
|
||||
return workspace_it->second.mutable_get();
|
||||
}
|
||||
|
||||
cublasHandle_t getCurrentCUDABlasHandle() {
|
||||
@ -358,8 +298,13 @@ cublasHandle_t getCurrentCUDABlasHandle() {
|
||||
// will allocate memory dynamically (even if they're cheap) outside
|
||||
// PyTorch's CUDA caching allocator. It's possible that CCA used up
|
||||
// all the memory and cublas's cudaMallocAsync will return OOM
|
||||
setWorkspaceForHandle(handle, stream);
|
||||
|
||||
cudaStream_t _stream = stream;
|
||||
auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
|
||||
auto workspace_it = cublas_handle_stream_to_workspace().find(key);
|
||||
if (workspace_it == cublas_handle_stream_to_workspace().end()) {
|
||||
workspace_it = cublas_handle_stream_to_workspace().insert(workspace_it, {key, getNewWorkspace()});
|
||||
}
|
||||
TORCH_CUDABLAS_CHECK(cublasSetWorkspace(handle, workspace_it->second.get(), getChosenWorkspaceSize()));
|
||||
#if !defined(USE_ROCM)
|
||||
// On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
|
||||
// FP32 data type calculations based on the value of the allow_tf32 flag.
|
||||
|
||||
@ -411,16 +411,16 @@ std::string CUDAHooks::showConfig() const {
|
||||
// HIP_VERSION value format was changed after ROCm v4.2 to include the patch number
|
||||
if(v < 500) {
|
||||
// If major=xx, minor=yy then format -> xxyy
|
||||
oss << (v / 100) << '.' << (v % 10);
|
||||
oss << (v / 100) << "." << (v % 10);
|
||||
}
|
||||
else {
|
||||
// If major=xx, minor=yy & patch=zzzzz then format -> xxyyzzzzz
|
||||
oss << (v / 10000000) << '.' << (v / 100000 % 100) << '.' << (v % 100000);
|
||||
oss << (v / 10000000) << "." << (v / 100000 % 100) << "." << (v % 100000);
|
||||
}
|
||||
#else
|
||||
oss << (v / 1000) << '.' << (v / 10 % 100);
|
||||
oss << (v / 1000) << "." << (v / 10 % 100);
|
||||
if (v % 10 != 0) {
|
||||
oss << '.' << (v % 10);
|
||||
oss << "." << (v % 10);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
@ -431,16 +431,16 @@ std::string CUDAHooks::showConfig() const {
|
||||
oss << " - HIP Runtime ";
|
||||
#endif
|
||||
printCudaStyleVersion(runtimeVersion);
|
||||
oss << '\n';
|
||||
oss << "\n";
|
||||
|
||||
// TODO: Make HIPIFY understand CUDART_VERSION macro
|
||||
#if !defined(USE_ROCM)
|
||||
if (runtimeVersion != CUDART_VERSION) {
|
||||
oss << " - Built with CUDA Runtime ";
|
||||
printCudaStyleVersion(CUDART_VERSION);
|
||||
oss << '\n';
|
||||
oss << "\n";
|
||||
}
|
||||
oss << " - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << '\n';
|
||||
oss << " - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << "\n";
|
||||
#endif
|
||||
|
||||
#if !defined(USE_ROCM)
|
||||
@ -448,9 +448,9 @@ std::string CUDAHooks::showConfig() const {
|
||||
|
||||
|
||||
auto printCudnnStyleVersion = [&](size_t v) {
|
||||
oss << (v / 1000) << '.' << (v / 100 % 10);
|
||||
oss << (v / 1000) << "." << (v / 100 % 10);
|
||||
if (v % 100 != 0) {
|
||||
oss << '.' << (v % 100);
|
||||
oss << "." << (v % 100);
|
||||
}
|
||||
};
|
||||
|
||||
@ -461,22 +461,22 @@ std::string CUDAHooks::showConfig() const {
|
||||
if (cudnnCudartVersion != CUDART_VERSION) {
|
||||
oss << " (built against CUDA ";
|
||||
printCudaStyleVersion(cudnnCudartVersion);
|
||||
oss << ')';
|
||||
oss << ")";
|
||||
}
|
||||
oss << '\n';
|
||||
oss << "\n";
|
||||
if (cudnnVersion != CUDNN_VERSION) {
|
||||
oss << " - Built with CuDNN ";
|
||||
printCudnnStyleVersion(CUDNN_VERSION);
|
||||
oss << '\n';
|
||||
oss << "\n";
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
// TODO: Check if miopen has the functions above and unify
|
||||
oss << " - MIOpen " << MIOPEN_VERSION_MAJOR << '.' << MIOPEN_VERSION_MINOR << '.' << MIOPEN_VERSION_PATCH << '\n';
|
||||
oss << " - MIOpen " << MIOPEN_VERSION_MAJOR << "." << MIOPEN_VERSION_MINOR << "." << MIOPEN_VERSION_PATCH << "\n";
|
||||
#endif
|
||||
|
||||
#if AT_MAGMA_ENABLED()
|
||||
oss << " - Magma " << MAGMA_VERSION_MAJOR << '.' << MAGMA_VERSION_MINOR << '.' << MAGMA_VERSION_MICRO << '\n';
|
||||
oss << " - Magma " << MAGMA_VERSION_MAJOR << "." << MAGMA_VERSION_MINOR << "." << MAGMA_VERSION_MICRO << "\n";
|
||||
#endif
|
||||
|
||||
return oss.str();
|
||||
|
||||
@ -42,7 +42,7 @@ static inline void launch_jitted_vectorized_kernel_dynamic(
|
||||
|
||||
// The cache key includes all the parameters to generate_code + vec_size + dev_idx
|
||||
std::stringstream ss;
|
||||
ss << nInputs << '_' << nOutputs << f;
|
||||
ss << nInputs << "_" << nOutputs << f;
|
||||
ss << f_inputs_type_str << compute_type_str << result_type_str;
|
||||
ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
|
||||
ss << extra_args_types;
|
||||
@ -144,7 +144,7 @@ static inline void launch_jitted_unrolled_kernel_dynamic(
|
||||
|
||||
// The cache key includes all the parameters to generate_code + dev_idx
|
||||
std::stringstream ss;
|
||||
ss << nInputs << '_' << nOutputs << f;
|
||||
ss << nInputs << "_" << nOutputs << f;
|
||||
ss << f_inputs_type_str << compute_type_str << result_type_str;
|
||||
ss << contiguous << dynamic_casting;
|
||||
ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
|
||||
|
||||
@ -52,10 +52,10 @@ TuningContext* getTuningContext() {
|
||||
std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry) {
|
||||
static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1";
|
||||
if (!blaslog) {
|
||||
return stream << entry.key_ << ',' << entry.time_;
|
||||
return stream << entry.key_ << "," << entry.time_;
|
||||
}
|
||||
else {
|
||||
return stream << entry.key_ << ',' << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_;
|
||||
return stream << entry.key_ << "," << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_;
|
||||
}
|
||||
}
|
||||
|
||||
@ -156,10 +156,10 @@ void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std
|
||||
if (isNew) {
|
||||
static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1";
|
||||
if (!blaslog) {
|
||||
untuned_file << op_signature << ',' << params_signature << std::endl;
|
||||
untuned_file << op_signature << "," << params_signature << std::endl;
|
||||
}
|
||||
else {
|
||||
untuned_file << op_signature << ',' << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl;
|
||||
untuned_file << op_signature << "," << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl;
|
||||
}
|
||||
TUNABLE_LOG3("Untuned,", op_signature, ",", params_signature);
|
||||
}
|
||||
@ -201,7 +201,7 @@ void TuningResultsManager::InitRealtimeAppend(const std::string& filename, const
|
||||
|
||||
if(!file_exists || file_empty) {
|
||||
for(const auto& [key, val] : validators) {
|
||||
(*realtime_out_) << "Validator," << key << ',' << val << std::endl;
|
||||
(*realtime_out_) << "Validator," << key << "," << val << std::endl;
|
||||
realtime_out_->flush();
|
||||
}
|
||||
validators_written_ = true;
|
||||
@ -219,7 +219,7 @@ void TuningResultsManager::AppendResultLine(const std::string& op_sig, const std
|
||||
return;
|
||||
}
|
||||
|
||||
(*realtime_out_) << op_sig << ',' << param_sig << ',' << result << std::endl;
|
||||
(*realtime_out_) << op_sig << "," << param_sig << "," << result << std::endl;
|
||||
realtime_out_->flush(); //ensure immediate write to disk
|
||||
|
||||
TUNABLE_LOG3("Realtime append: ", op_sig, "(", param_sig, ") -> ", result);
|
||||
|
||||
@ -93,31 +93,31 @@ std::string cudnnTypeToString(cudnnDataType_t dtype) {
|
||||
return "CUDNN_DATA_UINT8x4";
|
||||
default:
|
||||
std::ostringstream oss;
|
||||
oss << "(unknown data-type " << static_cast<int>(dtype) << ')';
|
||||
oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
|
||||
return oss.str();
|
||||
}
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
|
||||
out << "TensorDescriptor " << static_cast<void*>(d.desc()) << '\n';
|
||||
out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
|
||||
int nbDims = 0;
|
||||
int dimA[CUDNN_DIM_MAX];
|
||||
int strideA[CUDNN_DIM_MAX];
|
||||
cudnnDataType_t dtype{};
|
||||
cudnnGetTensorNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &nbDims, dimA, strideA);
|
||||
out << " type = " << cudnnTypeToString(dtype) << '\n';
|
||||
out << " nbDims = " << nbDims << '\n';
|
||||
out << " type = " << cudnnTypeToString(dtype) << "\n";
|
||||
out << " nbDims = " << nbDims << "\n";
|
||||
// Read out only nbDims of the arrays!
|
||||
out << " dimA = ";
|
||||
for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
|
||||
out << i << ", ";
|
||||
}
|
||||
out << '\n';
|
||||
out << "\n";
|
||||
out << " strideA = ";
|
||||
for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
|
||||
out << i << ", ";
|
||||
}
|
||||
out << '\n';
|
||||
out << "\n";
|
||||
return out;
|
||||
}
|
||||
|
||||
@ -168,27 +168,27 @@ std::string cudnnMemoryFormatToString(cudnnTensorFormat_t tformat) {
|
||||
return "CUDNN_TENSOR_NHWC";
|
||||
default:
|
||||
std::ostringstream oss;
|
||||
oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ')';
|
||||
oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ")";
|
||||
return oss.str();
|
||||
}
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d) {
|
||||
out << "FilterDescriptor " << static_cast<void*>(d.desc()) << '\n';
|
||||
out << "FilterDescriptor " << static_cast<void*>(d.desc()) << "\n";
|
||||
int nbDims = 0;
|
||||
int dimA[CUDNN_DIM_MAX];
|
||||
cudnnDataType_t dtype{};
|
||||
cudnnTensorFormat_t tformat{};
|
||||
cudnnGetFilterNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &tformat, &nbDims, dimA);
|
||||
out << " type = " << cudnnTypeToString(dtype) << '\n';
|
||||
out << " tensor_format = " << cudnnMemoryFormatToString(tformat) << '\n';
|
||||
out << " nbDims = " << nbDims << '\n';
|
||||
out << " type = " << cudnnTypeToString(dtype) << "\n";
|
||||
out << " tensor_format = " << cudnnMemoryFormatToString(tformat) << "\n";
|
||||
out << " nbDims = " << nbDims << "\n";
|
||||
// Read out only nbDims of the arrays!
|
||||
out << " dimA = ";
|
||||
for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
|
||||
out << i << ", ";
|
||||
}
|
||||
out << '\n';
|
||||
out << "\n";
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
@ -346,15 +346,15 @@ void foreachTensorInplaceWithFlag(std::vector<IValue>& args, int64_t begin, int6
|
||||
}
|
||||
|
||||
std::ostream& operator<< (std::ostream& os, const DynamicLayer& layer) {
|
||||
os << layer.layerId() << ':' << layer.key();
|
||||
os << layer.layerId() << ":" << layer.key();
|
||||
return os;
|
||||
}
|
||||
std::ostream& operator<< (std::ostream& os, const std::vector<DynamicLayer>& dls) {
|
||||
os << "DynamicLayerStack[ ";
|
||||
for (const auto& layer : dls) {
|
||||
os << layer << ' ';
|
||||
os << layer << " ";
|
||||
}
|
||||
os << ']';
|
||||
os << "]";
|
||||
return os;
|
||||
}
|
||||
|
||||
|
||||
@ -22,7 +22,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) {
|
||||
if (batched) {
|
||||
ss << "Batched[lvl=" << batched->level() << " dim=" << batched->bdim() << ", ";
|
||||
dumpTensor(ss, batched->value());
|
||||
ss << ']';
|
||||
ss << "]";
|
||||
return;
|
||||
}
|
||||
ss << "Tensor" << tensor.sizes();
|
||||
@ -36,7 +36,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) {
|
||||
ss << "dead, ";
|
||||
}
|
||||
dumpTensor(ss, wrapped->value());
|
||||
ss << ']';
|
||||
ss << "]";
|
||||
}
|
||||
|
||||
void TensorWrapper::refreshMetadata() {
|
||||
|
||||
@ -73,32 +73,32 @@ std::string miopenTypeToString(miopenDataType_t dtype) {
|
||||
return "miopenBFloat16";
|
||||
default:
|
||||
std::ostringstream oss;
|
||||
oss << "(unknown data-type " << static_cast<int>(dtype) << ')';
|
||||
oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
|
||||
return oss.str();
|
||||
}
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
|
||||
out << "TensorDescriptor " << static_cast<void*>(d.desc()) << '\n';
|
||||
out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
|
||||
int nbDims = 0;
|
||||
int dimA[MIOPEN_DIM_MAX];
|
||||
int strideA[MIOPEN_DIM_MAX];
|
||||
miopenDataType_t dtype;
|
||||
miopenGetTensorDescriptorSize(d.desc(), &nbDims);
|
||||
miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA);
|
||||
out << " type = " << miopenTypeToString(dtype) << '\n';
|
||||
out << " nbDims = " << nbDims << '\n';
|
||||
out << " type = " << miopenTypeToString(dtype) << "\n";
|
||||
out << " nbDims = " << nbDims << "\n";
|
||||
// Read out only nbDims of the arrays!
|
||||
out << " dimA = ";
|
||||
for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
|
||||
out << i << ", ";
|
||||
}
|
||||
out << '\n';
|
||||
out << "\n";
|
||||
out << " strideA = ";
|
||||
for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
|
||||
out << i << ", ";
|
||||
}
|
||||
out << '\n';
|
||||
out << "\n";
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
@ -91,7 +91,7 @@ struct OperationInfo : BaseInfo {
|
||||
std::stringstream kernelStr;
|
||||
kernelStr << kernelName;
|
||||
for (const Tensor& tensor : tensors) {
|
||||
kernelStr << ':' << BaseInfo::buildTensorString(tensor, includeBufferId);
|
||||
kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId);
|
||||
}
|
||||
return kernelStr.str();
|
||||
}
|
||||
|
||||
@ -39,9 +39,9 @@ std::string BaseInfo::buildTensorString(const Tensor& tensor, bool includeBuffer
|
||||
// see comments for INCLUDE_BUFFER_ID
|
||||
if (includeBufferId && deviceType == at::kMPS) {
|
||||
id<MTLBuffer> buffer = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
|
||||
tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ':' << buffer.retainCount << ')';
|
||||
tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ":" << buffer.retainCount << ")";
|
||||
}
|
||||
tensorStr << ':' << tensor.scalar_type() << tensor.sizes();
|
||||
tensorStr << ":" << tensor.scalar_type() << tensor.sizes();
|
||||
return tensorStr.str();
|
||||
} else {
|
||||
return "undefined";
|
||||
|
||||
@ -167,7 +167,7 @@ static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, co
|
||||
std::stringstream ss;
|
||||
ss << arg_name << " should be greater than zero but got (";
|
||||
std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
|
||||
ss << args.back() << ")" << " (while checking arguments for " << c << ')';
|
||||
ss << args.back() << ")" << " (while checking arguments for " << c << ")";
|
||||
TORCH_CHECK(false, ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
@ -639,7 +639,7 @@ static std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params)
|
||||
<< " deterministic = " << params.deterministic
|
||||
<< " cudnn_enabled = " << params.cudnn_enabled
|
||||
<< " allow_tf32 = " << params.allow_tf32
|
||||
<< '}';
|
||||
<< "}";
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
@ -1936,7 +1936,7 @@ static bool should_fold(const Tensor& tensor1, const Tensor& tensor2, bool has_o
|
||||
|
||||
// We order the tensors. t1 will be the larger tensor
|
||||
// We can always transpose tensor2 as the dimensions are always >= 1 (precondition from matmul)
|
||||
// and tensor1_larger iff tensor2.dim() > tensor1.dim(9
|
||||
// and tensor1_larger iff tensor2.dim() > tensor1.dim()
|
||||
const auto t1 = tensor1_larger ? MaybeOwned<Tensor>::borrowed(tensor1)
|
||||
: MaybeOwned<Tensor>::owned(tensor2.mT());
|
||||
const int64_t dim_t1 = t1->dim();
|
||||
@ -1948,20 +1948,11 @@ static bool should_fold(const Tensor& tensor1, const Tensor& tensor2, bool has_o
|
||||
return false;
|
||||
}
|
||||
|
||||
// In this case we *do* incur in an extra copy to avoid creating an unnecessary large tensor in the backward
|
||||
// Suppose we don't fold here. Let t1.shape = [b, m, n] t2.shape = [n, k] like in a transformer
|
||||
// t2 will be expanded to a tensor of shape [b, n, k] and then we do t1.bmm(t2_expanded)
|
||||
// The issue appears in the backward.
|
||||
// The output gradient g of this operation would have shape [b, m, k]
|
||||
// The backward wrt. t2 of bmm would be given by t1.mH @ g, which has shape [b, n, k]
|
||||
// Then, the backward of expand is simply `sum(0)`. As such, we are instantiating a tensor
|
||||
// of shape [b, n, k] unnecessarily, which may cause a large memory footprint, and in the
|
||||
// worst case, an OOM
|
||||
bool t2_requires_grad = tensor1_larger ? tensor2.requires_grad() : tensor1.requires_grad();
|
||||
if (t2_requires_grad && !has_out) {
|
||||
// We should be checking !at::GradMode::is_enabled(), but apparently
|
||||
// this regresses performance in some cases:
|
||||
// https://github.com/pytorch/pytorch/issues/118548#issuecomment-1916022394
|
||||
// If we require a gradient, we should fold to minimize backward memory usage - even if this
|
||||
// leads to a copy in forward because is needed in backward,
|
||||
// only time we avoid this strict pre-allocated memory usage (has_out = True)
|
||||
bool requires_grad = tensor1.requires_grad() || tensor2.requires_grad();
|
||||
if (requires_grad && !has_out) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@ -847,7 +847,7 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const std::optional<int64_t
|
||||
<< ", hop_length=" << hop_length << ", win_length=" << win_length \
|
||||
<< ", window="; \
|
||||
if (window.defined()) { \
|
||||
SS << window.toString() << '{' << window.sizes() << '}'; \
|
||||
SS << window.toString() << "{" << window.sizes() << "}"; \
|
||||
} else { \
|
||||
SS << "None"; \
|
||||
} \
|
||||
@ -1046,7 +1046,7 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const std::optional<int64_
|
||||
<< ", hop_length=" << hop_length << ", win_length=" << win_length \
|
||||
<< ", window="; \
|
||||
if (window.defined()) { \
|
||||
SS << window.toString() << '{' << window.sizes() << '}'; \
|
||||
SS << window.toString() << "{" << window.sizes() << "}"; \
|
||||
} else { \
|
||||
SS << "None"; \
|
||||
} \
|
||||
|
||||
@ -1087,8 +1087,7 @@ TORCH_IMPL_FUNC(index_copy_out)
|
||||
result.copy_(self);
|
||||
|
||||
// See Note [Enabling Deterministic Operations]
|
||||
if ((result.is_cuda() || result.is_xpu()) &&
|
||||
globalContext().deterministicAlgorithms()) {
|
||||
if (result.is_cuda() && globalContext().deterministicAlgorithms()) {
|
||||
torch::List<std::optional<Tensor>> indices;
|
||||
indices.resize(dim + 1);
|
||||
indices.set(dim, index);
|
||||
|
||||
@ -523,7 +523,7 @@ Tensor _functional_assert_async_msg_cpu(
|
||||
}
|
||||
|
||||
void _print(std::string_view s) {
|
||||
std::cout << s << '\n';
|
||||
std::cout << s << "\n";
|
||||
}
|
||||
|
||||
// Sorting-based algorithm for isin(); used when the number of test elements is
|
||||
|
||||
@ -296,7 +296,7 @@ template <typename scalar_t, typename res_scalar_t = scalar_t>
|
||||
bool launchGemmAndBiasCublasLt(
|
||||
// args contains result which is modified
|
||||
cublasCommonArgs& args,
|
||||
const std::optional<Tensor>& self,
|
||||
const Tensor& self,
|
||||
const Scalar& alpha,
|
||||
Activation activation = Activation::None
|
||||
) {
|
||||
@ -304,8 +304,12 @@ bool launchGemmAndBiasCublasLt(
|
||||
// or when it can be squeezed to 1D.
|
||||
// self_ptr == nullptr implies ignore bias epilogue
|
||||
// and use standard gemm-like API.
|
||||
const auto* self_ptr = self.has_value() ? self.value().const_data_ptr<scalar_t>() : static_cast<const scalar_t*>(nullptr);
|
||||
|
||||
const auto* self_ptr = [&]() -> auto {
|
||||
if (self.dim() == 1 || self.squeeze().dim() == 1) {
|
||||
return self.const_data_ptr<scalar_t>();
|
||||
}
|
||||
return static_cast<const scalar_t*>(nullptr);
|
||||
}();
|
||||
|
||||
const auto tuning_ctx = at::cuda::tunable::getTuningContext();
|
||||
if (tuning_ctx->IsTunableOpEnabled()) {
|
||||
@ -388,30 +392,35 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
bool disable_addmm_cuda_lt = persistent_disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
|
||||
#ifdef USE_ROCM
|
||||
// Conditioned on the device index, which is not persistent
|
||||
disable_addmm_cuda_lt = disable_addmm_cuda_lt || isGloballyDisabledAddmmCudaLt(self.device());
|
||||
disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt;
|
||||
#endif
|
||||
// Condition on the input
|
||||
disable_addmm_cuda_lt = disable_addmm_cuda_lt || !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation);
|
||||
disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation) || disable_addmm_cuda_lt;
|
||||
// }
|
||||
|
||||
at::ScalarType scalar_type = mat1.scalar_type();
|
||||
bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;
|
||||
|
||||
#ifdef USE_ROCM
|
||||
disable_addmm_cuda_lt = disable_addmm_cuda_lt || is_float_output_with_half_input;
|
||||
#endif
|
||||
|
||||
bool use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
|
||||
// for float output with half input cublasLT with bias produces wrong results
|
||||
use_bias_ptr_lt &= !is_float_output_with_half_input;
|
||||
|
||||
// Handle result/self shapes
|
||||
if (!result.is_same(self)) {
|
||||
at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]});
|
||||
|
||||
// We do not copy bias only when we need the bias ptr
|
||||
// We use bias ptr in the Lt path only when bias is 1D
|
||||
const auto use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
|
||||
const auto self_maybe_expanded = [&]() -> c10::MaybeOwned<Tensor> {
|
||||
if (!use_bias_ptr_lt) {
|
||||
// We do expand self even before
|
||||
// check for beta != 0.0 to make sure that
|
||||
// test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_*
|
||||
// runs green.
|
||||
return expand_size(self, result.sizes(), "addmm");
|
||||
}
|
||||
return c10::MaybeOwned<Tensor>::borrowed(self);
|
||||
}();
|
||||
// We do not copy bias only when we need the bias ptr
|
||||
if (beta.toComplexDouble() != 0.0 && !use_bias_ptr_lt) {
|
||||
// NOTE: self should broadcast over result
|
||||
at::native::copy_(result, *expand_size(self, result.sizes(), "addmm"));
|
||||
at::native::copy_(result, *self_maybe_expanded);
|
||||
}
|
||||
}
|
||||
|
||||
@ -459,7 +468,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
scalar_type,
|
||||
"addmm_cuda_lt",
|
||||
[&] {
|
||||
lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
|
||||
lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, self, alpha, activation);
|
||||
}
|
||||
);
|
||||
#endif
|
||||
@ -471,7 +480,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
scalar_type,
|
||||
"addmm_cuda_lt",
|
||||
[&] {
|
||||
lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
|
||||
lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, self, alpha, activation);
|
||||
}
|
||||
);
|
||||
} // end is_float_output_with_half_input
|
||||
@ -927,7 +936,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
|
||||
return _int_mm_out_cuda(self, mat2, result);
|
||||
}
|
||||
|
||||
static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
|
||||
static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
|
||||
// ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm
|
||||
TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
|
||||
TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
|
||||
@ -951,7 +960,7 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
|
||||
(out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
|
||||
"out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
|
||||
|
||||
if (self_baddbmm.has_value()) {
|
||||
if (!is_bmm && self_baddbmm.has_value()) {
|
||||
const auto& self = self_baddbmm.value();
|
||||
TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor");
|
||||
TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output");
|
||||
@ -959,12 +968,15 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
|
||||
}
|
||||
|
||||
Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
|
||||
Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
|
||||
IntArrayRef batch1_sizes = batch1.sizes();
|
||||
IntArrayRef batch2_sizes = batch2.sizes();
|
||||
|
||||
Tensor out = at::empty({batch1_sizes[0], batch1_sizes[1], batch2_sizes[2]}, batch1.options().dtype(out_dtype));
|
||||
return _bmm_out_dtype_cuda(batch1, batch2, out_dtype, out);
|
||||
}
|
||||
|
||||
Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
|
||||
baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype);
|
||||
baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true);
|
||||
Scalar beta(0.0);
|
||||
Scalar alpha(1.0);
|
||||
{
|
||||
@ -976,16 +988,14 @@ Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at
|
||||
}
|
||||
|
||||
Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
|
||||
TORCH_CHECK(self.scalar_type() == out_dtype || self.scalar_type() == batch1.dtype(),
|
||||
"self dtype must match either out_dtype or batch1 dtype");
|
||||
Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
|
||||
return _baddbmm_out_dtype_cuda(self, batch1, batch2, out_dtype, beta, alpha, out);
|
||||
// We need to copy the tensor
|
||||
Tensor out = self.clone().to(self.options().dtype(out_dtype));
|
||||
|
||||
return _baddbmm_out_dtype_cuda(out, batch1, batch2, out_dtype, beta, alpha, out);
|
||||
}
|
||||
|
||||
Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
|
||||
baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, out);
|
||||
// We need to copy the tensor
|
||||
out.copy_(self);
|
||||
baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self);
|
||||
{
|
||||
NoNamesGuard guard;
|
||||
baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
|
||||
@ -1020,27 +1030,24 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca
|
||||
}
|
||||
|
||||
Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
|
||||
TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
|
||||
TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
|
||||
Tensor result = at::empty({mat1.size(0), mat2.size(1)}, self.options().dtype(out_dtype));
|
||||
Tensor result = at::empty(self.sizes(), self.options().dtype(out_dtype));
|
||||
return _addmm_dtype_out_cuda(self, mat1, mat2, out_dtype, beta, alpha, result);
|
||||
}
|
||||
|
||||
Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
|
||||
// repeat dimensionality checks for direct calls to `out` overload
|
||||
TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type());
|
||||
TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
|
||||
TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
|
||||
TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
|
||||
TORCH_CHECK(
|
||||
mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
|
||||
mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
|
||||
TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
|
||||
TORCH_CHECK(out_dtype == mat1.scalar_type() ||
|
||||
(out_dtype == at::ScalarType::Float && (mat1.scalar_type() == at::ScalarType::Half || mat1.scalar_type() == at::ScalarType::BFloat16)),
|
||||
"out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
|
||||
|
||||
TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
|
||||
TORCH_CHECK(out_dtype == self.scalar_type() || self.scalar_type() == mat1.scalar_type(),
|
||||
"self dtype must match either out_dtype or mat1 dtype");
|
||||
TORCH_CHECK(out_dtype == self.scalar_type() ||
|
||||
(out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
|
||||
"out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
|
||||
TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
|
||||
|
||||
addmm_out_cuda_impl(out, self, mat1, mat2, beta, alpha);
|
||||
|
||||
|
||||
@ -78,18 +78,9 @@ __global__ void EmbeddingBag_updateOutputKernel_max(
|
||||
scalar_t weightFeatMax = 0;
|
||||
int64_t bag_size_ = 0;
|
||||
int64_t maxWord = -1;
|
||||
|
||||
// Separate validation loop reduces register pressure in the main loop below.
|
||||
// No early exit (break) on invalid input as benchmarking shows it degrades performance.
|
||||
bool has_invalid_index = false;
|
||||
for (int64_t emb = begin; emb < end; emb++) {
|
||||
index_t input_idx = input[emb];
|
||||
has_invalid_index = has_invalid_index || (input_idx < 0 || input_idx >= numRows);
|
||||
}
|
||||
CUDA_KERNEL_ASSERT(!has_invalid_index && "Invalid input index in EmbeddingBag: index out of range [0, numRows)");
|
||||
|
||||
for (int64_t emb = begin; emb < end; emb++) {
|
||||
bool pad = (input[emb] == padding_idx);
|
||||
CUDA_KERNEL_ASSERT(input[emb] < numRows);
|
||||
const int64_t weightRow = input[emb] * weight_stride0;
|
||||
scalar_t weightValue = weightFeat[weightRow];
|
||||
if (bag_size_ == 0 || weightValue > weightFeatMax) {
|
||||
@ -138,19 +129,10 @@ __global__ void EmbeddingBag_updateOutputKernel_sum_mean(
|
||||
CUDA_KERNEL_ASSERT(end >= begin);
|
||||
accscalar_t weightFeatSum = 0;
|
||||
int64_t bag_size_ = 0;
|
||||
|
||||
// Separate validation loop reduces register pressure in the main loop below.
|
||||
// No early exit (break) on invalid input as benchmarking shows it degrades performance.
|
||||
bool has_invalid_index = false;
|
||||
for (int64_t emb = begin; emb < end; emb++) {
|
||||
index_t input_idx = input[emb];
|
||||
has_invalid_index = has_invalid_index || (input_idx < 0 || input_idx >= numRows);
|
||||
}
|
||||
CUDA_KERNEL_ASSERT(!has_invalid_index && "Invalid input index in EmbeddingBag: index out of range [0, numRows)");
|
||||
|
||||
for (int64_t emb = begin; emb < end; emb++) {
|
||||
index_t input_idx = input[emb];
|
||||
bool pad = (input_idx == padding_idx);
|
||||
CUDA_KERNEL_ASSERT(0 <= input_idx && input_idx < numRows);
|
||||
const int64_t weightRow = input_idx * weight_stride0;
|
||||
scalar_t weightValue = weightFeat[weightRow];
|
||||
weightValue = pad ? static_cast<scalar_t>(0) : weightValue;
|
||||
|
||||
@ -78,9 +78,9 @@ _mx8_mx8_bf16_grouped_mm_fbgemm(
|
||||
const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
const Tensor& scale_a,
|
||||
const SwizzleType swizzle_a,
|
||||
const SwizzleType& swizzle_a,
|
||||
const Tensor& scale_b,
|
||||
const SwizzleType swizzle_b,
|
||||
const SwizzleType& swizzle_b,
|
||||
const std::optional<at::Tensor>& offs,
|
||||
Tensor& out) {
|
||||
const bool a_is_2d = mat_a.dim() == 2;
|
||||
@ -607,8 +607,6 @@ _scaled_grouped_mm_cuda_v2(
|
||||
// scale shape checks
|
||||
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
|
||||
_check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
|
||||
// swizze checks
|
||||
TORCH_CHECK_VALUE(swizzle_a_enum.size() == 1 && swizzle_b_enum.size() == 1, "Expected single swizzle argument");
|
||||
return _mx8_mx8_bf16_grouped_mm_fbgemm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
|
||||
@ -2,250 +2,18 @@
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/native/DispatchStub.h>
|
||||
#include <ATen/native/cuda/Loops.cuh>
|
||||
#include <ATen/native/cuda/JitLoops.cuh>
|
||||
#include <ATen/native/cuda/jit_utils.h>
|
||||
#include <ATen/native/cuda/ScanUtils.cuh>
|
||||
#include <ATen/native/TensorIterator.h>
|
||||
#include <ATen/native/BinaryOps.h>
|
||||
#include <ATen/OpMathType.h>
|
||||
#include <c10/util/MathConstants.h>
|
||||
#include <c10/util/complex.h>
|
||||
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
|
||||
// NOTE: CUDA on Windows requires that the enclosing function
|
||||
// of a __device__ lambda not have internal linkage.
|
||||
|
||||
namespace at::native {
|
||||
|
||||
// custom min and max to be used in logaddexp for complex arguments
|
||||
template <typename scalar_t, bool min>
|
||||
__host__ __device__ c10::complex<scalar_t> _logaddexp_minmax(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
|
||||
scalar_t xr = std::real(x);
|
||||
scalar_t yr = std::real(y);
|
||||
if (::isnan(yr) || (::isnan(std::imag(y)))) {
|
||||
return y;
|
||||
} else if (::isnan(xr) || (::isnan(std::imag(x)))) {
|
||||
return x;
|
||||
} else if (min) { // min
|
||||
return (xr < yr) ? x : y;
|
||||
} else { // max
|
||||
return (xr >= yr) ? x : y;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__host__ __device__ scalar_t _log_add_exp_helper(const scalar_t& x, const scalar_t& y) {
|
||||
// Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
|
||||
// Using the original expression: `at::_isnan(y) ? y : std::min(x, y)` causes an error in ROCM
|
||||
const auto isnan_x = at::_isnan(x);
|
||||
const auto isnan_y = at::_isnan(y);
|
||||
scalar_t min = isnan_y ? y : (isnan_x ? x : std::min(x, y));
|
||||
scalar_t max = isnan_y ? y : (isnan_x ? x : std::max(x, y));
|
||||
if (min != max || ::isfinite(min)) {
|
||||
// nan will be propagated here
|
||||
return ::log1p(std::exp(min - max)) + max;
|
||||
} else {
|
||||
// special case to correctly handle infinite cases
|
||||
return x;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__host__ __device__ c10::complex<scalar_t> _fast_build_exp(const c10::complex<scalar_t>& x) {
|
||||
// complex exponential function, but implemented manually to get fast compilation time
|
||||
// this function only handles the case where the x is finite (not inf nor nan)
|
||||
const auto xreal = std::real(x);
|
||||
const auto ximag = std::imag(x);
|
||||
const auto exp_x_abs = std::exp(xreal);
|
||||
auto exp_x_real = exp_x_abs * std::cos(ximag);
|
||||
auto exp_x_imag = exp_x_abs * std::sin(ximag);
|
||||
return {exp_x_real, exp_x_imag};
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__host__ __device__ c10::complex<scalar_t> _fast_build_exp_inf(const c10::complex<scalar_t>& x) {
|
||||
// complex exponential function, but implemented manually to get fast compilation time
|
||||
// this function only handles the case where the real part of x is infinite
|
||||
const auto ximag = std::imag(x);
|
||||
constexpr auto exp_x_abs = std::numeric_limits<scalar_t>::infinity();
|
||||
if (!::isfinite(ximag)) { // add this to make consitent with std::exp(x+yi)
|
||||
return {exp_x_abs, std::numeric_limits<scalar_t>::quiet_NaN()};
|
||||
}
|
||||
const auto sin = std::sin(ximag);
|
||||
const auto cos = std::cos(ximag);
|
||||
// special case if the angle is exactly the multiple of pi/2
|
||||
auto exp_x_real = (cos == 0) ? (scalar_t)0.0 : exp_x_abs * cos;
|
||||
auto exp_x_imag = (sin == 0) ? (scalar_t)0.0 : exp_x_abs * sin;
|
||||
return {exp_x_real, exp_x_imag};
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
|
||||
c10::complex<scalar_t> min = _logaddexp_minmax<scalar_t, /*min=*/true>(x, y);
|
||||
c10::complex<scalar_t> max = _logaddexp_minmax<scalar_t, /*min=*/false>(x, y);
|
||||
scalar_t min_real = std::real(min);
|
||||
scalar_t max_real = std::real(max);
|
||||
|
||||
if (::isnan(min_real) || ::isnan(std::imag(min))) {
|
||||
// handling the "infectious" NaNs
|
||||
return {std::numeric_limits<scalar_t>::quiet_NaN(), std::numeric_limits<scalar_t>::quiet_NaN()};
|
||||
}
|
||||
else if ((!::isfinite(min_real)) && (min_real == max_real)) {
|
||||
if (min_real < 0) {
|
||||
// handle the -inf case, the imaginary part here does not really matter as the exp(value)
|
||||
// will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined.
|
||||
// It does not matter if we're taking the exp of this value
|
||||
return min;
|
||||
} else {
|
||||
// handle the +inf case, we don't need the special precision for log1p for small values
|
||||
// and to avoid producing nan in case of real(max) == real(min) == +inf
|
||||
const auto exp_min = _fast_build_exp_inf(min);
|
||||
const auto exp_max = _fast_build_exp_inf(max);
|
||||
return ::log1p(exp_min + exp_max - 1); // log1p(x - 1) builds faster than log
|
||||
}
|
||||
} else {
|
||||
const auto minmax = min - max;
|
||||
c10::complex<scalar_t> exp_minmax;
|
||||
if (!::isfinite(minmax.real())) {
|
||||
exp_minmax = minmax.real() < 0 ? c10::complex<scalar_t>{0.0, 0.0} : _fast_build_exp_inf(minmax);
|
||||
} else {
|
||||
exp_minmax = _fast_build_exp(minmax);
|
||||
}
|
||||
return ::log1p(exp_minmax) + max;
|
||||
}
|
||||
}
|
||||
|
||||
// Complex logaddexp jiterator string
|
||||
const auto logaddexp_complex_string = jiterator_stringify(
|
||||
template<typename T>
|
||||
std::complex<T> log1p(const std::complex<T>& z)
|
||||
{
|
||||
using complex_t = std::complex<T>;
|
||||
T x = z.real();
|
||||
T y = z.imag();
|
||||
T zabs = abs(z);
|
||||
T theta = atan2(y, x + T(1));
|
||||
if (zabs < 0.5) {
|
||||
T r = x * (T(2) + x) + y * y;
|
||||
if (r == 0) { // handle underflow
|
||||
return complex_t(x, theta);
|
||||
}
|
||||
return complex_t(T(0.5) * std::log1p(r), theta);
|
||||
} else {
|
||||
T z0 = std::hypot(x + 1, y);
|
||||
return complex_t(log(z0), theta);
|
||||
}
|
||||
}
|
||||
|
||||
// separated _logaddexp_minmax into 2 different functions for jiterator_string
|
||||
template <typename T>
|
||||
std::complex<T> logaddexp_min(const std::complex<T>& x, const std::complex<T>& y) {
|
||||
T xr = x.real();
|
||||
T yr = y.real();
|
||||
if (isnan(yr) || isnan(y.imag())) {
|
||||
return y;
|
||||
} else if (isnan(xr) || isnan(x.imag())) {
|
||||
return x;
|
||||
} else {
|
||||
return (xr < yr) ? x : y;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::complex<T> logaddexp_max(const std::complex<T>& x, const std::complex<T>& y) {
|
||||
T xr = x.real();
|
||||
T yr = y.real();
|
||||
if (isnan(yr) || isnan(y.imag())) {
|
||||
return y;
|
||||
} else if (isnan(xr) || isnan(x.imag())) {
|
||||
return x;
|
||||
} else {
|
||||
return (xr >= yr) ? x : y;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::complex<T> fast_build_exp(const std::complex<T>& x) {
|
||||
const auto xreal = x.real();
|
||||
const auto ximag = x.imag();
|
||||
const auto exp_x_abs = exp(xreal);
|
||||
auto exp_x_real = exp_x_abs * cos(ximag);
|
||||
auto exp_x_imag = exp_x_abs * sin(ximag);
|
||||
return std::complex<T>(exp_x_real, exp_x_imag);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::complex<T> fast_build_exp_inf(const std::complex<T>& x) {
|
||||
using complex_t = std::complex<T>;
|
||||
const auto ximag = x.imag();
|
||||
const T exp_x_abs = INFINITY;
|
||||
if (!isfinite(ximag)) {
|
||||
return complex_t(exp_x_abs, NAN);
|
||||
}
|
||||
const auto sin_val = sin(ximag);
|
||||
const auto cos_val = cos(ximag);
|
||||
auto exp_x_real = (cos_val == T(0)) ? T(0) : exp_x_abs * cos_val;
|
||||
auto exp_x_imag = (sin_val == T(0)) ? T(0) : exp_x_abs * sin_val;
|
||||
return complex_t(exp_x_real, exp_x_imag);
|
||||
}
|
||||
|
||||
template <typename complex_t>
|
||||
complex_t logaddexp_complex(complex_t x, complex_t y) {
|
||||
using T = typename complex_t::value_type;
|
||||
complex_t min_val = logaddexp_min(x, y);
|
||||
complex_t max_val = logaddexp_max(x, y);
|
||||
T min_real = min_val.real();
|
||||
T max_real = max_val.real();
|
||||
|
||||
if (isnan(min_real) || isnan(min_val.imag())) {
|
||||
return complex_t(NAN, NAN);
|
||||
}
|
||||
else if ((!isfinite(min_real)) && (min_real == max_real)) {
|
||||
if (min_real < T(0)) {
|
||||
return min_val;
|
||||
} else {
|
||||
const auto exp_min = fast_build_exp_inf<T>(min_val);
|
||||
const auto exp_max = fast_build_exp_inf<T>(max_val);
|
||||
return log1p(exp_min + exp_max - complex_t(1, 0));
|
||||
}
|
||||
} else {
|
||||
const auto minmax = min_val - max_val;
|
||||
complex_t exp_minmax;
|
||||
if (!isfinite(minmax.real())) {
|
||||
exp_minmax = (minmax.real() < T(0)) ? complex_t(0, 0) : fast_build_exp_inf<T>(minmax);
|
||||
} else {
|
||||
exp_minmax = fast_build_exp<T>(minmax);
|
||||
}
|
||||
return log1p(exp_minmax) + max_val;
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
constexpr char logaddexp_complex_name[] = "logaddexp_complex";
|
||||
void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
|
||||
if (at::isComplexType(iter.dtype())) {
|
||||
#if AT_USE_JITERATOR()
|
||||
AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
|
||||
jitted_gpu_kernel<
|
||||
/*name=*/logaddexp_complex_name,
|
||||
/*return_dtype=*/scalar_t,
|
||||
/*common_dtype=*/scalar_t,
|
||||
/*arity=*/2>(iter, logaddexp_complex_string);
|
||||
});
|
||||
#else
|
||||
AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
|
||||
using opmath_t = at::opmath_type<scalar_t>;
|
||||
gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a_, scalar_t b_) -> scalar_t {
|
||||
const auto a = static_cast<opmath_t>(a_);
|
||||
const auto b = static_cast<opmath_t>(b_);
|
||||
return static_cast<scalar_t>(_log_add_exp_helper(a, b));
|
||||
});
|
||||
});
|
||||
#endif
|
||||
} else {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
||||
ScalarType::BFloat16, ScalarType::Half,
|
||||
iter.dtype(), "logaddexp_cuda",
|
||||
[&]() {
|
||||
@ -261,7 +29,6 @@ void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void logaddexp2_kernel_cuda(TensorIteratorBase& iter) {
|
||||
|
||||
@ -11,7 +11,7 @@ static inline std::ostream& operator<<(std::ostream& out, dim3 dim) {
|
||||
if (dim.y == 1 && dim.z == 1) {
|
||||
out << dim.x;
|
||||
} else {
|
||||
out << '[' << dim.x << ',' << dim.y << ',' << dim.z << ']';
|
||||
out << "[" << dim.x << "," << dim.y << "," << dim.z << "]";
|
||||
}
|
||||
return out;
|
||||
}
|
||||
@ -27,7 +27,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
|
||||
out << "input_mult=[";
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if (i != 0) {
|
||||
out << ',';
|
||||
out << ",";
|
||||
}
|
||||
out << config.input_mult[i];
|
||||
}
|
||||
@ -35,7 +35,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
|
||||
out << "output_mult=[";
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if (i != 0) {
|
||||
out << ',';
|
||||
out << ",";
|
||||
}
|
||||
out << config.output_mult[i];
|
||||
}
|
||||
@ -49,7 +49,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
|
||||
out << "block=" << config.block() << ", ";
|
||||
out << "grid=" << config.grid() << ", ";
|
||||
out << "global_memory_size=" << config.global_memory_size();
|
||||
out << ')';
|
||||
out << ")";
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
@ -740,12 +740,7 @@ _scaled_rowwise_rowwise(
|
||||
TORCH_CHECK_VALUE(scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat, "scale_a must have ", mat_a.size(0), " Float elements, got ", scale_a.numel())
|
||||
TORCH_CHECK_VALUE(scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat, "scale_b must have ", mat_b.size(1), " Float elements, got ", scale_b.numel())
|
||||
|
||||
// if we have a scale of shape [256, 1] (say), then stride can be [1, 0] - handle this case
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_a.stride(1) == 1 ||
|
||||
scale_a.size(1) == 1,
|
||||
"expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1)
|
||||
);
|
||||
TORCH_CHECK_VALUE(scale_a.stride(1) == 1, "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1));
|
||||
TORCH_CHECK_VALUE(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1));
|
||||
|
||||
auto scaling_choice_a = ScalingType::RowWise;
|
||||
@ -1101,19 +1096,6 @@ _scaled_mxfp8_mxfp8(
|
||||
return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
|
||||
}
|
||||
|
||||
void
|
||||
_check_mxfp4_support() {
|
||||
#ifndef USE_ROCM
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
// Only on B200 GPUs
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
// B200 = 10.0, B300 = 10.3
|
||||
dprops->major == 10,
|
||||
"MXFP4 scaling only supported in CUDA for B200/B300"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
Tensor&
|
||||
_scaled_mxfp4_mxfp4(
|
||||
@ -1126,7 +1108,6 @@ _scaled_mxfp4_mxfp4(
|
||||
#if defined(_WIN32) || (!defined(USE_ROCM) && !defined(USE_FBGEMM_GENAI))
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM and CUDA+FBGEMM_GENAI only");
|
||||
#else
|
||||
_check_mxfp4_support();
|
||||
// Restrictions:
|
||||
// A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
|
||||
TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",
|
||||
|
||||
@ -364,9 +364,9 @@ void f8f8bf16_grouped_gemm_impl_sm90(
|
||||
// reinterpret_cast<ProblemShape::UnderlyingProblemShape*>(
|
||||
// stride_output_h + group_count);
|
||||
|
||||
// std::cout << "PTRS " << mat_a.data_ptr() << ' ' << mat_b.data_ptr() << "
|
||||
// std::cout << "PTRS " << mat_a.data_ptr() << " " << mat_b.data_ptr() << "
|
||||
// "
|
||||
// << out.data_ptr() << ' ' << scale_a.data_ptr() << ' '
|
||||
// << out.data_ptr() << " " << scale_a.data_ptr() << " "
|
||||
// << scale_b.data_ptr() << "\n";
|
||||
// for (int i = 0; i < group_count; i++) {
|
||||
// std::cout << "A " << (void*)inputA_ptrs_h[i] << "\n";
|
||||
|
||||
@ -1057,14 +1057,14 @@ std::string generate_code(
|
||||
// TODO these arrays are potentially of the different types, use function
|
||||
// traits to determine the types
|
||||
declare_load_arrays << f_inputs_type << " arg" << std::to_string(i)
|
||||
<< '[' << std::to_string(thread_work_size) << "];\n";
|
||||
<< "[" << std::to_string(thread_work_size) << "];\n";
|
||||
}
|
||||
env.s("declare_load_arrays", declare_load_arrays.str());
|
||||
|
||||
std::stringstream declare_store_arrays;
|
||||
for (int i = 0; i < nOutputs; i++) {
|
||||
declare_store_arrays << result_type << " out" << std::to_string(i)
|
||||
<< '[' << std::to_string(thread_work_size) << "];\n";
|
||||
<< "[" << std::to_string(thread_work_size) << "];\n";
|
||||
}
|
||||
env.s("declare_store_arrays", declare_store_arrays.str());
|
||||
|
||||
@ -1217,7 +1217,7 @@ std::string generate_code(
|
||||
for (const auto i : c10::irange(nInputs)){
|
||||
auto i_string = std::to_string(i);
|
||||
vector_inputs << "auto * input" << i_string <<
|
||||
" = reinterpret_cast<const scalar_t*>(data[" << i_string << '+' << nOutputs << "])" <<
|
||||
" = reinterpret_cast<const scalar_t*>(data[" << i_string << "+" << nOutputs << "])" <<
|
||||
" + block_work_size * idx;\n";
|
||||
}
|
||||
env.s("vector_inputs", vector_inputs.str());
|
||||
@ -1543,17 +1543,17 @@ NvrtcFunction jit_pwise_function(
|
||||
|
||||
// Constructs file path by appending constructed cubin name to cache path
|
||||
std::stringstream ss;
|
||||
ss << *cache_dir << '/';
|
||||
ss << *cache_dir << "/";
|
||||
ss << kernel_name;
|
||||
#ifdef USE_ROCM
|
||||
ss << "_arch" << prop->gcnArchName;
|
||||
#else
|
||||
ss << "_arch" << cuda_major << '.' << cuda_minor;
|
||||
ss << "_arch" << cuda_major << "." << cuda_minor;
|
||||
#endif
|
||||
ss << "_nvrtc" << nvrtc_major << '.' << nvrtc_minor;
|
||||
ss << "_nvrtc" << nvrtc_major << "." << nvrtc_minor;
|
||||
ss << (compile_to_sass ? "_sass" : "_ptx");
|
||||
ss << '_' << code.length();
|
||||
ss << '_' << hash_code;
|
||||
ss << "_" << code.length();
|
||||
ss << "_" << hash_code;
|
||||
file_path = ss.str();
|
||||
|
||||
std::ifstream readin{file_path, std::ios::in | std::ifstream::binary};
|
||||
|
||||
@ -82,15 +82,15 @@ namespace native {
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, const ConvolutionParams& params) {
|
||||
out << "ConvolutionParams \n"
|
||||
<< " memory_format = " << params.memory_format << '\n'
|
||||
<< " data_type = " << cudnnTypeToString(params.dataType) << '\n'
|
||||
<< " padding = " << ArrayRef<int>{params.padding} << '\n'
|
||||
<< " stride = " << ArrayRef<int>{params.stride} << '\n'
|
||||
<< " dilation = " << ArrayRef<int>{params.dilation} << '\n'
|
||||
<< " groups = " << params.groups << '\n'
|
||||
<< " memory_format = " << params.memory_format << "\n"
|
||||
<< " data_type = " << cudnnTypeToString(params.dataType) << "\n"
|
||||
<< " padding = " << ArrayRef<int>{params.padding} << "\n"
|
||||
<< " stride = " << ArrayRef<int>{params.stride} << "\n"
|
||||
<< " dilation = " << ArrayRef<int>{params.dilation} << "\n"
|
||||
<< " groups = " << params.groups << "\n"
|
||||
<< " deterministic = " << (params.deterministic ? "true" : "false")
|
||||
<< '\n'
|
||||
<< " allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << '\n';
|
||||
<< "\n"
|
||||
<< " allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << "\n";
|
||||
|
||||
return out;
|
||||
}
|
||||
@ -173,16 +173,16 @@ std::string repro_from_args(const ConvolutionParams& params) {
|
||||
at::globalContext().float32Precision(
|
||||
at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
|
||||
at::Float32Precision::TF32)
|
||||
<< '\n';
|
||||
<< "\n";
|
||||
ss << "torch.backends.cudnn.benchmark = "
|
||||
<< pybool(at::globalContext().benchmarkCuDNN()) << '\n';
|
||||
<< pybool(at::globalContext().benchmarkCuDNN()) << "\n";
|
||||
ss << "torch.backends.cudnn.deterministic = " << pybool(params.deterministic)
|
||||
<< '\n';
|
||||
<< "\n";
|
||||
ss << "torch.backends.cudnn.allow_tf32 = " << pybool(params.allow_tf32)
|
||||
<< '\n';
|
||||
<< "\n";
|
||||
ss << "data = torch.randn(" << ArrayRef<int>(params.input_size, dim)
|
||||
<< ", dtype=" << full_dtype << ", ";
|
||||
ss << "device='cuda', requires_grad=True)" << to_channels_last << '\n';
|
||||
ss << "device='cuda', requires_grad=True)" << to_channels_last << "\n";
|
||||
ss << "net = torch.nn.Conv" << dim - 2 << "d(" << in_channels << ", "
|
||||
<< out_channels << ", ";
|
||||
ss << "kernel_size=" << ArrayRef<int>(¶ms.weight_size[2], dim - 2)
|
||||
@ -192,7 +192,7 @@ std::string repro_from_args(const ConvolutionParams& params) {
|
||||
ss << "dilation=" << ArrayRef<int>(params.dilation, dim - 2) << ", ";
|
||||
ss << "groups=" << params.groups << ")\n";
|
||||
ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last
|
||||
<< '\n';
|
||||
<< "\n";
|
||||
ss << "out = net(data)\n";
|
||||
ss << "out.backward(torch.randn_like(out))\n";
|
||||
ss << "torch.cuda.synchronize()\n\n";
|
||||
|
||||
@ -93,10 +93,11 @@ std::ostream& operator<<(std::ostream& out, const ConvolutionArgs& args) {
|
||||
<< "input: " << args.idesc // already has a trailing newline
|
||||
<< "output: " << args.odesc // already has a trailing newline
|
||||
<< "weight: " << args.wdesc // already has a trailing newline
|
||||
<< "Pointer addresses: " << '\n'
|
||||
<< " input: " << args.input.const_data_ptr() << '\n'
|
||||
<< " output: " << args.output.const_data_ptr() << '\n'
|
||||
<< " weight: " << args.weight.const_data_ptr() << '\n';
|
||||
<< "Pointer addresses: "
|
||||
<< "\n"
|
||||
<< " input: " << args.input.const_data_ptr() << "\n"
|
||||
<< " output: " << args.output.const_data_ptr() << "\n"
|
||||
<< " weight: " << args.weight.const_data_ptr() << "\n";
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
@ -115,7 +115,7 @@ std::ostream& operator<<(
|
||||
std::copy(
|
||||
strides.begin(), strides.end() - 1, std::ostream_iterator<int>(oss, ","));
|
||||
oss << sizes.back();
|
||||
output << oss.str() << '}';
|
||||
output << oss.str() << "}";
|
||||
return output;
|
||||
}
|
||||
|
||||
|
||||
@ -53,7 +53,7 @@ std::ostream& operator<<(std::ostream& out, const ConvParams& params) {
|
||||
<< " transposed = " << params.transposed
|
||||
<< " output_padding = " << IntArrayRef{params.output_padding}
|
||||
<< " groups = " << params.groups << " benchmark = " << params.benchmark
|
||||
<< " deterministic = " << params.deterministic << '}';
|
||||
<< " deterministic = " << params.deterministic << "}";
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
@ -5,7 +5,6 @@
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
|
||||
#include <ATen/native/xpu/Blas.h>
|
||||
#include <ATen/xpu/XPUScaledBlas.h>
|
||||
#include <torch/library.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
@ -340,399 +339,4 @@ Tensor _scaled_mm_xpu(
|
||||
out);
|
||||
}
|
||||
|
||||
using acceptance_fn = std::function<bool(
|
||||
c10::ScalarType,
|
||||
std::vector<ScalingType>&,
|
||||
ArrayRef<Tensor>&,
|
||||
c10::ScalarType,
|
||||
std::vector<ScalingType>&,
|
||||
ArrayRef<Tensor>&)>;
|
||||
using namespace std::placeholders;
|
||||
|
||||
namespace scaled_blas = at::native::onednn::scaled;
|
||||
using scaled_blas::convert_int_to_enum;
|
||||
using scaled_blas::ScaledGemmImplementation;
|
||||
|
||||
std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 2>
|
||||
scale_kernel_dispatch = {{
|
||||
{"tensorwise_tensorwise",
|
||||
scaled_blas::check_tensorwise_recipe,
|
||||
ScaledGemmImplementation::TENSORWISE_TENSORWISE},
|
||||
{"rowwise_rowwise",
|
||||
scaled_blas::check_rowwise_recipe,
|
||||
ScaledGemmImplementation::ROWWISE_ROWWISE},
|
||||
|
||||
}};
|
||||
|
||||
Tensor& _scaled_tensorwise_tensorwise(
|
||||
const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
const Tensor& scale_a,
|
||||
const Tensor& scale_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const c10::ScalarType out_dtype,
|
||||
bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
// Restrictions:
|
||||
// A, B are FP8, scales are fp32
|
||||
|
||||
TORCH_CHECK_VALUE(
|
||||
isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()),
|
||||
"mat_a and mat_b must be fp8 types, got: ",
|
||||
mat_a.scalar_type(),
|
||||
mat_b.scalar_type());
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_a.numel() == 1 && scale_a.scalar_type() == kFloat,
|
||||
"scale_a must have 1 Float element")
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_b.numel() == 1 && scale_b.scalar_type() == kFloat,
|
||||
"scale_b must have 1 Float element")
|
||||
|
||||
auto scaling_choice_a = ScalingType::TensorWise;
|
||||
auto scaling_choice_b = ScalingType::TensorWise;
|
||||
|
||||
_scaled_gemm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a,
|
||||
scale_b,
|
||||
scaling_choice_a,
|
||||
scaling_choice_b,
|
||||
bias,
|
||||
use_fast_accum,
|
||||
out);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
Tensor& _scaled_rowwise_rowwise(
|
||||
const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
const Tensor& scale_a,
|
||||
const Tensor& scale_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const c10::ScalarType out_dtype,
|
||||
bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
// Restrictions:
|
||||
// A, B are FP8, scales are fp32, shape M/N for A/B
|
||||
TORCH_CHECK_VALUE(
|
||||
isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()),
|
||||
"mat_a and mat_b must be fp8 types, got: ",
|
||||
mat_a.scalar_type(),
|
||||
mat_b.scalar_type());
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_a.size(0) == mat_a.size(0) && scale_a.size(1) == 1,
|
||||
"scale_a must have shape [",
|
||||
mat_a.size(0),
|
||||
", 1], got [",
|
||||
scale_a.sizes(),
|
||||
"]");
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat,
|
||||
"scale_a must have ",
|
||||
mat_a.size(0),
|
||||
" Float elements, got ",
|
||||
scale_a.numel())
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat,
|
||||
"scale_b must have ",
|
||||
mat_b.size(1),
|
||||
" Float elements, got ",
|
||||
scale_b.numel())
|
||||
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_a.stride(1) == 1,
|
||||
"expected scale_a.stride(1) to be 1, but got ",
|
||||
scale_a.stride(1));
|
||||
TORCH_CHECK_VALUE(
|
||||
scale_b.stride(1) == 1,
|
||||
"expected scale_b.stride(1) to be 1, but got ",
|
||||
scale_b.stride(1));
|
||||
|
||||
auto scaling_choice_a = ScalingType::RowWise;
|
||||
auto scaling_choice_b = ScalingType::RowWise;
|
||||
|
||||
_scaled_gemm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a,
|
||||
scale_b,
|
||||
scaling_choice_a,
|
||||
scaling_choice_b,
|
||||
bias,
|
||||
use_fast_accum,
|
||||
out);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
// V2: Computes matrix multiply + bias while applying scaling to input and
|
||||
// output matrices Scales are only applicable when matrices are of Float8 type
|
||||
// and assumed to be equal to 1.0 by default. If output matrix type is 16 or
|
||||
// 32-bit type, scale_result is not applied. Known limitations:
|
||||
// - Only works if mat1 is row-major and mat2 is column-major
|
||||
// - Only works if matrices sizes are divisible by 32
|
||||
// - If 1-dimensional tensors are used then scale_a should be size =
|
||||
// mat1.size(0)
|
||||
// and scale_b should have size = to mat2.size(1)
|
||||
// Arguments:
|
||||
// - `mat_a`: the first operand of the matrix multiply, can be type
|
||||
// `torch.float8_e4m3fn` or `torch.float8_e5m2`
|
||||
// - `mat_b`: the second operand of the matrix multiply, can be type
|
||||
// `torch.float8_e4m3fn` or `torch.float8_e5m2`
|
||||
// - `scale_a`: a tensor with the inverse scale of `mat1`, whose
|
||||
// shape/strides/dtype depend on the scaling scheme
|
||||
// - `scale_recipe_a`: An integer corresponding to an enum describing the
|
||||
// scaling scheme used for `scale_a`
|
||||
// - `swizzle_a`: An integer corresponding to a `SwizzleType` enum describing
|
||||
// the swizzling scheme for `scale_a`.
|
||||
// Not supported for XPU for now.
|
||||
// - `scale_b`: a tensor with the inverse scale of `mat2`, whose
|
||||
// shape/strides/dtype depend on the scaling scheme
|
||||
// - `scale_recipe_b`: An integer corresponding to an enum describing the
|
||||
// scaling scheme used for `scale_b`
|
||||
// - `swizzle_b`: An integer corresponding to a `SwizzleType` enum describing
|
||||
// the swizzling scheme for `scale_b`.
|
||||
// Not supported for XPU for now.
|
||||
// - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
|
||||
// - `out_dtype`: the output dtype, can either be a float8 or a higher
|
||||
// precision floating point type
|
||||
// - `contraction_dim`: describe which dimensions are `K` in the matmul.
|
||||
// Not supported for XPU. Should always be empty.
|
||||
// - `use_fast_accum`: Not supported for XPU, should always be false.
|
||||
// - `out`: a reference to the output tensor
|
||||
Tensor& _scaled_mm_xpu_v2_out(
|
||||
const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
ArrayRef<Tensor> scale_a,
|
||||
IntArrayRef scale_recipe_a,
|
||||
IntArrayRef swizzle_a,
|
||||
ArrayRef<Tensor> scale_b,
|
||||
IntArrayRef scale_recipe_b,
|
||||
IntArrayRef swizzle_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const std::optional<c10::ScalarType> out_dtype,
|
||||
IntArrayRef contraction_dim,
|
||||
bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
TORCH_CHECK_VALUE(mat_a.dim() == 2, "mat_a must be a matrix");
|
||||
TORCH_CHECK_VALUE(mat_b.dim() == 2, "mat_b must be a matrix");
|
||||
|
||||
// If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm
|
||||
// kernels do not support this case).
|
||||
if (mat_a.size(0) == 0 || mat_a.size(1) == 0 || mat_b.size(1) == 0) {
|
||||
// `out` was created with `at::empty`. In the case where we are multiplying
|
||||
// MxK by KxN and K is the zero dim, we need to initialize here to properly
|
||||
// return a tensor of zeros.
|
||||
at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
|
||||
if (mat_a.size(1) == 0) {
|
||||
out.zero_();
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
// Note: The `contraction_dim` is not actually used for now. We will need to
|
||||
// align this code when upstreamed CUDA code is done. Currently, only keeps
|
||||
// the code here for check.
|
||||
|
||||
// Check if the input matrix sizes can be multiplied
|
||||
// - if optional contraction dims are provided, use those
|
||||
// -- mostly for < 1B formats (i.e. nvfp4x2) where cheap .t() is not
|
||||
// available.
|
||||
if (contraction_dim.size() > 0) {
|
||||
TORCH_CHECK_VALUE(
|
||||
contraction_dim.size() == 2,
|
||||
"contraction_dim must have exactly 2 elements");
|
||||
auto mat_a_dim = contraction_dim[0];
|
||||
auto mat_b_dim = contraction_dim[1];
|
||||
TORCH_CHECK_VALUE(
|
||||
mat_a.size(mat_a_dim) == mat_b.size(mat_b_dim),
|
||||
"mat_a and mat_b shapes cannot be multiplied (",
|
||||
mat_a.size(0),
|
||||
"x",
|
||||
mat_a.size(1),
|
||||
" and ",
|
||||
mat_b.size(0),
|
||||
"x",
|
||||
mat_b.size(1),
|
||||
") ",
|
||||
"with contraction dims mat_a: ",
|
||||
mat_a_dim,
|
||||
", mat_b: ",
|
||||
mat_b_dim);
|
||||
} else {
|
||||
TORCH_CHECK_VALUE(
|
||||
mat_a.size(1) == mat_b.size(0),
|
||||
"mat_a and mat_b shapes cannot be multiplied (",
|
||||
mat_a.size(0),
|
||||
"x",
|
||||
mat_a.size(1),
|
||||
" and ",
|
||||
mat_b.size(0),
|
||||
"x",
|
||||
mat_b.size(1),
|
||||
")");
|
||||
}
|
||||
|
||||
TORCH_CHECK_VALUE(
|
||||
!bias || bias->numel() == mat_b.sizes()[1],
|
||||
"Bias must be size ",
|
||||
mat_b.sizes()[1],
|
||||
" but got ",
|
||||
bias->numel());
|
||||
|
||||
TORCH_CHECK_VALUE(
|
||||
!out_dtype || *out_dtype == out.scalar_type(),
|
||||
"out_dtype must match output matrix type");
|
||||
|
||||
if (bias) {
|
||||
TORCH_CHECK_VALUE(
|
||||
bias->scalar_type() == kFloat ||
|
||||
bias->scalar_type() == c10::ScalarType::BFloat16 ||
|
||||
bias->scalar_type() == c10::ScalarType::Half,
|
||||
"Bias must be Float32 or BFloat16 or Half, but got ",
|
||||
bias->scalar_type());
|
||||
}
|
||||
{
|
||||
auto bias_ = bias.value_or(Tensor());
|
||||
// NOLINTNEXTLINE(*c-array*)
|
||||
TensorArg targs[]{
|
||||
{out, "out", 0},
|
||||
{mat_a, "mat_a", 1},
|
||||
{mat_b, "mat_b", 2},
|
||||
{bias_, "bias", 3},
|
||||
{scale_a[0], "scale_a", 4},
|
||||
{scale_b[0], "scale_b", 5}};
|
||||
checkAllSameGPU(__func__, targs);
|
||||
}
|
||||
// Align with CUDA's default out to be bf16
|
||||
auto out_dtype_ = out_dtype.value_or(c10::ScalarType::BFloat16);
|
||||
|
||||
// Conversion of implicitly-defined enums to explicit
|
||||
auto scale_recipe_a_enum = convert_int_to_enum<ScalingType>(scale_recipe_a);
|
||||
auto swizzle_a_enum = convert_int_to_enum<SwizzleType>(swizzle_a);
|
||||
auto scale_recipe_b_enum = convert_int_to_enum<ScalingType>(scale_recipe_b);
|
||||
auto swizzle_b_enum = convert_int_to_enum<SwizzleType>(swizzle_b);
|
||||
|
||||
// XPU does not support swizzle for now. So directly return false.
|
||||
TORCH_CHECK_VALUE(
|
||||
swizzle_a_enum[0] == at::blas::SwizzleType::NO_SWIZZLE &&
|
||||
swizzle_b_enum[0] == at::blas::SwizzleType::NO_SWIZZLE,
|
||||
"XPU does not support swizzle yet.");
|
||||
|
||||
// at this point we can start working out what we want to be doing
|
||||
// Try to do as few steps as possible.
|
||||
// NOTE: support is deliberately sparse, can explicitly enumerate all
|
||||
// combinations allowed. Do this via a list of defined (name, acceptance,
|
||||
// concrete_impl) tuples.
|
||||
bool found_impl = false;
|
||||
ScaledGemmImplementation gemm_impl = ScaledGemmImplementation::NONE;
|
||||
|
||||
for (const auto& fn_entry : scale_kernel_dispatch) {
|
||||
const auto [name, accept_fn, scaled_gemm_impl] = fn_entry;
|
||||
bool ok = accept_fn(
|
||||
mat_a.scalar_type(),
|
||||
scale_recipe_a_enum,
|
||||
scale_a,
|
||||
mat_b.scalar_type(),
|
||||
scale_recipe_b_enum,
|
||||
scale_b);
|
||||
if (ok) {
|
||||
gemm_impl = scaled_gemm_impl;
|
||||
found_impl = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
TORCH_CHECK_VALUE(
|
||||
found_impl,
|
||||
"Invalid scaling configuration.\n"
|
||||
"- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n"
|
||||
"- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (",
|
||||
mat_a.size(0),
|
||||
", 1) and scale_b should be (1, ",
|
||||
mat_b.size(1),
|
||||
"), and both should be contiguous.\n"
|
||||
"Got mat_a.dtype()=",
|
||||
mat_a.scalar_type(),
|
||||
", scale_a[0].dtype()=",
|
||||
scale_a[0].scalar_type(),
|
||||
", scale_a[0].size()=",
|
||||
scale_a[0].sizes(),
|
||||
", scale_a[0].stride()=",
|
||||
scale_a[0].strides(),
|
||||
", ",
|
||||
"mat_b.dtype()=",
|
||||
mat_b.scalar_type(),
|
||||
", scale_b[0].dtype()=",
|
||||
scale_b[0].scalar_type(),
|
||||
", scale_b[0].size()=",
|
||||
scale_b[0].sizes(),
|
||||
" and scale_b[0].stride()=",
|
||||
scale_b[0].strides());
|
||||
|
||||
at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
|
||||
|
||||
auto bias_ = bias.value_or(Tensor());
|
||||
|
||||
// dispatch to appropriate lower-level calls for error checking & execution
|
||||
if (gemm_impl == ScaledGemmImplementation::TENSORWISE_TENSORWISE) {
|
||||
return _scaled_tensorwise_tensorwise(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a[0],
|
||||
scale_b[0],
|
||||
bias,
|
||||
out_dtype_,
|
||||
use_fast_accum,
|
||||
out);
|
||||
} else if (gemm_impl == ScaledGemmImplementation::ROWWISE_ROWWISE) {
|
||||
return _scaled_rowwise_rowwise(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a[0],
|
||||
scale_b[0],
|
||||
bias,
|
||||
out_dtype_,
|
||||
use_fast_accum,
|
||||
out);
|
||||
} else {
|
||||
TORCH_CHECK_VALUE(
|
||||
false, "Invalid state - found an implementation, but not really");
|
||||
}
|
||||
}
|
||||
|
||||
Tensor _scaled_mm_xpu_v2(
|
||||
const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
ArrayRef<Tensor> scale_a,
|
||||
IntArrayRef scale_recipe_a,
|
||||
IntArrayRef swizzle_a,
|
||||
ArrayRef<Tensor> scale_b,
|
||||
IntArrayRef scale_recipe_b,
|
||||
IntArrayRef swizzle_b,
|
||||
const std::optional<Tensor>& bias,
|
||||
const std::optional<c10::ScalarType> out_dtype,
|
||||
IntArrayRef contraction_dim,
|
||||
bool use_fast_accum) {
|
||||
const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
|
||||
Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
|
||||
|
||||
return _scaled_mm_xpu_v2_out(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a,
|
||||
scale_recipe_a,
|
||||
swizzle_a,
|
||||
scale_b,
|
||||
scale_recipe_b,
|
||||
swizzle_b,
|
||||
bias,
|
||||
out_dtype,
|
||||
contraction_dim,
|
||||
use_fast_accum,
|
||||
out);
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -82,7 +82,6 @@ NSArray<NSNumber*>* getTensorAxes(const TensorBase& t);
|
||||
NSArray<NSNumber*>* getTensorAxes(const IntArrayRef& sizes, at::OptionalIntArrayRef dim);
|
||||
std::string getMPSShapeString(MPSShape* shape);
|
||||
std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = true, bool exclude_shape = false);
|
||||
std::string to_hex_key(float);
|
||||
std::string getArrayRefString(const IntArrayRef s);
|
||||
// use has_storage() on the returned tensor to determine if src actually is a view
|
||||
Tensor gatherViewTensor(const Tensor& src, Tensor& dst);
|
||||
|
||||
@ -301,10 +301,6 @@ std::string getArrayRefString(const IntArrayRef s) {
|
||||
return fmt::to_string(fmt::join(s, ","));
|
||||
}
|
||||
|
||||
std::string to_hex_key(float f) {
|
||||
return fmt::format("{:a}", f);
|
||||
}
|
||||
|
||||
std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype, bool exclude_shape) {
|
||||
fmt::basic_memory_buffer<char, 100> buffer;
|
||||
auto buf_iterator = std::back_inserter(buffer);
|
||||
|
||||
@ -91,30 +91,25 @@ static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
|
||||
#include <ATen/native/mps/Repeat_metallib.h>
|
||||
#endif
|
||||
|
||||
Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
|
||||
TORCH_CHECK(repeat.dim() == 1, "repeat_interleave only accept 1D vector as repeat");
|
||||
template <typename index_t>
|
||||
void computeRepeatIndices(const index_t* repeat_ptr,
|
||||
const int64_t* cumsum_ptr,
|
||||
index_t* result_ptr,
|
||||
int64_t size,
|
||||
int64_t result_size) {
|
||||
id<MTLBuffer> repeatBuffer = reinterpret_cast<id<MTLBuffer>>(repeat_ptr);
|
||||
id<MTLBuffer> cumsumBuffer = reinterpret_cast<id<MTLBuffer>>(cumsum_ptr);
|
||||
id<MTLBuffer> resultBuffer = reinterpret_cast<id<MTLBuffer>>(result_ptr);
|
||||
TORCH_CHECK(repeatBuffer && cumsumBuffer && resultBuffer);
|
||||
|
||||
std::string scalar_type;
|
||||
if (repeat.scalar_type() == kInt) {
|
||||
if constexpr (std::is_same_v<index_t, int32_t>) {
|
||||
scalar_type = "int32_t";
|
||||
} else if (repeat.scalar_type() == kLong) {
|
||||
} else if constexpr (std::is_same_v<index_t, int64_t>) {
|
||||
scalar_type = "int64_t";
|
||||
} else {
|
||||
TORCH_CHECK(false, "repeats has to be Long or Int tensor");
|
||||
TORCH_CHECK(false, "repeat_interleave: unsupported indexing data type");
|
||||
}
|
||||
if (repeat.size(0) == 0) {
|
||||
return at::empty_like(repeat, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||
}
|
||||
Tensor repeat_ = repeat.contiguous();
|
||||
Tensor cumsum = repeat.cumsum(0);
|
||||
int64_t total = 0;
|
||||
if (output_size.has_value()) {
|
||||
total = output_size.value();
|
||||
} else {
|
||||
total = cumsum[-1].item<int64_t>();
|
||||
TORCH_CHECK((repeat >= 0).all().item<uint8_t>(), "repeats can not be negative");
|
||||
}
|
||||
|
||||
auto result = at::empty({total}, repeat.options());
|
||||
|
||||
MPSStream* mpsStream = getCurrentMPSStream();
|
||||
dispatch_sync(mpsStream->queue(), ^() {
|
||||
@ -126,13 +121,20 @@ Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output
|
||||
getMPSProfiler().beginProfileKernel(pipelineState, "repeat_interleave:" + scalar_type, false);
|
||||
|
||||
[computeEncoder setComputePipelineState:pipelineState];
|
||||
mps::mtl_setArgs(computeEncoder, repeat_, cumsum, result, repeat.size(0));
|
||||
mps::mtl_dispatch1DJob(computeEncoder, pipelineState, repeat.size(0));
|
||||
mps::mtl_setArgs(computeEncoder, repeatBuffer, cumsumBuffer, resultBuffer, size);
|
||||
mps::mtl_dispatch1DJob(computeEncoder, pipelineState, size);
|
||||
|
||||
getMPSProfiler().endProfileKernel(pipelineState);
|
||||
}
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
|
||||
Tensor output;
|
||||
AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_mps", [&]() {
|
||||
output = repeat_interleave_common<index_t, computeRepeatIndices<index_t>>(repeat, output_size);
|
||||
});
|
||||
return output;
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -5,7 +5,6 @@
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/TensorCompare.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <algorithm>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
@ -90,21 +89,13 @@ static void check_min_max_dims(const OptionalTensorRef clamp_opt, const Tensor&
|
||||
auto clamp_shape = clamp_opt->sizes();
|
||||
auto input_shape = input_t.sizes();
|
||||
|
||||
if (num_clamp_dims > num_input_dims) {
|
||||
auto leading_dims = num_clamp_dims - num_input_dims;
|
||||
for (int64_t i = 0; i < leading_dims; ++i) {
|
||||
TORCH_CHECK(clamp_shape[i] == 1,
|
||||
op_name + ": clamp tensor leading shape must be 1 to broadcast with input tensor");
|
||||
}
|
||||
}
|
||||
TORCH_CHECK(num_clamp_dims <= num_input_dims,
|
||||
op_name + ": clamp tensor number of dims must not be greater than that of input tensor")
|
||||
|
||||
auto clamp_idx = num_clamp_dims - 1;
|
||||
auto input_idx = num_input_dims - 1;
|
||||
auto common_dims = std::min(num_clamp_dims, num_input_dims);
|
||||
for (int64_t i = 0; i < common_dims; ++i)
|
||||
for (int i = 0; i < num_clamp_dims; i++)
|
||||
// One of the indices is allowed to be 1; will be handled by broadcast
|
||||
TORCH_CHECK(clamp_shape[clamp_idx - i] == input_shape[input_idx - i] || clamp_shape[clamp_idx - i] == 1 ||
|
||||
input_shape[input_idx - i] == 1,
|
||||
TORCH_CHECK(clamp_shape[num_clamp_dims - 1 - i] == input_shape[num_input_dims - 1 - i] ||
|
||||
clamp_shape[num_clamp_dims - 1 - i] == 1 || input_shape[num_input_dims - 1 - i] == 1,
|
||||
op_name + ": clamp tensor trailing shape must match input tensor")
|
||||
}
|
||||
}
|
||||
@ -145,6 +136,9 @@ static void clamp_tensor_out_mps(const Tensor& input_t,
|
||||
|
||||
auto result_type = output_t.scalar_type();
|
||||
|
||||
IntArrayRef new_min_shape;
|
||||
IntArrayRef new_max_shape;
|
||||
|
||||
auto num_min_dims = min_opt->dim();
|
||||
auto num_max_dims = max_opt->dim();
|
||||
auto num_input_dims = input_t.dim();
|
||||
@ -152,32 +146,24 @@ static void clamp_tensor_out_mps(const Tensor& input_t,
|
||||
std::vector<int64_t> new_min_arr(num_input_dims);
|
||||
std::vector<int64_t> new_max_arr(num_input_dims);
|
||||
|
||||
if (has_min && num_min_dims < num_input_dims) {
|
||||
fill_new_shape(num_input_dims, num_min_dims, new_min_arr.data(), min_opt->sizes());
|
||||
new_min_shape = IntArrayRef(new_min_arr);
|
||||
}
|
||||
|
||||
if (has_max && num_max_dims < num_input_dims) {
|
||||
fill_new_shape(num_input_dims, num_max_dims, new_max_arr.data(), max_opt->sizes());
|
||||
new_max_shape = IntArrayRef(new_max_arr);
|
||||
}
|
||||
|
||||
Tensor min_opt_tensor;
|
||||
Tensor max_opt_tensor;
|
||||
|
||||
auto reshape_clamp_tensor = [&](const OptionalTensorRef clamp_tensor_ref,
|
||||
int64_t num_clamp_dims,
|
||||
std::vector<int64_t>& new_shape_storage) -> Tensor {
|
||||
IntArrayRef clamp_shape = clamp_tensor_ref->sizes();
|
||||
bool requires_view = false;
|
||||
|
||||
if (num_clamp_dims > num_input_dims) {
|
||||
clamp_shape = clamp_shape.slice(num_clamp_dims - num_input_dims);
|
||||
requires_view = true;
|
||||
} else if (num_clamp_dims < num_input_dims) {
|
||||
fill_new_shape(num_input_dims, num_clamp_dims, new_shape_storage.data(), clamp_shape);
|
||||
clamp_shape = IntArrayRef(new_shape_storage);
|
||||
requires_view = true;
|
||||
}
|
||||
|
||||
return requires_view ? (*clamp_tensor_ref).view(clamp_shape) : *clamp_tensor_ref;
|
||||
};
|
||||
|
||||
if (has_min) {
|
||||
min_opt_tensor = reshape_clamp_tensor(min_opt, num_min_dims, new_min_arr);
|
||||
min_opt_tensor = (num_min_dims < num_input_dims) ? (*min_opt).view(new_min_shape) : *min_opt;
|
||||
}
|
||||
if (has_max) {
|
||||
max_opt_tensor = reshape_clamp_tensor(max_opt, num_max_dims, new_max_arr);
|
||||
max_opt_tensor = (num_max_dims < num_input_dims) ? (*max_opt).view(new_max_shape) : *max_opt;
|
||||
}
|
||||
|
||||
@autoreleasepool {
|
||||
@ -258,8 +244,8 @@ static void clamp_scalar_out_mps(const Tensor& input_t,
|
||||
|
||||
@autoreleasepool {
|
||||
// the optional min/max refs could affect how we build the cached graph
|
||||
std::string key = op_name + (has_min ? ("_min:" + to_hex_key(min_scalar)) : "") +
|
||||
(has_max ? ("_max:" + to_hex_key(max_scalar)) : "") + "_scalar:" + getTensorsStringKey({input_t});
|
||||
std::string key = op_name + (has_min ? ("_min:" + std::to_string(min_scalar)) : "") +
|
||||
(has_max ? ("_max:" + std::to_string(max_scalar)) : "") + "_scalar:" + getTensorsStringKey({input_t});
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
if (has_min)
|
||||
newCachedGraph->minTensor = [mpsGraph constantWithScalar:min_scalar
|
||||
|
||||
@ -4225,7 +4225,7 @@
|
||||
MTIA: mm_out_mtia
|
||||
MPS: mm_out_mps
|
||||
XPU: mm_out_xpu
|
||||
SparseCPU, SparseCUDA, SparseMPS: _sparse_mm_out
|
||||
SparseCPU, SparseCUDA: _sparse_mm_out
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm_out
|
||||
|
||||
- func: mm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
|
||||
|
||||
@ -301,12 +301,12 @@ class AvgPoolMicrokernelTester {
|
||||
ASSERT_NEAR(
|
||||
float(int32_t(y[i * yStride() + k])), yFP[i * kc() + k], 0.5001f)
|
||||
<< "at pixel " << i << ", channel " << k << ", n = " << n()
|
||||
<< ", ks = " << kh() << 'x' << kw() << " (" << ks()
|
||||
<< ", ks = " << kh() << "x" << kw() << " (" << ks()
|
||||
<< "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
|
||||
ASSERT_EQ(
|
||||
uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
|
||||
<< "at pixel " << i << ", channel " << k << ", n = " << n()
|
||||
<< ", ks = " << kh() << 'x' << kw() << " (" << ks()
|
||||
<< ", ks = " << kh() << "x" << kw() << " (" << ks()
|
||||
<< "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
|
||||
}
|
||||
}
|
||||
@ -396,12 +396,12 @@ class AvgPoolMicrokernelTester {
|
||||
ASSERT_NEAR(
|
||||
float(int32_t(y[i * yStride() + k])), yFP[i * kc() + k], 0.5001f)
|
||||
<< "at pixel " << i << ", channel " << k << ", n = " << n()
|
||||
<< ", ks = " << kh() << 'x' << kw() << " (" << ks()
|
||||
<< ", ks = " << kh() << "x" << kw() << " (" << ks()
|
||||
<< "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
|
||||
ASSERT_EQ(
|
||||
uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
|
||||
<< "at pixel " << i << ", channel " << k << ", n = " << n()
|
||||
<< ", ks = " << kh() << 'x' << kw() << " (" << ks()
|
||||
<< ", ks = " << kh() << "x" << kw() << " (" << ks()
|
||||
<< "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
|
||||
}
|
||||
}
|
||||
|
||||
@ -232,7 +232,7 @@ class MaxPoolMicrokernelTester {
|
||||
ASSERT_EQ(
|
||||
uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
|
||||
<< "at pixel " << i << ", channel " << k << ", n = " << n()
|
||||
<< ", ks = " << kh() << 'x' << kw() << " (" << ks()
|
||||
<< ", ks = " << kh() << "x" << kw() << " (" << ks()
|
||||
<< "), kc = " << kc();
|
||||
}
|
||||
}
|
||||
|
||||
@ -17,7 +17,7 @@ inline std::vector<T> _expand_param_if_needed(
|
||||
std::ostringstream ss;
|
||||
ss << "expected " << param_name << " to be a single integer value or a "
|
||||
<< "list of " << expected_dim << " values to match the convolution "
|
||||
<< "dimensions, but got " << param_name << '=' << list_param;
|
||||
<< "dimensions, but got " << param_name << "=" << list_param;
|
||||
TORCH_CHECK(false, ss.str());
|
||||
} else {
|
||||
return list_param.vec();
|
||||
|
||||
@ -358,9 +358,9 @@ std::string Adapter::stringize() const {
|
||||
std::string device_type = get_device_type_str(properties.deviceType);
|
||||
VkPhysicalDeviceLimits limits = properties.limits;
|
||||
|
||||
ss << '{' << std::endl;
|
||||
ss << "{" << std::endl;
|
||||
ss << " Physical Device Info {" << std::endl;
|
||||
ss << " apiVersion: " << v_major << '.' << v_minor << std::endl;
|
||||
ss << " apiVersion: " << v_major << "." << v_minor << std::endl;
|
||||
ss << " driverversion: " << properties.driverVersion << std::endl;
|
||||
ss << " deviceType: " << device_type << std::endl;
|
||||
ss << " deviceName: " << properties.deviceName << std::endl;
|
||||
@ -371,7 +371,7 @@ std::string Adapter::stringize() const {
|
||||
|
||||
#define PRINT_LIMIT_PROP_VEC3(name) \
|
||||
ss << " " << std::left << std::setw(36) << #name << limits.name[0] \
|
||||
<< ',' << limits.name[1] << ',' << limits.name[2] << std::endl;
|
||||
<< "," << limits.name[1] << "," << limits.name[2] << std::endl;
|
||||
|
||||
ss << " Physical Device Limits {" << std::endl;
|
||||
PRINT_LIMIT_PROP(maxImageDimension1D);
|
||||
@ -425,7 +425,7 @@ std::string Adapter::stringize() const {
|
||||
;
|
||||
}
|
||||
ss << " ]" << std::endl;
|
||||
ss << '}';
|
||||
ss << "}";
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
@ -33,7 +33,7 @@ std::ostream& operator<<(std::ostream& out, const VkResult result) {
|
||||
VK_RESULT_CASE(VK_ERROR_FORMAT_NOT_SUPPORTED)
|
||||
VK_RESULT_CASE(VK_ERROR_FRAGMENTED_POOL)
|
||||
default:
|
||||
out << "VK_ERROR_UNKNOWN (VkResult " << result << ')';
|
||||
out << "VK_ERROR_UNKNOWN (VkResult " << result << ")";
|
||||
break;
|
||||
}
|
||||
return out;
|
||||
@ -46,7 +46,7 @@ std::ostream& operator<<(std::ostream& out, const VkResult result) {
|
||||
//
|
||||
|
||||
std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
|
||||
out << loc.function << " at " << loc.file << ':' << loc.line;
|
||||
out << loc.function << " at " << loc.file << ":" << loc.line;
|
||||
return out;
|
||||
}
|
||||
|
||||
@ -66,7 +66,7 @@ Error::Error(SourceLocation source_location, const char* cond, std::string msg)
|
||||
: msg_(std::move(msg)), source_location_{source_location} {
|
||||
std::ostringstream oss;
|
||||
oss << "Exception raised from " << source_location_ << ": ";
|
||||
oss << '(' << cond << ") is false! ";
|
||||
oss << "(" << cond << ") is false! ";
|
||||
oss << msg_;
|
||||
what_ = oss.str();
|
||||
}
|
||||
|
||||
@ -173,8 +173,8 @@ void QueryPool::extract_results() {
|
||||
|
||||
static std::string stringize(const VkExtent3D& extents) {
|
||||
std::stringstream ss;
|
||||
ss << '{' << extents.width << ", " << extents.height << ", " << extents.depth
|
||||
<< '}';
|
||||
ss << "{" << extents.width << ", " << extents.height << ", " << extents.depth
|
||||
<< "}";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
|
||||
@ -149,7 +149,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn(
|
||||
(void)flags;
|
||||
|
||||
std::stringstream stream;
|
||||
stream << layer_prefix << ' ' << message_code << ' ' << message << std::endl;
|
||||
stream << layer_prefix << " " << message_code << " " << message << std::endl;
|
||||
const std::string log = stream.str();
|
||||
|
||||
std::cout << log;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user