Compare commits

..

2 Commits

Author SHA1 Message Date
6e16bfcd3e add partition meta 2025-11-14 13:22:25 -08:00
d1f572a901 Add test for unbacked symint expression
Add backend node meta to invoke subgraph
2025-11-14 13:06:29 -08:00
551 changed files with 6065 additions and 13554 deletions

View File

@ -1,19 +0,0 @@
# Aarch64 (ARM/Graviton) Support Scripts
Scripts for building aarch64 PyTorch PIP Wheels. These scripts build the following wheels:
* torch
* torchvision
* torchaudio
* torchtext
* torchdata
## Aarch64_ci_build.sh
This script is design to support CD operations within PyPi manylinux aarch64 container, and be executed in the container. It prepares the container and then executes __aarch64_wheel_ci_build.py__ to build the wheels. The script "assumes" the PyTorch repo is located at: ```/pytorch``` and will put the wheels into ```/artifacts```.
### Usage
```DESIRED_PYTHON=<PythonVersion> aarch64_ci_build.sh```
__NOTE:__ CI build is currently __EXPERMINTAL__
## Build_aarch64_wheel.py
This app allows a person to build using AWS EC3 resources and requires AWS-CLI and Boto3 with AWS credentials to support building EC2 instances for the wheel builds. Can be used in a codebuild CD or from a local system.
### Usage
```build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch <RCtag>```

View File

@ -1,53 +0,0 @@
#!/bin/bash
set -eux -o pipefail
GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
# Set CUDA architecture lists to match x86 build_cuda.sh
if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0"
elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
fi
# Compress the fatbin with -compress-mode=size for CUDA 13
if [[ "$DESIRED_CUDA" == *"13"* ]]; then
export TORCH_NVCC_FLAGS="-compress-mode=size"
# Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
export BUILD_BUNDLE_PTXAS=1
fi
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
source $SCRIPTPATH/aarch64_ci_setup.sh
###############################################################################
# Run aarch64 builder python
###############################################################################
cd /
# adding safe directory for git as the permissions will be
# on the mounted pytorch repo
git config --global --add safe.directory /pytorch
pip install -r /pytorch/requirements.txt
pip install auditwheel==6.2.0 wheel
if [ "$DESIRED_CUDA" = "cpu" ]; then
echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
else
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
export USE_SYSTEM_NCCL=1
# Check if we should use NVIDIA libs from PyPI (similar to x86 build_cuda.sh logic)
if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
echo "Bundling CUDA libraries with wheel for aarch64."
else
echo "Using nvidia libs from pypi for aarch64."
echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
export USE_NVIDIA_PYPI_LIBS=1
fi
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
fi

View File

@ -1,21 +0,0 @@
#!/bin/bash
set -eux -o pipefail
# This script is used to prepare the Docker container for aarch64_ci_wheel_build.py python script
# By creating symlinks from desired /opt/python to /usr/local/bin/
NUMPY_VERSION=2.0.2
if [[ "$DESIRED_PYTHON" == "3.13" || "$DESIRED_PYTHON" == "3.13t" ]]; then
NUMPY_VERSION=2.1.2
fi
SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
source $SCRIPTPATH/../manywheel/set_desired_python.sh
pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2
for tool in python python3 pip pip3 ninja scons patchelf; do
ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin;
done
python --version

View File

@ -1,333 +0,0 @@
#!/usr/bin/env python3
# encoding: UTF-8
import os
import shutil
from subprocess import check_call, check_output
def list_dir(path: str) -> list[str]:
"""'
Helper for getting paths for Python
"""
return check_output(["ls", "-1", path]).decode().split("\n")
def replace_tag(filename) -> None:
with open(filename) as f:
lines = f.readlines()
for i, line in enumerate(lines):
if line.startswith("Tag:"):
lines[i] = line.replace("-linux_", "-manylinux_2_28_")
print(f"Updated tag from {line} to {lines[i]}")
break
with open(filename, "w") as f:
f.writelines(lines)
def patch_library_rpath(
folder: str,
lib_name: str,
use_nvidia_pypi_libs: bool = False,
desired_cuda: str = "",
) -> None:
"""Apply patchelf to set RPATH for a library in torch/lib"""
lib_path = f"{folder}/tmp/torch/lib/{lib_name}"
if use_nvidia_pypi_libs:
# For PyPI NVIDIA libraries, construct CUDA RPATH
cuda_rpaths = [
"$ORIGIN/../../nvidia/cudnn/lib",
"$ORIGIN/../../nvidia/nvshmem/lib",
"$ORIGIN/../../nvidia/nccl/lib",
"$ORIGIN/../../nvidia/cusparselt/lib",
]
if "130" in desired_cuda:
cuda_rpaths.append("$ORIGIN/../../nvidia/cu13/lib")
else:
cuda_rpaths.extend(
[
"$ORIGIN/../../nvidia/cublas/lib",
"$ORIGIN/../../nvidia/cuda_cupti/lib",
"$ORIGIN/../../nvidia/cuda_nvrtc/lib",
"$ORIGIN/../../nvidia/cuda_runtime/lib",
"$ORIGIN/../../nvidia/cufft/lib",
"$ORIGIN/../../nvidia/curand/lib",
"$ORIGIN/../../nvidia/cusolver/lib",
"$ORIGIN/../../nvidia/cusparse/lib",
"$ORIGIN/../../nvidia/nvtx/lib",
"$ORIGIN/../../nvidia/cufile/lib",
]
)
# Add $ORIGIN for local torch libs
rpath = ":".join(cuda_rpaths) + ":$ORIGIN"
else:
# For bundled libraries, just use $ORIGIN
rpath = "$ORIGIN"
if os.path.exists(lib_path):
os.system(
f"cd {folder}/tmp/torch/lib/; "
f"patchelf --set-rpath '{rpath}' --force-rpath {lib_name}"
)
def copy_and_patch_library(
src_path: str,
folder: str,
use_nvidia_pypi_libs: bool = False,
desired_cuda: str = "",
) -> None:
"""Copy a library to torch/lib and patch its RPATH"""
if os.path.exists(src_path):
lib_name = os.path.basename(src_path)
shutil.copy2(src_path, f"{folder}/tmp/torch/lib/{lib_name}")
patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
def package_cuda_wheel(wheel_path, desired_cuda) -> None:
"""
Package the cuda wheel libraries
"""
folder = os.path.dirname(wheel_path)
os.mkdir(f"{folder}/tmp")
os.system(f"unzip {wheel_path} -d {folder}/tmp")
# Delete original wheel since it will be repackaged
os.system(f"rm {wheel_path}")
# Check if we should use PyPI NVIDIA libraries or bundle system libraries
use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
if use_nvidia_pypi_libs:
print("Using nvidia libs from pypi - skipping CUDA library bundling")
# For PyPI approach, we don't bundle CUDA libraries - they come from PyPI packages
# We only need to bundle non-NVIDIA libraries
minimal_libs_to_copy = [
"/lib64/libgomp.so.1",
"/usr/lib64/libgfortran.so.5",
"/acl/build/libarm_compute.so",
"/acl/build/libarm_compute_graph.so",
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
"/usr/local/lib/libnvpl_lapack_core.so.0",
"/usr/local/lib/libnvpl_blas_core.so.0",
]
# Copy minimal libraries to unzipped_folder/torch/lib
for lib_path in minimal_libs_to_copy:
copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
# Patch torch libraries used for searching libraries
torch_libs_to_patch = [
"libtorch.so",
"libtorch_cpu.so",
"libtorch_cuda.so",
"libtorch_cuda_linalg.so",
"libtorch_global_deps.so",
"libtorch_python.so",
"libtorch_nvshmem.so",
"libc10.so",
"libc10_cuda.so",
"libcaffe2_nvrtc.so",
"libshm.so",
]
for lib_name in torch_libs_to_patch:
patch_library_rpath(folder, lib_name, use_nvidia_pypi_libs, desired_cuda)
else:
print("Bundling CUDA libraries with wheel")
# Original logic for bundling system CUDA libraries
# Common libraries for all CUDA versions
common_libs = [
# Non-NVIDIA system libraries
"/lib64/libgomp.so.1",
"/usr/lib64/libgfortran.so.5",
"/acl/build/libarm_compute.so",
"/acl/build/libarm_compute_graph.so",
# Common CUDA libraries (same for all versions)
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
"/usr/local/lib/libnvpl_lapack_core.so.0",
"/usr/local/lib/libnvpl_blas_core.so.0",
"/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
"/usr/local/cuda/lib64/libcudnn.so.9",
"/usr/local/cuda/lib64/libcusparseLt.so.0",
"/usr/local/cuda/lib64/libcurand.so.10",
"/usr/local/cuda/lib64/libnccl.so.2",
"/usr/local/cuda/lib64/libnvshmem_host.so.3",
"/usr/local/cuda/lib64/libcudnn_adv.so.9",
"/usr/local/cuda/lib64/libcudnn_cnn.so.9",
"/usr/local/cuda/lib64/libcudnn_graph.so.9",
"/usr/local/cuda/lib64/libcudnn_ops.so.9",
"/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
"/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
"/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
"/usr/local/cuda/lib64/libcufile.so.0",
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
"/usr/local/cuda/lib64/libcusparse.so.12",
]
# CUDA version-specific libraries
if "13" in desired_cuda:
minor_version = desired_cuda[-1]
version_specific_libs = [
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
"/usr/local/cuda/lib64/libcublas.so.13",
"/usr/local/cuda/lib64/libcublasLt.so.13",
"/usr/local/cuda/lib64/libcudart.so.13",
"/usr/local/cuda/lib64/libcufft.so.12",
"/usr/local/cuda/lib64/libcusolver.so.12",
"/usr/local/cuda/lib64/libnvJitLink.so.13",
"/usr/local/cuda/lib64/libnvrtc.so.13",
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",
]
elif "12" in desired_cuda:
# Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
minor_version = desired_cuda[-1]
version_specific_libs = [
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
"/usr/local/cuda/lib64/libcublas.so.12",
"/usr/local/cuda/lib64/libcublasLt.so.12",
"/usr/local/cuda/lib64/libcudart.so.12",
"/usr/local/cuda/lib64/libcufft.so.11",
"/usr/local/cuda/lib64/libcusolver.so.11",
"/usr/local/cuda/lib64/libnvJitLink.so.12",
"/usr/local/cuda/lib64/libnvrtc.so.12",
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
]
else:
raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")
# Combine all libraries
libs_to_copy = common_libs + version_specific_libs
# Copy libraries to unzipped_folder/torch/lib
for lib_path in libs_to_copy:
copy_and_patch_library(lib_path, folder, use_nvidia_pypi_libs, desired_cuda)
# Make sure the wheel is tagged with manylinux_2_28
for f in os.scandir(f"{folder}/tmp/"):
if f.is_dir() and f.name.endswith(".dist-info"):
replace_tag(f"{f.path}/WHEEL")
break
os.system(f"wheel pack {folder}/tmp/ -d {folder}")
os.system(f"rm -rf {folder}/tmp/")
def complete_wheel(folder: str) -> str:
"""
Complete wheel build and put in artifact location
"""
wheel_name = list_dir(f"/{folder}/dist")[0]
# Please note for cuda we don't run auditwheel since we use custom script to package
# the cuda dependencies to the wheel file using update_wheel() method.
# However we need to make sure filename reflects the correct Manylinux platform.
if "pytorch" in folder and not enable_cuda:
print("Repairing Wheel with AuditWheel")
check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
repaired_wheel_name = list_dir(f"/{folder}/wheelhouse")[0]
print(f"Moving {repaired_wheel_name} wheel to /{folder}/dist")
os.rename(
f"/{folder}/wheelhouse/{repaired_wheel_name}",
f"/{folder}/dist/{repaired_wheel_name}",
)
else:
repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
print(f"Copying {repaired_wheel_name} to artifacts")
shutil.copy2(
f"/{folder}/dist/{repaired_wheel_name}", f"/artifacts/{repaired_wheel_name}"
)
return repaired_wheel_name
def parse_arguments():
"""
Parse inline arguments
"""
from argparse import ArgumentParser
parser = ArgumentParser("AARCH64 wheels python CD")
parser.add_argument("--debug", action="store_true")
parser.add_argument("--build-only", action="store_true")
parser.add_argument("--test-only", type=str)
parser.add_argument("--enable-mkldnn", action="store_true")
parser.add_argument("--enable-cuda", action="store_true")
return parser.parse_args()
if __name__ == "__main__":
"""
Entry Point
"""
args = parse_arguments()
enable_mkldnn = args.enable_mkldnn
enable_cuda = args.enable_cuda
branch = check_output(
["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd="/pytorch"
).decode()
print("Building PyTorch wheel")
build_vars = ""
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
if enable_cuda:
build_vars += "MAX_JOBS=5 "
# Handle PyPI NVIDIA libraries vs bundled libraries
use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
if use_nvidia_pypi_libs:
print("Configuring build for PyPI NVIDIA libraries")
# Configure for dynamic linking (matching x86 logic)
build_vars += "ATEN_STATIC_CUDA=0 USE_CUDA_STATIC_LINK=0 USE_CUPTI_SO=1 "
else:
print("Configuring build for bundled NVIDIA libraries")
# Keep existing static linking approach - already configured above
override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
desired_cuda = os.getenv("DESIRED_CUDA")
if override_package_version is not None:
version = override_package_version
build_vars += (
f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
)
elif branch in ["nightly", "main"]:
build_date = (
check_output(["git", "log", "--pretty=format:%cs", "-1"], cwd="/pytorch")
.decode()
.replace("-", "")
)
version = (
check_output(["cat", "version.txt"], cwd="/pytorch").decode().strip()[:-2]
)
if enable_cuda:
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date}+{desired_cuda} PYTORCH_BUILD_NUMBER=1 "
else:
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
elif branch.startswith(("v1.", "v2.")):
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
if enable_mkldnn:
print("build pytorch with mkldnn+acl backend")
build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
build_vars += "ACL_ROOT_DIR=/acl "
if enable_cuda:
build_vars += "BLAS=NVPL "
else:
build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS "
else:
print("build pytorch without mkldnn backend")
os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")
if enable_cuda:
print("Updating Cuda Dependency")
filename = os.listdir("/pytorch/dist/")
wheel_path = f"/pytorch/dist/{filename[0]}"
package_cuda_wheel(wheel_path, desired_cuda)
pytorch_wheel_name = complete_wheel("/pytorch/")
print(f"Build Complete. Created {pytorch_wheel_name}..")

View File

@ -1,999 +0,0 @@
#!/usr/bin/env python3
# This script is for building AARCH64 wheels using AWS EC2 instances.
# To generate binaries for the release follow these steps:
# 1. Update mappings for each of the Domain Libraries by adding new row to a table like this:
# "v1.11.0": ("0.11.0", "rc1"),
# 2. Run script with following arguments for each of the supported python versions and required tag, for example:
# build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch v1.11.0-rc3
import os
import subprocess
import sys
import time
from typing import Optional, Union
import boto3
# AMI images for us-east-1, change the following based on your ~/.aws/config
os_amis = {
"ubuntu20_04": "ami-052eac90edaa9d08f", # login_name: ubuntu
"ubuntu22_04": "ami-0c6c29c5125214c77", # login_name: ubuntu
"redhat8": "ami-0698b90665a2ddcf1", # login_name: ec2-user
}
ubuntu20_04_ami = os_amis["ubuntu20_04"]
def compute_keyfile_path(key_name: Optional[str] = None) -> tuple[str, str]:
if key_name is None:
key_name = os.getenv("AWS_KEY_NAME")
if key_name is None:
return os.getenv("SSH_KEY_PATH", ""), ""
homedir_path = os.path.expanduser("~")
default_path = os.path.join(homedir_path, ".ssh", f"{key_name}.pem")
return os.getenv("SSH_KEY_PATH", default_path), key_name
ec2 = boto3.resource("ec2")
def ec2_get_instances(filter_name, filter_value):
return ec2.instances.filter(
Filters=[{"Name": filter_name, "Values": [filter_value]}]
)
def ec2_instances_of_type(instance_type="t4g.2xlarge"):
return ec2_get_instances("instance-type", instance_type)
def ec2_instances_by_id(instance_id):
rc = list(ec2_get_instances("instance-id", instance_id))
return rc[0] if len(rc) > 0 else None
def start_instance(
key_name, ami=ubuntu20_04_ami, instance_type="t4g.2xlarge", ebs_size: int = 50
):
inst = ec2.create_instances(
ImageId=ami,
InstanceType=instance_type,
SecurityGroups=["ssh-allworld"],
KeyName=key_name,
MinCount=1,
MaxCount=1,
BlockDeviceMappings=[
{
"DeviceName": "/dev/sda1",
"Ebs": {
"DeleteOnTermination": True,
"VolumeSize": ebs_size,
"VolumeType": "standard",
},
}
],
)[0]
print(f"Create instance {inst.id}")
inst.wait_until_running()
running_inst = ec2_instances_by_id(inst.id)
print(f"Instance started at {running_inst.public_dns_name}")
return running_inst
class RemoteHost:
addr: str
keyfile_path: str
login_name: str
container_id: Optional[str] = None
ami: Optional[str] = None
def __init__(self, addr: str, keyfile_path: str, login_name: str = "ubuntu"):
self.addr = addr
self.keyfile_path = keyfile_path
self.login_name = login_name
def _gen_ssh_prefix(self) -> list[str]:
return [
"ssh",
"-o",
"StrictHostKeyChecking=no",
"-i",
self.keyfile_path,
f"{self.login_name}@{self.addr}",
"--",
]
@staticmethod
def _split_cmd(args: Union[str, list[str]]) -> list[str]:
return args.split() if isinstance(args, str) else args
def run_ssh_cmd(self, args: Union[str, list[str]]) -> None:
subprocess.check_call(self._gen_ssh_prefix() + self._split_cmd(args))
def check_ssh_output(self, args: Union[str, list[str]]) -> str:
return subprocess.check_output(
self._gen_ssh_prefix() + self._split_cmd(args)
).decode("utf-8")
def scp_upload_file(self, local_file: str, remote_file: str) -> None:
subprocess.check_call(
[
"scp",
"-i",
self.keyfile_path,
local_file,
f"{self.login_name}@{self.addr}:{remote_file}",
]
)
def scp_download_file(
self, remote_file: str, local_file: Optional[str] = None
) -> None:
if local_file is None:
local_file = "."
subprocess.check_call(
[
"scp",
"-i",
self.keyfile_path,
f"{self.login_name}@{self.addr}:{remote_file}",
local_file,
]
)
def start_docker(self, image="quay.io/pypa/manylinux2014_aarch64:latest") -> None:
self.run_ssh_cmd("sudo apt-get install -y docker.io")
self.run_ssh_cmd(f"sudo usermod -a -G docker {self.login_name}")
self.run_ssh_cmd("sudo service docker start")
self.run_ssh_cmd(f"docker pull {image}")
self.container_id = self.check_ssh_output(
f"docker run -t -d -w /root {image}"
).strip()
def using_docker(self) -> bool:
return self.container_id is not None
def run_cmd(self, args: Union[str, list[str]]) -> None:
if not self.using_docker():
return self.run_ssh_cmd(args)
assert self.container_id is not None
docker_cmd = self._gen_ssh_prefix() + [
"docker",
"exec",
"-i",
self.container_id,
"bash",
]
p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE)
p.communicate(
input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
"utf-8"
)
)
rc = p.wait()
if rc != 0:
raise subprocess.CalledProcessError(rc, docker_cmd)
def check_output(self, args: Union[str, list[str]]) -> str:
if not self.using_docker():
return self.check_ssh_output(args)
assert self.container_id is not None
docker_cmd = self._gen_ssh_prefix() + [
"docker",
"exec",
"-i",
self.container_id,
"bash",
]
p = subprocess.Popen(docker_cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
(out, err) = p.communicate(
input=" ".join(["source .bashrc && "] + self._split_cmd(args)).encode(
"utf-8"
)
)
rc = p.wait()
if rc != 0:
raise subprocess.CalledProcessError(rc, docker_cmd, output=out, stderr=err)
return out.decode("utf-8")
def upload_file(self, local_file: str, remote_file: str) -> None:
if not self.using_docker():
return self.scp_upload_file(local_file, remote_file)
tmp_file = os.path.join("/tmp", os.path.basename(local_file))
self.scp_upload_file(local_file, tmp_file)
self.run_ssh_cmd(
["docker", "cp", tmp_file, f"{self.container_id}:/root/{remote_file}"]
)
self.run_ssh_cmd(["rm", tmp_file])
def download_file(self, remote_file: str, local_file: Optional[str] = None) -> None:
if not self.using_docker():
return self.scp_download_file(remote_file, local_file)
tmp_file = os.path.join("/tmp", os.path.basename(remote_file))
self.run_ssh_cmd(
["docker", "cp", f"{self.container_id}:/root/{remote_file}", tmp_file]
)
self.scp_download_file(tmp_file, local_file)
self.run_ssh_cmd(["rm", tmp_file])
def download_wheel(
self, remote_file: str, local_file: Optional[str] = None
) -> None:
if self.using_docker() and local_file is None:
basename = os.path.basename(remote_file)
local_file = basename.replace(
"-linux_aarch64.whl", "-manylinux2014_aarch64.whl"
)
self.download_file(remote_file, local_file)
def list_dir(self, path: str) -> list[str]:
return self.check_output(["ls", "-1", path]).split("\n")
def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
import socket
for i in range(attempt_cnt):
try:
with socket.create_connection((addr, port), timeout=timeout):
return
except (ConnectionRefusedError, TimeoutError): # noqa: PERF203
if i == attempt_cnt - 1:
raise
time.sleep(timeout)
def update_apt_repo(host: RemoteHost) -> None:
time.sleep(5)
host.run_cmd("sudo systemctl stop apt-daily.service || true")
host.run_cmd("sudo systemctl stop unattended-upgrades.service || true")
host.run_cmd(
"while systemctl is-active --quiet apt-daily.service; do sleep 1; done"
)
host.run_cmd(
"while systemctl is-active --quiet unattended-upgrades.service; do sleep 1; done"
)
host.run_cmd("sudo apt-get update")
time.sleep(3)
host.run_cmd("sudo apt-get update")
def install_condaforge(
host: RemoteHost, suffix: str = "latest/download/Miniforge3-Linux-aarch64.sh"
) -> None:
print("Install conda-forge")
host.run_cmd(f"curl -OL https://github.com/conda-forge/miniforge/releases/{suffix}")
host.run_cmd(f"sh -f {os.path.basename(suffix)} -b")
host.run_cmd(f"rm -f {os.path.basename(suffix)}")
if host.using_docker():
host.run_cmd("echo 'PATH=$HOME/miniforge3/bin:$PATH'>>.bashrc")
else:
host.run_cmd(
[
"sed",
"-i",
"'/^# If not running interactively.*/i PATH=$HOME/miniforge3/bin:$PATH'",
".bashrc",
]
)
def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
if python_version == "3.6":
# Python-3.6 EOLed and not compatible with conda-4.11
install_condaforge(
host, suffix="download/4.10.3-10/Miniforge3-4.10.3-10-Linux-aarch64.sh"
)
host.run_cmd(f"conda install -y python={python_version} numpy pyyaml")
else:
install_condaforge(
host, suffix="download/4.11.0-4/Miniforge3-4.11.0-4-Linux-aarch64.sh"
)
# Pytorch-1.10 or older are not compatible with setuptools=59.6 or newer
host.run_cmd(
f"conda install -y python={python_version} numpy pyyaml setuptools>=59.5.0"
)
def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:
host.run_cmd("pip3 install auditwheel")
host.run_cmd(
"conda install -y patchelf" if use_conda else "sudo apt-get install -y patchelf"
)
from tempfile import NamedTemporaryFile
with NamedTemporaryFile() as tmp:
tmp.write(embed_library_script.encode("utf-8"))
tmp.flush()
host.upload_file(tmp.name, "embed_library.py")
print("Embedding libgomp into wheel")
if host.using_docker():
host.run_cmd(f"python3 embed_library.py {wheel_name} --update-tag")
else:
host.run_cmd(f"python3 embed_library.py {wheel_name}")
def checkout_repo(
host: RemoteHost,
*,
branch: str = "main",
url: str,
git_clone_flags: str,
mapping: dict[str, tuple[str, str]],
) -> Optional[str]:
for prefix in mapping:
if not branch.startswith(prefix):
continue
tag = f"v{mapping[prefix][0]}-{mapping[prefix][1]}"
host.run_cmd(f"git clone {url} -b {tag} {git_clone_flags}")
return mapping[prefix][0]
host.run_cmd(f"git clone {url} -b {branch} {git_clone_flags}")
return None
def build_torchvision(
host: RemoteHost,
*,
branch: str = "main",
use_conda: bool = True,
git_clone_flags: str,
run_smoke_tests: bool = True,
) -> str:
print("Checking out TorchVision repo")
build_version = checkout_repo(
host,
branch=branch,
url="https://github.com/pytorch/vision",
git_clone_flags=git_clone_flags,
mapping={
"v1.7.1": ("0.8.2", "rc2"),
"v1.8.0": ("0.9.0", "rc3"),
"v1.8.1": ("0.9.1", "rc1"),
"v1.9.0": ("0.10.0", "rc1"),
"v1.10.0": ("0.11.1", "rc1"),
"v1.10.1": ("0.11.2", "rc1"),
"v1.10.2": ("0.11.3", "rc1"),
"v1.11.0": ("0.12.0", "rc1"),
"v1.12.0": ("0.13.0", "rc4"),
"v1.12.1": ("0.13.1", "rc6"),
"v1.13.0": ("0.14.0", "rc4"),
"v1.13.1": ("0.14.1", "rc2"),
"v2.0.0": ("0.15.1", "rc2"),
"v2.0.1": ("0.15.2", "rc2"),
},
)
print("Building TorchVision wheel")
# Please note libnpg and jpeg are required to build image.so extension
if use_conda:
host.run_cmd("conda install -y libpng jpeg")
# Remove .so files to force static linking
host.run_cmd(
"rm miniforge3/lib/libpng.so miniforge3/lib/libpng16.so miniforge3/lib/libjpeg.so"
)
# And patch setup.py to include libz dependency for libpng
host.run_cmd(
[
'sed -i -e \'s/image_link_flags\\.append("png")/image_link_flags += ["png", "z"]/\' vision/setup.py'
]
)
build_vars = ""
if branch == "nightly":
version = host.check_output(
["if [ -f vision/version.txt ]; then cat vision/version.txt; fi"]
).strip()
if len(version) == 0:
# In older revisions, version was embedded in setup.py
version = (
host.check_output(["grep", '"version = \'"', "vision/setup.py"])
.strip()
.split("'")[1][:-2]
)
build_date = (
host.check_output("cd vision && git log --pretty=format:%s -1")
.strip()
.split()[0]
.replace("-", "")
)
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
elif build_version is not None:
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
if host.using_docker():
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation")
vision_wheel_name = host.list_dir("vision/dist")[0]
embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name))
print("Copying TorchVision wheel")
host.download_wheel(os.path.join("vision", "dist", vision_wheel_name))
if run_smoke_tests:
host.run_cmd(
f"pip3 install {os.path.join('vision', 'dist', vision_wheel_name)}"
)
host.run_cmd("python3 vision/test/smoke_test.py")
print("Delete vision checkout")
host.run_cmd("rm -rf vision")
return vision_wheel_name
def build_torchdata(
host: RemoteHost,
*,
branch: str = "main",
use_conda: bool = True,
git_clone_flags: str = "",
) -> str:
print("Checking out TorchData repo")
git_clone_flags += " --recurse-submodules"
build_version = checkout_repo(
host,
branch=branch,
url="https://github.com/pytorch/data",
git_clone_flags=git_clone_flags,
mapping={
"v1.13.1": ("0.5.1", ""),
"v2.0.0": ("0.6.0", "rc5"),
"v2.0.1": ("0.6.1", "rc1"),
},
)
print("Building TorchData wheel")
build_vars = ""
if branch == "nightly":
version = host.check_output(
["if [ -f data/version.txt ]; then cat data/version.txt; fi"]
).strip()
build_date = (
host.check_output("cd data && git log --pretty=format:%s -1")
.strip()
.split()[0]
.replace("-", "")
)
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
elif build_version is not None:
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
if host.using_docker():
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation")
wheel_name = host.list_dir("data/dist")[0]
embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name))
print("Copying TorchData wheel")
host.download_wheel(os.path.join("data", "dist", wheel_name))
return wheel_name
def build_torchtext(
host: RemoteHost,
*,
branch: str = "main",
use_conda: bool = True,
git_clone_flags: str = "",
) -> str:
print("Checking out TorchText repo")
git_clone_flags += " --recurse-submodules"
build_version = checkout_repo(
host,
branch=branch,
url="https://github.com/pytorch/text",
git_clone_flags=git_clone_flags,
mapping={
"v1.9.0": ("0.10.0", "rc1"),
"v1.10.0": ("0.11.0", "rc2"),
"v1.10.1": ("0.11.1", "rc1"),
"v1.10.2": ("0.11.2", "rc1"),
"v1.11.0": ("0.12.0", "rc1"),
"v1.12.0": ("0.13.0", "rc2"),
"v1.12.1": ("0.13.1", "rc5"),
"v1.13.0": ("0.14.0", "rc3"),
"v1.13.1": ("0.14.1", "rc1"),
"v2.0.0": ("0.15.1", "rc2"),
"v2.0.1": ("0.15.2", "rc2"),
},
)
print("Building TorchText wheel")
build_vars = ""
if branch == "nightly":
version = host.check_output(
["if [ -f text/version.txt ]; then cat text/version.txt; fi"]
).strip()
build_date = (
host.check_output("cd text && git log --pretty=format:%s -1")
.strip()
.split()[0]
.replace("-", "")
)
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
elif build_version is not None:
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
if host.using_docker():
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation")
wheel_name = host.list_dir("text/dist")[0]
embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name))
print("Copying TorchText wheel")
host.download_wheel(os.path.join("text", "dist", wheel_name))
return wheel_name
def build_torchaudio(
host: RemoteHost,
*,
branch: str = "main",
use_conda: bool = True,
git_clone_flags: str = "",
) -> str:
print("Checking out TorchAudio repo")
git_clone_flags += " --recurse-submodules"
build_version = checkout_repo(
host,
branch=branch,
url="https://github.com/pytorch/audio",
git_clone_flags=git_clone_flags,
mapping={
"v1.9.0": ("0.9.0", "rc2"),
"v1.10.0": ("0.10.0", "rc5"),
"v1.10.1": ("0.10.1", "rc1"),
"v1.10.2": ("0.10.2", "rc1"),
"v1.11.0": ("0.11.0", "rc1"),
"v1.12.0": ("0.12.0", "rc3"),
"v1.12.1": ("0.12.1", "rc5"),
"v1.13.0": ("0.13.0", "rc4"),
"v1.13.1": ("0.13.1", "rc2"),
"v2.0.0": ("2.0.1", "rc3"),
"v2.0.1": ("2.0.2", "rc2"),
},
)
print("Building TorchAudio wheel")
build_vars = ""
if branch == "nightly":
version = (
host.check_output(["grep", '"version = \'"', "audio/setup.py"])
.strip()
.split("'")[1][:-2]
)
build_date = (
host.check_output("cd audio && git log --pretty=format:%s -1")
.strip()
.split()[0]
.replace("-", "")
)
build_vars += f"BUILD_VERSION={version}.dev{build_date}"
elif build_version is not None:
build_vars += f"BUILD_VERSION={build_version} PYTORCH_VERSION={branch[1:].split('-', maxsplit=1)[0]}"
if host.using_docker():
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
host.run_cmd(
f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \
&& ./packaging/ffmpeg/build.sh \
&& {build_vars} python3 -m build --wheel --no-isolation"
)
wheel_name = host.list_dir("audio/dist")[0]
embed_libgomp(host, use_conda, os.path.join("audio", "dist", wheel_name))
print("Copying TorchAudio wheel")
host.download_wheel(os.path.join("audio", "dist", wheel_name))
return wheel_name
def configure_system(
host: RemoteHost,
*,
compiler: str = "gcc-8",
use_conda: bool = True,
python_version: str = "3.8",
) -> None:
if use_conda:
install_condaforge_python(host, python_version)
print("Configuring the system")
if not host.using_docker():
update_apt_repo(host)
host.run_cmd("sudo apt-get install -y ninja-build g++ git cmake gfortran unzip")
else:
host.run_cmd("yum install -y sudo")
host.run_cmd("conda install -y ninja scons")
if not use_conda:
host.run_cmd(
"sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
)
host.run_cmd("pip3 install dataclasses typing-extensions")
if not use_conda:
print("Installing Cython + numpy from PyPy")
host.run_cmd("sudo pip3 install Cython")
host.run_cmd("sudo pip3 install numpy")
def build_domains(
host: RemoteHost,
*,
branch: str = "main",
use_conda: bool = True,
git_clone_flags: str = "",
) -> tuple[str, str, str, str]:
vision_wheel_name = build_torchvision(
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
)
audio_wheel_name = build_torchaudio(
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
)
data_wheel_name = build_torchdata(
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
)
text_wheel_name = build_torchtext(
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
)
return (vision_wheel_name, audio_wheel_name, data_wheel_name, text_wheel_name)
def start_build(
host: RemoteHost,
*,
branch: str = "main",
compiler: str = "gcc-8",
use_conda: bool = True,
python_version: str = "3.8",
pytorch_only: bool = False,
pytorch_build_number: Optional[str] = None,
shallow_clone: bool = True,
enable_mkldnn: bool = False,
) -> tuple[str, str, str, str, str]:
git_clone_flags = " --depth 1 --shallow-submodules" if shallow_clone else ""
if host.using_docker() and not use_conda:
print("Auto-selecting conda option for docker images")
use_conda = True
if not host.using_docker():
print("Disable mkldnn for host builds")
enable_mkldnn = False
configure_system(
host, compiler=compiler, use_conda=use_conda, python_version=python_version
)
if host.using_docker():
print("Move libgfortant.a into a standard location")
# HACK: pypa gforntran.a is compiled without PIC, which leads to the following error
# libgfortran.a(error.o)(.text._gfortrani_st_printf+0x34): unresolvable R_AARCH64_ADR_PREL_PG_HI21 relocation against symbol `__stack_chk_guard@@GLIBC_2.17' # noqa: E501, B950
# Workaround by copying gfortran library from the host
host.run_ssh_cmd("sudo apt-get install -y gfortran-8")
host.run_cmd("mkdir -p /usr/lib/gcc/aarch64-linux-gnu/8")
host.run_ssh_cmd(
[
"docker",
"cp",
"/usr/lib/gcc/aarch64-linux-gnu/8/libgfortran.a",
f"{host.container_id}:/opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/",
]
)
print("Checking out PyTorch repo")
host.run_cmd(
f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"
)
host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")
print("Building PyTorch wheel")
build_opts = ""
if pytorch_build_number is not None:
build_opts += f" -C--build-option=--build-number={pytorch_build_number}"
# Breakpad build fails on aarch64
build_vars = "USE_BREAKPAD=0 "
if branch == "nightly":
build_date = (
host.check_output("cd pytorch && git log --pretty=format:%s -1")
.strip()
.split()[0]
.replace("-", "")
)
version = host.check_output("cat pytorch/version.txt").strip()[:-2]
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
if branch.startswith(("v1.", "v2.")):
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
if host.using_docker():
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
if enable_mkldnn:
host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")
print("build pytorch with mkldnn+acl backend")
build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
build_vars += " BLAS=OpenBLAS"
build_vars += " OpenBLAS_HOME=/opt/OpenBLAS"
build_vars += " ACL_ROOT_DIR=/acl"
host.run_cmd(
f"cd $HOME/pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
)
print("Repair the wheel")
pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
ld_library_path = "/acl/build:$HOME/pytorch/build/lib"
host.run_cmd(
f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"
)
print("replace the original wheel with the repaired one")
pytorch_repaired_wheel_name = host.list_dir("wheelhouse")[0]
host.run_cmd(
f"cp $HOME/wheelhouse/{pytorch_repaired_wheel_name} $HOME/pytorch/dist/{pytorch_wheel_name}"
)
else:
print("build pytorch without mkldnn backend")
host.run_cmd(
f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
)
print("Deleting build folder")
host.run_cmd("cd pytorch && rm -rf build")
pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
embed_libgomp(host, use_conda, os.path.join("pytorch", "dist", pytorch_wheel_name))
print("Copying the wheel")
host.download_wheel(os.path.join("pytorch", "dist", pytorch_wheel_name))
print("Installing PyTorch wheel")
host.run_cmd(f"pip3 install pytorch/dist/{pytorch_wheel_name}")
if pytorch_only:
return (pytorch_wheel_name, None, None, None, None)
domain_wheels = build_domains(
host, branch=branch, use_conda=use_conda, git_clone_flags=git_clone_flags
)
return (pytorch_wheel_name, *domain_wheels)
embed_library_script = """
#!/usr/bin/env python3
from auditwheel.patcher import Patchelf
from auditwheel.wheeltools import InWheelCtx
from auditwheel.elfutils import elf_file_filter
from auditwheel.repair import copylib
from auditwheel.lddtree import lddtree
from subprocess import check_call
import os
import shutil
import sys
from tempfile import TemporaryDirectory
def replace_tag(filename):
with open(filename, 'r') as f:
lines = f.read().split("\\n")
for i,line in enumerate(lines):
if not line.startswith("Tag: "):
continue
lines[i] = line.replace("-linux_", "-manylinux2014_")
print(f'Updated tag from {line} to {lines[i]}')
with open(filename, 'w') as f:
f.write("\\n".join(lines))
class AlignedPatchelf(Patchelf):
def set_soname(self, file_name: str, new_soname: str) -> None:
check_call(['patchelf', '--page-size', '65536', '--set-soname', new_soname, file_name])
def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
check_call(['patchelf', '--page-size', '65536', '--replace-needed', soname, new_soname, file_name])
def embed_library(whl_path, lib_soname, update_tag=False):
patcher = AlignedPatchelf()
out_dir = TemporaryDirectory()
whl_name = os.path.basename(whl_path)
tmp_whl_name = os.path.join(out_dir.name, whl_name)
with InWheelCtx(whl_path) as ctx:
torchlib_path = os.path.join(ctx._tmpdir.name, 'torch', 'lib')
ctx.out_wheel=tmp_whl_name
new_lib_path, new_lib_soname = None, None
for filename, elf in elf_file_filter(ctx.iter_files()):
if not filename.startswith('torch/lib'):
continue
libtree = lddtree(filename)
if lib_soname not in libtree['needed']:
continue
lib_path = libtree['libs'][lib_soname]['path']
if lib_path is None:
print(f"Can't embed {lib_soname} as it could not be found")
break
if lib_path.startswith(torchlib_path):
continue
if new_lib_path is None:
new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
patcher.replace_needed(filename, lib_soname, new_lib_soname)
print(f'Replacing {lib_soname} with {new_lib_soname} for {filename}')
if update_tag:
# Add manylinux2014 tag
for filename in ctx.iter_files():
if os.path.basename(filename) != 'WHEEL':
continue
replace_tag(filename)
shutil.move(tmp_whl_name, whl_path)
if __name__ == '__main__':
embed_library(sys.argv[1], 'libgomp.so.1', len(sys.argv) > 2 and sys.argv[2] == '--update-tag')
"""
def run_tests(host: RemoteHost, whl: str, branch="main") -> None:
print("Configuring the system")
update_apt_repo(host)
host.run_cmd("sudo apt-get install -y python3-pip git")
host.run_cmd("sudo pip3 install Cython")
host.run_cmd("sudo pip3 install numpy")
host.upload_file(whl, ".")
host.run_cmd(f"sudo pip3 install {whl}")
host.run_cmd("python3 -c 'import torch;print(torch.rand((3,3))'")
host.run_cmd(f"git clone -b {branch} https://github.com/pytorch/pytorch")
host.run_cmd("cd pytorch/test; python3 test_torch.py -v")
def get_instance_name(instance) -> Optional[str]:
if instance.tags is None:
return None
for tag in instance.tags:
if tag["Key"] == "Name":
return tag["Value"]
return None
def list_instances(instance_type: str) -> None:
print(f"All instances of type {instance_type}")
for instance in ec2_instances_of_type(instance_type):
ifaces = instance.network_interfaces
az = ifaces[0].subnet.availability_zone if len(ifaces) > 0 else None
print(
f"{instance.id} {get_instance_name(instance)} {instance.public_dns_name} {instance.state['Name']} {az}"
)
def terminate_instances(instance_type: str) -> None:
print(f"Terminating all instances of type {instance_type}")
instances = list(ec2_instances_of_type(instance_type))
for instance in instances:
print(f"Terminating {instance.id}")
instance.terminate()
print("Waiting for termination to complete")
for instance in instances:
instance.wait_until_terminated()
def parse_arguments():
from argparse import ArgumentParser
parser = ArgumentParser("Build and test AARCH64 wheels using EC2")
parser.add_argument("--key-name", type=str)
parser.add_argument("--debug", action="store_true")
parser.add_argument("--build-only", action="store_true")
parser.add_argument("--test-only", type=str)
group = parser.add_mutually_exclusive_group()
group.add_argument("--os", type=str, choices=list(os_amis.keys()))
group.add_argument("--ami", type=str)
parser.add_argument(
"--python-version",
type=str,
choices=[f"3.{d}" for d in range(6, 12)],
default=None,
)
parser.add_argument("--alloc-instance", action="store_true")
parser.add_argument("--list-instances", action="store_true")
parser.add_argument("--pytorch-only", action="store_true")
parser.add_argument("--keep-running", action="store_true")
parser.add_argument("--terminate-instances", action="store_true")
parser.add_argument("--instance-type", type=str, default="t4g.2xlarge")
parser.add_argument("--ebs-size", type=int, default=50)
parser.add_argument("--branch", type=str, default="main")
parser.add_argument("--use-docker", action="store_true")
parser.add_argument(
"--compiler",
type=str,
choices=["gcc-7", "gcc-8", "gcc-9", "clang"],
default="gcc-8",
)
parser.add_argument("--use-torch-from-pypi", action="store_true")
parser.add_argument("--pytorch-build-number", type=str, default=None)
parser.add_argument("--disable-mkldnn", action="store_true")
return parser.parse_args()
if __name__ == "__main__":
args = parse_arguments()
ami = (
args.ami
if args.ami is not None
else os_amis[args.os]
if args.os is not None
else ubuntu20_04_ami
)
keyfile_path, key_name = compute_keyfile_path(args.key_name)
if args.list_instances:
list_instances(args.instance_type)
sys.exit(0)
if args.terminate_instances:
terminate_instances(args.instance_type)
sys.exit(0)
if len(key_name) == 0:
raise RuntimeError("""
Cannot start build without key_name, please specify
--key-name argument or AWS_KEY_NAME environment variable.""")
if len(keyfile_path) == 0 or not os.path.exists(keyfile_path):
raise RuntimeError(f"""
Cannot find keyfile with name: [{key_name}] in path: [{keyfile_path}], please
check `~/.ssh/` folder or manually set SSH_KEY_PATH environment variable.""")
# Starting the instance
inst = start_instance(
key_name, ami=ami, instance_type=args.instance_type, ebs_size=args.ebs_size
)
instance_name = f"{args.key_name}-{args.os}"
if args.python_version is not None:
instance_name += f"-py{args.python_version}"
inst.create_tags(
DryRun=False,
Tags=[
{
"Key": "Name",
"Value": instance_name,
}
],
)
addr = inst.public_dns_name
wait_for_connection(addr, 22)
host = RemoteHost(addr, keyfile_path)
host.ami = ami
if args.use_docker:
update_apt_repo(host)
host.start_docker()
if args.test_only:
run_tests(host, args.test_only)
sys.exit(0)
if args.alloc_instance:
if args.python_version is None:
sys.exit(0)
install_condaforge_python(host, args.python_version)
sys.exit(0)
python_version = args.python_version if args.python_version is not None else "3.10"
if args.use_torch_from_pypi:
configure_system(host, compiler=args.compiler, python_version=python_version)
print("Installing PyTorch wheel")
host.run_cmd("pip3 install torch")
build_domains(
host, branch=args.branch, git_clone_flags=" --depth 1 --shallow-submodules"
)
else:
start_build(
host,
branch=args.branch,
compiler=args.compiler,
python_version=python_version,
pytorch_only=args.pytorch_only,
pytorch_build_number=args.pytorch_build_number,
enable_mkldnn=not args.disable_mkldnn,
)
if not args.keep_running:
print(f"Waiting for instance {inst.id} to terminate")
inst.terminate()
inst.wait_until_terminated()

View File

@ -1,87 +0,0 @@
#!/usr/bin/env python3
import os
import shutil
import sys
from subprocess import check_call
from tempfile import TemporaryDirectory
from auditwheel.elfutils import elf_file_filter
from auditwheel.lddtree import lddtree
from auditwheel.patcher import Patchelf
from auditwheel.repair import copylib
from auditwheel.wheeltools import InWheelCtx
def replace_tag(filename):
with open(filename) as f:
lines = f.read().split("\\n")
for i, line in enumerate(lines):
if not line.startswith("Tag: "):
continue
lines[i] = line.replace("-linux_", "-manylinux2014_")
print(f"Updated tag from {line} to {lines[i]}")
with open(filename, "w") as f:
f.write("\\n".join(lines))
class AlignedPatchelf(Patchelf):
def set_soname(self, file_name: str, new_soname: str) -> None:
check_call(
["patchelf", "--page-size", "65536", "--set-soname", new_soname, file_name]
)
def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
check_call(
[
"patchelf",
"--page-size",
"65536",
"--replace-needed",
soname,
new_soname,
file_name,
]
)
def embed_library(whl_path, lib_soname, update_tag=False):
patcher = AlignedPatchelf()
out_dir = TemporaryDirectory()
whl_name = os.path.basename(whl_path)
tmp_whl_name = os.path.join(out_dir.name, whl_name)
with InWheelCtx(whl_path) as ctx:
torchlib_path = os.path.join(ctx._tmpdir.name, "torch", "lib")
ctx.out_wheel = tmp_whl_name
new_lib_path, new_lib_soname = None, None
for filename, _ in elf_file_filter(ctx.iter_files()):
if not filename.startswith("torch/lib"):
continue
libtree = lddtree(filename)
if lib_soname not in libtree["needed"]:
continue
lib_path = libtree["libs"][lib_soname]["path"]
if lib_path is None:
print(f"Can't embed {lib_soname} as it could not be found")
break
if lib_path.startswith(torchlib_path):
continue
if new_lib_path is None:
new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
patcher.replace_needed(filename, lib_soname, new_lib_soname)
print(f"Replacing {lib_soname} with {new_lib_soname} for {filename}")
if update_tag:
# Add manylinux2014 tag
for filename in ctx.iter_files():
if os.path.basename(filename) != "WHEEL":
continue
replace_tag(filename)
shutil.move(tmp_whl_name, whl_path)
if __name__ == "__main__":
embed_library(
sys.argv[1], "libgomp.so.1", len(sys.argv) > 2 and sys.argv[2] == "--update-tag"
)

View File

@ -188,7 +188,7 @@ case "$tag" in
fi
GCC_VERSION=11
VISION=yes
ROCM_VERSION=7.1
ROCM_VERSION=7.0
NINJA_VERSION=1.9.0
TRITON=yes
KATEX=yes

View File

@ -60,16 +60,14 @@ EOF
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
fi
if [[ $(ver $ROCM_VERSION) -lt $(ver 7.1) ]]; then
# precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5, removed in ROCm 7.1
# search for all unversioned packages
# if search fails it will abort this script; use true to avoid case where search fails
MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
if [[ "x${MIOPENHIPGFX}" = x ]]; then
echo "miopen-hip-gfx package not available" && exit 1
else
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
fi
# precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
# search for all unversioned packages
# if search fails it will abort this script; use true to avoid case where search fails
MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
if [[ "x${MIOPENHIPGFX}" = x ]]; then
echo "miopen-hip-gfx package not available" && exit 1
else
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
fi
# ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime

View File

@ -12,8 +12,8 @@ function do_install() {
rocm_version_nodot=${rocm_version//./}
# https://github.com/icl-utk-edu/magma/pull/65
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
# post merge of https://github.com/icl-utk-edu/magma/pull/65
MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
rocm_dir="/opt/rocm"

View File

@ -4,14 +4,17 @@ set -ex
SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
# Source the common build script for architecture-specific configurations (MKLDNN, ACL, etc.)
source "${SCRIPTPATH}/../pytorch/build.sh" || true
case "${GPU_ARCH_TYPE:-BLANK}" in
cuda)
cuda | cuda-aarch64)
bash "${SCRIPTPATH}/build_cuda.sh"
;;
rocm)
bash "${SCRIPTPATH}/build_rocm.sh"
;;
cpu | cpu-cxx11-abi | cpu-s390x)
cpu | cpu-cxx11-abi | cpu-aarch64 | cpu-s390x)
bash "${SCRIPTPATH}/build_cpu.sh"
;;
xpu)

View File

@ -18,12 +18,31 @@ retry () {
$* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
}
# Detect architecture first
ARCH=$(uname -m)
echo "Detected architecture: $ARCH"
PLATFORM=""
# TODO move this into the Docker images
OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
retry yum install -q -y zip openssl
PLATFORM="manylinux_2_28_x86_64"
# Set platform based on architecture
case $ARCH in
x86_64)
PLATFORM="manylinux_2_28_x86_64"
;;
aarch64)
PLATFORM="manylinux_2_28_aarch64"
;;
s390x)
PLATFORM="manylinux_2_28_s390x"
;;
*)
echo "Unsupported architecture: $ARCH"
exit 1
;;
esac
elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
retry dnf install -q -y zip openssl
elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
@ -38,6 +57,8 @@ else
exit 1
fi
echo "Platform set to: $PLATFORM"
# We use the package name to test the package by passing this to 'pip install'
# This is the env variable that setup.py uses to name the package. Note that
# pip 'normalizes' the name first by changing all - to _
@ -299,8 +320,8 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
# ROCm workaround for roctracer dlopens
if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
patchedpath=$(fname_without_so_number $destpath)
# Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load
elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then
# Keep the so number for XPU dependencies, libgomp.so.1, ACL libraries, and NVPL libraries to avoid twice load
elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" || "$filename" == libarm_compute* || "$filename" == libnvpl* || "$filename" == "libgfortran.so.5" ]]; then
patchedpath=$destpath
else
patchedpath=$(fname_with_sha256 $destpath)
@ -346,9 +367,22 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
done
# create Manylinux 2_28 tag this needs to happen before regenerate the RECORD
if [[ $PLATFORM == "manylinux_2_28_x86_64" && $GPU_ARCH_TYPE != "cpu-s390x" && $GPU_ARCH_TYPE != "xpu" ]]; then
# Support all architectures (x86_64, aarch64, s390x)
if [[ "$IS_MANYLINUX2_28" == "1" && $GPU_ARCH_TYPE != "xpu" ]]; then
wheel_file=$(echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/WHEEL/g')
sed -i -e s#linux_x86_64#"${PLATFORM}"# $wheel_file;
echo "Updating wheel tag for $ARCH architecture"
# Replace linux_* with manylinux_2_28_* based on architecture
case $ARCH in
x86_64)
sed -i -e 's#linux_x86_64#manylinux_2_28_x86_64#g' $wheel_file
;;
aarch64)
sed -i -e 's#linux_aarch64#manylinux_2_28_aarch64#g' $wheel_file
;;
s390x)
sed -i -e 's#linux_s390x#manylinux_2_28_s390x#g' $wheel_file
;;
esac
fi
# regenerate the RECORD file with new hashes

View File

@ -15,6 +15,10 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
EXTRA_CAFFE2_CMAKE_FLAGS=()
fi
# Detect architecture
ARCH=$(uname -m)
echo "Building CPU wheel for architecture: $ARCH"
WHEELHOUSE_DIR="wheelhousecpu"
LIBTORCH_HOUSE_DIR="libtorch_housecpu"
if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
@ -34,8 +38,10 @@ elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
if [[ "$(uname -m)" == "s390x" ]]; then
if [[ "$ARCH" == "s390x" ]]; then
LIBGOMP_PATH="/usr/lib/s390x-linux-gnu/libgomp.so.1"
elif [[ "$ARCH" == "aarch64" ]]; then
LIBGOMP_PATH="/usr/lib/aarch64-linux-gnu/libgomp.so.1"
else
LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
fi
@ -49,6 +55,32 @@ DEPS_SONAME=(
"libgomp.so.1"
)
# Add ARM-specific library dependencies for CPU builds
if [[ "$ARCH" == "aarch64" ]]; then
echo "Adding ARM-specific CPU library dependencies"
# ARM Compute Library (if available)
if [[ -d "/acl/build" ]]; then
echo "Adding ARM Compute Library for CPU"
DEPS_LIST+=(
"/acl/build/libarm_compute.so"
"/acl/build/libarm_compute_graph.so"
)
DEPS_SONAME+=(
"libarm_compute.so"
"libarm_compute_graph.so"
)
fi
# ARM system libraries
DEPS_LIST+=(
"/usr/lib64/libgfortran.so.5"
)
DEPS_SONAME+=(
"libgfortran.so.5"
)
fi
rm -rf /usr/local/cuda*
SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"

View File

@ -29,6 +29,10 @@ if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
EXTRA_CAFFE2_CMAKE_FLAGS=()
fi
# Detect architecture
ARCH=$(uname -m)
echo "Building for architecture: $ARCH"
# Determine CUDA version and architectures to build for
#
# NOTE: We should first check `DESIRED_CUDA` when determining `CUDA_VERSION`,
@ -53,34 +57,60 @@ fi
cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
# Function to remove architectures from a list
remove_archs() {
local result="$1"
shift
for arch in "$@"; do
result="${result//${arch};/}"
done
echo "$result"
}
# Function to filter CUDA architectures for aarch64
# aarch64 ARM GPUs only support certain compute capabilities
# Keep: 8.0 (A100), 9.0+ (Hopper, Grace Hopper, newer)
# Remove: < 8.0 (no ARM GPUs), 8.6 (x86_64 RTX 3090/A6000 only)
filter_aarch64_archs() {
local arch_list="$1"
# Explicitly remove architectures not needed on aarch64
arch_list=$(remove_archs "$arch_list" "5.0" "6.0" "7.0" "7.5" "8.6")
echo "$arch_list"
}
# Base: Common architectures across all modern CUDA versions
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0"
case ${CUDA_VERSION} in
#removing sm_50-sm_60 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
#however we would like to keep sm_70 architecture see: https://github.com/pytorch/pytorch/issues/157517
12.8)
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0"
;;
12.9)
TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;9.0;10.0;12.0+PTX"
# WAR to resolve the ld error in libtorch build with CUDA 12.9
12.6) TORCH_CUDA_ARCH_LIST="5.0;6.0;${TORCH_CUDA_ARCH_LIST}" ;; # Only 12.6 includes Legacy Maxwell/Pascal that will be removed in future releases
12.8) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0" ;; # +Hopper/Blackwell support
12.9) TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};10.0;12.0+PTX" # +Hopper/Blackwell support + PTX for forward compatibility
if [[ "$PACKAGE_TYPE" == "libtorch" ]]; then
TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//7.0;/}" # Remove 7.0 to resolve the ld error
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST//8.6;/}" # Remove 8.6 for libtorch
fi
;;
13.0)
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
;;
12.6)
TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
;;
*)
echo "unknown cuda version $CUDA_VERSION"
exit 1
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;$([[ "$ARCH" == "aarch64" ]] && echo "11.0;" || echo "")12.0+PTX"
export TORCH_NVCC_FLAGS="-compress-mode=size"
export BUILD_BUNDLE_PTXAS=1
;;
*) echo "unknown cuda version $CUDA_VERSION"; exit 1 ;;
esac
# Filter for aarch64: Remove < 8.0 and 8.6
[[ "$ARCH" == "aarch64" ]] && TORCH_CUDA_ARCH_LIST=$(filter_aarch64_archs "$TORCH_CUDA_ARCH_LIST")
echo "TORCH_CUDA_ARCH_LIST set to: $TORCH_CUDA_ARCH_LIST"
export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
echo "${TORCH_CUDA_ARCH_LIST}"
# Disable MAGMA for aarch64 as pre-built libraries are x86-64 only
if [[ "$ARCH" == "aarch64" ]]; then
echo "Disabling MAGMA for aarch64 architecture"
export USE_MAGMA=0
fi
# Package directories
WHEELHOUSE_DIR="wheelhouse$cuda_version_nodot"
LIBTORCH_HOUSE_DIR="libtorch_house$cuda_version_nodot"
@ -244,6 +274,51 @@ else
exit 1
fi
# Add ARM-specific library dependencies
if [[ "$ARCH" == "aarch64" ]]; then
echo "Adding ARM-specific library dependencies"
# ARM Compute Library (if available)
if [[ -d "/acl/build" ]]; then
echo "Adding ARM Compute Library"
DEPS_LIST+=(
"/acl/build/libarm_compute.so"
"/acl/build/libarm_compute_graph.so"
)
DEPS_SONAME+=(
"libarm_compute.so"
"libarm_compute_graph.so"
)
fi
# ARM system libraries
DEPS_LIST+=(
"/lib64/libgomp.so.1"
"/usr/lib64/libgfortran.so.5"
)
DEPS_SONAME+=(
"libgomp.so.1"
"libgfortran.so.5"
)
# NVPL libraries (ARM optimized BLAS/LAPACK)
if [[ -d "/usr/local/lib" && -f "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0" ]]; then
echo "Adding NVPL libraries for ARM"
DEPS_LIST+=(
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0"
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0"
"/usr/local/lib/libnvpl_lapack_core.so.0"
"/usr/local/lib/libnvpl_blas_core.so.0"
)
DEPS_SONAME+=(
"libnvpl_lapack_lp64_gomp.so.0"
"libnvpl_blas_lp64_gomp.so.0"
"libnvpl_lapack_core.so.0"
"libnvpl_blas_core.so.0"
)
fi
fi
# run_tests.sh requires DESIRED_CUDA to know what tests to exclude
export DESIRED_CUDA="$cuda_version_nodot"
@ -251,9 +326,11 @@ export DESIRED_CUDA="$cuda_version_nodot"
rm -rf /usr/local/cuda || true
ln -s "/usr/local/cuda-${CUDA_VERSION}" /usr/local/cuda
# Switch `/usr/local/magma` to the desired CUDA version
rm -rf /usr/local/magma || true
ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
# Switch `/usr/local/magma` to the desired CUDA version (skip for aarch64)
if [[ "$ARCH" != "aarch64" ]]; then
rm -rf /usr/local/magma || true
ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
fi
export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev) # 10.0.130
export CUDA_VERSION_SHORT=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev | cut -f1,2 -d".") # 10.0

View File

@ -21,87 +21,3 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
fi
mkdir -p "$pytest_reports_dir" || true
##########################################
# copied from .ci/pytorch/common_utils.sh
##########################################
function get_pinned_commit() {
cat .github/ci_commit_pins/"${1}".txt
}
function pip_install_whl() {
# This is used to install PyTorch and other build artifacts wheel locally
# without using any network connection
# Convert the input arguments into an array
local args=("$@")
# Check if the first argument contains multiple paths separated by spaces
if [[ "${args[0]}" == *" "* ]]; then
# Split the string by spaces into an array
IFS=' ' read -r -a paths <<< "${args[0]}"
# Loop through each path and install individually
for path in "${paths[@]}"; do
echo "Installing $path"
python3 -mpip install --no-index --no-deps "$path"
done
else
# Loop through each argument and install individually
for path in "${args[@]}"; do
echo "Installing $path"
python3 -mpip install --no-index --no-deps "$path"
done
fi
}
function pip_build_and_install() {
local build_target=$1
local wheel_dir=$2
local found_whl=0
for file in "${wheel_dir}"/*.whl
do
if [[ -f "${file}" ]]; then
found_whl=1
break
fi
done
# Build the wheel if it doesn't exist
if [ "${found_whl}" == "0" ]; then
python3 -m pip wheel \
--no-build-isolation \
--no-deps \
-w "${wheel_dir}" \
"${build_target}"
fi
for file in "${wheel_dir}"/*.whl
do
pip_install_whl "${file}"
done
}
function install_torchvision() {
local orig_preload
local commit
commit=$(get_pinned_commit vision)
orig_preload=${LD_PRELOAD}
if [ -n "${LD_PRELOAD}" ]; then
# Silence dlerror to work-around glibc ASAN bug, see https://sourceware.org/bugzilla/show_bug.cgi?id=27653#c9
echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c -
LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so
fi
if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
# Not sure if both are needed, but why not
export FORCE_CUDA=1
export WITH_CUDA=1
fi
pip_build_and_install "git+https://github.com/pytorch/vision.git@${commit}" dist/vision
if [ -n "${LD_PRELOAD}" ]; then
LD_PRELOAD=${orig_preload}
fi
}

View File

@ -19,7 +19,7 @@ git config --global --add safe.directory /var/lib/jenkins/workspace
if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
# TODO: This can be removed later once vision is also part of the Docker image
install_torchvision
pip install -q --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
# JIT C++ extensions require ninja, so put it into PATH.
export PATH="/var/lib/jenkins/.local/bin:$PATH"
# NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we

View File

@ -86,10 +86,20 @@ else
fi
fi
# Enable MKLDNN with ARM Compute Library for ARM builds
if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
export USE_MKLDNN=1
# ACL is required for aarch64 builds
if [[ ! -d "/acl" ]]; then
echo "ERROR: ARM Compute Library not found at /acl"
echo "ACL is required for aarch64 builds. Check Docker image setup."
exit 1
fi
export USE_MKLDNN_ACL=1
export ACL_ROOT_DIR=/acl
echo "ARM Compute Library enabled for MKLDNN: ACL_ROOT_DIR=/acl"
fi
if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then

View File

@ -100,6 +100,337 @@ def check_lib_statically_linked_libstdc_cxx_abi_symbols(lib: str) -> None:
)
def _compile_and_extract_symbols(
cpp_content: str, compile_flags: list[str], exclude_list: list[str] | None = None
) -> list[str]:
"""
Helper to compile a C++ file and extract all symbols.
Args:
cpp_content: C++ source code to compile
compile_flags: Compilation flags
exclude_list: List of symbol names to exclude. Defaults to ["main"].
Returns:
List of all symbols found in the object file (excluding those in exclude_list).
"""
import subprocess
import tempfile
if exclude_list is None:
exclude_list = ["main"]
with tempfile.TemporaryDirectory() as tmpdir:
tmppath = Path(tmpdir)
cpp_file = tmppath / "test.cpp"
obj_file = tmppath / "test.o"
cpp_file.write_text(cpp_content)
result = subprocess.run(
compile_flags + [str(cpp_file), "-o", str(obj_file)],
capture_output=True,
text=True,
timeout=60,
)
if result.returncode != 0:
raise RuntimeError(f"Compilation failed: {result.stderr}")
symbols = get_symbols(str(obj_file))
# Return all symbol names, excluding those in the exclude list
return [name for _addr, _stype, name in symbols if name not in exclude_list]
def check_stable_only_symbols(install_root: Path) -> None:
"""
Test TORCH_STABLE_ONLY and TORCH_TARGET_VERSION by compiling test code and comparing symbol counts.
This approach tests:
1. WITHOUT macros -> many torch symbols exposed
2. WITH TORCH_STABLE_ONLY -> zero torch symbols (all hidden)
3. WITH TORCH_TARGET_VERSION -> zero torch symbols (all hidden)
4. WITH both macros -> zero torch symbols (all hidden)
"""
include_dir = install_root / "include"
assert include_dir.exists(), f"Expected {include_dir} to be present"
test_cpp_content = """
// Main torch C++ API headers
#include <torch/torch.h>
#include <torch/all.h>
// ATen tensor library
#include <ATen/ATen.h>
// Core c10 headers (commonly used)
#include <c10/core/Device.h>
#include <c10/core/DeviceType.h>
#include <c10/core/ScalarType.h>
#include <c10/core/TensorOptions.h>
#include <c10/util/Optional.h>
int main() { return 0; }
"""
base_compile_flags = [
"g++",
"-std=c++17",
f"-I{include_dir}",
f"-I{include_dir}/torch/csrc/api/include",
"-c", # Compile only, don't link
]
# Compile WITHOUT any macros
symbols_without = _compile_and_extract_symbols(
cpp_content=test_cpp_content,
compile_flags=base_compile_flags,
)
# We expect constexpr symbols, inline functions used by other headers etc.
# to produce symbols
num_symbols_without = len(symbols_without)
print(f"Found {num_symbols_without} symbols without any macros defined")
assert num_symbols_without != 0, (
"Expected a non-zero number of symbols without any macros"
)
# Compile WITH TORCH_STABLE_ONLY (expect 0 symbols)
compile_flags_with_stable_only = base_compile_flags + ["-DTORCH_STABLE_ONLY"]
symbols_with_stable_only = _compile_and_extract_symbols(
cpp_content=test_cpp_content,
compile_flags=compile_flags_with_stable_only,
)
num_symbols_with_stable_only = len(symbols_with_stable_only)
assert num_symbols_with_stable_only == 0, (
f"Expected no symbols with TORCH_STABLE_ONLY macro, but found {num_symbols_with_stable_only}"
)
# Compile WITH TORCH_TARGET_VERSION (expect 0 symbols)
compile_flags_with_target_version = base_compile_flags + [
"-DTORCH_TARGET_VERSION=1"
]
symbols_with_target_version = _compile_and_extract_symbols(
cpp_content=test_cpp_content,
compile_flags=compile_flags_with_target_version,
)
num_symbols_with_target_version = len(symbols_with_target_version)
assert num_symbols_with_target_version == 0, (
f"Expected no symbols with TORCH_TARGET_VERSION macro, but found {num_symbols_with_target_version}"
)
# Compile WITH both macros (expect 0 symbols)
compile_flags_with_both = base_compile_flags + [
"-DTORCH_STABLE_ONLY",
"-DTORCH_TARGET_VERSION=1",
]
symbols_with_both = _compile_and_extract_symbols(
cpp_content=test_cpp_content,
compile_flags=compile_flags_with_both,
)
num_symbols_with_both = len(symbols_with_both)
assert num_symbols_with_both == 0, (
f"Expected no symbols with both macros, but found {num_symbols_with_both}"
)
def check_stable_api_symbols(install_root: Path) -> None:
"""
Test that stable API headers still expose symbols with TORCH_STABLE_ONLY.
The torch/csrc/stable/c/shim.h header is tested in check_stable_c_shim_symbols
"""
include_dir = install_root / "include"
assert include_dir.exists(), f"Expected {include_dir} to be present"
stable_dir = include_dir / "torch" / "csrc" / "stable"
assert stable_dir.exists(), f"Expected {stable_dir} to be present"
stable_headers = list(stable_dir.rglob("*.h"))
if not stable_headers:
raise RuntimeError("Could not find any stable headers")
includes = []
for header in stable_headers:
rel_path = header.relative_to(include_dir)
includes.append(f"#include <{rel_path.as_posix()}>")
includes_str = "\n".join(includes)
test_stable_content = f"""
{includes_str}
int main() {{ return 0; }}
"""
compile_flags = [
"g++",
"-std=c++17",
f"-I{include_dir}",
f"-I{include_dir}/torch/csrc/api/include",
"-c",
"-DTORCH_STABLE_ONLY",
]
symbols_stable = _compile_and_extract_symbols(
cpp_content=test_stable_content,
compile_flags=compile_flags,
)
num_symbols_stable = len(symbols_stable)
print(f"Found {num_symbols_stable} symbols in torch/csrc/stable")
assert num_symbols_stable > 0, (
f"Expected stable headers to expose symbols with TORCH_STABLE_ONLY, "
f"but found {num_symbols_stable} symbols"
)
def check_headeronly_symbols(install_root: Path) -> None:
"""
Test that header-only utility headers still expose symbols with TORCH_STABLE_ONLY.
"""
include_dir = install_root / "include"
assert include_dir.exists(), f"Expected {include_dir} to be present"
# Find all headers in torch/headeronly
headeronly_dir = include_dir / "torch" / "headeronly"
assert headeronly_dir.exists(), f"Expected {headeronly_dir} to be present"
headeronly_headers = list(headeronly_dir.rglob("*.h"))
if not headeronly_headers:
raise RuntimeError("Could not find any headeronly headers")
# Filter out platform-specific headers that may not compile everywhere
platform_specific_keywords = [
"cpu/vec",
]
filtered_headers = []
for header in headeronly_headers:
rel_path = header.relative_to(include_dir).as_posix()
if not any(
keyword in rel_path.lower() for keyword in platform_specific_keywords
):
filtered_headers.append(header)
includes = []
for header in filtered_headers:
rel_path = header.relative_to(include_dir)
includes.append(f"#include <{rel_path.as_posix()}>")
includes_str = "\n".join(includes)
test_headeronly_content = f"""
{includes_str}
int main() {{ return 0; }}
"""
compile_flags = [
"g++",
"-std=c++17",
f"-I{include_dir}",
f"-I{include_dir}/torch/csrc/api/include",
"-c",
"-DTORCH_STABLE_ONLY",
]
symbols_headeronly = _compile_and_extract_symbols(
cpp_content=test_headeronly_content,
compile_flags=compile_flags,
)
num_symbols_headeronly = len(symbols_headeronly)
print(f"Found {num_symbols_headeronly} symbols in torch/headeronly")
assert num_symbols_headeronly > 0, (
f"Expected headeronly headers to expose symbols with TORCH_STABLE_ONLY, "
f"but found {num_symbols_headeronly} symbols"
)
def check_aoti_shim_symbols(install_root: Path) -> None:
"""
Test that AOTI shim headers still expose symbols with TORCH_STABLE_ONLY.
"""
include_dir = install_root / "include"
assert include_dir.exists(), f"Expected {include_dir} to be present"
# There are no constexpr symbols etc., so we need to actually use functions
# so that some symbols are found.
test_shim_content = """
#include <torch/csrc/inductor/aoti_torch/c/shim.h>
int main() {
int32_t (*fp1)() = &aoti_torch_device_type_cpu;
int32_t (*fp2)() = &aoti_torch_dtype_float32;
(void)fp1; (void)fp2;
return 0;
}
"""
compile_flags = [
"g++",
"-std=c++17",
f"-I{include_dir}",
f"-I{include_dir}/torch/csrc/api/include",
"-c",
"-DTORCH_STABLE_ONLY",
]
symbols_shim = _compile_and_extract_symbols(
cpp_content=test_shim_content,
compile_flags=compile_flags,
)
num_symbols_shim = len(symbols_shim)
assert num_symbols_shim > 0, (
f"Expected shim headers to expose symbols with TORCH_STABLE_ONLY, "
f"but found {num_symbols_shim} symbols"
)
def check_stable_c_shim_symbols(install_root: Path) -> None:
"""
Test that stable C shim headers still expose symbols with TORCH_STABLE_ONLY.
"""
include_dir = install_root / "include"
assert include_dir.exists(), f"Expected {include_dir} to be present"
# Check if the stable C shim exists
stable_shim = include_dir / "torch" / "csrc" / "stable" / "c" / "shim.h"
if not stable_shim.exists():
raise RuntimeError("Could not find stable c shim")
# There are no constexpr symbols etc., so we need to actually use functions
# so that some symbols are found.
test_stable_shim_content = """
#include <torch/csrc/stable/c/shim.h>
int main() {
// Reference stable C API functions to create undefined symbols
AOTITorchError (*fp1)(const char*, uint32_t*, int32_t*) = &torch_parse_device_string;
AOTITorchError (*fp2)(uint32_t*) = &torch_get_num_threads;
(void)fp1; (void)fp2;
return 0;
}
"""
compile_flags = [
"g++",
"-std=c++17",
f"-I{include_dir}",
f"-I{include_dir}/torch/csrc/api/include",
"-c",
"-DTORCH_STABLE_ONLY",
]
symbols_stable_shim = _compile_and_extract_symbols(
cpp_content=test_stable_shim_content,
compile_flags=compile_flags,
)
num_symbols_stable_shim = len(symbols_stable_shim)
assert num_symbols_stable_shim > 0, (
f"Expected stable C shim headers to expose symbols with TORCH_STABLE_ONLY, "
f"but found {num_symbols_stable_shim} symbols"
)
def check_lib_symbols_for_abi_correctness(lib: str) -> None:
print(f"lib: {lib}")
cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS)
@ -129,6 +460,13 @@ def main() -> None:
check_lib_symbols_for_abi_correctness(libtorch_cpu_path)
check_lib_statically_linked_libstdc_cxx_abi_symbols(libtorch_cpu_path)
# Check symbols when TORCH_STABLE_ONLY is defined
check_stable_only_symbols(install_root)
check_stable_api_symbols(install_root)
check_headeronly_symbols(install_root)
check_aoti_shim_symbols(install_root)
check_stable_c_shim_symbols(install_root)
if __name__ == "__main__":
main()

View File

@ -389,13 +389,6 @@ test_lazy_tensor_meta_reference_disabled() {
export -n TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE
}
test_dynamo_core() {
time python test/run_test.py \
--include-dynamo-core-tests \
--verbose \
--upload-artifacts-while-running
assert_git_not_dirty
}
test_dynamo_wrapped_shard() {
if [[ -z "$NUM_TEST_SHARDS" ]]; then
@ -1250,97 +1243,6 @@ test_custom_script_ops() {
assert_git_not_dirty
}
test_libtorch_agnostic_targetting() {
echo "Testing libtorch_agnostic runs correctly on TORCH_TARGET_VERSION"
REPO_DIR=$(pwd)
WHEEL_DIR="${REPO_DIR}/test/cpp_extensions/.wheels"
# Build wheel with current PyTorch (this has TORCH_TARGET_VERSION 2_9_0)
echo "Building 2.9 extension wheel with current PyTorch..."
pushd test/cpp_extensions/libtorch_agnostic_2_9_extension
time python setup.py bdist_wheel
# Save the wheel
mkdir -p "$WHEEL_DIR"
cp dist/*.whl "$WHEEL_DIR/"
WHEEL_FILE=$(find "$WHEEL_DIR" -maxdepth 1 -name "*.whl" -type f | head -1)
echo "Built wheel: $(basename "$WHEEL_FILE")"
popd
# Create venv and install PyTorch 2.9
python -m venv venv_pytorch_2_9
# shellcheck disable=SC1091
. venv_pytorch_2_9/bin/activate
# Clear PYTHONPATH to avoid using the development PyTorch
echo "Clearing PYTHONPATH to use only venv packages..."
unset PYTHONPATH
# Upgrade pip to latest version
echo "Upgrading pip to latest version..."
pip install --upgrade pip
pip --version
echo "Installing PyTorch 2.9..."
# Install from release channel only
PYTORCH_VERSION="2.9.0"
# Extract CUDA version from BUILD_ENVIRONMENT (e.g., "cuda12.1" -> "cu121")
if [[ "$BUILD_ENVIRONMENT" =~ cuda([0-9]+)\.([0-9]+) ]]; then
CUDA_MAJOR="${BASH_REMATCH[1]}"
CUDA_MINOR="${BASH_REMATCH[2]}"
CUDA_VERSION="cu${CUDA_MAJOR}${CUDA_MINOR}"
echo " Detected CUDA ${CUDA_MAJOR}.${CUDA_MINOR} from BUILD_ENVIRONMENT, using ${CUDA_VERSION}"
else
# Default to CPU build
CUDA_VERSION="cpu"
echo " No CUDA detected in BUILD_ENVIRONMENT, using CPU build"
fi
if pip install torch=="${PYTORCH_VERSION}" --index-url https://download.pytorch.org/whl/${CUDA_VERSION}/; then
echo "Installed PyTorch ${PYTORCH_VERSION} from release channel (${CUDA_VERSION})"
else
echo " FAILED to install PyTorch 2.9.0 from release channel"
echo " URL: https://download.pytorch.org/whl/${CUDA_VERSION}/"
deactivate
rm -rf venv_pytorch_2_9
return 1
fi
INSTALLED_VERSION=$(python -c "import torch; print(torch.__version__)" 2>/dev/null || echo "unknown")
echo " Installed version: $INSTALLED_VERSION"
# Install test dependencies
echo "Installing test dependencies..."
pip install expecttest numpy unittest-xml-reporting
# Install the pre-built wheel
echo ""
echo "Installing pre-built 2.9 extension wheel (built with PyTorch 2.10)..."
pip install "$WHEEL_FILE"
echo "Installed $(basename "$WHEEL_FILE") into PyTorch 2.9 environment"
# Run tests with PyTorch 2.9 runtime (2.10 tests will be skipped automatically)
echo ""
echo "Running tests with PyTorch 2.9 runtime (using wheel built on PyTorch 2.10)..."
if time python test/cpp_extensions/test_libtorch_agnostic.py -v; then
echo ""
echo " Wheel built with current torch and TORCH_TARGET_VERSION 2_9_0 works with PyTorch 2.9 runtime!"
else
echo "targeting test failed"
deactivate
rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
return 1
fi
deactivate
rm -rf venv_pytorch_2_9 "$WHEEL_DIR"
assert_git_not_dirty
}
test_jit_hooks() {
echo "Testing jit hooks in cpp"
HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
@ -1813,8 +1715,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]];
elif [[ "${TEST_CONFIG}" == *backward* ]]; then
test_forward_backward_compatibility
# Do NOT add tests after bc check tests, see its comment.
elif [[ "${TEST_CONFIG}" == *libtorch_agnostic_targetting* ]]; then
test_libtorch_agnostic_targetting
elif [[ "${TEST_CONFIG}" == *xla* ]]; then
install_torchvision
build_xla
@ -1914,8 +1814,6 @@ elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
test_inductor_shard "${SHARD_NUMBER}"
elif [[ "${TEST_CONFIG}" == *einops* ]]; then
test_einops
elif [[ "${TEST_CONFIG}" == *dynamo_core* ]]; then
test_dynamo_core
elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
install_torchvision
test_dynamo_wrapped_shard "${SHARD_NUMBER}"

View File

@ -1 +1 @@
ee1a1350eb37804b94334768f328144f058f14e9
07b6cbde121417a70e4dc871adb6d27030e0ce3f

View File

@ -1 +1 @@
2d82dc5caa336d179d9b46ac4a0fb8c43d84c5cc
acccf86477759b2d3500f1ae1be065f7b1e409ec

View File

@ -1 +1 @@
94631807d22c09723dd006f7be5beb649d5f88d0
e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a

7
.github/labeler.yml vendored
View File

@ -91,6 +91,13 @@
"ciflow/trunk":
- .ci/docker/ci_commit_pins/triton.txt
"oncall: distributed":
- torch/csrc/distributed/**
- torch/distributed/**
- torch/nn/parallel/**
- test/distributed/**
- torch/testing/_internal/distributed/**
"release notes: distributed (checkpoint)":
- torch/distributed/checkpoint/**
- test/distributed/checkpoint/**

View File

@ -7,7 +7,6 @@ ciflow_push_tags:
- ciflow/binaries
- ciflow/binaries_libtorch
- ciflow/binaries_wheel
- ciflow/dynamo
- ciflow/h100
- ciflow/h100-cutlass-backend
- ciflow/h100-distributed

View File

@ -260,11 +260,8 @@ jobs:
"${DOCKER_IMAGE}"
)
docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
if [[ ${BUILD_ENVIRONMENT} == *"aarch64"* ]]; then
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/aarch64_linux/aarch64_ci_build.sh"
else
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
fi
# Unified build script for all architectures (x86_64, aarch64, s390x)
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
- name: Chown artifacts
if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' && inputs.build_environment != 'linux-s390x-binary-manywheel' }}

View File

@ -326,7 +326,7 @@ jobs:
SCCACHE_BUCKET: ${{ !contains(matrix.runner, 'b200') && 'ossci-compiler-cache-circleci-v2' || '' }}
SCCACHE_REGION: ${{ !contains(matrix.runner, 'b200') && 'us-east-1' || '' }}
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
DOCKER_IMAGE: ${{ inputs.docker-image }}
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}

View File

@ -1,70 +0,0 @@
# Workflow: Dynamo Unit Test
# runs unit tests for dynamo.
name: dynamo-unittest
on:
push:
tags:
- ciflow/dynamo/*
workflow_call:
schedule:
- cron: 29 8 * * * # about 1:29am PDT
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
get-label-type:
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
dynamo-build:
name: dynamo-build
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
strategy:
matrix:
python-version: ['3.11', '3.12']
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-py${{ matrix.python-version }}-clang12
docker-image-name: ci-image:pytorch-linux-jammy-py${{ matrix.python-version }}-clang12
test-matrix: |
{ include: [
{ config: "dynamo_core", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
{ config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
{ config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
{ config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
]}
secrets: inherit
dynamo-test:
name: dynamo-test
uses: ./.github/workflows/_linux-test.yml
needs: [get-label-type, dynamo-build]
strategy:
matrix:
python-version: ['3.11', '3.12']
with:
build-environment: linux-jammy-py${{ matrix.python-version }}-clang12
docker-image: ci-image:pytorch-linux-jammy-py${{ matrix.python-version }}-clang12
test-matrix: |
{ include: [
{ config: "dynamo_core", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
{ config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
{ config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
{ config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
]}
secrets: inherit

View File

@ -70,7 +70,6 @@ jobs:
{ config: "distributed", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
{ config: "distributed", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
{ config: "numpy_2_x", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
{ config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
]}
secrets: inherit

View File

@ -83,7 +83,6 @@ jobs:
{ config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
{ config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.12xlarge.nvidia.gpu" },
{ config: "pr_time_benchmarks", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
{ config: "libtorch_agnostic_targetting", shard: 1, num_shards: 1, runner: "linux.g4dn.metal.nvidia.gpu" },
]}
secrets: inherit

View File

@ -144,7 +144,7 @@ inline std::bitset<kVmapNumLevels> createVmapLevelsBitset(BatchDimsRef bdims) {
}
inline std::ostream& operator<<(std::ostream& out, const BatchDim& bdim) {
out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ')';
out << "(lvl=" << bdim.level() << ", dim=" << bdim.dim() << ")";
return out;
}

View File

@ -9,7 +9,7 @@ namespace indexing {
const EllipsisIndexType Ellipsis = EllipsisIndexType();
std::ostream& operator<<(std::ostream& stream, const Slice& slice) {
stream << slice.start() << ':' << slice.stop() << ':' << slice.step();
stream << slice.start() << ":" << slice.stop() << ":" << slice.step();
return stream;
}
@ -31,12 +31,12 @@ std::ostream& operator<<(std::ostream& stream, const TensorIndex& tensor_index)
}
std::ostream& operator<<(std::ostream& stream, const std::vector<TensorIndex>& tensor_indices) {
stream << '(';
stream << "(";
for (const auto i : c10::irange(tensor_indices.size())) {
stream << tensor_indices[i];
if (i < tensor_indices.size() - 1) stream << ", ";
}
stream << ')';
stream << ")";
return stream;
}

View File

@ -113,7 +113,7 @@ void TensorNames::checkUnique(const char* op_name) const {
std::ostream& operator<<(std::ostream& out, const TensorName& tensorname) {
out << tensorname.name_ << " (index ";
out << tensorname.origin_idx_ << " of ";
out << tensorname.origin_ << ')';
out << tensorname.origin_ << ")";
return out;
}

View File

@ -13,9 +13,9 @@ std::ostream& operator<<(std::ostream & out, const TensorGeometryArg& t) {
if (t.pos == 0) {
// 0 is distinguished; it usually indicates 'self' or the return
// tensor
out << '\'' << t.name << '\'';
out << "'" << t.name << "'";
} else {
out << "argument #" << t.pos << " '" << t.name << '\'';
out << "argument #" << t.pos << " '" << t.name << "'";
}
return out;
}
@ -154,7 +154,7 @@ void checkSameGPU(CheckedFrom c, const TensorArg& t1, const TensorArg& t2) {
oss << "Tensor for " << t2 << " is on CPU, ";
}
oss << "but expected " << ((!t1->is_cpu() && !t2->is_cpu()) ? "them" : "it")
<< " to be on GPU (while checking arguments for " << c << ')';
<< " to be on GPU (while checking arguments for " << c << ")";
TORCH_CHECK(false, oss.str());
}
TORCH_CHECK(
@ -199,7 +199,7 @@ void checkScalarTypes(CheckedFrom c, const TensorArg& t,
i++;
}
oss << "; but got " << t->toString()
<< " instead (while checking arguments for " << c << ')';
<< " instead (while checking arguments for " << c << ")";
TORCH_CHECK(false, oss.str());
}
}

View File

@ -43,8 +43,8 @@ std::string get_mkldnn_version() {
// https://github.com/intel/ideep/issues/29
{
const dnnl_version_t* ver = dnnl_version();
ss << "Intel(R) MKL-DNN v" << ver->major << '.' << ver->minor << '.' << ver->patch
<< " (Git Hash " << ver->hash << ')';
ss << "Intel(R) MKL-DNN v" << ver->major << "." << ver->minor << "." << ver->patch
<< " (Git Hash " << ver->hash << ")";
}
#else
ss << "MKLDNN not found";
@ -81,7 +81,7 @@ std::string get_openmp_version() {
break;
}
if (ver_str) {
ss << " (a.k.a. OpenMP " << ver_str << ')';
ss << " (a.k.a. OpenMP " << ver_str << ")";
}
}
#else
@ -135,38 +135,38 @@ std::string show_config() {
#if defined(__GNUC__)
{
ss << " - GCC " << __GNUC__ << '.' << __GNUC_MINOR__ << '\n';
ss << " - GCC " << __GNUC__ << "." << __GNUC_MINOR__ << "\n";
}
#endif
#if defined(__cplusplus)
{
ss << " - C++ Version: " << __cplusplus << '\n';
ss << " - C++ Version: " << __cplusplus << "\n";
}
#endif
#if defined(__clang_major__)
{
ss << " - clang " << __clang_major__ << '.' << __clang_minor__ << '.' << __clang_patchlevel__ << '\n';
ss << " - clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__ << "\n";
}
#endif
#if defined(_MSC_VER)
{
ss << " - MSVC " << _MSC_FULL_VER << '\n';
ss << " - MSVC " << _MSC_FULL_VER << "\n";
}
#endif
#if AT_MKL_ENABLED()
ss << " - " << get_mkl_version() << '\n';
ss << " - " << get_mkl_version() << "\n";
#endif
#if AT_MKLDNN_ENABLED()
ss << " - " << get_mkldnn_version() << '\n';
ss << " - " << get_mkldnn_version() << "\n";
#endif
#ifdef _OPENMP
ss << " - " << get_openmp_version() << '\n';
ss << " - " << get_openmp_version() << "\n";
#endif
#if AT_BUILD_WITH_LAPACK()
@ -183,7 +183,7 @@ std::string show_config() {
ss << " - Cross compiling on MacOSX\n";
#endif
ss << " - "<< used_cpu_capability() << '\n';
ss << " - "<< used_cpu_capability() << "\n";
if (hasCUDA()) {
ss << detail::getCUDAHooks().showConfig();
@ -200,10 +200,10 @@ std::string show_config() {
ss << " - Build settings: ";
for (const auto& pair : caffe2::GetBuildOptions()) {
if (!pair.second.empty()) {
ss << pair.first << '=' << pair.second << ", ";
ss << pair.first << "=" << pair.second << ", ";
}
}
ss << '\n';
ss << "\n";
// TODO: do HIP
// TODO: do XLA

View File

@ -209,7 +209,7 @@ struct CodeTemplate {
// to indent correctly in the context.
void emitIndent(std::ostream& out, size_t indent) const {
for ([[maybe_unused]] const auto i : c10::irange(indent)) {
out << ' ';
out << " ";
}
}
void emitStringWithIndents(

View File

@ -10,7 +10,7 @@ std::ostream& operator<<(std::ostream& out, const Dimname& dimname) {
if (dimname.type() == NameType::WILDCARD) {
out << "None";
} else {
out << '\'' << dimname.symbol().toUnqualString() << '\'';
out << "'" << dimname.symbol().toUnqualString() << "'";
}
return out;
}

View File

@ -5,7 +5,7 @@
namespace at {
std::ostream& operator<<(std::ostream& out, const Range& range) {
out << "Range[" << range.begin << ", " << range.end << ']';
out << "Range[" << range.begin << ", " << range.end << "]";
return out;
}

View File

@ -71,7 +71,7 @@ void TensorBase::enforce_invariants() {
void TensorBase::print() const {
if (defined()) {
std::cerr << '[' << toString() << ' ' << sizes() << ']' << '\n';
std::cerr << "[" << toString() << " " << sizes() << "]" << '\n';
} else {
std::cerr << "[UndefinedTensor]" << '\n';
}

View File

@ -1,6 +1,5 @@
#pragma once
#include <torch/headeronly/core/TensorAccessor.h>
#include <c10/macros/Macros.h>
#include <c10/util/ArrayRef.h>
#include <c10/util/Deprecated.h>
@ -12,37 +11,252 @@
namespace at {
using torch::headeronly::DefaultPtrTraits;
// The PtrTraits argument to the TensorAccessor/GenericPackedTensorAccessor
// is used to enable the __restrict__ keyword/modifier for the data
// passed to cuda.
template <typename T>
struct DefaultPtrTraits {
typedef T* PtrType;
};
#if defined(__CUDACC__) || defined(__HIPCC__)
using torch::headeronly::RestrictPtrTraits;
template <typename T>
struct RestrictPtrTraits {
typedef T* __restrict__ PtrType;
};
#endif
// TensorAccessorBase and TensorAccessor are used for both CPU and CUDA tensors.
// For CUDA tensors it is used in device code (only). This means that we restrict ourselves
// to functions and types available there (e.g. IntArrayRef isn't).
// The PtrTraits argument is only relevant to cuda to support `__restrict__` pointers.
template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
using TensorAccessorBase = torch::headeronly::detail::TensorAccessorBase<c10::IntArrayRef, T, N, PtrTraits, index_t>;
class TensorAccessorBase {
public:
typedef typename PtrTraits<T>::PtrType PtrType;
C10_HOST_DEVICE TensorAccessorBase(
PtrType data_,
const index_t* sizes_,
const index_t* strides_)
: data_(data_), sizes_(sizes_), strides_(strides_) {}
C10_HOST IntArrayRef sizes() const {
return IntArrayRef(sizes_,N);
}
C10_HOST IntArrayRef strides() const {
return IntArrayRef(strides_,N);
}
C10_HOST_DEVICE index_t stride(index_t i) const {
return strides_[i];
}
C10_HOST_DEVICE index_t size(index_t i) const {
return sizes_[i];
}
C10_HOST_DEVICE PtrType data() {
return data_;
}
C10_HOST_DEVICE const PtrType data() const {
return data_;
}
protected:
PtrType data_;
const index_t* sizes_;
const index_t* strides_;
};
// The `TensorAccessor` is typically instantiated for CPU `Tensor`s using
// `Tensor.accessor<T, N>()`.
// For CUDA `Tensor`s, `GenericPackedTensorAccessor` is used on the host and only
// indexing on the device uses `TensorAccessor`s.
template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
using TensorAccessor = torch::headeronly::detail::TensorAccessor<c10::IntArrayRef, T, N, PtrTraits, index_t>;
class TensorAccessor : public TensorAccessorBase<T,N,PtrTraits,index_t> {
public:
typedef typename PtrTraits<T>::PtrType PtrType;
namespace detail {
C10_HOST_DEVICE TensorAccessor(
PtrType data_,
const index_t* sizes_,
const index_t* strides_)
: TensorAccessorBase<T, N, PtrTraits, index_t>(data_,sizes_,strides_) {}
template <size_t N, typename index_t>
struct IndexBoundsCheck {
IndexBoundsCheck(index_t i) {
TORCH_CHECK_INDEX(
C10_HOST_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
}
C10_HOST_DEVICE const TensorAccessor<T, N-1, PtrTraits, index_t> operator[](index_t i) const {
return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i,this->sizes_+1,this->strides_+1);
}
};
template<typename T, template <typename U> class PtrTraits, typename index_t>
class TensorAccessor<T,1,PtrTraits,index_t> : public TensorAccessorBase<T,1,PtrTraits,index_t> {
public:
typedef typename PtrTraits<T>::PtrType PtrType;
C10_HOST_DEVICE TensorAccessor(
PtrType data_,
const index_t* sizes_,
const index_t* strides_)
: TensorAccessorBase<T, 1, PtrTraits, index_t>(data_,sizes_,strides_) {}
C10_HOST_DEVICE T & operator[](index_t i) {
// NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
return this->data_[this->strides_[0]*i];
}
C10_HOST_DEVICE const T & operator[](index_t i) const {
return this->data_[this->strides_[0]*i];
}
};
// GenericPackedTensorAccessorBase and GenericPackedTensorAccessor are used on for CUDA `Tensor`s on the host
// and as
// In contrast to `TensorAccessor`s, they copy the strides and sizes on instantiation (on the host)
// in order to transfer them on the device when calling kernels.
// On the device, indexing of multidimensional tensors gives to `TensorAccessor`s.
// Use RestrictPtrTraits as PtrTraits if you want the tensor's data pointer to be marked as __restrict__.
// Instantiation from data, sizes, strides is only needed on the host and std::copy isn't available
// on the device, so those functions are host only.
template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
class GenericPackedTensorAccessorBase {
public:
typedef typename PtrTraits<T>::PtrType PtrType;
C10_HOST GenericPackedTensorAccessorBase(
PtrType data_,
const index_t* sizes_,
const index_t* strides_)
: data_(data_) {
std::copy(sizes_, sizes_ + N, std::begin(this->sizes_));
std::copy(strides_, strides_ + N, std::begin(this->strides_));
}
// if index_t is not int64_t, we want to have an int64_t constructor
template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
C10_HOST GenericPackedTensorAccessorBase(
PtrType data_,
const source_index_t* sizes_,
const source_index_t* strides_)
: data_(data_) {
for (const auto i : c10::irange(N)) {
this->sizes_[i] = sizes_[i];
this->strides_[i] = strides_[i];
}
}
C10_HOST_DEVICE index_t stride(index_t i) const {
return strides_[i];
}
C10_HOST_DEVICE index_t size(index_t i) const {
return sizes_[i];
}
C10_HOST_DEVICE PtrType data() {
return data_;
}
C10_HOST_DEVICE const PtrType data() const {
return data_;
}
protected:
PtrType data_;
// NOLINTNEXTLINE(*c-arrays*)
index_t sizes_[N];
// NOLINTNEXTLINE(*c-arrays*)
index_t strides_[N];
C10_HOST void bounds_check_(index_t i) const {
TORCH_CHECK_INDEX(
0 <= i && i < index_t{N},
"Index ",
i,
" is not within bounds of a tensor of dimension ",
N);
}
}
};
} // namespace detail
template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
using GenericPackedTensorAccessorBase = torch::headeronly::detail::GenericPackedTensorAccessorBase<detail::IndexBoundsCheck<N, index_t>, T, N, PtrTraits, index_t>;
class GenericPackedTensorAccessor : public GenericPackedTensorAccessorBase<T,N,PtrTraits,index_t> {
public:
typedef typename PtrTraits<T>::PtrType PtrType;
C10_HOST GenericPackedTensorAccessor(
PtrType data_,
const index_t* sizes_,
const index_t* strides_)
: GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
// if index_t is not int64_t, we want to have an int64_t constructor
template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
C10_HOST GenericPackedTensorAccessor(
PtrType data_,
const source_index_t* sizes_,
const source_index_t* strides_)
: GenericPackedTensorAccessorBase<T, N, PtrTraits, index_t>(data_, sizes_, strides_) {}
C10_DEVICE TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) {
index_t* new_sizes = this->sizes_ + 1;
index_t* new_strides = this->strides_ + 1;
return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
}
C10_DEVICE const TensorAccessor<T, N - 1, PtrTraits, index_t> operator[](index_t i) const {
const index_t* new_sizes = this->sizes_ + 1;
const index_t* new_strides = this->strides_ + 1;
return TensorAccessor<T,N-1,PtrTraits,index_t>(this->data_ + this->strides_[0]*i, new_sizes, new_strides);
}
/// Returns a PackedTensorAccessor of the same dimension after transposing the
/// two dimensions given. Does not actually move elements; transposition is
/// made by permuting the size/stride arrays. If the dimensions are not valid,
/// asserts.
C10_HOST GenericPackedTensorAccessor<T, N, PtrTraits, index_t> transpose(
index_t dim1,
index_t dim2) const {
this->bounds_check_(dim1);
this->bounds_check_(dim2);
GenericPackedTensorAccessor<T, N, PtrTraits, index_t> result(
this->data_, this->sizes_, this->strides_);
std::swap(result.strides_[dim1], result.strides_[dim2]);
std::swap(result.sizes_[dim1], result.sizes_[dim2]);
return result;
}
};
template<typename T, template <typename U> class PtrTraits, typename index_t>
class GenericPackedTensorAccessor<T,1,PtrTraits,index_t> : public GenericPackedTensorAccessorBase<T,1,PtrTraits,index_t> {
public:
typedef typename PtrTraits<T>::PtrType PtrType;
C10_HOST GenericPackedTensorAccessor(
PtrType data_,
const index_t* sizes_,
const index_t* strides_)
: GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
// if index_t is not int64_t, we want to have an int64_t constructor
template <typename source_index_t, class = std::enable_if_t<std::is_same_v<source_index_t, int64_t>>>
C10_HOST GenericPackedTensorAccessor(
PtrType data_,
const source_index_t* sizes_,
const source_index_t* strides_)
: GenericPackedTensorAccessorBase<T, 1, PtrTraits, index_t>(data_, sizes_, strides_) {}
C10_DEVICE T & operator[](index_t i) {
return this->data_[this->strides_[0] * i];
}
C10_DEVICE const T& operator[](index_t i) const {
return this->data_[this->strides_[0]*i];
}
// Same as in the general N-dimensional case, but note that in the
// 1-dimensional case the returned PackedTensorAccessor will always be an
// identical copy of the original
C10_HOST GenericPackedTensorAccessor<T, 1, PtrTraits, index_t> transpose(
index_t dim1,
index_t dim2) const {
this->bounds_check_(dim1);
this->bounds_check_(dim2);
return GenericPackedTensorAccessor<T, 1, PtrTraits, index_t>(
this->data_, this->sizes_, this->strides_);
}
};
template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
using GenericPackedTensorAccessor = torch::headeronly::detail::GenericPackedTensorAccessor<TensorAccessor<T, N-1, PtrTraits, index_t>, detail::IndexBoundsCheck<N, index_t>, T, N, PtrTraits, index_t>;
// Can't put this directly into the macro function args because of commas
#define AT_X GenericPackedTensorAccessor<T, N, PtrTraits, index_t>

View File

@ -245,9 +245,6 @@ class TORCH_API TensorBase {
size_t weak_use_count() const noexcept {
return impl_.weak_use_count();
}
bool is_uniquely_owned() const noexcept {
return impl_.is_uniquely_owned();
}
std::string toString() const;

View File

@ -9,8 +9,8 @@ APIVitals VitalsAPI;
std::ostream& operator<<(std::ostream& os, TorchVital const& tv) {
for (const auto& m : tv.attrs) {
os << "[TORCH_VITAL] " << tv.name << '.' << m.first << "\t\t "
<< m.second.value << '\n';
os << "[TORCH_VITAL] " << tv.name << "." << m.first << "\t\t "
<< m.second.value << "\n";
}
return os;
}

View File

@ -100,18 +100,18 @@ inline bool operator==(const AliasInfo& lhs, const AliasInfo& rhs) {
// this does match the way things are represented in the schema
inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
out << '(';
out << "(";
bool first = true;
for (const auto& set : aliasInfo.beforeSets()) {
if (first) {
first = false;
} else {
out << '|';
out << "|";
}
out << set.toUnqualString();
}
if (aliasInfo.isWrite()) {
out << '!';
out << "!";
}
if (aliasInfo.beforeSets() != aliasInfo.afterSets()) {
out << " -> ";
@ -120,12 +120,12 @@ inline std::ostream& operator<<(std::ostream& out, const AliasInfo& aliasInfo) {
if (first) {
first = false;
} else {
out << '|';
out << "|";
}
out << set.toUnqualString();
}
}
out << ')';
out << ")";
return out;
}
} // namespace c10

View File

@ -198,7 +198,7 @@ inline void swap(Blob& lhs, Blob& rhs) noexcept {
}
inline std::ostream& operator<<(std::ostream& out, const Blob& v) {
return out << "Blob[" << v.TypeName() << ']';
return out << "Blob[" << v.TypeName() << "]";
}
} // namespace caffe2

View File

@ -456,8 +456,8 @@ bool ClassType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {
*why_not << "Method on class '" << repr_str()
<< "' (1) is not compatible with interface '"
<< rhs.repr_str() << "' (2)\n"
<< " (1) " << self_method->getSchema() << '\n'
<< " (2) " << schema << '\n';
<< " (1) " << self_method->getSchema() << "\n"
<< " (2) " << schema << "\n";
}
return false;
}

View File

@ -100,7 +100,7 @@ struct TORCH_API ClassType : public NamedType {
std::string repr_str() const override {
std::stringstream ss;
ss << str()
<< " (of Python compilation unit at: " << compilation_unit().get() << ')';
<< " (of Python compilation unit at: " << compilation_unit().get() << ")";
return ss.str();
}

View File

@ -58,12 +58,12 @@ std::string DispatchKeyExtractor::dumpState() const {
std::ostringstream oss;
for (const auto i : c10::irange(c10::utils::bitset::NUM_BITS())) {
if (dispatch_arg_indices_reverse_.get(i)) {
oss << '1';
oss << "1";
} else {
oss << '0';
oss << "0";
}
}
oss << ' ' << nonFallthroughKeys_ << '\n';
oss << " " << nonFallthroughKeys_ << "\n";
return oss.str();
}

View File

@ -69,8 +69,8 @@ private:
void _print_dispatch_trace(const std::string& label, const std::string& op_name, const DispatchKeySet& dispatchKeySet) {
auto nesting_value = dispatch_trace_nesting_value();
for (int64_t i = 0; i < nesting_value; ++i) std::cerr << ' ';
std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << ']' << std::endl;
for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
}
} // namespace detail

View File

@ -570,7 +570,7 @@ void OperatorEntry::checkInvariants() const {
std::string OperatorEntry::listAllDispatchKeys() const {
std::ostringstream str;
str << '[';
str << "[";
bool has_kernels = false;
for (auto k : allDispatchKeysInFullSet()) {
@ -584,7 +584,7 @@ std::string OperatorEntry::listAllDispatchKeys() const {
str << k;
has_kernels = true;
}
str << ']';
str << "]";
return str.str();
}
@ -683,12 +683,12 @@ void OperatorEntry::setReportErrorCallback_(std::unique_ptr<c10::SafePyObject> c
// This WON'T report backend fallbacks.
std::string OperatorEntry::dumpState() const {
std::ostringstream oss;
oss << "name: " << name_ << '\n';
oss << "name: " << name_ << "\n";
if (schema_) {
oss << "schema: " << schema_->schema << '\n';
oss << "debug: " << schema_->debug << '\n';
oss << "schema: " << schema_->schema << "\n";
oss << "debug: " << schema_->debug << "\n";
oss << "alias analysis kind: " << toString(schema_->schema.aliasAnalysis())
<< (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << '\n';
<< (schema_->schema.isDefaultAliasAnalysisKind() ? " (default)" : "") << "\n";
} else {
oss << "schema: (none)\n";
}

View File

@ -7,7 +7,7 @@
namespace c10 {
void FunctionSchema::dump() const {
std::cout << *this << '\n';
std::cout << *this << "\n";
}
const std::vector<Argument>& FunctionSchema::getCorrectList(SchemaArgType type) const {
@ -210,9 +210,9 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
out << schema.name();
if (!schema.overload_name().empty()) {
out << '.' << schema.overload_name();
out << "." << schema.overload_name();
}
out << '(';
out << "(";
bool seen_kwarg_only = false;
for (const auto i : c10::irange(schema.arguments().size())) {
@ -273,7 +273,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
}
if (need_paren) {
out << '(';
out << "(";
}
for (const auto i : c10::irange(returns.size())) {
if (i > 0) {
@ -288,7 +288,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
out << "...";
}
if (need_paren) {
out << ')';
out << ")";
}
return out;
}
@ -471,7 +471,7 @@ bool FunctionSchema::isForwardCompatibleWith(
if (!arguments().at(i).isForwardCompatibleWith(old.arguments().at(i))) {
if (why_not) {
why_not
<< '\'' << arguments().at(i).name() << '\''
<< "'" << arguments().at(i).name() << "'"
<< " is not forward compatible with the older version of the schema";
}
return false;
@ -511,7 +511,7 @@ bool FunctionSchema::isForwardCompatibleWith(
.isForwardCompatibleWith(old.arguments().at(i))) {
if (why_not) {
why_not << "Out argument '"
<< '\'' << arguments().at(i).name()
<< "'" << arguments().at(i).name()
<< " is not FC with the older version of the schema";
}
return false;

View File

@ -571,7 +571,7 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
if (arg.N()) {
N = std::to_string(*arg.N());
}
out << '[' << N << ']';
out << "[" << N << "]";
} else {
out << unopt_type->str();
}
@ -582,15 +582,15 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
}
if (is_opt) {
out << '?';
out << "?";
}
if (!arg.name().empty()) {
out << ' ' << arg.name();
out << " " << arg.name();
}
if (arg.default_value()) {
out << '=';
out << "=";
if ((type->kind() == c10::TypeKind::StringType ||
unopt_type->kind() == c10::TypeKind::StringType) &&
arg.default_value().value().isString()) {

View File

@ -66,7 +66,7 @@ bool operator==(const ivalue::Tuple& lhs, const ivalue::Tuple& rhs) {
}
std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {
out << v.qualifiedClassName() << '.' << v.name();
out << v.qualifiedClassName() << "." << v.name();
return out;
}
@ -526,7 +526,7 @@ std::ostream& printMaybeAnnotatedList(
!elementTypeCanBeInferredFromMembers(list_elem_type)) {
out << "annotate(" << the_list.type<c10::Type>()->annotation_str() << ", ";
printList(out, the_list.toListRef(), "[", "]", formatter);
out << ')';
out << ")";
return out;
} else {
return printList(out, the_list.toListRef(), "[", "]", formatter);
@ -538,7 +538,7 @@ std::ostream& printDict(
std::ostream& out,
const Dict& v,
const IValueFormatter& formatter) {
out << '{';
out << "{";
bool first = true;
for (const auto& pair : v) {
@ -552,7 +552,7 @@ std::ostream& printDict(
first = false;
}
out << '}';
out << "}";
return out;
}
}
@ -565,8 +565,8 @@ static std::ostream& printMaybeAnnotatedDict(
auto value_type = the_dict.type()->castRaw<DictType>()->getValueType();
if (the_dict.toGenericDict().empty() ||
!elementTypeCanBeInferredFromMembers(value_type)) {
out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ',';
printDict(out, the_dict.toGenericDict(), formatter) << ')';
out << "annotate(" << the_dict.type<c10::Type>()->annotation_str() << ",";
printDict(out, the_dict.toGenericDict(), formatter) << ")";
} else {
return printDict(out, the_dict.toGenericDict(), formatter);
}
@ -577,7 +577,7 @@ static std::ostream& printComplex(std::ostream & out, const IValue & v) {
c10::complex<double> d = v.toComplexDouble();
IValue real(d.real()), imag(std::abs(d.imag()));
auto sign = d.imag() >= 0 ? '+' : '-';
return out << real << sign << imag << 'j';
return out << real << sign << imag << "j";
}
std::ostream& IValue::repr(
@ -605,9 +605,9 @@ std::ostream& IValue::repr(
if (static_cast<double>(i) == d) {
// -0.0 (signed zero) needs to be parsed as -0.
if (i == 0 && std::signbit(d)) {
return out << '-' << i << '.';
return out << "-" << i << ".";
}
return out << i << '.';
return out << i << ".";
}
}
auto orig_prec = out.precision();
@ -643,20 +643,20 @@ std::ostream& IValue::repr(
device_stream << v.toDevice();
out << "torch.device(";
c10::printQuotedString(out, device_stream.str());
return out << ')';
return out << ")";
}
case IValue::Tag::Generator: {
auto generator = v.toGenerator();
out << "torch.Generator(device=";
c10::printQuotedString(out, generator.device().str());
out << ", seed=" << generator.current_seed() << ')';
out << ", seed=" << generator.current_seed() << ")";
return out;
}
case IValue::Tag::GenericDict:
return printMaybeAnnotatedDict(out, v, formatter);
case IValue::Tag::Enum: {
auto enum_holder = v.toEnumHolder();
return out << enum_holder->qualifiedClassName() << '.' <<
return out << enum_holder->qualifiedClassName() << "." <<
enum_holder->name();
}
case IValue::Tag::Object: {
@ -801,7 +801,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
if (c == FP_NORMAL || c == FP_ZERO) {
int64_t i = static_cast<int64_t>(d);
if (static_cast<double>(i) == d) {
return out << i << '.';
return out << i << ".";
}
}
auto orig_prec = out.precision();
@ -852,7 +852,7 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
return printDict(out, v.toGenericDict(), formatter);
case IValue::Tag::PyObject: {
auto py_obj = v.toPyObject();
return out << "<PyObject at" << py_obj << '>';
return out << "<PyObject at" << py_obj << ">";
}
case IValue::Tag::Generator:
return out << "Generator";
@ -862,22 +862,22 @@ std::ostream& operator<<(std::ostream & out, const IValue & v) {
// TODO we should attempt to call __str__ if the object defines it.
auto obj = v.toObject();
// print this out the way python would do it
return out << '<' << obj->name() << " object at " << obj.get() << '>';
return out << "<" << obj->name() << " object at " << obj.get() << ">";
}
case IValue::Tag::Enum: {
auto enum_holder = v.toEnumHolder();
return out << "Enum<" << enum_holder->unqualifiedClassName() << '.' <<
enum_holder->name() << '>';
return out << "Enum<" << enum_holder->unqualifiedClassName() << "." <<
enum_holder->name() << ">";
}
}
return out << "<Invalid IValue tag=" << std::to_string(static_cast<uint32_t>(v.tag)) << '>';
return out << "<Invalid IValue tag=" << std::to_string(static_cast<uint32_t>(v.tag)) << ">";
}
#undef TORCH_FORALL_TAGS
void IValue::dump() const {
std::cout << *this << '\n';
std::cout << *this << "\n";
}
std::shared_ptr<ClassType> ivalue::Object::type() const {
@ -1050,7 +1050,7 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(
std::stringstream err;
err << "Cannot serialize custom bound C++ class";
if (auto qualname = type()->name()) {
err << ' ' << qualname->qualifiedName();
err << " " << qualname->qualifiedName();
}
err << ". Please define serialization methods via def_pickle() for "
"this class.";

View File

@ -211,7 +211,7 @@ struct TORCH_API OptionalType : public UnionType {
std::string str() const override {
std::stringstream ss;
ss << getElementType()->str() << '?';
ss << getElementType()->str() << "?";
return ss.str();
}
@ -240,7 +240,7 @@ struct TORCH_API OptionalType : public UnionType {
std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
std::stringstream ss;
ss << "Optional[" << getElementType()->annotation_str(printer) << ']';
ss << "Optional[" << getElementType()->annotation_str(printer) << "]";
return ss.str();
}
};
@ -906,7 +906,7 @@ struct TORCH_API ListType
std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
std::stringstream ss;
ss << "List[" << getElementType()->annotation_str(printer) << ']';
ss << "List[" << getElementType()->annotation_str(printer) << "]";
return ss.str();
}
};
@ -946,7 +946,7 @@ struct TORCH_API DictType : public SharedType {
std::string str() const override {
std::stringstream ss;
ss << "Dict(" << getKeyType()->str() << ", " << getValueType()->str()
<< ')';
<< ")";
return ss.str();
}
@ -1018,7 +1018,7 @@ struct TORCH_API FutureType
std::string str() const override {
std::stringstream ss;
ss << "Future(" << getElementType()->str() << ')';
ss << "Future(" << getElementType()->str() << ")";
return ss.str();
}
TypePtr createWithContained(
@ -1041,7 +1041,7 @@ struct TORCH_API FutureType
std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
std::stringstream ss;
ss << "Future[" << getElementType()->annotation_str(printer) << ']';
ss << "Future[" << getElementType()->annotation_str(printer) << "]";
return ss.str();
}
};
@ -1060,7 +1060,7 @@ struct TORCH_API AwaitType
std::string str() const override {
std::stringstream ss;
ss << "Await(" << getElementType()->str() << ')';
ss << "Await(" << getElementType()->str() << ")";
return ss.str();
}
TypePtr createWithContained(
@ -1083,7 +1083,7 @@ struct TORCH_API AwaitType
std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
std::stringstream ss;
ss << "Await[" << getElementType()->annotation_str(printer) << ']';
ss << "Await[" << getElementType()->annotation_str(printer) << "]";
return ss.str();
}
};
@ -1102,7 +1102,7 @@ struct TORCH_API RRefType
std::string str() const override {
std::stringstream ss;
ss << "RRef(" << getElementType()->str() << ')';
ss << "RRef(" << getElementType()->str() << ")";
return ss.str();
}
TypePtr createWithContained(
@ -1115,7 +1115,7 @@ struct TORCH_API RRefType
std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override {
std::stringstream ss;
ss << "RRef[" << getElementType()->annotation_str(printer) << ']';
ss << "RRef[" << getElementType()->annotation_str(printer) << "]";
return ss.str();
}
};

View File

@ -11,7 +11,7 @@ std::string toString(const OperatorName& opName) {
std::ostream& operator<<(std::ostream& os, const OperatorName& opName) {
os << opName.name;
if (!opName.overload_name.empty()) {
os << '.' << opName.overload_name;
os << "." << opName.overload_name;
}
return os;
}

View File

@ -65,7 +65,7 @@ VaryingShape<T> VaryingShape<T>::merge(const VaryingShape<T>& other) const {
template <typename T>
std::ostream& operator<<(std::ostream& out, const VaryingShape<T>& vs) {
out << '(';
out << "(";
if (!vs.size()) {
out << "*)";
return out;
@ -79,10 +79,10 @@ std::ostream& operator<<(std::ostream& out, const VaryingShape<T>& vs) {
if (v.has_value()) {
out << v.value();
} else {
out << '*';
out << "*";
}
}
out << ')';
out << ")";
return out;
}
@ -105,7 +105,7 @@ std::ostream& operator<<(
}
auto sizes_opt = ss.sizes();
os << '(';
os << "(";
for (size_t i = 0; i < rank_opt.value(); i++) {
if (i > 0) {
os << ", ";
@ -113,10 +113,10 @@ std::ostream& operator<<(
if(sizes_opt.has_value() && sizes_opt.value()[i].is_static()) {
os << sizes_opt.value()[i];
} else {
os << '*';
os << "*";
}
}
os << ')';
os << ")";
return os;
}
@ -131,17 +131,17 @@ std::ostream& operator<<(std::ostream& os, const ShapeSymbol& s) {
}
std::ostream& operator<<(std::ostream& os, const Stride& s) {
os << '{';
os << "{";
if (s.stride_index_.has_value()) {
os << *s.stride_index_;
} else {
os << '*';
os << "*";
}
os << ':';
os << ":";
if (s.stride_.has_value()) {
os << *s.stride_;
} else {
os << '*';
os << "*";
}
os << '}';
return os;

View File

@ -67,7 +67,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
bool has_valid_strides_info = ndim > 0 &&
value->strides().isComplete() && value->strides().size() == ndim;
out << '(';
out << "(";
size_t i = 0;
bool symbolic = type_verbosity() == TypeVerbosity::Symbolic;
for (i = 0; i < *ndim; ++i) {
@ -79,7 +79,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
} else if (symbolic) {
out << value->symbolic_sizes().at(i);
} else {
out << '*';
out << "*";
}
}
if (has_valid_strides_info &&
@ -91,7 +91,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
}
out << value->strides()[i].value();
}
out << ']';
out << "]";
}
if (type_verbosity() >= TypeVerbosity::Full) {
if (value->requiresGrad()) {
@ -107,12 +107,12 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
out << "device=" << *value->device();
}
}
out << ')';
out << ")";
} else {
if (type_verbosity() >= TypeVerbosity::Full) {
size_t i = 0;
if (value->requiresGrad()) {
out << '('
out << "("
<< "requires_grad=" << *value->requiresGrad();
i++;
}
@ -120,7 +120,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
out << ((i++ > 0) ? ", " : "(") << "device=" << *value->device();
}
if (i > 0) {
out << ')';
out << ")";
}
}
}
@ -133,18 +133,18 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
out << *prim << "[]";
} else if (t.kind() == TypeKind::OptionalType) {
auto prim = t.castRaw<OptionalType>()->getElementType();
out << *prim << '?';
out << *prim << "?";
} else if(t.kind() == TypeKind::FutureType) {
auto elem = t.castRaw<FutureType>()->getElementType();
out << "Future[" << *elem << ']';
out << "Future[" << *elem << "]";
} else if(t.kind() == TypeKind::RRefType) {
auto elem = t.castRaw<RRefType>()->getElementType();
out << "RRef[" << *elem << ']';
out << "RRef[" << *elem << "]";
} else if(auto tup = t.cast<TupleType>()) {
if (tup->schema()) {
out << "NamedTuple";
}
out << '(';
out << "(";
for(size_t i = 0; i < tup->elements().size(); ++i) {
if(i > 0)
out << ", ";
@ -160,7 +160,7 @@ std::ostream& operator<<(std::ostream & out, const Type & t) {
out << *(tup->elements()[i]);
}
}
out << ')';
out << ")";
} else if (t.kind() == TypeKind::FunctionType) {
out << "Function";
} else {
@ -475,7 +475,7 @@ std::optional<TypePtr> unifyTypeList(
why_not << "Could not unify type list since element " << i << " of type "
<< elements.at(i)->repr_str()
<< " did not match the types before it ("
<< ret_type->repr_str() << ')';
<< ret_type->repr_str() << ")";
return std::nullopt;
}
ret_type = *maybe_unified;
@ -907,13 +907,13 @@ std::string TupleType::str() const {
// NOLINTNEXTLINE(bugprone-unchecked-optional-access)
ss << name()->qualifiedName();
} else {
ss << '(';
ss << "(";
for(size_t i = 0; i < elements().size(); ++i) {
if(i > 0)
ss << ", ";
ss << elements()[i]->str();
}
ss << ')';
ss << ")";
}
return ss.str();
}
@ -1003,8 +1003,8 @@ bool InterfaceType::isSubTypeImpl(
*why_not << "Method on interface '" << lhs.repr_str()
<< "' (1) is not compatible with interface '"
<< rhs.repr_str() << "' (2)\n"
<< " (1) " << *self_schema << '\n'
<< " (2) " << schema << '\n';
<< " (1) " << *self_schema << "\n"
<< " (2) " << schema << "\n";
return false;
}
return false;
@ -1078,7 +1078,7 @@ SymbolicShape SymbolicShape::merge(const SymbolicShape& other) const {
}
void SymbolicShape::dump() const {
std::cout << *this << '\n';
std::cout << *this << "\n";
}
bool EnumType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const {

View File

@ -205,9 +205,9 @@ UnionType::UnionType(std::vector<TypePtr> reference, TypeKind kind) : SharedType
for (const auto i : c10::irange(reference.size())) {
msg << reference[i]->repr_str();
if (i > 0) {
msg << ',';
msg << ",";
}
msg << ' ';
msg << " ";
}
msg << "} has the single type " << types_[0]->repr_str()
<< ". Use the common supertype instead of creating a Union"

View File

@ -80,7 +80,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
}
stream << buf[i];
}
stream << ']';
stream << "]";
return stream;
}

View File

@ -55,7 +55,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
}
stream << buf[i];
}
stream << ']';
stream << "]";
return stream;
}

View File

@ -3,7 +3,6 @@
#include <cstdint>
#include <map>
#include <shared_mutex>
#include <cuda_runtime_api.h>
#include <cusparse.h>
@ -89,13 +88,8 @@ TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
TORCH_CUDA_CPP_API void clearCublasWorkspaces();
struct WorkspaceMapWithMutex {
std::map<std::tuple<void*, void*>, at::DataPtr> map;
std::shared_mutex mutex;
};
TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace();
TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace();
TORCH_CUDA_CPP_API size_t getChosenWorkspaceSize();
TORCH_CUDA_CPP_API size_t getCUDABlasLtWorkspaceSize();
TORCH_CUDA_CPP_API void* getCUDABlasLtWorkspace();

View File

@ -175,24 +175,17 @@ void CUDAGraph::instantiate() {
// Trailing NULL, NULL, 0 arguments were recommended by Cuda driver people,
// who prefer not to report error message through these arguments moving forward
// (they prefer return value, or errors on api calls internal to the capture)
// ROCM appears to fail with HIP error: invalid argument
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && !defined(USE_ROCM)
AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, cudaGraphInstantiateFlagUseNodePriority));
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000)
AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, 0));
#else
AT_CUDA_CHECK(cudaGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0));
#endif
//Since ROCm 6.2, we want to go down this path as hipGraphExecDestroy in the destructor will not immediately free the memory.
//It will wait for the next sync operation. cudaGraphInstantiateFlagAutoFreeOnLaunch will add async frees after graph launch.
} else {
#if !defined(USE_ROCM)
AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
graph_,
cudaGraphInstantiateFlagAutoFreeOnLaunch | cudaGraphInstantiateFlagUseNodePriority));
#else
AT_CUDA_CHECK(cudaGraphInstantiateWithFlags(&graph_exec_,
graph_,
cudaGraphInstantiateFlagAutoFreeOnLaunch));
#endif
}
has_graph_exec_ = true;
}

View File

@ -99,7 +99,7 @@ void destroyCublasHandle(cublasHandle_t handle) {
// - Comments of @soumith copied from cuDNN handle pool implementation
#ifdef NO_CUDNN_DESTROY_HANDLE
#else
cublasDestroy(handle);
cublasDestroy(handle);
#endif
}
@ -107,27 +107,19 @@ using CuBlasPoolType = DeviceThreadHandlePool<cublasHandle_t, createCublasHandle
} // namespace
WorkspaceMapWithMutex& cublas_handle_stream_to_workspace() {
static auto& instance = *new WorkspaceMapWithMutex;
std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace() {
static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
return instance;
}
WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace() {
static auto& instance = *new WorkspaceMapWithMutex;
std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace() {
static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
return instance;
}
void clearCublasWorkspaces() {
{
auto& workspace = cublas_handle_stream_to_workspace();
std::unique_lock<std::shared_mutex> lock(workspace.mutex);
workspace.map.clear();
}
{
auto& workspace = cublaslt_handle_stream_to_workspace();
std::unique_lock<std::shared_mutex> lock(workspace.mutex);
workspace.map.clear();
}
cublas_handle_stream_to_workspace().clear();
cublaslt_handle_stream_to_workspace().clear();
}
size_t parseChosenWorkspaceSize() {
@ -241,38 +233,6 @@ at::DataPtr getNewCUDABlasLtWorkspace() {
return c10::cuda::CUDACachingAllocator::get()->allocate(getCUDABlasLtWorkspaceSize());
}
void setWorkspaceForHandle(cublasHandle_t handle, c10::cuda::CUDAStream stream) {
cudaStream_t _stream = stream;
auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
auto& workspace = cublas_handle_stream_to_workspace();
size_t workspace_size = getChosenWorkspaceSize();
// Fast path: check if workspace already exists
{
std::shared_lock<std::shared_mutex> lock(workspace.mutex);
auto workspace_it = workspace.map.find(key);
if (workspace_it != workspace.map.end()) {
TORCH_CUDABLAS_CHECK(cublasSetWorkspace(
handle, workspace_it->second.get(), workspace_size));
return;
}
}
// Slow path: allocate workspace outside the lock
auto new_workspace = getNewWorkspace();
// Insert with lock (double-check in case another thread inserted while we
// were allocating)
{
std::unique_lock<std::shared_mutex> lock(workspace.mutex);
auto workspace_it = workspace.map.try_emplace(key, std::move(new_workspace)).first;
TORCH_CUDABLAS_CHECK(
cublasSetWorkspace(handle, workspace_it->second.get(), workspace_size));
}
}
void* getCUDABlasLtWorkspace() {
#ifndef USE_ROCM
static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true;
@ -281,10 +241,8 @@ void* getCUDABlasLtWorkspace() {
auto stream = c10::cuda::getCurrentCUDAStream();
cudaStream_t _stream = stream;
auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
auto& workspace = at::cuda::cublas_handle_stream_to_workspace();
std::shared_lock<std::shared_mutex> lock(workspace.mutex);
auto workspace_it = workspace.map.find(key);
TORCH_INTERNAL_ASSERT(workspace_it != workspace.map.end());
auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key);
TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end());
return workspace_it->second.mutable_get();
}
#endif
@ -292,29 +250,11 @@ void* getCUDABlasLtWorkspace() {
auto stream = c10::cuda::getCurrentCUDAStream();
cudaStream_t _stream = stream;
auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
auto& workspace = cublaslt_handle_stream_to_workspace();
// Fast path: check if workspace already exists
{
std::shared_lock<std::shared_mutex> lock(workspace.mutex);
auto workspace_it = workspace.map.find(key);
if (workspace_it != workspace.map.end()) {
return workspace_it->second.mutable_get();
}
}
// Slow path: allocate workspace outside the lock
auto new_workspace = getNewCUDABlasLtWorkspace();
// Insert with lock (double-check in case another thread inserted while we
// were allocating)
{
std::unique_lock<std::shared_mutex> lock(workspace.mutex);
auto workspace_it =
workspace.map.try_emplace(key, std::move(new_workspace)).first;
return workspace_it->second.mutable_get();
auto workspace_it = cublaslt_handle_stream_to_workspace().find(key);
if (workspace_it == cublaslt_handle_stream_to_workspace().end()) {
workspace_it = cublaslt_handle_stream_to_workspace().insert(workspace_it, {key, getNewCUDABlasLtWorkspace()});
}
return workspace_it->second.mutable_get();
}
cublasHandle_t getCurrentCUDABlasHandle() {
@ -358,8 +298,13 @@ cublasHandle_t getCurrentCUDABlasHandle() {
// will allocate memory dynamically (even if they're cheap) outside
// PyTorch's CUDA caching allocator. It's possible that CCA used up
// all the memory and cublas's cudaMallocAsync will return OOM
setWorkspaceForHandle(handle, stream);
cudaStream_t _stream = stream;
auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
auto workspace_it = cublas_handle_stream_to_workspace().find(key);
if (workspace_it == cublas_handle_stream_to_workspace().end()) {
workspace_it = cublas_handle_stream_to_workspace().insert(workspace_it, {key, getNewWorkspace()});
}
TORCH_CUDABLAS_CHECK(cublasSetWorkspace(handle, workspace_it->second.get(), getChosenWorkspaceSize()));
#if !defined(USE_ROCM)
// On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
// FP32 data type calculations based on the value of the allow_tf32 flag.

View File

@ -411,16 +411,16 @@ std::string CUDAHooks::showConfig() const {
// HIP_VERSION value format was changed after ROCm v4.2 to include the patch number
if(v < 500) {
// If major=xx, minor=yy then format -> xxyy
oss << (v / 100) << '.' << (v % 10);
oss << (v / 100) << "." << (v % 10);
}
else {
// If major=xx, minor=yy & patch=zzzzz then format -> xxyyzzzzz
oss << (v / 10000000) << '.' << (v / 100000 % 100) << '.' << (v % 100000);
oss << (v / 10000000) << "." << (v / 100000 % 100) << "." << (v % 100000);
}
#else
oss << (v / 1000) << '.' << (v / 10 % 100);
oss << (v / 1000) << "." << (v / 10 % 100);
if (v % 10 != 0) {
oss << '.' << (v % 10);
oss << "." << (v % 10);
}
#endif
};
@ -431,16 +431,16 @@ std::string CUDAHooks::showConfig() const {
oss << " - HIP Runtime ";
#endif
printCudaStyleVersion(runtimeVersion);
oss << '\n';
oss << "\n";
// TODO: Make HIPIFY understand CUDART_VERSION macro
#if !defined(USE_ROCM)
if (runtimeVersion != CUDART_VERSION) {
oss << " - Built with CUDA Runtime ";
printCudaStyleVersion(CUDART_VERSION);
oss << '\n';
oss << "\n";
}
oss << " - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << '\n';
oss << " - NVCC architecture flags: " << NVCC_FLAGS_EXTRA << "\n";
#endif
#if !defined(USE_ROCM)
@ -448,9 +448,9 @@ std::string CUDAHooks::showConfig() const {
auto printCudnnStyleVersion = [&](size_t v) {
oss << (v / 1000) << '.' << (v / 100 % 10);
oss << (v / 1000) << "." << (v / 100 % 10);
if (v % 100 != 0) {
oss << '.' << (v % 100);
oss << "." << (v % 100);
}
};
@ -461,22 +461,22 @@ std::string CUDAHooks::showConfig() const {
if (cudnnCudartVersion != CUDART_VERSION) {
oss << " (built against CUDA ";
printCudaStyleVersion(cudnnCudartVersion);
oss << ')';
oss << ")";
}
oss << '\n';
oss << "\n";
if (cudnnVersion != CUDNN_VERSION) {
oss << " - Built with CuDNN ";
printCudnnStyleVersion(CUDNN_VERSION);
oss << '\n';
oss << "\n";
}
#endif
#else
// TODO: Check if miopen has the functions above and unify
oss << " - MIOpen " << MIOPEN_VERSION_MAJOR << '.' << MIOPEN_VERSION_MINOR << '.' << MIOPEN_VERSION_PATCH << '\n';
oss << " - MIOpen " << MIOPEN_VERSION_MAJOR << "." << MIOPEN_VERSION_MINOR << "." << MIOPEN_VERSION_PATCH << "\n";
#endif
#if AT_MAGMA_ENABLED()
oss << " - Magma " << MAGMA_VERSION_MAJOR << '.' << MAGMA_VERSION_MINOR << '.' << MAGMA_VERSION_MICRO << '\n';
oss << " - Magma " << MAGMA_VERSION_MAJOR << "." << MAGMA_VERSION_MINOR << "." << MAGMA_VERSION_MICRO << "\n";
#endif
return oss.str();

View File

@ -42,7 +42,7 @@ static inline void launch_jitted_vectorized_kernel_dynamic(
// The cache key includes all the parameters to generate_code + vec_size + dev_idx
std::stringstream ss;
ss << nInputs << '_' << nOutputs << f;
ss << nInputs << "_" << nOutputs << f;
ss << f_inputs_type_str << compute_type_str << result_type_str;
ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);
ss << extra_args_types;
@ -144,7 +144,7 @@ static inline void launch_jitted_unrolled_kernel_dynamic(
// The cache key includes all the parameters to generate_code + dev_idx
std::stringstream ss;
ss << nInputs << '_' << nOutputs << f;
ss << nInputs << "_" << nOutputs << f;
ss << f_inputs_type_str << compute_type_str << result_type_str;
ss << contiguous << dynamic_casting;
ss << static_cast<int>(at::cuda::jit::BinaryFuncVariant::NoScalar);

View File

@ -52,10 +52,10 @@ TuningContext* getTuningContext() {
std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry) {
static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1";
if (!blaslog) {
return stream << entry.key_ << ',' << entry.time_;
return stream << entry.key_ << "," << entry.time_;
}
else {
return stream << entry.key_ << ',' << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_;
return stream << entry.key_ << "," << entry.time_ << ",BLAS_PARAMS: " << entry.blas_sig_;
}
}
@ -156,10 +156,10 @@ void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std
if (isNew) {
static const bool blaslog = c10::utils::get_env("PYTORCH_TUNABLEOP_BLAS_LOG") == "1";
if (!blaslog) {
untuned_file << op_signature << ',' << params_signature << std::endl;
untuned_file << op_signature << "," << params_signature << std::endl;
}
else {
untuned_file << op_signature << ',' << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl;
untuned_file << op_signature << "," << params_signature << ",BLAS_PARAMS: " << blas_signature << std::endl;
}
TUNABLE_LOG3("Untuned,", op_signature, ",", params_signature);
}
@ -201,7 +201,7 @@ void TuningResultsManager::InitRealtimeAppend(const std::string& filename, const
if(!file_exists || file_empty) {
for(const auto& [key, val] : validators) {
(*realtime_out_) << "Validator," << key << ',' << val << std::endl;
(*realtime_out_) << "Validator," << key << "," << val << std::endl;
realtime_out_->flush();
}
validators_written_ = true;
@ -219,7 +219,7 @@ void TuningResultsManager::AppendResultLine(const std::string& op_sig, const std
return;
}
(*realtime_out_) << op_sig << ',' << param_sig << ',' << result << std::endl;
(*realtime_out_) << op_sig << "," << param_sig << "," << result << std::endl;
realtime_out_->flush(); //ensure immediate write to disk
TUNABLE_LOG3("Realtime append: ", op_sig, "(", param_sig, ") -> ", result);

View File

@ -93,31 +93,31 @@ std::string cudnnTypeToString(cudnnDataType_t dtype) {
return "CUDNN_DATA_UINT8x4";
default:
std::ostringstream oss;
oss << "(unknown data-type " << static_cast<int>(dtype) << ')';
oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
return oss.str();
}
}
std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
out << "TensorDescriptor " << static_cast<void*>(d.desc()) << '\n';
out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
int nbDims = 0;
int dimA[CUDNN_DIM_MAX];
int strideA[CUDNN_DIM_MAX];
cudnnDataType_t dtype{};
cudnnGetTensorNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &nbDims, dimA, strideA);
out << " type = " << cudnnTypeToString(dtype) << '\n';
out << " nbDims = " << nbDims << '\n';
out << " type = " << cudnnTypeToString(dtype) << "\n";
out << " nbDims = " << nbDims << "\n";
// Read out only nbDims of the arrays!
out << " dimA = ";
for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
out << i << ", ";
}
out << '\n';
out << "\n";
out << " strideA = ";
for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
out << i << ", ";
}
out << '\n';
out << "\n";
return out;
}
@ -168,27 +168,27 @@ std::string cudnnMemoryFormatToString(cudnnTensorFormat_t tformat) {
return "CUDNN_TENSOR_NHWC";
default:
std::ostringstream oss;
oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ')';
oss << "(unknown cudnn tensor format " << static_cast<int>(tformat) << ")";
return oss.str();
}
}
std::ostream& operator<<(std::ostream & out, const FilterDescriptor& d) {
out << "FilterDescriptor " << static_cast<void*>(d.desc()) << '\n';
out << "FilterDescriptor " << static_cast<void*>(d.desc()) << "\n";
int nbDims = 0;
int dimA[CUDNN_DIM_MAX];
cudnnDataType_t dtype{};
cudnnTensorFormat_t tformat{};
cudnnGetFilterNdDescriptor(d.desc(), CUDNN_DIM_MAX, &dtype, &tformat, &nbDims, dimA);
out << " type = " << cudnnTypeToString(dtype) << '\n';
out << " tensor_format = " << cudnnMemoryFormatToString(tformat) << '\n';
out << " nbDims = " << nbDims << '\n';
out << " type = " << cudnnTypeToString(dtype) << "\n";
out << " tensor_format = " << cudnnMemoryFormatToString(tformat) << "\n";
out << " nbDims = " << nbDims << "\n";
// Read out only nbDims of the arrays!
out << " dimA = ";
for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
out << i << ", ";
}
out << '\n';
out << "\n";
return out;
}

View File

@ -346,15 +346,15 @@ void foreachTensorInplaceWithFlag(std::vector<IValue>& args, int64_t begin, int6
}
std::ostream& operator<< (std::ostream& os, const DynamicLayer& layer) {
os << layer.layerId() << ':' << layer.key();
os << layer.layerId() << ":" << layer.key();
return os;
}
std::ostream& operator<< (std::ostream& os, const std::vector<DynamicLayer>& dls) {
os << "DynamicLayerStack[ ";
for (const auto& layer : dls) {
os << layer << ' ';
os << layer << " ";
}
os << ']';
os << "]";
return os;
}

View File

@ -22,7 +22,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) {
if (batched) {
ss << "Batched[lvl=" << batched->level() << " dim=" << batched->bdim() << ", ";
dumpTensor(ss, batched->value());
ss << ']';
ss << "]";
return;
}
ss << "Tensor" << tensor.sizes();
@ -36,7 +36,7 @@ void dumpTensor(std::ostream& ss, const Tensor& tensor) {
ss << "dead, ";
}
dumpTensor(ss, wrapped->value());
ss << ']';
ss << "]";
}
void TensorWrapper::refreshMetadata() {

View File

@ -73,32 +73,32 @@ std::string miopenTypeToString(miopenDataType_t dtype) {
return "miopenBFloat16";
default:
std::ostringstream oss;
oss << "(unknown data-type " << static_cast<int>(dtype) << ')';
oss << "(unknown data-type " << static_cast<int>(dtype) << ")";
return oss.str();
}
}
std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d) {
out << "TensorDescriptor " << static_cast<void*>(d.desc()) << '\n';
out << "TensorDescriptor " << static_cast<void*>(d.desc()) << "\n";
int nbDims = 0;
int dimA[MIOPEN_DIM_MAX];
int strideA[MIOPEN_DIM_MAX];
miopenDataType_t dtype;
miopenGetTensorDescriptorSize(d.desc(), &nbDims);
miopenGetTensorDescriptor(d.desc(), &dtype, dimA, strideA);
out << " type = " << miopenTypeToString(dtype) << '\n';
out << " nbDims = " << nbDims << '\n';
out << " type = " << miopenTypeToString(dtype) << "\n";
out << " nbDims = " << nbDims << "\n";
// Read out only nbDims of the arrays!
out << " dimA = ";
for (auto i : ArrayRef<int>{dimA, static_cast<size_t>(nbDims)}) {
out << i << ", ";
}
out << '\n';
out << "\n";
out << " strideA = ";
for (auto i : ArrayRef<int>{strideA, static_cast<size_t>(nbDims)}) {
out << i << ", ";
}
out << '\n';
out << "\n";
return out;
}

View File

@ -91,7 +91,7 @@ struct OperationInfo : BaseInfo {
std::stringstream kernelStr;
kernelStr << kernelName;
for (const Tensor& tensor : tensors) {
kernelStr << ':' << BaseInfo::buildTensorString(tensor, includeBufferId);
kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId);
}
return kernelStr.str();
}

View File

@ -39,9 +39,9 @@ std::string BaseInfo::buildTensorString(const Tensor& tensor, bool includeBuffer
// see comments for INCLUDE_BUFFER_ID
if (includeBufferId && deviceType == at::kMPS) {
id<MTLBuffer> buffer = __builtin_bit_cast(id<MTLBuffer>, tensor.storage().data());
tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ':' << buffer.retainCount << ')';
tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) << ":" << buffer.retainCount << ")";
}
tensorStr << ':' << tensor.scalar_type() << tensor.sizes();
tensorStr << ":" << tensor.scalar_type() << tensor.sizes();
return tensorStr.str();
} else {
return "undefined";

View File

@ -167,7 +167,7 @@ static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, co
std::stringstream ss;
ss << arg_name << " should be greater than zero but got (";
std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
ss << args.back() << ")" << " (while checking arguments for " << c << ')';
ss << args.back() << ")" << " (while checking arguments for " << c << ")";
TORCH_CHECK(false, ss.str());
}
}

View File

@ -639,7 +639,7 @@ static std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params)
<< " deterministic = " << params.deterministic
<< " cudnn_enabled = " << params.cudnn_enabled
<< " allow_tf32 = " << params.allow_tf32
<< '}';
<< "}";
return out;
}

View File

@ -1936,7 +1936,7 @@ static bool should_fold(const Tensor& tensor1, const Tensor& tensor2, bool has_o
// We order the tensors. t1 will be the larger tensor
// We can always transpose tensor2 as the dimensions are always >= 1 (precondition from matmul)
// and tensor1_larger iff tensor2.dim() > tensor1.dim(9
// and tensor1_larger iff tensor2.dim() > tensor1.dim()
const auto t1 = tensor1_larger ? MaybeOwned<Tensor>::borrowed(tensor1)
: MaybeOwned<Tensor>::owned(tensor2.mT());
const int64_t dim_t1 = t1->dim();
@ -1948,20 +1948,11 @@ static bool should_fold(const Tensor& tensor1, const Tensor& tensor2, bool has_o
return false;
}
// In this case we *do* incur in an extra copy to avoid creating an unnecessary large tensor in the backward
// Suppose we don't fold here. Let t1.shape = [b, m, n] t2.shape = [n, k] like in a transformer
// t2 will be expanded to a tensor of shape [b, n, k] and then we do t1.bmm(t2_expanded)
// The issue appears in the backward.
// The output gradient g of this operation would have shape [b, m, k]
// The backward wrt. t2 of bmm would be given by t1.mH @ g, which has shape [b, n, k]
// Then, the backward of expand is simply `sum(0)`. As such, we are instantiating a tensor
// of shape [b, n, k] unnecessarily, which may cause a large memory footprint, and in the
// worst case, an OOM
bool t2_requires_grad = tensor1_larger ? tensor2.requires_grad() : tensor1.requires_grad();
if (t2_requires_grad && !has_out) {
// We should be checking !at::GradMode::is_enabled(), but apparently
// this regresses performance in some cases:
// https://github.com/pytorch/pytorch/issues/118548#issuecomment-1916022394
// If we require a gradient, we should fold to minimize backward memory usage - even if this
// leads to a copy in forward because is needed in backward,
// only time we avoid this strict pre-allocated memory usage (has_out = True)
bool requires_grad = tensor1.requires_grad() || tensor2.requires_grad();
if (requires_grad && !has_out) {
return true;
}

View File

@ -847,7 +847,7 @@ Tensor stft(const Tensor& self, const int64_t n_fft, const std::optional<int64_t
<< ", hop_length=" << hop_length << ", win_length=" << win_length \
<< ", window="; \
if (window.defined()) { \
SS << window.toString() << '{' << window.sizes() << '}'; \
SS << window.toString() << "{" << window.sizes() << "}"; \
} else { \
SS << "None"; \
} \
@ -1046,7 +1046,7 @@ Tensor istft(const Tensor& self, const int64_t n_fft, const std::optional<int64_
<< ", hop_length=" << hop_length << ", win_length=" << win_length \
<< ", window="; \
if (window.defined()) { \
SS << window.toString() << '{' << window.sizes() << '}'; \
SS << window.toString() << "{" << window.sizes() << "}"; \
} else { \
SS << "None"; \
} \

View File

@ -1087,8 +1087,7 @@ TORCH_IMPL_FUNC(index_copy_out)
result.copy_(self);
// See Note [Enabling Deterministic Operations]
if ((result.is_cuda() || result.is_xpu()) &&
globalContext().deterministicAlgorithms()) {
if (result.is_cuda() && globalContext().deterministicAlgorithms()) {
torch::List<std::optional<Tensor>> indices;
indices.resize(dim + 1);
indices.set(dim, index);

View File

@ -523,7 +523,7 @@ Tensor _functional_assert_async_msg_cpu(
}
void _print(std::string_view s) {
std::cout << s << '\n';
std::cout << s << "\n";
}
// Sorting-based algorithm for isin(); used when the number of test elements is

View File

@ -296,7 +296,7 @@ template <typename scalar_t, typename res_scalar_t = scalar_t>
bool launchGemmAndBiasCublasLt(
// args contains result which is modified
cublasCommonArgs& args,
const std::optional<Tensor>& self,
const Tensor& self,
const Scalar& alpha,
Activation activation = Activation::None
) {
@ -304,8 +304,12 @@ bool launchGemmAndBiasCublasLt(
// or when it can be squeezed to 1D.
// self_ptr == nullptr implies ignore bias epilogue
// and use standard gemm-like API.
const auto* self_ptr = self.has_value() ? self.value().const_data_ptr<scalar_t>() : static_cast<const scalar_t*>(nullptr);
const auto* self_ptr = [&]() -> auto {
if (self.dim() == 1 || self.squeeze().dim() == 1) {
return self.const_data_ptr<scalar_t>();
}
return static_cast<const scalar_t*>(nullptr);
}();
const auto tuning_ctx = at::cuda::tunable::getTuningContext();
if (tuning_ctx->IsTunableOpEnabled()) {
@ -388,30 +392,35 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
bool disable_addmm_cuda_lt = persistent_disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
#ifdef USE_ROCM
// Conditioned on the device index, which is not persistent
disable_addmm_cuda_lt = disable_addmm_cuda_lt || isGloballyDisabledAddmmCudaLt(self.device());
disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt;
#endif
// Condition on the input
disable_addmm_cuda_lt = disable_addmm_cuda_lt || !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation);
disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation) || disable_addmm_cuda_lt;
// }
at::ScalarType scalar_type = mat1.scalar_type();
bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;
#ifdef USE_ROCM
disable_addmm_cuda_lt = disable_addmm_cuda_lt || is_float_output_with_half_input;
#endif
bool use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
// for float output with half input cublasLT with bias produces wrong results
use_bias_ptr_lt &= !is_float_output_with_half_input;
// Handle result/self shapes
if (!result.is_same(self)) {
at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]});
// We do not copy bias only when we need the bias ptr
// We use bias ptr in the Lt path only when bias is 1D
const auto use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
const auto self_maybe_expanded = [&]() -> c10::MaybeOwned<Tensor> {
if (!use_bias_ptr_lt) {
// We do expand self even before
// check for beta != 0.0 to make sure that
// test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_*
// runs green.
return expand_size(self, result.sizes(), "addmm");
}
return c10::MaybeOwned<Tensor>::borrowed(self);
}();
// We do not copy bias only when we need the bias ptr
if (beta.toComplexDouble() != 0.0 && !use_bias_ptr_lt) {
// NOTE: self should broadcast over result
at::native::copy_(result, *expand_size(self, result.sizes(), "addmm"));
at::native::copy_(result, *self_maybe_expanded);
}
}
@ -459,7 +468,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
scalar_type,
"addmm_cuda_lt",
[&] {
lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, self, alpha, activation);
}
);
#endif
@ -471,7 +480,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
scalar_type,
"addmm_cuda_lt",
[&] {
lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, use_bias_ptr_lt ? std::make_optional(self) : std::nullopt, alpha, activation);
lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, self, alpha, activation);
}
);
} // end is_float_output_with_half_input
@ -927,7 +936,7 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) {
return _int_mm_out_cuda(self, mat2, result);
}
static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
// ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm
TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
@ -951,7 +960,7 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
(out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
"out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
if (self_baddbmm.has_value()) {
if (!is_bmm && self_baddbmm.has_value()) {
const auto& self = self_baddbmm.value();
TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor");
TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output");
@ -959,12 +968,15 @@ static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& bat
}
Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
IntArrayRef batch1_sizes = batch1.sizes();
IntArrayRef batch2_sizes = batch2.sizes();
Tensor out = at::empty({batch1_sizes[0], batch1_sizes[1], batch2_sizes[2]}, batch1.options().dtype(out_dtype));
return _bmm_out_dtype_cuda(batch1, batch2, out_dtype, out);
}
Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype);
baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true);
Scalar beta(0.0);
Scalar alpha(1.0);
{
@ -976,16 +988,14 @@ Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at
}
Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
TORCH_CHECK(self.scalar_type() == out_dtype || self.scalar_type() == batch1.dtype(),
"self dtype must match either out_dtype or batch1 dtype");
Tensor out = at::empty({batch1.size(0), batch1.size(1), batch2.size(2)}, batch1.options().dtype(out_dtype));
return _baddbmm_out_dtype_cuda(self, batch1, batch2, out_dtype, beta, alpha, out);
// We need to copy the tensor
Tensor out = self.clone().to(self.options().dtype(out_dtype));
return _baddbmm_out_dtype_cuda(out, batch1, batch2, out_dtype, beta, alpha, out);
}
Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, out);
// We need to copy the tensor
out.copy_(self);
baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self);
{
NoNamesGuard guard;
baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
@ -1020,27 +1030,24 @@ Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::Sca
}
Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
Tensor result = at::empty({mat1.size(0), mat2.size(1)}, self.options().dtype(out_dtype));
Tensor result = at::empty(self.sizes(), self.options().dtype(out_dtype));
return _addmm_dtype_out_cuda(self, mat1, mat2, out_dtype, beta, alpha, result);
}
Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
// repeat dimensionality checks for direct calls to `out` overload
TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type());
TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
TORCH_CHECK(
mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
TORCH_CHECK(out_dtype == mat1.scalar_type() ||
(out_dtype == at::ScalarType::Float && (mat1.scalar_type() == at::ScalarType::Half || mat1.scalar_type() == at::ScalarType::BFloat16)),
"out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
TORCH_CHECK(out_dtype == self.scalar_type() || self.scalar_type() == mat1.scalar_type(),
"self dtype must match either out_dtype or mat1 dtype");
TORCH_CHECK(out_dtype == self.scalar_type() ||
(out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
"out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
addmm_out_cuda_impl(out, self, mat1, mat2, beta, alpha);

View File

@ -78,18 +78,9 @@ __global__ void EmbeddingBag_updateOutputKernel_max(
scalar_t weightFeatMax = 0;
int64_t bag_size_ = 0;
int64_t maxWord = -1;
// Separate validation loop reduces register pressure in the main loop below.
// No early exit (break) on invalid input as benchmarking shows it degrades performance.
bool has_invalid_index = false;
for (int64_t emb = begin; emb < end; emb++) {
index_t input_idx = input[emb];
has_invalid_index = has_invalid_index || (input_idx < 0 || input_idx >= numRows);
}
CUDA_KERNEL_ASSERT(!has_invalid_index && "Invalid input index in EmbeddingBag: index out of range [0, numRows)");
for (int64_t emb = begin; emb < end; emb++) {
bool pad = (input[emb] == padding_idx);
CUDA_KERNEL_ASSERT(input[emb] < numRows);
const int64_t weightRow = input[emb] * weight_stride0;
scalar_t weightValue = weightFeat[weightRow];
if (bag_size_ == 0 || weightValue > weightFeatMax) {
@ -138,19 +129,10 @@ __global__ void EmbeddingBag_updateOutputKernel_sum_mean(
CUDA_KERNEL_ASSERT(end >= begin);
accscalar_t weightFeatSum = 0;
int64_t bag_size_ = 0;
// Separate validation loop reduces register pressure in the main loop below.
// No early exit (break) on invalid input as benchmarking shows it degrades performance.
bool has_invalid_index = false;
for (int64_t emb = begin; emb < end; emb++) {
index_t input_idx = input[emb];
has_invalid_index = has_invalid_index || (input_idx < 0 || input_idx >= numRows);
}
CUDA_KERNEL_ASSERT(!has_invalid_index && "Invalid input index in EmbeddingBag: index out of range [0, numRows)");
for (int64_t emb = begin; emb < end; emb++) {
index_t input_idx = input[emb];
bool pad = (input_idx == padding_idx);
CUDA_KERNEL_ASSERT(0 <= input_idx && input_idx < numRows);
const int64_t weightRow = input_idx * weight_stride0;
scalar_t weightValue = weightFeat[weightRow];
weightValue = pad ? static_cast<scalar_t>(0) : weightValue;

View File

@ -78,9 +78,9 @@ _mx8_mx8_bf16_grouped_mm_fbgemm(
const Tensor& mat_a,
const Tensor& mat_b,
const Tensor& scale_a,
const SwizzleType swizzle_a,
const SwizzleType& swizzle_a,
const Tensor& scale_b,
const SwizzleType swizzle_b,
const SwizzleType& swizzle_b,
const std::optional<at::Tensor>& offs,
Tensor& out) {
const bool a_is_2d = mat_a.dim() == 2;
@ -607,8 +607,6 @@ _scaled_grouped_mm_cuda_v2(
// scale shape checks
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
_check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
// swizze checks
TORCH_CHECK_VALUE(swizzle_a_enum.size() == 1 && swizzle_b_enum.size() == 1, "Expected single swizzle argument");
return _mx8_mx8_bf16_grouped_mm_fbgemm(
mat_a,
mat_b,

View File

@ -2,250 +2,18 @@
#include <ATen/Dispatch.h>
#include <ATen/native/DispatchStub.h>
#include <ATen/native/cuda/Loops.cuh>
#include <ATen/native/cuda/JitLoops.cuh>
#include <ATen/native/cuda/jit_utils.h>
#include <ATen/native/cuda/ScanUtils.cuh>
#include <ATen/native/TensorIterator.h>
#include <ATen/native/BinaryOps.h>
#include <ATen/OpMathType.h>
#include <c10/util/MathConstants.h>
#include <c10/util/complex.h>
#include <cmath>
#include <limits>
// NOTE: CUDA on Windows requires that the enclosing function
// of a __device__ lambda not have internal linkage.
namespace at::native {
// custom min and max to be used in logaddexp for complex arguments
template <typename scalar_t, bool min>
__host__ __device__ c10::complex<scalar_t> _logaddexp_minmax(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
scalar_t xr = std::real(x);
scalar_t yr = std::real(y);
if (::isnan(yr) || (::isnan(std::imag(y)))) {
return y;
} else if (::isnan(xr) || (::isnan(std::imag(x)))) {
return x;
} else if (min) { // min
return (xr < yr) ? x : y;
} else { // max
return (xr >= yr) ? x : y;
}
}
template <typename scalar_t>
__host__ __device__ scalar_t _log_add_exp_helper(const scalar_t& x, const scalar_t& y) {
// Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
// Using the original expression: `at::_isnan(y) ? y : std::min(x, y)` causes an error in ROCM
const auto isnan_x = at::_isnan(x);
const auto isnan_y = at::_isnan(y);
scalar_t min = isnan_y ? y : (isnan_x ? x : std::min(x, y));
scalar_t max = isnan_y ? y : (isnan_x ? x : std::max(x, y));
if (min != max || ::isfinite(min)) {
// nan will be propagated here
return ::log1p(std::exp(min - max)) + max;
} else {
// special case to correctly handle infinite cases
return x;
}
}
template <typename scalar_t>
__host__ __device__ c10::complex<scalar_t> _fast_build_exp(const c10::complex<scalar_t>& x) {
// complex exponential function, but implemented manually to get fast compilation time
// this function only handles the case where the x is finite (not inf nor nan)
const auto xreal = std::real(x);
const auto ximag = std::imag(x);
const auto exp_x_abs = std::exp(xreal);
auto exp_x_real = exp_x_abs * std::cos(ximag);
auto exp_x_imag = exp_x_abs * std::sin(ximag);
return {exp_x_real, exp_x_imag};
}
template <typename scalar_t>
__host__ __device__ c10::complex<scalar_t> _fast_build_exp_inf(const c10::complex<scalar_t>& x) {
// complex exponential function, but implemented manually to get fast compilation time
// this function only handles the case where the real part of x is infinite
const auto ximag = std::imag(x);
constexpr auto exp_x_abs = std::numeric_limits<scalar_t>::infinity();
if (!::isfinite(ximag)) { // add this to make consitent with std::exp(x+yi)
return {exp_x_abs, std::numeric_limits<scalar_t>::quiet_NaN()};
}
const auto sin = std::sin(ximag);
const auto cos = std::cos(ximag);
// special case if the angle is exactly the multiple of pi/2
auto exp_x_real = (cos == 0) ? (scalar_t)0.0 : exp_x_abs * cos;
auto exp_x_imag = (sin == 0) ? (scalar_t)0.0 : exp_x_abs * sin;
return {exp_x_real, exp_x_imag};
}
template <typename scalar_t>
__host__ __device__ c10::complex<scalar_t> _log_add_exp_helper(const c10::complex<scalar_t>& x, const c10::complex<scalar_t>& y) {
c10::complex<scalar_t> min = _logaddexp_minmax<scalar_t, /*min=*/true>(x, y);
c10::complex<scalar_t> max = _logaddexp_minmax<scalar_t, /*min=*/false>(x, y);
scalar_t min_real = std::real(min);
scalar_t max_real = std::real(max);
if (::isnan(min_real) || ::isnan(std::imag(min))) {
// handling the "infectious" NaNs
return {std::numeric_limits<scalar_t>::quiet_NaN(), std::numeric_limits<scalar_t>::quiet_NaN()};
}
else if ((!::isfinite(min_real)) && (min_real == max_real)) {
if (min_real < 0) {
// handle the -inf case, the imaginary part here does not really matter as the exp(value)
// will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined.
// It does not matter if we're taking the exp of this value
return min;
} else {
// handle the +inf case, we don't need the special precision for log1p for small values
// and to avoid producing nan in case of real(max) == real(min) == +inf
const auto exp_min = _fast_build_exp_inf(min);
const auto exp_max = _fast_build_exp_inf(max);
return ::log1p(exp_min + exp_max - 1); // log1p(x - 1) builds faster than log
}
} else {
const auto minmax = min - max;
c10::complex<scalar_t> exp_minmax;
if (!::isfinite(minmax.real())) {
exp_minmax = minmax.real() < 0 ? c10::complex<scalar_t>{0.0, 0.0} : _fast_build_exp_inf(minmax);
} else {
exp_minmax = _fast_build_exp(minmax);
}
return ::log1p(exp_minmax) + max;
}
}
// Complex logaddexp jiterator string
const auto logaddexp_complex_string = jiterator_stringify(
template<typename T>
std::complex<T> log1p(const std::complex<T>& z)
{
using complex_t = std::complex<T>;
T x = z.real();
T y = z.imag();
T zabs = abs(z);
T theta = atan2(y, x + T(1));
if (zabs < 0.5) {
T r = x * (T(2) + x) + y * y;
if (r == 0) { // handle underflow
return complex_t(x, theta);
}
return complex_t(T(0.5) * std::log1p(r), theta);
} else {
T z0 = std::hypot(x + 1, y);
return complex_t(log(z0), theta);
}
}
// separated _logaddexp_minmax into 2 different functions for jiterator_string
template <typename T>
std::complex<T> logaddexp_min(const std::complex<T>& x, const std::complex<T>& y) {
T xr = x.real();
T yr = y.real();
if (isnan(yr) || isnan(y.imag())) {
return y;
} else if (isnan(xr) || isnan(x.imag())) {
return x;
} else {
return (xr < yr) ? x : y;
}
}
template <typename T>
std::complex<T> logaddexp_max(const std::complex<T>& x, const std::complex<T>& y) {
T xr = x.real();
T yr = y.real();
if (isnan(yr) || isnan(y.imag())) {
return y;
} else if (isnan(xr) || isnan(x.imag())) {
return x;
} else {
return (xr >= yr) ? x : y;
}
}
template <typename T>
std::complex<T> fast_build_exp(const std::complex<T>& x) {
const auto xreal = x.real();
const auto ximag = x.imag();
const auto exp_x_abs = exp(xreal);
auto exp_x_real = exp_x_abs * cos(ximag);
auto exp_x_imag = exp_x_abs * sin(ximag);
return std::complex<T>(exp_x_real, exp_x_imag);
}
template <typename T>
std::complex<T> fast_build_exp_inf(const std::complex<T>& x) {
using complex_t = std::complex<T>;
const auto ximag = x.imag();
const T exp_x_abs = INFINITY;
if (!isfinite(ximag)) {
return complex_t(exp_x_abs, NAN);
}
const auto sin_val = sin(ximag);
const auto cos_val = cos(ximag);
auto exp_x_real = (cos_val == T(0)) ? T(0) : exp_x_abs * cos_val;
auto exp_x_imag = (sin_val == T(0)) ? T(0) : exp_x_abs * sin_val;
return complex_t(exp_x_real, exp_x_imag);
}
template <typename complex_t>
complex_t logaddexp_complex(complex_t x, complex_t y) {
using T = typename complex_t::value_type;
complex_t min_val = logaddexp_min(x, y);
complex_t max_val = logaddexp_max(x, y);
T min_real = min_val.real();
T max_real = max_val.real();
if (isnan(min_real) || isnan(min_val.imag())) {
return complex_t(NAN, NAN);
}
else if ((!isfinite(min_real)) && (min_real == max_real)) {
if (min_real < T(0)) {
return min_val;
} else {
const auto exp_min = fast_build_exp_inf<T>(min_val);
const auto exp_max = fast_build_exp_inf<T>(max_val);
return log1p(exp_min + exp_max - complex_t(1, 0));
}
} else {
const auto minmax = min_val - max_val;
complex_t exp_minmax;
if (!isfinite(minmax.real())) {
exp_minmax = (minmax.real() < T(0)) ? complex_t(0, 0) : fast_build_exp_inf<T>(minmax);
} else {
exp_minmax = fast_build_exp<T>(minmax);
}
return log1p(exp_minmax) + max_val;
}
}
);
constexpr char logaddexp_complex_name[] = "logaddexp_complex";
void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
if (at::isComplexType(iter.dtype())) {
#if AT_USE_JITERATOR()
AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
jitted_gpu_kernel<
/*name=*/logaddexp_complex_name,
/*return_dtype=*/scalar_t,
/*common_dtype=*/scalar_t,
/*arity=*/2>(iter, logaddexp_complex_string);
});
#else
AT_DISPATCH_COMPLEX_TYPES_AND(at::ScalarType::ComplexHalf, iter.dtype(), "logaddexp_cuda", [&]() {
using opmath_t = at::opmath_type<scalar_t>;
gpu_kernel(iter, [] GPU_LAMBDA (scalar_t a_, scalar_t b_) -> scalar_t {
const auto a = static_cast<opmath_t>(a_);
const auto b = static_cast<opmath_t>(b_);
return static_cast<scalar_t>(_log_add_exp_helper(a, b));
});
});
#endif
} else {
AT_DISPATCH_FLOATING_TYPES_AND2(
AT_DISPATCH_FLOATING_TYPES_AND2(
ScalarType::BFloat16, ScalarType::Half,
iter.dtype(), "logaddexp_cuda",
[&]() {
@ -261,7 +29,6 @@ void logaddexp_kernel_cuda(TensorIteratorBase& iter) {
}
});
});
}
}
void logaddexp2_kernel_cuda(TensorIteratorBase& iter) {

View File

@ -11,7 +11,7 @@ static inline std::ostream& operator<<(std::ostream& out, dim3 dim) {
if (dim.y == 1 && dim.z == 1) {
out << dim.x;
} else {
out << '[' << dim.x << ',' << dim.y << ',' << dim.z << ']';
out << "[" << dim.x << "," << dim.y << "," << dim.z << "]";
}
return out;
}
@ -27,7 +27,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
out << "input_mult=[";
for (int i = 0; i < 3; i++) {
if (i != 0) {
out << ',';
out << ",";
}
out << config.input_mult[i];
}
@ -35,7 +35,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
out << "output_mult=[";
for (int i = 0; i < 2; i++) {
if (i != 0) {
out << ',';
out << ",";
}
out << config.output_mult[i];
}
@ -49,7 +49,7 @@ std::ostream& operator<<(std::ostream& out, const ReduceConfig& config) {
out << "block=" << config.block() << ", ";
out << "grid=" << config.grid() << ", ";
out << "global_memory_size=" << config.global_memory_size();
out << ')';
out << ")";
return out;
}

View File

@ -740,12 +740,7 @@ _scaled_rowwise_rowwise(
TORCH_CHECK_VALUE(scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat, "scale_a must have ", mat_a.size(0), " Float elements, got ", scale_a.numel())
TORCH_CHECK_VALUE(scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat, "scale_b must have ", mat_b.size(1), " Float elements, got ", scale_b.numel())
// if we have a scale of shape [256, 1] (say), then stride can be [1, 0] - handle this case
TORCH_CHECK_VALUE(
scale_a.stride(1) == 1 ||
scale_a.size(1) == 1,
"expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1)
);
TORCH_CHECK_VALUE(scale_a.stride(1) == 1, "expected scale_a.stride(1) to be 1, but got ", scale_a.stride(1));
TORCH_CHECK_VALUE(scale_b.stride(1) == 1, "expected scale_b.stride(1) to be 1, but got ", scale_b.stride(1));
auto scaling_choice_a = ScalingType::RowWise;
@ -1101,19 +1096,6 @@ _scaled_mxfp8_mxfp8(
return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
}
void
_check_mxfp4_support() {
#ifndef USE_ROCM
auto dprops = at::cuda::getCurrentDeviceProperties();
// Only on B200 GPUs
TORCH_CHECK_NOT_IMPLEMENTED(
// B200 = 10.0, B300 = 10.3
dprops->major == 10,
"MXFP4 scaling only supported in CUDA for B200/B300"
);
#endif
}
Tensor&
_scaled_mxfp4_mxfp4(
@ -1126,7 +1108,6 @@ _scaled_mxfp4_mxfp4(
#if defined(_WIN32) || (!defined(USE_ROCM) && !defined(USE_FBGEMM_GENAI))
TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM and CUDA+FBGEMM_GENAI only");
#else
_check_mxfp4_support();
// Restrictions:
// A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",

View File

@ -364,9 +364,9 @@ void f8f8bf16_grouped_gemm_impl_sm90(
// reinterpret_cast<ProblemShape::UnderlyingProblemShape*>(
// stride_output_h + group_count);
// std::cout << "PTRS " << mat_a.data_ptr() << ' ' << mat_b.data_ptr() << "
// std::cout << "PTRS " << mat_a.data_ptr() << " " << mat_b.data_ptr() << "
// "
// << out.data_ptr() << ' ' << scale_a.data_ptr() << ' '
// << out.data_ptr() << " " << scale_a.data_ptr() << " "
// << scale_b.data_ptr() << "\n";
// for (int i = 0; i < group_count; i++) {
// std::cout << "A " << (void*)inputA_ptrs_h[i] << "\n";

View File

@ -1057,14 +1057,14 @@ std::string generate_code(
// TODO these arrays are potentially of the different types, use function
// traits to determine the types
declare_load_arrays << f_inputs_type << " arg" << std::to_string(i)
<< '[' << std::to_string(thread_work_size) << "];\n";
<< "[" << std::to_string(thread_work_size) << "];\n";
}
env.s("declare_load_arrays", declare_load_arrays.str());
std::stringstream declare_store_arrays;
for (int i = 0; i < nOutputs; i++) {
declare_store_arrays << result_type << " out" << std::to_string(i)
<< '[' << std::to_string(thread_work_size) << "];\n";
<< "[" << std::to_string(thread_work_size) << "];\n";
}
env.s("declare_store_arrays", declare_store_arrays.str());
@ -1217,7 +1217,7 @@ std::string generate_code(
for (const auto i : c10::irange(nInputs)){
auto i_string = std::to_string(i);
vector_inputs << "auto * input" << i_string <<
" = reinterpret_cast<const scalar_t*>(data[" << i_string << '+' << nOutputs << "])" <<
" = reinterpret_cast<const scalar_t*>(data[" << i_string << "+" << nOutputs << "])" <<
" + block_work_size * idx;\n";
}
env.s("vector_inputs", vector_inputs.str());
@ -1543,17 +1543,17 @@ NvrtcFunction jit_pwise_function(
// Constructs file path by appending constructed cubin name to cache path
std::stringstream ss;
ss << *cache_dir << '/';
ss << *cache_dir << "/";
ss << kernel_name;
#ifdef USE_ROCM
ss << "_arch" << prop->gcnArchName;
#else
ss << "_arch" << cuda_major << '.' << cuda_minor;
ss << "_arch" << cuda_major << "." << cuda_minor;
#endif
ss << "_nvrtc" << nvrtc_major << '.' << nvrtc_minor;
ss << "_nvrtc" << nvrtc_major << "." << nvrtc_minor;
ss << (compile_to_sass ? "_sass" : "_ptx");
ss << '_' << code.length();
ss << '_' << hash_code;
ss << "_" << code.length();
ss << "_" << hash_code;
file_path = ss.str();
std::ifstream readin{file_path, std::ios::in | std::ifstream::binary};

View File

@ -82,15 +82,15 @@ namespace native {
std::ostream& operator<<(std::ostream& out, const ConvolutionParams& params) {
out << "ConvolutionParams \n"
<< " memory_format = " << params.memory_format << '\n'
<< " data_type = " << cudnnTypeToString(params.dataType) << '\n'
<< " padding = " << ArrayRef<int>{params.padding} << '\n'
<< " stride = " << ArrayRef<int>{params.stride} << '\n'
<< " dilation = " << ArrayRef<int>{params.dilation} << '\n'
<< " groups = " << params.groups << '\n'
<< " memory_format = " << params.memory_format << "\n"
<< " data_type = " << cudnnTypeToString(params.dataType) << "\n"
<< " padding = " << ArrayRef<int>{params.padding} << "\n"
<< " stride = " << ArrayRef<int>{params.stride} << "\n"
<< " dilation = " << ArrayRef<int>{params.dilation} << "\n"
<< " groups = " << params.groups << "\n"
<< " deterministic = " << (params.deterministic ? "true" : "false")
<< '\n'
<< " allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << '\n';
<< "\n"
<< " allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << "\n";
return out;
}
@ -173,16 +173,16 @@ std::string repro_from_args(const ConvolutionParams& params) {
at::globalContext().float32Precision(
at::Float32Backend::CUDA, at::Float32Op::MATMUL) ==
at::Float32Precision::TF32)
<< '\n';
<< "\n";
ss << "torch.backends.cudnn.benchmark = "
<< pybool(at::globalContext().benchmarkCuDNN()) << '\n';
<< pybool(at::globalContext().benchmarkCuDNN()) << "\n";
ss << "torch.backends.cudnn.deterministic = " << pybool(params.deterministic)
<< '\n';
<< "\n";
ss << "torch.backends.cudnn.allow_tf32 = " << pybool(params.allow_tf32)
<< '\n';
<< "\n";
ss << "data = torch.randn(" << ArrayRef<int>(params.input_size, dim)
<< ", dtype=" << full_dtype << ", ";
ss << "device='cuda', requires_grad=True)" << to_channels_last << '\n';
ss << "device='cuda', requires_grad=True)" << to_channels_last << "\n";
ss << "net = torch.nn.Conv" << dim - 2 << "d(" << in_channels << ", "
<< out_channels << ", ";
ss << "kernel_size=" << ArrayRef<int>(&params.weight_size[2], dim - 2)
@ -192,7 +192,7 @@ std::string repro_from_args(const ConvolutionParams& params) {
ss << "dilation=" << ArrayRef<int>(params.dilation, dim - 2) << ", ";
ss << "groups=" << params.groups << ")\n";
ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last
<< '\n';
<< "\n";
ss << "out = net(data)\n";
ss << "out.backward(torch.randn_like(out))\n";
ss << "torch.cuda.synchronize()\n\n";

View File

@ -93,10 +93,11 @@ std::ostream& operator<<(std::ostream& out, const ConvolutionArgs& args) {
<< "input: " << args.idesc // already has a trailing newline
<< "output: " << args.odesc // already has a trailing newline
<< "weight: " << args.wdesc // already has a trailing newline
<< "Pointer addresses: " << '\n'
<< " input: " << args.input.const_data_ptr() << '\n'
<< " output: " << args.output.const_data_ptr() << '\n'
<< " weight: " << args.weight.const_data_ptr() << '\n';
<< "Pointer addresses: "
<< "\n"
<< " input: " << args.input.const_data_ptr() << "\n"
<< " output: " << args.output.const_data_ptr() << "\n"
<< " weight: " << args.weight.const_data_ptr() << "\n";
return out;
}

View File

@ -115,7 +115,7 @@ std::ostream& operator<<(
std::copy(
strides.begin(), strides.end() - 1, std::ostream_iterator<int>(oss, ","));
oss << sizes.back();
output << oss.str() << '}';
output << oss.str() << "}";
return output;
}

View File

@ -53,7 +53,7 @@ std::ostream& operator<<(std::ostream& out, const ConvParams& params) {
<< " transposed = " << params.transposed
<< " output_padding = " << IntArrayRef{params.output_padding}
<< " groups = " << params.groups << " benchmark = " << params.benchmark
<< " deterministic = " << params.deterministic << '}';
<< " deterministic = " << params.deterministic << "}";
return out;
}

View File

@ -5,7 +5,6 @@
#include <ATen/native/Resize.h>
#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
#include <ATen/native/xpu/Blas.h>
#include <ATen/xpu/XPUScaledBlas.h>
#include <torch/library.h>
#ifndef AT_PER_OPERATOR_HEADERS
@ -340,399 +339,4 @@ Tensor _scaled_mm_xpu(
out);
}
using acceptance_fn = std::function<bool(
c10::ScalarType,
std::vector<ScalingType>&,
ArrayRef<Tensor>&,
c10::ScalarType,
std::vector<ScalingType>&,
ArrayRef<Tensor>&)>;
using namespace std::placeholders;
namespace scaled_blas = at::native::onednn::scaled;
using scaled_blas::convert_int_to_enum;
using scaled_blas::ScaledGemmImplementation;
std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 2>
scale_kernel_dispatch = {{
{"tensorwise_tensorwise",
scaled_blas::check_tensorwise_recipe,
ScaledGemmImplementation::TENSORWISE_TENSORWISE},
{"rowwise_rowwise",
scaled_blas::check_rowwise_recipe,
ScaledGemmImplementation::ROWWISE_ROWWISE},
}};
Tensor& _scaled_tensorwise_tensorwise(
const Tensor& mat_a,
const Tensor& mat_b,
const Tensor& scale_a,
const Tensor& scale_b,
const std::optional<Tensor>& bias,
const c10::ScalarType out_dtype,
bool use_fast_accum,
Tensor& out) {
// Restrictions:
// A, B are FP8, scales are fp32
TORCH_CHECK_VALUE(
isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()),
"mat_a and mat_b must be fp8 types, got: ",
mat_a.scalar_type(),
mat_b.scalar_type());
TORCH_CHECK_VALUE(
scale_a.numel() == 1 && scale_a.scalar_type() == kFloat,
"scale_a must have 1 Float element")
TORCH_CHECK_VALUE(
scale_b.numel() == 1 && scale_b.scalar_type() == kFloat,
"scale_b must have 1 Float element")
auto scaling_choice_a = ScalingType::TensorWise;
auto scaling_choice_b = ScalingType::TensorWise;
_scaled_gemm(
mat_a,
mat_b,
scale_a,
scale_b,
scaling_choice_a,
scaling_choice_b,
bias,
use_fast_accum,
out);
return out;
}
Tensor& _scaled_rowwise_rowwise(
const Tensor& mat_a,
const Tensor& mat_b,
const Tensor& scale_a,
const Tensor& scale_b,
const std::optional<Tensor>& bias,
const c10::ScalarType out_dtype,
bool use_fast_accum,
Tensor& out) {
// Restrictions:
// A, B are FP8, scales are fp32, shape M/N for A/B
TORCH_CHECK_VALUE(
isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()),
"mat_a and mat_b must be fp8 types, got: ",
mat_a.scalar_type(),
mat_b.scalar_type());
TORCH_CHECK_VALUE(
scale_a.size(0) == mat_a.size(0) && scale_a.size(1) == 1,
"scale_a must have shape [",
mat_a.size(0),
", 1], got [",
scale_a.sizes(),
"]");
TORCH_CHECK_VALUE(
scale_a.numel() == mat_a.size(0) && scale_a.scalar_type() == kFloat,
"scale_a must have ",
mat_a.size(0),
" Float elements, got ",
scale_a.numel())
TORCH_CHECK_VALUE(
scale_b.numel() == mat_b.size(1) && scale_b.scalar_type() == kFloat,
"scale_b must have ",
mat_b.size(1),
" Float elements, got ",
scale_b.numel())
TORCH_CHECK_VALUE(
scale_a.stride(1) == 1,
"expected scale_a.stride(1) to be 1, but got ",
scale_a.stride(1));
TORCH_CHECK_VALUE(
scale_b.stride(1) == 1,
"expected scale_b.stride(1) to be 1, but got ",
scale_b.stride(1));
auto scaling_choice_a = ScalingType::RowWise;
auto scaling_choice_b = ScalingType::RowWise;
_scaled_gemm(
mat_a,
mat_b,
scale_a,
scale_b,
scaling_choice_a,
scaling_choice_b,
bias,
use_fast_accum,
out);
return out;
}
// V2: Computes matrix multiply + bias while applying scaling to input and
// output matrices Scales are only applicable when matrices are of Float8 type
// and assumed to be equal to 1.0 by default. If output matrix type is 16 or
// 32-bit type, scale_result is not applied. Known limitations:
// - Only works if mat1 is row-major and mat2 is column-major
// - Only works if matrices sizes are divisible by 32
// - If 1-dimensional tensors are used then scale_a should be size =
// mat1.size(0)
// and scale_b should have size = to mat2.size(1)
// Arguments:
// - `mat_a`: the first operand of the matrix multiply, can be type
// `torch.float8_e4m3fn` or `torch.float8_e5m2`
// - `mat_b`: the second operand of the matrix multiply, can be type
// `torch.float8_e4m3fn` or `torch.float8_e5m2`
// - `scale_a`: a tensor with the inverse scale of `mat1`, whose
// shape/strides/dtype depend on the scaling scheme
// - `scale_recipe_a`: An integer corresponding to an enum describing the
// scaling scheme used for `scale_a`
// - `swizzle_a`: An integer corresponding to a `SwizzleType` enum describing
// the swizzling scheme for `scale_a`.
// Not supported for XPU for now.
// - `scale_b`: a tensor with the inverse scale of `mat2`, whose
// shape/strides/dtype depend on the scaling scheme
// - `scale_recipe_b`: An integer corresponding to an enum describing the
// scaling scheme used for `scale_b`
// - `swizzle_b`: An integer corresponding to a `SwizzleType` enum describing
// the swizzling scheme for `scale_b`.
// Not supported for XPU for now.
// - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
// - `out_dtype`: the output dtype, can either be a float8 or a higher
// precision floating point type
// - `contraction_dim`: describe which dimensions are `K` in the matmul.
// Not supported for XPU. Should always be empty.
// - `use_fast_accum`: Not supported for XPU, should always be false.
// - `out`: a reference to the output tensor
Tensor& _scaled_mm_xpu_v2_out(
const Tensor& mat_a,
const Tensor& mat_b,
ArrayRef<Tensor> scale_a,
IntArrayRef scale_recipe_a,
IntArrayRef swizzle_a,
ArrayRef<Tensor> scale_b,
IntArrayRef scale_recipe_b,
IntArrayRef swizzle_b,
const std::optional<Tensor>& bias,
const std::optional<c10::ScalarType> out_dtype,
IntArrayRef contraction_dim,
bool use_fast_accum,
Tensor& out) {
TORCH_CHECK_VALUE(mat_a.dim() == 2, "mat_a must be a matrix");
TORCH_CHECK_VALUE(mat_b.dim() == 2, "mat_b must be a matrix");
// If any of M, K, N is 0 - return early (the tensorwise/rowwise float8 gemm
// kernels do not support this case).
if (mat_a.size(0) == 0 || mat_a.size(1) == 0 || mat_b.size(1) == 0) {
// `out` was created with `at::empty`. In the case where we are multiplying
// MxK by KxN and K is the zero dim, we need to initialize here to properly
// return a tensor of zeros.
at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
if (mat_a.size(1) == 0) {
out.zero_();
}
return out;
}
// Note: The `contraction_dim` is not actually used for now. We will need to
// align this code when upstreamed CUDA code is done. Currently, only keeps
// the code here for check.
// Check if the input matrix sizes can be multiplied
// - if optional contraction dims are provided, use those
// -- mostly for < 1B formats (i.e. nvfp4x2) where cheap .t() is not
// available.
if (contraction_dim.size() > 0) {
TORCH_CHECK_VALUE(
contraction_dim.size() == 2,
"contraction_dim must have exactly 2 elements");
auto mat_a_dim = contraction_dim[0];
auto mat_b_dim = contraction_dim[1];
TORCH_CHECK_VALUE(
mat_a.size(mat_a_dim) == mat_b.size(mat_b_dim),
"mat_a and mat_b shapes cannot be multiplied (",
mat_a.size(0),
"x",
mat_a.size(1),
" and ",
mat_b.size(0),
"x",
mat_b.size(1),
") ",
"with contraction dims mat_a: ",
mat_a_dim,
", mat_b: ",
mat_b_dim);
} else {
TORCH_CHECK_VALUE(
mat_a.size(1) == mat_b.size(0),
"mat_a and mat_b shapes cannot be multiplied (",
mat_a.size(0),
"x",
mat_a.size(1),
" and ",
mat_b.size(0),
"x",
mat_b.size(1),
")");
}
TORCH_CHECK_VALUE(
!bias || bias->numel() == mat_b.sizes()[1],
"Bias must be size ",
mat_b.sizes()[1],
" but got ",
bias->numel());
TORCH_CHECK_VALUE(
!out_dtype || *out_dtype == out.scalar_type(),
"out_dtype must match output matrix type");
if (bias) {
TORCH_CHECK_VALUE(
bias->scalar_type() == kFloat ||
bias->scalar_type() == c10::ScalarType::BFloat16 ||
bias->scalar_type() == c10::ScalarType::Half,
"Bias must be Float32 or BFloat16 or Half, but got ",
bias->scalar_type());
}
{
auto bias_ = bias.value_or(Tensor());
// NOLINTNEXTLINE(*c-array*)
TensorArg targs[]{
{out, "out", 0},
{mat_a, "mat_a", 1},
{mat_b, "mat_b", 2},
{bias_, "bias", 3},
{scale_a[0], "scale_a", 4},
{scale_b[0], "scale_b", 5}};
checkAllSameGPU(__func__, targs);
}
// Align with CUDA's default out to be bf16
auto out_dtype_ = out_dtype.value_or(c10::ScalarType::BFloat16);
// Conversion of implicitly-defined enums to explicit
auto scale_recipe_a_enum = convert_int_to_enum<ScalingType>(scale_recipe_a);
auto swizzle_a_enum = convert_int_to_enum<SwizzleType>(swizzle_a);
auto scale_recipe_b_enum = convert_int_to_enum<ScalingType>(scale_recipe_b);
auto swizzle_b_enum = convert_int_to_enum<SwizzleType>(swizzle_b);
// XPU does not support swizzle for now. So directly return false.
TORCH_CHECK_VALUE(
swizzle_a_enum[0] == at::blas::SwizzleType::NO_SWIZZLE &&
swizzle_b_enum[0] == at::blas::SwizzleType::NO_SWIZZLE,
"XPU does not support swizzle yet.");
// at this point we can start working out what we want to be doing
// Try to do as few steps as possible.
// NOTE: support is deliberately sparse, can explicitly enumerate all
// combinations allowed. Do this via a list of defined (name, acceptance,
// concrete_impl) tuples.
bool found_impl = false;
ScaledGemmImplementation gemm_impl = ScaledGemmImplementation::NONE;
for (const auto& fn_entry : scale_kernel_dispatch) {
const auto [name, accept_fn, scaled_gemm_impl] = fn_entry;
bool ok = accept_fn(
mat_a.scalar_type(),
scale_recipe_a_enum,
scale_a,
mat_b.scalar_type(),
scale_recipe_b_enum,
scale_b);
if (ok) {
gemm_impl = scaled_gemm_impl;
found_impl = true;
break;
}
}
TORCH_CHECK_VALUE(
found_impl,
"Invalid scaling configuration.\n"
"- For TensorWise scaling, a and b should be float8, scales should be float and singletons.\n"
"- For RowWise scaling, a and b should be float8, scales should be float, scale_a should be (",
mat_a.size(0),
", 1) and scale_b should be (1, ",
mat_b.size(1),
"), and both should be contiguous.\n"
"Got mat_a.dtype()=",
mat_a.scalar_type(),
", scale_a[0].dtype()=",
scale_a[0].scalar_type(),
", scale_a[0].size()=",
scale_a[0].sizes(),
", scale_a[0].stride()=",
scale_a[0].strides(),
", ",
"mat_b.dtype()=",
mat_b.scalar_type(),
", scale_b[0].dtype()=",
scale_b[0].scalar_type(),
", scale_b[0].size()=",
scale_b[0].sizes(),
" and scale_b[0].stride()=",
scale_b[0].strides());
at::native::resize_output(out, {mat_a.size(0), mat_b.size(1)});
auto bias_ = bias.value_or(Tensor());
// dispatch to appropriate lower-level calls for error checking & execution
if (gemm_impl == ScaledGemmImplementation::TENSORWISE_TENSORWISE) {
return _scaled_tensorwise_tensorwise(
mat_a,
mat_b,
scale_a[0],
scale_b[0],
bias,
out_dtype_,
use_fast_accum,
out);
} else if (gemm_impl == ScaledGemmImplementation::ROWWISE_ROWWISE) {
return _scaled_rowwise_rowwise(
mat_a,
mat_b,
scale_a[0],
scale_b[0],
bias,
out_dtype_,
use_fast_accum,
out);
} else {
TORCH_CHECK_VALUE(
false, "Invalid state - found an implementation, but not really");
}
}
Tensor _scaled_mm_xpu_v2(
const Tensor& mat_a,
const Tensor& mat_b,
ArrayRef<Tensor> scale_a,
IntArrayRef scale_recipe_a,
IntArrayRef swizzle_a,
ArrayRef<Tensor> scale_b,
IntArrayRef scale_recipe_b,
IntArrayRef swizzle_b,
const std::optional<Tensor>& bias,
const std::optional<c10::ScalarType> out_dtype,
IntArrayRef contraction_dim,
bool use_fast_accum) {
const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
return _scaled_mm_xpu_v2_out(
mat_a,
mat_b,
scale_a,
scale_recipe_a,
swizzle_a,
scale_b,
scale_recipe_b,
swizzle_b,
bias,
out_dtype,
contraction_dim,
use_fast_accum,
out);
}
} // namespace at::native

View File

@ -82,7 +82,6 @@ NSArray<NSNumber*>* getTensorAxes(const TensorBase& t);
NSArray<NSNumber*>* getTensorAxes(const IntArrayRef& sizes, at::OptionalIntArrayRef dim);
std::string getMPSShapeString(MPSShape* shape);
std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = true, bool exclude_shape = false);
std::string to_hex_key(float);
std::string getArrayRefString(const IntArrayRef s);
// use has_storage() on the returned tensor to determine if src actually is a view
Tensor gatherViewTensor(const Tensor& src, Tensor& dst);

View File

@ -301,10 +301,6 @@ std::string getArrayRefString(const IntArrayRef s) {
return fmt::to_string(fmt::join(s, ","));
}
std::string to_hex_key(float f) {
return fmt::format("{:a}", f);
}
std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype, bool exclude_shape) {
fmt::basic_memory_buffer<char, 100> buffer;
auto buf_iterator = std::back_inserter(buffer);

View File

@ -91,30 +91,25 @@ static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
#include <ATen/native/mps/Repeat_metallib.h>
#endif
Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
TORCH_CHECK(repeat.dim() == 1, "repeat_interleave only accept 1D vector as repeat");
template <typename index_t>
void computeRepeatIndices(const index_t* repeat_ptr,
const int64_t* cumsum_ptr,
index_t* result_ptr,
int64_t size,
int64_t result_size) {
id<MTLBuffer> repeatBuffer = reinterpret_cast<id<MTLBuffer>>(repeat_ptr);
id<MTLBuffer> cumsumBuffer = reinterpret_cast<id<MTLBuffer>>(cumsum_ptr);
id<MTLBuffer> resultBuffer = reinterpret_cast<id<MTLBuffer>>(result_ptr);
TORCH_CHECK(repeatBuffer && cumsumBuffer && resultBuffer);
std::string scalar_type;
if (repeat.scalar_type() == kInt) {
if constexpr (std::is_same_v<index_t, int32_t>) {
scalar_type = "int32_t";
} else if (repeat.scalar_type() == kLong) {
} else if constexpr (std::is_same_v<index_t, int64_t>) {
scalar_type = "int64_t";
} else {
TORCH_CHECK(false, "repeats has to be Long or Int tensor");
TORCH_CHECK(false, "repeat_interleave: unsupported indexing data type");
}
if (repeat.size(0) == 0) {
return at::empty_like(repeat, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
}
Tensor repeat_ = repeat.contiguous();
Tensor cumsum = repeat.cumsum(0);
int64_t total = 0;
if (output_size.has_value()) {
total = output_size.value();
} else {
total = cumsum[-1].item<int64_t>();
TORCH_CHECK((repeat >= 0).all().item<uint8_t>(), "repeats can not be negative");
}
auto result = at::empty({total}, repeat.options());
MPSStream* mpsStream = getCurrentMPSStream();
dispatch_sync(mpsStream->queue(), ^() {
@ -126,13 +121,20 @@ Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output
getMPSProfiler().beginProfileKernel(pipelineState, "repeat_interleave:" + scalar_type, false);
[computeEncoder setComputePipelineState:pipelineState];
mps::mtl_setArgs(computeEncoder, repeat_, cumsum, result, repeat.size(0));
mps::mtl_dispatch1DJob(computeEncoder, pipelineState, repeat.size(0));
mps::mtl_setArgs(computeEncoder, repeatBuffer, cumsumBuffer, resultBuffer, size);
mps::mtl_dispatch1DJob(computeEncoder, pipelineState, size);
getMPSProfiler().endProfileKernel(pipelineState);
}
});
return result;
}
Tensor repeat_interleave_mps(const Tensor& repeat, std::optional<int64_t> output_size) {
Tensor output;
AT_DISPATCH_INDEX_TYPES(repeat.scalar_type(), "repeat_interleave_mps", [&]() {
output = repeat_interleave_common<index_t, computeRepeatIndices<index_t>>(repeat, output_size);
});
return output;
}
} // namespace at::native

View File

@ -5,7 +5,6 @@
#include <ATen/native/Resize.h>
#include <ATen/native/TensorCompare.h>
#include <ATen/native/mps/OperationUtils.h>
#include <algorithm>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
@ -90,21 +89,13 @@ static void check_min_max_dims(const OptionalTensorRef clamp_opt, const Tensor&
auto clamp_shape = clamp_opt->sizes();
auto input_shape = input_t.sizes();
if (num_clamp_dims > num_input_dims) {
auto leading_dims = num_clamp_dims - num_input_dims;
for (int64_t i = 0; i < leading_dims; ++i) {
TORCH_CHECK(clamp_shape[i] == 1,
op_name + ": clamp tensor leading shape must be 1 to broadcast with input tensor");
}
}
TORCH_CHECK(num_clamp_dims <= num_input_dims,
op_name + ": clamp tensor number of dims must not be greater than that of input tensor")
auto clamp_idx = num_clamp_dims - 1;
auto input_idx = num_input_dims - 1;
auto common_dims = std::min(num_clamp_dims, num_input_dims);
for (int64_t i = 0; i < common_dims; ++i)
for (int i = 0; i < num_clamp_dims; i++)
// One of the indices is allowed to be 1; will be handled by broadcast
TORCH_CHECK(clamp_shape[clamp_idx - i] == input_shape[input_idx - i] || clamp_shape[clamp_idx - i] == 1 ||
input_shape[input_idx - i] == 1,
TORCH_CHECK(clamp_shape[num_clamp_dims - 1 - i] == input_shape[num_input_dims - 1 - i] ||
clamp_shape[num_clamp_dims - 1 - i] == 1 || input_shape[num_input_dims - 1 - i] == 1,
op_name + ": clamp tensor trailing shape must match input tensor")
}
}
@ -145,6 +136,9 @@ static void clamp_tensor_out_mps(const Tensor& input_t,
auto result_type = output_t.scalar_type();
IntArrayRef new_min_shape;
IntArrayRef new_max_shape;
auto num_min_dims = min_opt->dim();
auto num_max_dims = max_opt->dim();
auto num_input_dims = input_t.dim();
@ -152,32 +146,24 @@ static void clamp_tensor_out_mps(const Tensor& input_t,
std::vector<int64_t> new_min_arr(num_input_dims);
std::vector<int64_t> new_max_arr(num_input_dims);
if (has_min && num_min_dims < num_input_dims) {
fill_new_shape(num_input_dims, num_min_dims, new_min_arr.data(), min_opt->sizes());
new_min_shape = IntArrayRef(new_min_arr);
}
if (has_max && num_max_dims < num_input_dims) {
fill_new_shape(num_input_dims, num_max_dims, new_max_arr.data(), max_opt->sizes());
new_max_shape = IntArrayRef(new_max_arr);
}
Tensor min_opt_tensor;
Tensor max_opt_tensor;
auto reshape_clamp_tensor = [&](const OptionalTensorRef clamp_tensor_ref,
int64_t num_clamp_dims,
std::vector<int64_t>& new_shape_storage) -> Tensor {
IntArrayRef clamp_shape = clamp_tensor_ref->sizes();
bool requires_view = false;
if (num_clamp_dims > num_input_dims) {
clamp_shape = clamp_shape.slice(num_clamp_dims - num_input_dims);
requires_view = true;
} else if (num_clamp_dims < num_input_dims) {
fill_new_shape(num_input_dims, num_clamp_dims, new_shape_storage.data(), clamp_shape);
clamp_shape = IntArrayRef(new_shape_storage);
requires_view = true;
}
return requires_view ? (*clamp_tensor_ref).view(clamp_shape) : *clamp_tensor_ref;
};
if (has_min) {
min_opt_tensor = reshape_clamp_tensor(min_opt, num_min_dims, new_min_arr);
min_opt_tensor = (num_min_dims < num_input_dims) ? (*min_opt).view(new_min_shape) : *min_opt;
}
if (has_max) {
max_opt_tensor = reshape_clamp_tensor(max_opt, num_max_dims, new_max_arr);
max_opt_tensor = (num_max_dims < num_input_dims) ? (*max_opt).view(new_max_shape) : *max_opt;
}
@autoreleasepool {
@ -258,8 +244,8 @@ static void clamp_scalar_out_mps(const Tensor& input_t,
@autoreleasepool {
// the optional min/max refs could affect how we build the cached graph
std::string key = op_name + (has_min ? ("_min:" + to_hex_key(min_scalar)) : "") +
(has_max ? ("_max:" + to_hex_key(max_scalar)) : "") + "_scalar:" + getTensorsStringKey({input_t});
std::string key = op_name + (has_min ? ("_min:" + std::to_string(min_scalar)) : "") +
(has_max ? ("_max:" + std::to_string(max_scalar)) : "") + "_scalar:" + getTensorsStringKey({input_t});
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
if (has_min)
newCachedGraph->minTensor = [mpsGraph constantWithScalar:min_scalar

View File

@ -4225,7 +4225,7 @@
MTIA: mm_out_mtia
MPS: mm_out_mps
XPU: mm_out_xpu
SparseCPU, SparseCUDA, SparseMPS: _sparse_mm_out
SparseCPU, SparseCUDA: _sparse_mm_out
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm_out
- func: mm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor

View File

@ -301,12 +301,12 @@ class AvgPoolMicrokernelTester {
ASSERT_NEAR(
float(int32_t(y[i * yStride() + k])), yFP[i * kc() + k], 0.5001f)
<< "at pixel " << i << ", channel " << k << ", n = " << n()
<< ", ks = " << kh() << 'x' << kw() << " (" << ks()
<< ", ks = " << kh() << "x" << kw() << " (" << ks()
<< "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
ASSERT_EQ(
uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
<< "at pixel " << i << ", channel " << k << ", n = " << n()
<< ", ks = " << kh() << 'x' << kw() << " (" << ks()
<< ", ks = " << kh() << "x" << kw() << " (" << ks()
<< "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
}
}
@ -396,12 +396,12 @@ class AvgPoolMicrokernelTester {
ASSERT_NEAR(
float(int32_t(y[i * yStride() + k])), yFP[i * kc() + k], 0.5001f)
<< "at pixel " << i << ", channel " << k << ", n = " << n()
<< ", ks = " << kh() << 'x' << kw() << " (" << ks()
<< ", ks = " << kh() << "x" << kw() << " (" << ks()
<< "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
ASSERT_EQ(
uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
<< "at pixel " << i << ", channel " << k << ", n = " << n()
<< ", ks = " << kh() << 'x' << kw() << " (" << ks()
<< ", ks = " << kh() << "x" << kw() << " (" << ks()
<< "), kc = " << kc() << ", acc = " << yAcc[i * kc() + k];
}
}

View File

@ -232,7 +232,7 @@ class MaxPoolMicrokernelTester {
ASSERT_EQ(
uint32_t(yRef[i * kc() + k]), uint32_t(y[i * yStride() + k]))
<< "at pixel " << i << ", channel " << k << ", n = " << n()
<< ", ks = " << kh() << 'x' << kw() << " (" << ks()
<< ", ks = " << kh() << "x" << kw() << " (" << ks()
<< "), kc = " << kc();
}
}

View File

@ -17,7 +17,7 @@ inline std::vector<T> _expand_param_if_needed(
std::ostringstream ss;
ss << "expected " << param_name << " to be a single integer value or a "
<< "list of " << expected_dim << " values to match the convolution "
<< "dimensions, but got " << param_name << '=' << list_param;
<< "dimensions, but got " << param_name << "=" << list_param;
TORCH_CHECK(false, ss.str());
} else {
return list_param.vec();

View File

@ -358,9 +358,9 @@ std::string Adapter::stringize() const {
std::string device_type = get_device_type_str(properties.deviceType);
VkPhysicalDeviceLimits limits = properties.limits;
ss << '{' << std::endl;
ss << "{" << std::endl;
ss << " Physical Device Info {" << std::endl;
ss << " apiVersion: " << v_major << '.' << v_minor << std::endl;
ss << " apiVersion: " << v_major << "." << v_minor << std::endl;
ss << " driverversion: " << properties.driverVersion << std::endl;
ss << " deviceType: " << device_type << std::endl;
ss << " deviceName: " << properties.deviceName << std::endl;
@ -371,7 +371,7 @@ std::string Adapter::stringize() const {
#define PRINT_LIMIT_PROP_VEC3(name) \
ss << " " << std::left << std::setw(36) << #name << limits.name[0] \
<< ',' << limits.name[1] << ',' << limits.name[2] << std::endl;
<< "," << limits.name[1] << "," << limits.name[2] << std::endl;
ss << " Physical Device Limits {" << std::endl;
PRINT_LIMIT_PROP(maxImageDimension1D);
@ -425,7 +425,7 @@ std::string Adapter::stringize() const {
;
}
ss << " ]" << std::endl;
ss << '}';
ss << "}";
return ss.str();
}

View File

@ -33,7 +33,7 @@ std::ostream& operator<<(std::ostream& out, const VkResult result) {
VK_RESULT_CASE(VK_ERROR_FORMAT_NOT_SUPPORTED)
VK_RESULT_CASE(VK_ERROR_FRAGMENTED_POOL)
default:
out << "VK_ERROR_UNKNOWN (VkResult " << result << ')';
out << "VK_ERROR_UNKNOWN (VkResult " << result << ")";
break;
}
return out;
@ -46,7 +46,7 @@ std::ostream& operator<<(std::ostream& out, const VkResult result) {
//
std::ostream& operator<<(std::ostream& out, const SourceLocation& loc) {
out << loc.function << " at " << loc.file << ':' << loc.line;
out << loc.function << " at " << loc.file << ":" << loc.line;
return out;
}
@ -66,7 +66,7 @@ Error::Error(SourceLocation source_location, const char* cond, std::string msg)
: msg_(std::move(msg)), source_location_{source_location} {
std::ostringstream oss;
oss << "Exception raised from " << source_location_ << ": ";
oss << '(' << cond << ") is false! ";
oss << "(" << cond << ") is false! ";
oss << msg_;
what_ = oss.str();
}

View File

@ -173,8 +173,8 @@ void QueryPool::extract_results() {
static std::string stringize(const VkExtent3D& extents) {
std::stringstream ss;
ss << '{' << extents.width << ", " << extents.height << ", " << extents.depth
<< '}';
ss << "{" << extents.width << ", " << extents.height << ", " << extents.depth
<< "}";
return ss.str();
}

View File

@ -149,7 +149,7 @@ VKAPI_ATTR VkBool32 VKAPI_CALL debug_report_callback_fn(
(void)flags;
std::stringstream stream;
stream << layer_prefix << ' ' << message_code << ' ' << message << std::endl;
stream << layer_prefix << " " << message_code << " " << message << std::endl;
const std::string log = stream.str();
std::cout << log;

Some files were not shown because too many files have changed in this diff Show More