mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-23 06:34:55 +08:00
Compare commits
12 Commits
perf_ops
...
install-to
Author | SHA1 | Date | |
---|---|---|---|
3b9b4065af | |||
e1f586a43e | |||
18dc2e03ac | |||
d7c3d8a551 | |||
78b4d254aa | |||
8d5240d846 | |||
135db45c9c | |||
8139b6b1b1 | |||
24c95d83e6 | |||
21a34fa017 | |||
636d3aa00f | |||
174f2faa8c |
@ -31,8 +31,7 @@ pip install -r /pytorch/requirements.txt
|
||||
pip install auditwheel==6.2.0 wheel
|
||||
if [ "$DESIRED_CUDA" = "cpu" ]; then
|
||||
echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
|
||||
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
|
||||
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
|
||||
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
|
||||
else
|
||||
echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
|
||||
export USE_SYSTEM_NCCL=1
|
||||
@ -46,6 +45,5 @@ else
|
||||
export USE_NVIDIA_PYPI_LIBS=1
|
||||
fi
|
||||
|
||||
#USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
|
||||
USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
|
||||
python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
|
||||
fi
|
||||
|
@ -13,49 +13,6 @@ def list_dir(path: str) -> list[str]:
|
||||
return check_output(["ls", "-1", path]).decode().split("\n")
|
||||
|
||||
|
||||
def build_ArmComputeLibrary() -> None:
|
||||
"""
|
||||
Using ArmComputeLibrary for aarch64 PyTorch
|
||||
"""
|
||||
print("Building Arm Compute Library")
|
||||
acl_build_flags = [
|
||||
"debug=0",
|
||||
"neon=1",
|
||||
"opencl=0",
|
||||
"os=linux",
|
||||
"openmp=1",
|
||||
"cppthreads=0",
|
||||
"arch=armv8a",
|
||||
"multi_isa=1",
|
||||
"fixed_format_kernels=1",
|
||||
"build=native",
|
||||
]
|
||||
acl_install_dir = "/acl"
|
||||
acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")
|
||||
if os.path.isdir(acl_install_dir):
|
||||
shutil.rmtree(acl_install_dir)
|
||||
if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):
|
||||
check_call(
|
||||
[
|
||||
"git",
|
||||
"clone",
|
||||
"https://github.com/ARM-software/ComputeLibrary.git",
|
||||
"-b",
|
||||
"v25.02",
|
||||
"--depth",
|
||||
"1",
|
||||
"--shallow-submodules",
|
||||
]
|
||||
)
|
||||
|
||||
check_call(
|
||||
["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,
|
||||
cwd=acl_checkout_dir,
|
||||
)
|
||||
for d in ["arm_compute", "include", "utils", "support", "src", "build"]:
|
||||
shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
|
||||
|
||||
|
||||
def replace_tag(filename) -> None:
|
||||
with open(filename) as f:
|
||||
lines = f.readlines()
|
||||
@ -317,7 +274,7 @@ if __name__ == "__main__":
|
||||
).decode()
|
||||
|
||||
print("Building PyTorch wheel")
|
||||
build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
|
||||
build_vars = ""
|
||||
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
|
||||
if enable_cuda:
|
||||
build_vars += "MAX_JOBS=5 "
|
||||
@ -356,19 +313,13 @@ if __name__ == "__main__":
|
||||
build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
|
||||
|
||||
if enable_mkldnn:
|
||||
build_ArmComputeLibrary()
|
||||
print("build pytorch with mkldnn+acl backend")
|
||||
build_vars += (
|
||||
"USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
|
||||
"ACL_ROOT_DIR=/acl "
|
||||
"LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH "
|
||||
"ACL_INCLUDE_DIR=/acl/build "
|
||||
"ACL_LIBRARY=/acl/build "
|
||||
)
|
||||
build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
|
||||
build_vars += "ACL_ROOT_DIR=/acl "
|
||||
if enable_cuda:
|
||||
build_vars += "BLAS=NVPL "
|
||||
else:
|
||||
build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/OpenBLAS "
|
||||
build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS "
|
||||
else:
|
||||
print("build pytorch without mkldnn backend")
|
||||
|
||||
|
@ -299,40 +299,6 @@ def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
|
||||
)
|
||||
|
||||
|
||||
def build_OpenBLAS(host: RemoteHost, git_clone_flags: str = "") -> None:
|
||||
print("Building OpenBLAS")
|
||||
host.run_cmd(
|
||||
f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.28 {git_clone_flags}"
|
||||
)
|
||||
make_flags = "NUM_THREADS=64 USE_OPENMP=1 NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=ARMV8"
|
||||
host.run_cmd(
|
||||
f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS"
|
||||
)
|
||||
|
||||
|
||||
def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None:
|
||||
print("Building Arm Compute Library")
|
||||
acl_build_flags = " ".join(
|
||||
[
|
||||
"debug=0",
|
||||
"neon=1",
|
||||
"opencl=0",
|
||||
"os=linux",
|
||||
"openmp=1",
|
||||
"cppthreads=0",
|
||||
"arch=armv8a",
|
||||
"multi_isa=1",
|
||||
"fixed_format_kernels=1",
|
||||
"build=native",
|
||||
]
|
||||
)
|
||||
host.run_cmd(
|
||||
f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v25.02 {git_clone_flags}"
|
||||
)
|
||||
|
||||
host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}")
|
||||
|
||||
|
||||
def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:
|
||||
host.run_cmd("pip3 install auditwheel")
|
||||
host.run_cmd(
|
||||
@ -700,7 +666,6 @@ def start_build(
|
||||
configure_system(
|
||||
host, compiler=compiler, use_conda=use_conda, python_version=python_version
|
||||
)
|
||||
build_OpenBLAS(host, git_clone_flags)
|
||||
|
||||
if host.using_docker():
|
||||
print("Move libgfortant.a into a standard location")
|
||||
@ -723,6 +688,8 @@ def start_build(
|
||||
f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"
|
||||
)
|
||||
|
||||
host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")
|
||||
|
||||
print("Building PyTorch wheel")
|
||||
build_opts = ""
|
||||
if pytorch_build_number is not None:
|
||||
@ -743,15 +710,18 @@ def start_build(
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
if enable_mkldnn:
|
||||
build_ArmComputeLibrary(host, git_clone_flags)
|
||||
host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")
|
||||
print("build pytorch with mkldnn+acl backend")
|
||||
build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
|
||||
build_vars += " BLAS=OpenBLAS"
|
||||
build_vars += " OpenBLAS_HOME=/opt/OpenBLAS"
|
||||
build_vars += " ACL_ROOT_DIR=/acl"
|
||||
host.run_cmd(
|
||||
f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}"
|
||||
f"cd $HOME/pytorch && {build_vars} python3 setup.py bdist_wheel{build_opts}"
|
||||
)
|
||||
print("Repair the wheel")
|
||||
pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
|
||||
ld_library_path = "$HOME/acl/build:$HOME/pytorch/build/lib"
|
||||
ld_library_path = "/acl/build:$HOME/pytorch/build/lib"
|
||||
host.run_cmd(
|
||||
f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"
|
||||
)
|
||||
@ -907,7 +877,7 @@ def terminate_instances(instance_type: str) -> None:
|
||||
def parse_arguments():
|
||||
from argparse import ArgumentParser
|
||||
|
||||
parser = ArgumentParser("Builid and test AARCH64 wheels using EC2")
|
||||
parser = ArgumentParser("Build and test AARCH64 wheels using EC2")
|
||||
parser.add_argument("--key-name", type=str)
|
||||
parser.add_argument("--debug", action="store_true")
|
||||
parser.add_argument("--build-only", action="store_true")
|
||||
|
@ -214,7 +214,8 @@ case "$tag" in
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
# TODO (huydhn): Upgrade this to Python >= 3.10
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
|
@ -1 +1 @@
|
||||
5ae38bdb0dc066c5823e34dc9797afb9de42c866
|
||||
fccfc522864cf8bc172abe0cd58ae5581e2d44b9
|
||||
|
27
.ci/docker/common/install_acl.sh
Normal file → Executable file
27
.ci/docker/common/install_acl.sh
Normal file → Executable file
@ -1,16 +1,27 @@
|
||||
set -euo pipefail
|
||||
#!/bin/bash
|
||||
# Script used only in CD pipeline
|
||||
|
||||
readonly version=v25.02
|
||||
readonly src_host=https://github.com/ARM-software
|
||||
readonly src_repo=ComputeLibrary
|
||||
set -eux
|
||||
|
||||
ACL_VERSION=${ACL_VERSION:-"v25.02"}
|
||||
ACL_INSTALL_DIR="/acl"
|
||||
|
||||
# Clone ACL
|
||||
[[ ! -d ${src_repo} ]] && git clone ${src_host}/${src_repo}.git
|
||||
cd ${src_repo}
|
||||
|
||||
git checkout $version
|
||||
git clone https://github.com/ARM-software/ComputeLibrary.git -b "${ACL_VERSION}" --depth 1 --shallow-submodules
|
||||
|
||||
ACL_CHECKOUT_DIR="ComputeLibrary"
|
||||
# Build with scons
|
||||
pushd $ACL_CHECKOUT_DIR
|
||||
scons -j8 Werror=0 debug=0 neon=1 opencl=0 embed_kernels=0 \
|
||||
os=linux arch=armv8a build=native multi_isa=1 \
|
||||
fixed_format_kernels=1 openmp=1 cppthreads=0
|
||||
popd
|
||||
|
||||
# Install ACL
|
||||
sudo mkdir -p ${ACL_INSTALL_DIR}
|
||||
for d in arm_compute include utils support src build
|
||||
do
|
||||
sudo cp -r ${ACL_CHECKOUT_DIR}/${d} ${ACL_INSTALL_DIR}/${d}
|
||||
done
|
||||
|
||||
rm -rf $ACL_CHECKOUT_DIR
|
12
.ci/docker/common/install_openblas.sh
Normal file → Executable file
12
.ci/docker/common/install_openblas.sh
Normal file → Executable file
@ -3,8 +3,10 @@
|
||||
|
||||
set -ex
|
||||
|
||||
cd /
|
||||
git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules
|
||||
OPENBLAS_VERSION=${OPENBLAS_VERSION:-"v0.3.30"}
|
||||
|
||||
# Clone OpenBLAS
|
||||
git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION}" --depth 1 --shallow-submodules
|
||||
|
||||
OPENBLAS_CHECKOUT_DIR="OpenBLAS"
|
||||
OPENBLAS_BUILD_FLAGS="
|
||||
@ -17,5 +19,7 @@ CFLAGS=-O3
|
||||
BUILD_BFLOAT16=1
|
||||
"
|
||||
|
||||
make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
|
||||
make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}
|
||||
make -j8 ${OPENBLAS_BUILD_FLAGS} -C $OPENBLAS_CHECKOUT_DIR
|
||||
sudo make install -C $OPENBLAS_CHECKOUT_DIR
|
||||
|
||||
rm -rf $OPENBLAS_CHECKOUT_DIR
|
@ -62,6 +62,13 @@ ARG OPENBLAS_VERSION
|
||||
ADD ./common/install_openblas.sh install_openblas.sh
|
||||
RUN bash ./install_openblas.sh && rm install_openblas.sh
|
||||
|
||||
# Install Arm Compute Library
|
||||
FROM base as arm_compute
|
||||
# use python3.9 to install scons
|
||||
RUN python3.9 -m pip install scons==4.7.0
|
||||
RUN ln -sf /opt/python/cp39-cp39/bin/scons /usr/local/bin
|
||||
COPY ./common/install_acl.sh install_acl.sh
|
||||
RUN bash ./install_acl.sh && rm install_acl.sh
|
||||
FROM base as final
|
||||
|
||||
# remove unnecessary python versions
|
||||
@ -70,4 +77,5 @@ RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
|
||||
RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
|
||||
RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
|
||||
COPY --from=openblas /opt/OpenBLAS/ /opt/OpenBLAS/
|
||||
ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
|
||||
COPY --from=arm_compute /acl /acl
|
||||
ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:/acl/build/:$LD_LIBRARY_PATH
|
@ -28,6 +28,7 @@ fi
|
||||
MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
|
||||
DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
|
||||
OPENBLAS_VERSION=${OPENBLAS_VERSION:-}
|
||||
ACL_VERSION=${ACL_VERSION:-}
|
||||
|
||||
case ${image} in
|
||||
manylinux2_28-builder:cpu)
|
||||
@ -41,7 +42,6 @@ case ${image} in
|
||||
GPU_IMAGE=arm64v8/almalinux:8
|
||||
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
|
||||
MANY_LINUX_VERSION="2_28_aarch64"
|
||||
OPENBLAS_VERSION="v0.3.30"
|
||||
;;
|
||||
manylinuxcxx11-abi-builder:cpu-cxx11-abi)
|
||||
TARGET=final
|
||||
@ -121,7 +121,8 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
|
||||
DOCKER_BUILDKIT=1 docker build \
|
||||
${DOCKER_GPU_BUILD_ARG} \
|
||||
--build-arg "GPU_IMAGE=${GPU_IMAGE}" \
|
||||
--build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION}" \
|
||||
--build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION:-}" \
|
||||
--build-arg "ACL_VERSION=${ACL_VERSION:-}" \
|
||||
--target "${TARGET}" \
|
||||
-t "${tmp_tag}" \
|
||||
$@ \
|
||||
|
@ -7,4 +7,4 @@ set -ex
|
||||
|
||||
SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||
|
||||
USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh
|
||||
USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
|
||||
|
@ -66,11 +66,6 @@ class VllmBuildParameters:
|
||||
"DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm"
|
||||
)
|
||||
|
||||
# the cleaning script to remove torch dependencies from pip
|
||||
cleaning_script: Path = env_path_field(
|
||||
"cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"
|
||||
)
|
||||
|
||||
# OUTPUT_DIR: where docker buildx (local exporter) will write artifacts
|
||||
output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm")
|
||||
|
||||
@ -165,7 +160,6 @@ class VllmBuildRunner(BaseRunner):
|
||||
logger.info("Running vllm build with inputs: %s", inputs)
|
||||
vllm_commit = clone_vllm()
|
||||
|
||||
self.cp_torch_cleaning_script(inputs)
|
||||
self.cp_dockerfile_if_exist(inputs)
|
||||
# cp torch wheels from root direct to vllm workspace if exist
|
||||
self.cp_torch_whls_if_exist(inputs)
|
||||
@ -211,11 +205,6 @@ class VllmBuildRunner(BaseRunner):
|
||||
copy(inputs.torch_whls_path, tmp_dir)
|
||||
return tmp_dir
|
||||
|
||||
def cp_torch_cleaning_script(self, inputs: VllmBuildParameters):
|
||||
script = get_path(inputs.cleaning_script, resolve=True)
|
||||
vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")
|
||||
copy(script, vllm_script)
|
||||
|
||||
def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters):
|
||||
if not inputs.use_local_dockerfile:
|
||||
logger.info("using vllm default dockerfile.torch_nightly for build")
|
||||
|
@ -11,7 +11,7 @@ from typing import Any
|
||||
|
||||
from cli.lib.common.cli_helper import BaseRunner
|
||||
from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env
|
||||
from cli.lib.common.path_helper import copy, get_path, remove_dir
|
||||
from cli.lib.common.path_helper import copy, remove_dir
|
||||
from cli.lib.common.pip_helper import (
|
||||
pip_install_first_match,
|
||||
pip_install_packages,
|
||||
@ -43,10 +43,6 @@ class VllmTestParameters:
|
||||
|
||||
torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")
|
||||
|
||||
cleaning_script: Path = env_path_field(
|
||||
"cleaning_script", ".github/ci_configs/vllm/use_existing_torch.py"
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
if not self.torch_whls_path.exists():
|
||||
raise ValueError("missing torch_whls_path")
|
||||
@ -96,13 +92,11 @@ class VllmTestRunner(BaseRunner):
|
||||
self._set_envs(params)
|
||||
|
||||
clone_vllm(dst=self.work_directory)
|
||||
self.cp_torch_cleaning_script(params)
|
||||
with working_directory(self.work_directory):
|
||||
remove_dir(Path("vllm"))
|
||||
self._install_wheels(params)
|
||||
self._install_dependencies()
|
||||
# verify the torches are not overridden by test dependencies
|
||||
|
||||
check_versions()
|
||||
|
||||
def run(self):
|
||||
@ -131,11 +125,6 @@ class VllmTestRunner(BaseRunner):
|
||||
# double check the torches are not overridden by other packages
|
||||
check_versions()
|
||||
|
||||
def cp_torch_cleaning_script(self, params: VllmTestParameters):
|
||||
script = get_path(params.cleaning_script, resolve=True)
|
||||
vllm_script = Path(f"./{self.work_directory}/use_existing_torch.py")
|
||||
copy(script, vllm_script)
|
||||
|
||||
def _install_wheels(self, params: VllmTestParameters):
|
||||
logger.info("Running vllm test with inputs: %s", params)
|
||||
if not pkg_exists("torch"):
|
||||
|
@ -89,7 +89,7 @@ fi
|
||||
if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
|
||||
export USE_MKLDNN=1
|
||||
export USE_MKLDNN_ACL=1
|
||||
export ACL_ROOT_DIR=/ComputeLibrary
|
||||
export ACL_ROOT_DIR=/acl
|
||||
fi
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then
|
||||
|
@ -258,19 +258,11 @@ function install_torchrec_and_fbgemm() {
|
||||
git clone --recursive https://github.com/pytorch/fbgemm
|
||||
pushd fbgemm/fbgemm_gpu
|
||||
git checkout "${fbgemm_commit}" --recurse-submodules
|
||||
# until the fbgemm_commit includes the tbb patch
|
||||
patch <<'EOF'
|
||||
--- a/FbgemmGpu.cmake
|
||||
+++ b/FbgemmGpu.cmake
|
||||
@@ -184,5 +184,6 @@ gpu_cpp_library(
|
||||
fbgemm_gpu_tbe_cache
|
||||
fbgemm_gpu_tbe_optimizers
|
||||
fbgemm_gpu_tbe_utils
|
||||
+ tbb
|
||||
DESTINATION
|
||||
fbgemm_gpu)
|
||||
EOF
|
||||
python setup.py bdist_wheel --build-variant=rocm
|
||||
python setup.py bdist_wheel \
|
||||
--build-variant=rocm \
|
||||
-DHIP_ROOT_DIR="${ROCM_PATH}" \
|
||||
-DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
|
||||
-DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
|
||||
popd
|
||||
|
||||
# Save the wheel before cleaning up
|
||||
|
@ -35,10 +35,11 @@ fi
|
||||
|
||||
print_cmake_info
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
|
||||
USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
||||
# Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
|
||||
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
||||
else
|
||||
# NB: we always build with distributed; USE_DISTRIBUTED turns off all
|
||||
# backends (specifically the gloo backend), so test that this case works too
|
||||
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
|
||||
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
|
||||
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
|
||||
fi
|
||||
if which sccache > /dev/null; then
|
||||
|
@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
|
||||
fi
|
||||
popd
|
||||
|
||||
python -mpip install -r requirements.txt
|
||||
|
||||
# enable debug asserts in serialization
|
||||
export TORCH_SERIALIZATION_DEBUG=1
|
||||
|
||||
python -mpip install --no-input -r requirements.txt
|
||||
|
||||
setup_test_python() {
|
||||
# The CircleCI worker hostname doesn't resolve to an address.
|
||||
# This environment variable makes ProcessGroupGloo default to
|
||||
@ -181,9 +177,6 @@ checkout_install_torchbench() {
|
||||
popd
|
||||
|
||||
pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt
|
||||
# https://github.com/pytorch/pytorch/issues/160689 to remove torchao because
|
||||
# its current version 0.12.0 doesn't work with transformers 4.54.0
|
||||
pip uninstall -y torchao
|
||||
|
||||
echo "Print all dependencies after TorchBench is installed"
|
||||
python -mpip freeze
|
||||
|
@ -386,8 +386,8 @@ def smoke_test_compile(device: str = "cpu") -> None:
|
||||
|
||||
|
||||
def smoke_test_nvshmem() -> None:
|
||||
if not torch.cuda.is_available() or target_os == "windows":
|
||||
print("Windows platform or CUDA is not available, skipping NVSHMEM test")
|
||||
if not torch.cuda.is_available():
|
||||
print("CUDA is not available, skipping NVSHMEM test")
|
||||
return
|
||||
|
||||
# Check if NVSHMEM is compiled in current build
|
||||
@ -396,9 +396,7 @@ def smoke_test_nvshmem() -> None:
|
||||
except ImportError:
|
||||
# Not built with NVSHMEM support.
|
||||
# torch is not compiled with NVSHMEM prior to 2.9
|
||||
from torch.torch_version import TorchVersion
|
||||
|
||||
if TorchVersion(torch.__version__) < (2, 9):
|
||||
if torch.__version__ < "2.9":
|
||||
return
|
||||
else:
|
||||
# After 2.9: NVSHMEM is expected to be compiled in current build
|
||||
|
@ -778,11 +778,6 @@ test_single_dynamo_benchmark() {
|
||||
}
|
||||
|
||||
test_inductor_micro_benchmark() {
|
||||
# torchao requires cuda 8.0 or above for bfloat16 support
|
||||
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;8.6"
|
||||
fi
|
||||
install_torchao
|
||||
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
||||
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
|
||||
test_inductor_set_cpu_affinity
|
||||
@ -1614,26 +1609,6 @@ test_operator_benchmark() {
|
||||
--expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
|
||||
}
|
||||
|
||||
test_operator_microbenchmark() {
|
||||
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
||||
mkdir -p "$TEST_REPORTS_DIR"
|
||||
TEST_DIR=$(pwd)
|
||||
|
||||
pip_uninstall torch torchvision torchaudio
|
||||
pip_install torch==2.8.0 torchvision torchaudio ninja --force-reinstall
|
||||
cd benchmarks/operator_benchmark/pt_extension
|
||||
python -m pip install .
|
||||
|
||||
cd "${TEST_DIR}"/benchmarks/operator_benchmark
|
||||
for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do
|
||||
$TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
|
||||
--output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
|
||||
--benchmark-name "PyTorch operator microbenchmark" --use-compile
|
||||
$TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
|
||||
--output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}.json" \
|
||||
--benchmark-name "PyTorch operator microbenchmark"
|
||||
done
|
||||
}
|
||||
|
||||
if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
|
||||
(cd test && python -c "import torch; print(torch.__config__.show())")
|
||||
@ -1684,39 +1659,50 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
|
||||
elif [[ "${TEST_CONFIG}" == *all* ]]; then
|
||||
TEST_MODE="all"
|
||||
fi
|
||||
|
||||
test_operator_benchmark cpu ${TEST_MODE}
|
||||
|
||||
fi
|
||||
elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
|
||||
test_operator_microbenchmark
|
||||
elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
|
||||
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
|
||||
install_torchao
|
||||
fi
|
||||
test_inductor_distributed
|
||||
elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
|
||||
test_inductor_halide
|
||||
elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
|
||||
test_inductor_triton_cpu
|
||||
elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
|
||||
install_torchao
|
||||
test_inductor_micro_benchmark
|
||||
elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
|
||||
install_torchvision
|
||||
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
|
||||
install_torchao
|
||||
fi
|
||||
id=$((SHARD_NUMBER-1))
|
||||
test_dynamo_benchmark huggingface "$id"
|
||||
elif [[ "${TEST_CONFIG}" == *timm* ]]; then
|
||||
install_torchvision
|
||||
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
|
||||
install_torchao
|
||||
fi
|
||||
id=$((SHARD_NUMBER-1))
|
||||
test_dynamo_benchmark timm_models "$id"
|
||||
elif [[ "${TEST_CONFIG}" == cachebench ]]; then
|
||||
install_torchaudio
|
||||
install_torchvision
|
||||
install_torchao
|
||||
PYTHONPATH=/torchbench test_cachebench
|
||||
elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
|
||||
install_torchaudio
|
||||
install_torchvision
|
||||
install_torchao
|
||||
PYTHONPATH=/torchbench test_verify_cachebench
|
||||
elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
|
||||
install_torchaudio
|
||||
install_torchvision
|
||||
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
|
||||
install_torchao
|
||||
fi
|
||||
id=$((SHARD_NUMBER-1))
|
||||
# https://github.com/opencv/opencv-python/issues/885
|
||||
pip_install opencv-python==4.8.0.74
|
||||
@ -1736,13 +1722,24 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
|
||||
fi
|
||||
elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
|
||||
install_torchvision
|
||||
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
|
||||
install_torchao
|
||||
fi
|
||||
PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
|
||||
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
|
||||
test_inductor_aoti
|
||||
fi
|
||||
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
|
||||
install_torchvision
|
||||
if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
|
||||
install_torchao
|
||||
fi
|
||||
test_inductor_shard "${SHARD_NUMBER}"
|
||||
if [[ "${SHARD_NUMBER}" == 1 ]]; then
|
||||
if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.9-gcc11-build ]]; then
|
||||
test_inductor_distributed
|
||||
fi
|
||||
fi
|
||||
elif [[ "${TEST_CONFIG}" == *einops* ]]; then
|
||||
test_einops
|
||||
elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
|
||||
|
@ -85,7 +85,7 @@ mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
|
||||
# Create an isolated directory to store this builds pytorch checkout and conda
|
||||
# installation
|
||||
if [[ -z "$MAC_PACKAGE_WORK_DIR" ]]; then
|
||||
MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_${DESIRED_PYTHON}_$(date +%H%M%S)"
|
||||
MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_conda_${DESIRED_PYTHON}_$(date +%H%M%S)"
|
||||
fi
|
||||
mkdir -p "$MAC_PACKAGE_WORK_DIR" || true
|
||||
if [[ -n ${GITHUB_ACTIONS} ]]; then
|
||||
@ -96,11 +96,11 @@ fi
|
||||
whl_tmp_dir="${MAC_PACKAGE_WORK_DIR}/dist"
|
||||
mkdir -p "$whl_tmp_dir"
|
||||
|
||||
mac_version='macosx-11_0-arm64'
|
||||
mac_version='macosx_11_0_arm64'
|
||||
libtorch_arch='arm64'
|
||||
|
||||
# Create a consistent wheel package name to rename the wheel to
|
||||
wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version//[-,]/_}.whl"
|
||||
wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version}.whl"
|
||||
|
||||
###########################################################
|
||||
|
||||
@ -125,6 +125,7 @@ popd
|
||||
export TH_BINARY_BUILD=1
|
||||
export INSTALL_TEST=0 # dont install test binaries into site-packages
|
||||
export MACOSX_DEPLOYMENT_TARGET=11.0
|
||||
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
@ -132,19 +133,25 @@ RENAME_WHEEL=true
|
||||
case $desired_python in
|
||||
3.14t)
|
||||
echo "Using 3.14 deps"
|
||||
mac_version='macosx-11.0-arm64'
|
||||
NUMPY_PINNED_VERSION="==2.1.0"
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
RENAME_WHEEL=false
|
||||
;;
|
||||
3.14)
|
||||
echo "Using 3.14t deps"
|
||||
mac_version='macosx-11.0-arm64'
|
||||
NUMPY_PINNED_VERSION="==2.1.0"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
RENAME_WHEEL=false
|
||||
;;
|
||||
3.13t)
|
||||
echo "Using 3.13 deps"
|
||||
NUMPY_PINNED_VERSION="==2.1.0"
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
RENAME_WHEEL=false
|
||||
;;
|
||||
3.13)
|
||||
@ -169,16 +176,20 @@ case $desired_python in
|
||||
;;
|
||||
esac
|
||||
|
||||
# Install into a fresh env
|
||||
tmp_env_name="wheel_py$python_nodot"
|
||||
conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS}
|
||||
source activate "$tmp_env_name"
|
||||
|
||||
PINNED_PACKAGES=(
|
||||
"numpy${NUMPY_PINNED_VERSION}"
|
||||
)
|
||||
python -mvenv ~/${desired_python}-build
|
||||
source ~/${desired_python}-build/bin/activate
|
||||
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
|
||||
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt"
|
||||
pip install requests ninja typing-extensions
|
||||
retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
|
||||
retry brew install libomp
|
||||
|
||||
# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
|
||||
# is build as part of tensorpipe submodule
|
||||
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
|
||||
export USE_DISTRIBUTED=1
|
||||
|
||||
export USE_MKLDNN=OFF
|
||||
@ -188,7 +199,7 @@ export BUILD_TEST=OFF
|
||||
pushd "$pytorch_rootdir"
|
||||
echo "Calling setup.py bdist_wheel at $(date)"
|
||||
|
||||
_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name "${mac_version//[-.]/_}"
|
||||
python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name ${mac_version}
|
||||
|
||||
echo "Finished setup.py bdist_wheel at $(date)"
|
||||
|
||||
|
2
.flake8
2
.flake8
@ -73,7 +73,7 @@ exclude =
|
||||
./docs/src,
|
||||
./functorch/docs,
|
||||
./functorch/examples,
|
||||
./functorch/docs/source/tutorials,
|
||||
./functorch/notebooks,
|
||||
./scripts,
|
||||
./test/generated_type_hints_smoketest.py,
|
||||
./third_party,
|
||||
|
1
.github/actionlint.yaml
vendored
1
.github/actionlint.yaml
vendored
@ -21,7 +21,6 @@ self-hosted-runner:
|
||||
- linux.arm64.2xlarge.ephemeral
|
||||
- linux.arm64.m7g.4xlarge
|
||||
- linux.arm64.m7g.4xlarge.ephemeral
|
||||
- linux.arm64.r7g.12xlarge.memory
|
||||
- linux.4xlarge.nvidia.gpu
|
||||
- linux.8xlarge.nvidia.gpu
|
||||
- linux.16xlarge.nvidia.gpu
|
||||
|
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
87ff22e49ed0e92576c4935ccb8c143daac4a3cd
|
||||
fa5142928ee157aa65137c4ecff2fe9b1a9e0648
|
||||
|
2
.github/ci_commit_pins/torchao.txt
vendored
2
.github/ci_commit_pins/torchao.txt
vendored
@ -1 +1 @@
|
||||
51c87b6ead6b7e098ada95d6a7609ee873b854cf
|
||||
f32431e593d0e9db86c502d3872dd67ee40a005f
|
||||
|
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
5bcc153d7bf69ef34bc5788a33f60f1792cf2861
|
||||
cc99baf14dacc2497d0c5ed84e076ef2c37f6a4d
|
||||
|
2
.github/ci_commit_pins/xla.txt
vendored
2
.github/ci_commit_pins/xla.txt
vendored
@ -1 +1 @@
|
||||
c77852e117bdf056c8e9a087e51d6f65cf6ba53d
|
||||
6c5478ff7c3d50dd1e3047d72ec5909bea474073
|
||||
|
29
.github/ci_configs/vllm/Dockerfile.tmp_vllm
vendored
29
.github/ci_configs/vllm/Dockerfile.tmp_vllm
vendored
@ -82,10 +82,16 @@ RUN if command -v apt-get >/dev/null; then \
|
||||
apt-get update -y \
|
||||
&& apt-get install -y ccache software-properties-common git curl wget sudo vim; \
|
||||
else \
|
||||
dnf install -y git curl wget sudo; \
|
||||
dnf install -y git curl wget sudo vim; \
|
||||
fi \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
# this won't be needed for future versions of this docker image
|
||||
# or future versions of triton.
|
||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
|
||||
# Install uv for faster pip installs if not existed
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if ! python3 -m uv --version >/dev/null 2>&1; then \
|
||||
@ -214,16 +220,11 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
echo "Installing sccache..."; \
|
||||
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
|
||||
SCCACHE_ARCHIVE="sccache-v0.8.1-aarch64-unknown-linux-musl"; \
|
||||
else \
|
||||
SCCACHE_ARCHIVE="sccache-v0.8.1-x86_64-unknown-linux-musl"; \
|
||||
fi; \
|
||||
curl -L -o sccache.tar.gz "https://github.com/mozilla/sccache/releases/download/v0.8.1/${SCCACHE_ARCHIVE}.tar.gz" \
|
||||
echo "Installing sccache..." \
|
||||
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
|
||||
&& tar -xzf sccache.tar.gz \
|
||||
&& sudo mv "${SCCACHE_ARCHIVE}"/sccache /usr/bin/sccache \
|
||||
&& rm -rf sccache.tar.gz "${SCCACHE_ARCHIVE}" \
|
||||
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
||||
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
|
||||
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
|
||||
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
|
||||
@ -284,7 +285,7 @@ RUN if command -v apt-get >/dev/null; then \
|
||||
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
|
||||
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
|
||||
else \
|
||||
dnf install -y git curl wget sudo; \
|
||||
dnf install -y git curl wget sudo vim; \
|
||||
fi \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
@ -297,6 +298,12 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
|
||||
echo "[INFO] Showing torch_build_versions.txt content:" && \
|
||||
cat torch_build_versions.txt
|
||||
|
||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
# this won't be needed for future versions of this docker image
|
||||
# or future versions of triton.
|
||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
|
||||
# Install uv for faster pip installs if not existed
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if ! python3 -m uv --version > /dev/null 2>&1; then \
|
||||
|
17
.github/ci_configs/vllm/use_existing_torch.py
vendored
17
.github/ci_configs/vllm/use_existing_torch.py
vendored
@ -1,17 +0,0 @@
|
||||
import glob
|
||||
|
||||
|
||||
requires_files = glob.glob("requirements/*.txt")
|
||||
requires_files += ["pyproject.toml"]
|
||||
for file in requires_files:
|
||||
print(f">>> cleaning {file}")
|
||||
with open(file) as f:
|
||||
lines = f.readlines()
|
||||
if "torch" in "".join(lines).lower():
|
||||
print("removed:")
|
||||
with open(file, "w") as f:
|
||||
for line in lines:
|
||||
if "torch" not in line.lower():
|
||||
f.write(line)
|
||||
print(f"<<< done cleaning {file}")
|
||||
print()
|
@ -15,7 +15,7 @@ optree==0.13.0
|
||||
packaging==23.1
|
||||
parameterized==0.8.1
|
||||
pillow==10.3.0
|
||||
protobuf==5.29.5
|
||||
protobuf==5.29.4
|
||||
psutil==5.9.8
|
||||
pygments==2.15.0
|
||||
pytest-cpp==2.3.0
|
||||
@ -26,7 +26,7 @@ pytest-xdist==3.3.1
|
||||
pytest==7.3.2
|
||||
pyyaml==6.0.2
|
||||
scipy==1.12.0
|
||||
setuptools==78.1.1
|
||||
setuptools==72.1.0
|
||||
sympy==1.13.3
|
||||
tlparse==0.4.0
|
||||
tensorboard==2.13.0
|
||||
|
4
.github/scripts/docathon-label-sync.py
vendored
4
.github/scripts/docathon-label-sync.py
vendored
@ -39,9 +39,7 @@ def main() -> None:
|
||||
pull_request_label_names = [label.name for label in pull_request_labels]
|
||||
issue_label_names = [label.name for label in issue_labels]
|
||||
labels_to_add = [
|
||||
label
|
||||
for label in issue_label_names
|
||||
if label not in pull_request_label_names and label != "actionable"
|
||||
label for label in issue_label_names if label not in pull_request_label_names
|
||||
]
|
||||
if not labels_to_add:
|
||||
print("The pull request already has the same labels.")
|
||||
|
3
.github/scripts/prepare_vllm_wheels.sh
vendored
3
.github/scripts/prepare_vllm_wheels.sh
vendored
@ -84,9 +84,6 @@ repackage_wheel() {
|
||||
rm -rf $package
|
||||
}
|
||||
|
||||
# Require to re-package the wheel
|
||||
${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1
|
||||
|
||||
pushd externals/vllm/wheels
|
||||
for package in xformers flashinfer-python vllm; do
|
||||
repackage_wheel $package
|
||||
|
@ -22,16 +22,6 @@ name: !{{ build_environment }}
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
{%- endmacro %}
|
||||
|
||||
{%- macro setup_python(py_ver) -%}
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}"
|
||||
freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }}
|
||||
{%- endmacro %}
|
||||
|
||||
on:
|
||||
# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
|
||||
push:
|
||||
@ -71,13 +61,23 @@ jobs:
|
||||
{%- endif %}
|
||||
steps:
|
||||
!{{ set_runner_specific_vars() }}
|
||||
!{{ setup_python(config.get("python_version", "3.10")) }}
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
!{{ common.checkout(deep_clone=False, directory="pytorch") }}
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -94,6 +94,8 @@ jobs:
|
||||
{%- if config["package_type"] == "wheel" %}
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -104,9 +106,33 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
|
2
.github/workflows/_linux-test.yml
vendored
2
.github/workflows/_linux-test.yml
vendored
@ -273,8 +273,6 @@ jobs:
|
||||
TEST_CONFIG: ${{ matrix.config }}
|
||||
SHARD_NUMBER: ${{ matrix.shard }}
|
||||
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
|
||||
EXTRA_FLAGS: ${{ matrix.extra_flags || '' }}
|
||||
OP_BENCHMARK_TESTS: ${{ matrix.op_benchmark_tests }}
|
||||
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
|
||||
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
|
||||
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
|
||||
|
33
.github/workflows/_rocm-test.yml
vendored
33
.github/workflows/_rocm-test.yml
vendored
@ -62,11 +62,6 @@ on:
|
||||
required: false
|
||||
type: number
|
||||
default: 1
|
||||
secrets:
|
||||
HUGGING_FACE_HUB_TOKEN:
|
||||
required: false
|
||||
description: |
|
||||
HF Auth token to avoid rate limits when downloading models or datasets from hub
|
||||
env:
|
||||
GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
||||
|
||||
@ -81,9 +76,10 @@ jobs:
|
||||
strategy:
|
||||
matrix: ${{ fromJSON(inputs.test-matrix) }}
|
||||
fail-fast: false
|
||||
runs-on: ${{ matrix.runner }}
|
||||
timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
|
||||
runs-on: ${{ matrix.runner }}
|
||||
steps:
|
||||
# [see note: pytorch repo ref]
|
||||
- name: Checkout PyTorch
|
||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
||||
with:
|
||||
@ -135,9 +131,6 @@ jobs:
|
||||
|
||||
- name: Start monitoring script
|
||||
id: monitor-script
|
||||
if: ${{ !inputs.disable-monitor }}
|
||||
shell: bash
|
||||
continue-on-error: true
|
||||
env:
|
||||
JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
||||
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
|
||||
@ -145,6 +138,9 @@ jobs:
|
||||
WORKFLOW_RUN_ID: ${{github.run_id}}
|
||||
MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
|
||||
MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
|
||||
if: ${{ !inputs.disable-monitor }}
|
||||
shell: bash
|
||||
continue-on-error: true
|
||||
run: |
|
||||
python3 -m pip install psutil==5.9.8 dataclasses_json==0.6.7
|
||||
python3 -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
|
||||
@ -182,12 +178,6 @@ jobs:
|
||||
run: |
|
||||
echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Preserve github env variables for use in docker
|
||||
shell: bash
|
||||
run: |
|
||||
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
|
||||
- name: Test
|
||||
id: test
|
||||
env:
|
||||
@ -203,22 +193,20 @@ jobs:
|
||||
JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
|
||||
BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
|
||||
TEST_CONFIG: ${{ matrix.config }}
|
||||
SHARD_NUMBER: ${{ matrix.shard }}
|
||||
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
|
||||
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
|
||||
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
|
||||
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
|
||||
TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
|
||||
NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
|
||||
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
|
||||
TEST_CONFIG: ${{ matrix.config }}
|
||||
SHARD_NUMBER: ${{ matrix.shard }}
|
||||
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
|
||||
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
|
||||
DOCKER_IMAGE: ${{ inputs.docker-image }}
|
||||
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
|
||||
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
|
||||
TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
|
||||
DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
|
||||
run: |
|
||||
set -x
|
||||
@ -248,7 +236,6 @@ jobs:
|
||||
-e GITHUB_RUN_ATTEMPT \
|
||||
-e JOB_ID \
|
||||
-e JOB_NAME \
|
||||
-e BASE_SHA \
|
||||
-e BRANCH \
|
||||
-e SHA1 \
|
||||
-e AWS_DEFAULT_REGION \
|
||||
@ -266,12 +253,10 @@ jobs:
|
||||
-e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
|
||||
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
|
||||
-e TESTS_TO_INCLUDE \
|
||||
-e HUGGING_FACE_HUB_TOKEN \
|
||||
-e DASHBOARD_TAG \
|
||||
--env-file="${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}" \
|
||||
--ulimit stack=10485760:83886080 \
|
||||
--ulimit core=0 \
|
||||
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
||||
--security-opt seccomp=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
--shm-size="8g" \
|
||||
|
43
.github/workflows/build-vllm-wheel.yml
vendored
43
.github/workflows/build-vllm-wheel.yml
vendored
@ -12,9 +12,6 @@ on:
|
||||
paths:
|
||||
- .github/workflows/build-vllm-wheel.yml
|
||||
- .github/ci_commit_pins/vllm.txt
|
||||
schedule:
|
||||
# every morning at 01:30PM UTC, 9:30AM EST, 6:30AM PST
|
||||
- cron: 30 13 * * *
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
@ -27,33 +24,21 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: [ '3.12' ]
|
||||
# TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved
|
||||
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
|
||||
# TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554
|
||||
device: [ 'cu128', 'cu129' ]
|
||||
runner: [ 'linux.12xlarge.memory' ]
|
||||
include:
|
||||
- platform: manylinux_2_28_x86_64
|
||||
device: cu128
|
||||
- device: cu128
|
||||
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8'
|
||||
runner: linux.12xlarge.memory
|
||||
- platform: manylinux_2_28_x86_64
|
||||
device: cu129
|
||||
- device: cu129
|
||||
manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
|
||||
runner: linux.12xlarge.memory
|
||||
- platform: manylinux_2_28_aarch64
|
||||
device: cu128
|
||||
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
|
||||
runner: linux.arm64.r7g.12xlarge.memory
|
||||
- platform: manylinux_2_28_aarch64
|
||||
device: cu129
|
||||
manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9'
|
||||
runner: linux.arm64.r7g.12xlarge.memory
|
||||
name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
|
||||
name: "Build ${{ matrix.device }} vLLM wheel"
|
||||
runs-on: ${{ matrix.runner }}
|
||||
timeout-minutes: 480
|
||||
env:
|
||||
PY_VERS: ${{ matrix.python-version }}
|
||||
MANYLINUX_IMAGE: ${{ matrix.manylinux-image }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
PLATFORM: 'manylinux_2_28_x86_64'
|
||||
BUILD_DEVICE: ${{ matrix.device }}
|
||||
steps:
|
||||
- name: Setup SSH (Click me for login details)
|
||||
@ -151,7 +136,7 @@ jobs:
|
||||
|
||||
- uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
|
||||
with:
|
||||
name: vllm-wheel-${{ matrix.device }}-${{ matrix.platform }}-${{ matrix.python-version }}
|
||||
name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }}
|
||||
if-no-files-found: error
|
||||
path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl
|
||||
|
||||
@ -161,17 +146,15 @@ jobs:
|
||||
|
||||
# Copied from build-triton-wheel workflow (mostly)
|
||||
upload-wheel:
|
||||
name: "Upload ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
|
||||
name: "Upload ${{ matrix.device }} vLLM wheel"
|
||||
needs:
|
||||
- build-wheel
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
|
||||
device: [ 'cu128', 'cu129' ]
|
||||
env:
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
BUILD_DEVICE: ${{ matrix.device }}
|
||||
permissions:
|
||||
id-token: write
|
||||
@ -183,7 +166,7 @@ jobs:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
- name: Configure AWS credentials(PyTorch account) for main
|
||||
if: ${{ (github.event_name == 'push' && github.event.ref == 'refs/heads/main') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
|
||||
if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
|
||||
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels
|
||||
@ -207,15 +190,15 @@ jobs:
|
||||
run: |
|
||||
set -eux
|
||||
mkdir -p "${RUNNER_TEMP}/artifacts/"
|
||||
mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-"${PLATFORM}"-*/* "${RUNNER_TEMP}/artifacts/"
|
||||
mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/"
|
||||
|
||||
- name: Set DRY_RUN
|
||||
if: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
|
||||
- name: Set DRY_RUN (only for tagged pushes)
|
||||
if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
|
||||
shell: bash
|
||||
run: |
|
||||
echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Set UPLOAD_CHANNEL
|
||||
- name: Set UPLOAD_CHANNEL (only for tagged pushes)
|
||||
if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
|
||||
shell: bash
|
||||
run: |
|
||||
|
18
.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
generated
vendored
18
.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
generated
vendored
@ -60,13 +60,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.10.4"
|
||||
freethreaded: false
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -81,9 +81,13 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
|
336
.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
generated
vendored
336
.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
generated
vendored
@ -56,13 +56,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.10.4"
|
||||
freethreaded: false
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -77,9 +77,13 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -95,6 +99,8 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -105,9 +111,33 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
@ -166,13 +196,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.11.4"
|
||||
freethreaded: false
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -187,9 +217,13 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -205,6 +239,8 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -215,9 +251,33 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
@ -276,13 +336,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.12.4"
|
||||
freethreaded: false
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -297,9 +357,13 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -315,6 +379,8 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -325,9 +391,33 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
@ -386,13 +476,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.13.4"
|
||||
freethreaded: false
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -407,9 +497,13 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -425,6 +519,8 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -435,9 +531,33 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
@ -496,13 +616,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.13.4"
|
||||
freethreaded: true
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -517,9 +637,13 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -535,6 +659,8 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -545,9 +671,33 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
@ -606,13 +756,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.14.0-rc.2"
|
||||
freethreaded: false
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -627,9 +777,13 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -645,6 +799,8 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -655,9 +811,33 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
@ -716,13 +896,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.14.0-rc.2"
|
||||
freethreaded: true
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -737,9 +917,13 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -755,6 +939,8 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -765,9 +951,33 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
|
@ -35,6 +35,8 @@ jobs:
|
||||
needs:
|
||||
- get-default-label-prefix
|
||||
with:
|
||||
# More memory is needed to build torchao
|
||||
runner: linux.2xlarge.memory
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
@ -43,6 +45,7 @@ jobs:
|
||||
{ include: [
|
||||
{ config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.aws.a100", owners: ["oncall:pt2"] },
|
||||
]}
|
||||
build-additional-packages: "vision audio fbgemm torchao"
|
||||
secrets: inherit
|
||||
|
||||
test:
|
||||
|
4
.github/workflows/inductor-nightly.yml
vendored
4
.github/workflows/inductor-nightly.yml
vendored
@ -37,7 +37,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-default-label-prefix
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
test-matrix: |
|
||||
@ -56,7 +56,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: nightly-dynamo-benchmarks-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
|
@ -137,7 +137,6 @@ jobs:
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
# disable monitor in perf tests, next step is to enable it
|
||||
disable-monitor: false
|
||||
monitor-log-interval: 15
|
||||
monitor-data-collect-interval: 4
|
||||
@ -154,7 +153,6 @@ jobs:
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
timeout-minutes: 1440
|
||||
# disable monitor in perf tests, next step is to enable it
|
||||
disable-monitor: false
|
||||
monitor-log-interval: 15
|
||||
monitor-data-collect-interval: 4
|
||||
@ -173,7 +171,6 @@ jobs:
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
# disable monitor in perf tests for more investigation
|
||||
disable-monitor: false
|
||||
monitor-log-interval: 15
|
||||
monitor-data-collect-interval: 4
|
||||
|
@ -43,11 +43,6 @@ on:
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
freezing:
|
||||
description: Run freezing?
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
benchmark_configs:
|
||||
description: The list of configs used the benchmark
|
||||
required: false
|
||||
@ -80,7 +75,7 @@ jobs:
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -106,8 +101,8 @@ jobs:
|
||||
needs: inductor-build
|
||||
if: github.event.schedule == '0 7 * * *'
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
@ -121,9 +116,10 @@ jobs:
|
||||
name: inductor-test
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-build
|
||||
if: github.event_name == 'workflow_dispatch'
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
dashboard-tag: training-${{ inputs.training || 'false' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'true' }}-aotinductor-${{ inputs.aotinductor || 'true' }}-freezing-${{ inputs.freezing || 'true' }}
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
|
@ -80,7 +80,7 @@ jobs:
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -107,7 +107,7 @@ jobs:
|
||||
needs: inductor-build
|
||||
if: github.event.schedule == '0 7 * * *'
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
@ -124,7 +124,7 @@ jobs:
|
||||
needs: inductor-build
|
||||
if: github.event_name == 'workflow_dispatch'
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
|
12
.github/workflows/inductor-periodic.yml
vendored
12
.github/workflows/inductor-periodic.yml
vendored
@ -36,10 +36,12 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-default-label-prefix
|
||||
with:
|
||||
# More memory is needed to build torchao
|
||||
runner: linux.2xlarge.memory
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0;8.6'
|
||||
cuda-arch-list: '8.6'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
@ -62,7 +64,7 @@ jobs:
|
||||
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
|
||||
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
|
||||
@ -128,6 +130,8 @@ jobs:
|
||||
needs:
|
||||
- get-default-label-prefix
|
||||
with:
|
||||
# More memory is needed to build torchao
|
||||
runner: linux.2xlarge.memory
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
@ -154,7 +158,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-default-label-prefix
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
test-matrix: |
|
||||
@ -200,7 +204,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: periodic-dynamo-benchmarks-cpu-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
7
.github/workflows/inductor-unittest.yml
vendored
7
.github/workflows/inductor-unittest.yml
vendored
@ -33,6 +33,8 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# More memory is needed to build torchao
|
||||
runner: linux.2xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.6'
|
||||
@ -45,6 +47,7 @@ jobs:
|
||||
{ config: "inductor_cpp_wrapper", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "inductor_cpp_wrapper", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
build-additional-packages: "vision audio torchao"
|
||||
secrets: inherit
|
||||
|
||||
inductor-test:
|
||||
@ -110,7 +113,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
test-matrix: |
|
||||
@ -127,7 +130,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-cpu-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
6
.github/workflows/inductor.yml
vendored
6
.github/workflows/inductor.yml
vendored
@ -49,6 +49,8 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# More memory is needed to build torchao
|
||||
runner: linux.2xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.6'
|
||||
@ -79,7 +81,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
test-matrix: |
|
||||
@ -101,7 +103,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-cpu-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
2
.github/workflows/nightly.yml
vendored
2
.github/workflows/nightly.yml
vendored
@ -54,7 +54,7 @@ jobs:
|
||||
- get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.10-gcc11
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
docker-image: ${{ needs.docs-build.outputs.docker-image }}
|
||||
push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }}
|
||||
run-doxygen: true
|
||||
|
10
.github/workflows/operator_benchmark.yml
vendored
10
.github/workflows/operator_benchmark.yml
vendored
@ -14,10 +14,6 @@ on:
|
||||
schedule:
|
||||
# Run at 07:00 UTC every Sunday
|
||||
- cron: 0 7 * * 0
|
||||
pull_request:
|
||||
paths:
|
||||
- benchmarks/operator_benchmark/**
|
||||
- .github/workflows/operator_benchmark.yml
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
@ -33,7 +29,7 @@ jobs:
|
||||
name: opbenchmark-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -46,7 +42,7 @@ jobs:
|
||||
name: opbenchmark-on-demand-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -59,7 +55,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: opbenchmark-build
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11-build
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
46
.github/workflows/operator_microbenchmark.yml
vendored
46
.github/workflows/operator_microbenchmark.yml
vendored
@ -1,46 +0,0 @@
|
||||
name: operator_microbenchmark
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/op-benchmark/*
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Run at 06:00 UTC everyday
|
||||
- cron: 0 6 * * *
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
opmicrobenchmark-build:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: opmicrobenchmark-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '8.0 9.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" },
|
||||
{ config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
opmicrobenchmark-test:
|
||||
name: opmicrobenchmark-test
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: opmicrobenchmark-build
|
||||
with:
|
||||
timeout-minutes: 500
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
@ -1,46 +0,0 @@
|
||||
name: operator_microbenchmark_b200
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/op-benchmark/*
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Run at 06:00 UTC everyday
|
||||
- cron: 0 6 * * *
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
opmicrobenchmark-build:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: opmicrobenchmark-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '10.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
opmicrobenchmark-test:
|
||||
name: opmicrobenchmark-test
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: opmicrobenchmark-build
|
||||
with:
|
||||
timeout-minutes: 500
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
|
||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
secrets: inherit
|
1
.github/workflows/rocm-mi300.yml
vendored
1
.github/workflows/rocm-mi300.yml
vendored
@ -70,4 +70,5 @@ jobs:
|
||||
build-environment: linux-noble-rocm-py3.12-mi300
|
||||
docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
|
||||
tests-to-include: "inductor/test_ck_backend"
|
||||
secrets: inherit
|
||||
|
7
.github/workflows/trunk.yml
vendored
7
.github/workflows/trunk.yml
vendored
@ -239,13 +239,16 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
# More memory is needed to build torchao
|
||||
runner: linux.2xlarge.memory
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.10-gcc11
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "verify_cachebench", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
]}
|
||||
build-additional-packages: "vision audio torchao"
|
||||
secrets: inherit
|
||||
|
||||
verify-cachebench-cpu-test:
|
||||
@ -255,7 +258,7 @@ jobs:
|
||||
- verify-cachebench-cpu-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-py3.10-gcc11
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
5
.gitignore
vendored
5
.gitignore
vendored
@ -259,6 +259,9 @@ gen
|
||||
.pytest_cache
|
||||
aten/build/*
|
||||
|
||||
# Linker scripts for prioritized text optimization
|
||||
cmake/linker_script.ld
|
||||
|
||||
# Bram
|
||||
plsdontbreak
|
||||
|
||||
@ -389,5 +392,3 @@ android/pytorch_android_torchvision/.cxx
|
||||
|
||||
# Claude Code local configuration
|
||||
CLAUDE.local.md
|
||||
/test_*.py
|
||||
/debug_*.py
|
||||
|
@ -13,7 +13,7 @@ exclude_patterns = [
|
||||
'**/fb/**',
|
||||
'functorch/docs/**',
|
||||
'functorch/examples/**',
|
||||
'functorch/docs/source/tutorials/**',
|
||||
'functorch/notebooks/**',
|
||||
'torch/_inductor/fx_passes/serialized_patterns/**',
|
||||
'torch/_inductor/autoheuristic/artifacts/**',
|
||||
'scripts/**',
|
||||
@ -1568,6 +1568,7 @@ include_patterns = [
|
||||
exclude_patterns = [
|
||||
'caffe2/**',
|
||||
'functorch/docs/**',
|
||||
'functorch/notebooks/**',
|
||||
'torch/_inductor/fx_passes/serialized_patterns/**',
|
||||
'torch/_inductor/autoheuristic/artifacts/**',
|
||||
'test/dynamo/cpython/**',
|
||||
|
@ -22,6 +22,7 @@ COMMON_COPTS = [
|
||||
"-DHAVE_SHM_UNLINK=1",
|
||||
"-D_FILE_OFFSET_BITS=64",
|
||||
"-DUSE_FBGEMM",
|
||||
"-DUSE_DISTRIBUTED",
|
||||
"-DAT_PER_OPERATOR_HEADERS",
|
||||
"-DATEN_THREADING=NATIVE",
|
||||
"-DNO_CUDNN_DESTROY_HANDLE",
|
||||
@ -810,7 +811,7 @@ cc_library(
|
||||
name = "torch_python",
|
||||
srcs = libtorch_python_core_sources
|
||||
+ if_cuda(libtorch_python_cuda_sources)
|
||||
+ libtorch_python_distributed_sources
|
||||
+ if_cuda(libtorch_python_distributed_sources)
|
||||
+ GENERATED_AUTOGRAD_PYTHON,
|
||||
hdrs = glob([
|
||||
"torch/csrc/generic/*.cpp",
|
||||
|
@ -181,9 +181,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
|
||||
set(CPU_POWER ON)
|
||||
endif()
|
||||
|
||||
# For non-supported platforms, turn USE_DISTRIBUTED off by default.
|
||||
# NB: USE_DISTRIBUTED simply disables the backend; distributed code
|
||||
# still gets built
|
||||
# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
|
||||
# tested and likely won't work without additional changes.
|
||||
if(NOT LINUX AND NOT WIN32)
|
||||
set(USE_DISTRIBUTED
|
||||
OFF
|
||||
@ -263,11 +262,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
|
||||
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
|
||||
option(USE_NATIVE_ARCH "Use -march=native" OFF)
|
||||
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
|
||||
option(USE_DISTRIBUTED "Enable default distributed backends" ON)
|
||||
option(USE_DISTRIBUTED "Use distributed" ON)
|
||||
cmake_dependent_option(USE_NCCL "Use NCCL" ON
|
||||
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
|
||||
cmake_dependent_option(USE_XCCL "Use XCCL" ON
|
||||
"USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
|
||||
"USE_XPU;UNIX;NOT APPLE" OFF)
|
||||
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
|
||||
cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
|
||||
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
|
||||
@ -380,6 +379,13 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
|
||||
OFF "USE_CUDA" OFF)
|
||||
cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
|
||||
"CPU_AARCH64" OFF)
|
||||
# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le.
|
||||
set(USE_PRIORITIZED_TEXT_DEFAULT OFF)
|
||||
if(LINUX AND CPU_AARCH64)
|
||||
set(USE_PRIORITIZED_TEXT_DEFAULT ON)
|
||||
endif()
|
||||
cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld."
|
||||
"${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF)
|
||||
|
||||
option(USE_MIMALLOC "Use mimalloc" OFF)
|
||||
# Enable third party mimalloc library to improve memory allocation performance
|
||||
@ -432,10 +438,11 @@ if(WIN32)
|
||||
PATH_SUFFIXES lib
|
||||
NO_DEFAULT_PATH)
|
||||
if(NOT libuv_tmp_LIBRARY)
|
||||
set(USE_DISTRIBUTED OFF)
|
||||
set(USE_GLOO OFF)
|
||||
message(
|
||||
WARNING
|
||||
"Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
|
||||
"Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
|
||||
"Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
|
||||
)
|
||||
else()
|
||||
@ -657,6 +664,11 @@ endif(MSVC)
|
||||
|
||||
string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
|
||||
|
||||
# Set linker max-page-size to 64KiB on AArch64 Linux
|
||||
if(LINUX AND CPU_AARCH64)
|
||||
add_link_options_if_supported("-z,max-page-size=0x10000")
|
||||
endif()
|
||||
|
||||
# Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
|
||||
# applicable to mobile are disabled by this variable. Setting
|
||||
# `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
|
||||
@ -874,7 +886,7 @@ cmake_dependent_option(
|
||||
"Whether to build the flash_attention kernel for scaled dot product attention.\
|
||||
Will be disabled if not supported by the platform"
|
||||
ON
|
||||
"(USE_CUDA AND NOT MSVC) OR USE_ROCM"
|
||||
"USE_CUDA OR USE_ROCM;NOT MSVC"
|
||||
OFF)
|
||||
|
||||
cmake_dependent_option(
|
||||
@ -909,7 +921,7 @@ cmake_dependent_option(
|
||||
# USE_FLASH_ATTENTION -> USE_ROCM -> Dependencies.cmake -> aotriton.cmake
|
||||
#
|
||||
if(USE_ROCM)
|
||||
if(USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION)
|
||||
if(UNIX AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
|
||||
include(cmake/External/aotriton.cmake)
|
||||
endif()
|
||||
endif()
|
||||
@ -1421,3 +1433,57 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
|
||||
install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
|
||||
DESTINATION "${CMAKE_INSTALL_BINDIR}")
|
||||
endif()
|
||||
|
||||
if(USE_PRIORITIZED_TEXT_FOR_LD)
|
||||
add_compile_options(
|
||||
$<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
|
||||
$<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
|
||||
)
|
||||
set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
|
||||
set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
|
||||
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
|
||||
COMMENT "Generating prioritized text linker files"
|
||||
VERBATIM
|
||||
)
|
||||
|
||||
add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
|
||||
|
||||
if(BUILD_PYTHON)
|
||||
set(LINKER_OPT_TARGETS torch_python)
|
||||
endif()
|
||||
|
||||
if(NOT BUILD_LIBTORCHLESS)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
|
||||
if(USE_CUDA)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
|
||||
endif()
|
||||
if(USE_XPU)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
|
||||
endif()
|
||||
if(USE_ROCM)
|
||||
list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
foreach(tgt IN LISTS LINKER_OPT_TARGETS)
|
||||
if(TARGET ${tgt})
|
||||
add_dependencies("${tgt}" generate_linker_script)
|
||||
target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
|
||||
set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
|
||||
else()
|
||||
message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
else()
|
||||
if(LINUX AND CPU_AARCH64)
|
||||
message(WARNING [[
|
||||
It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
|
||||
To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
|
||||
]])
|
||||
endif()
|
||||
endif()
|
@ -16,8 +16,6 @@ However, if you believe you have found a security vulnerability in PyTorch, we e
|
||||
|
||||
Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new
|
||||
|
||||
All reports submitted thru the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework.
|
||||
|
||||
Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:
|
||||
|
||||
https://www.facebook.com/whitehat
|
||||
|
@ -180,7 +180,7 @@ void Context::setUserEnabledNNPACK(bool e) {
|
||||
}
|
||||
|
||||
bool Context::allowTF32CuDNN(const std::string& op) const {
|
||||
if (op.empty()){
|
||||
if (op.size() == 0){
|
||||
bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32";
|
||||
bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32";
|
||||
TORCH_CHECK(
|
||||
@ -281,6 +281,9 @@ bool Context::userEnabledOverrideableSDP() const {
|
||||
|
||||
static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
|
||||
static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
|
||||
#ifdef USE_ROCM
|
||||
static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32";
|
||||
#endif
|
||||
|
||||
bool Context::checkCuBLASConfigDeterministic() {
|
||||
// If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
|
||||
@ -340,6 +343,12 @@ void Context::setImmediateMiopen(bool b) {
|
||||
}
|
||||
|
||||
bool Context::allowTF32CuBLAS() const {
|
||||
#ifdef USE_ROCM
|
||||
const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
|
||||
if (allow_tf32 != true) {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
|
||||
bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32";
|
||||
TORCH_CHECK(
|
||||
@ -353,6 +362,14 @@ bool Context::allowTF32CuBLAS() const {
|
||||
}
|
||||
|
||||
void Context::setAllowTF32CuBLAS(bool b) {
|
||||
#ifdef USE_ROCM
|
||||
const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
|
||||
if (allow_tf32 != true) {
|
||||
C10_LOG_FIRST_N(INFO, 10) << "torch.backends.cuda.matmul.allow_tf32 is not supported on ROCm by default. "
|
||||
<< "Please set environment variable HIPBLASLT_ALLOW_TF32=1 to enable it.";
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
|
||||
setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee");
|
||||
}
|
||||
@ -426,7 +443,7 @@ void Context::setFloat32Precision(const std::string& backend, const std::string&
|
||||
std::string msg;
|
||||
auto iterp = _fp32_precisions.find(backend);
|
||||
TORCH_CHECK(iterp != _fp32_precisions.end());
|
||||
for (const auto& p : iterp->second) {
|
||||
for (auto p : iterp->second) {
|
||||
msg += p;
|
||||
msg += " ";
|
||||
}
|
||||
|
@ -102,7 +102,7 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
|
||||
// SparseTensorImpl has no storage, so we cannot query its nbytes.
|
||||
// (original_storage_size is only used for storage resizing in fsdp anyway, which does not apply to sparse)
|
||||
// Same for XLA
|
||||
if (base.unsafeGetTensorImpl()->has_storage() && data_ptr().device().type() != c10::DeviceType::XLA) {
|
||||
if (base.unsafeGetTensorImpl()->has_storage() && base.device().type() != c10::DeviceType::XLA) {
|
||||
original_storage_size_ = base.unsafeGetTensorImpl()->unsafe_storage().unsafeGetStorageImpl()->sym_nbytes();
|
||||
} else {
|
||||
original_storage_size_ = -1;
|
||||
|
@ -457,9 +457,24 @@ void gemm(
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
// for the fallback path, first compute gemm with beta = 0,
|
||||
// and then add c in full precision.
|
||||
int64_t c_size = n * m;
|
||||
std::vector<float> float_c(c_size, 0.f);
|
||||
gemm_no_downcast_stub(
|
||||
at::kCPU, at::kBFloat16,
|
||||
transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
|
||||
transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
|
||||
for (const auto j : c10::irange(n)) {
|
||||
for (const auto i : c10::irange(m)) {
|
||||
auto offset = j * ldc + i;
|
||||
// beta == 0 won't propagate NaN from C
|
||||
if (beta == 0.f) {
|
||||
c[offset] = float_c[j * m + i];
|
||||
} else {
|
||||
c[offset] = beta * c[offset] + float_c[j * m + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void gemm(
|
||||
@ -478,9 +493,24 @@ void gemm(
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
// for the fallback path, first compute gemm with beta = 0,
|
||||
// and then add c in full precision.
|
||||
int64_t c_size = n * m;
|
||||
std::vector<float> float_c(c_size, 0.f);
|
||||
gemm_no_downcast_stub(
|
||||
at::kCPU, at::kHalf,
|
||||
transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
|
||||
transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
|
||||
for (const auto j : c10::irange(n)) {
|
||||
for (const auto i : c10::irange(m)) {
|
||||
auto offset = j * ldc + i;
|
||||
// beta == 0 won't propagate NaN from C
|
||||
if (beta == 0.f) {
|
||||
c[offset] = float_c[j * m + i];
|
||||
} else {
|
||||
c[offset] = beta * c[offset] + float_c[j * m + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void gemm(
|
||||
|
@ -67,13 +67,13 @@ TORCH_PRECOMPUTE_META_FUNC(fractional_max_pool3d)(
|
||||
int64_t inputH = input_.size(heightDim);
|
||||
int64_t inputW = input_.size(widthDim);
|
||||
|
||||
TORCH_CHECK((poolSizeT <= inputT) && (outputT + poolSizeT - 1 < inputT),
|
||||
TORCH_CHECK(outputT + poolSizeT - 1 < inputT,
|
||||
"fractional_max_pool3d_out(): pool time ", poolSizeT,
|
||||
" too large relative to input time ", inputT);
|
||||
TORCH_CHECK((poolSizeW <= inputW) && (outputW + poolSizeW - 1 < inputW),
|
||||
TORCH_CHECK(outputW + poolSizeW - 1 < inputW,
|
||||
"fractional_max_pool3d_out(): pool width ", poolSizeW,
|
||||
" too large relative to input width ", inputW);
|
||||
TORCH_CHECK((poolSizeH <= inputH) && (outputH + poolSizeH - 1 < inputH),
|
||||
TORCH_CHECK(outputH + poolSizeH - 1 < inputH,
|
||||
"fractional_max_pool3d_out(): pool height ", poolSizeH,
|
||||
" too large relative to input height ", inputH);
|
||||
|
||||
|
@ -1360,8 +1360,7 @@ Tensor outer(const Tensor& self, const Tensor& vec2) {
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
|
||||
// Used by default on x86 platforms and on AArch64+ACL
|
||||
#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
|
||||
static inline int64_t get_mkldnn_matmul_min_dim() {
|
||||
static auto value = [&] {
|
||||
const int64_t default_min_dim = [&] {
|
||||
@ -1396,6 +1395,8 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) {
|
||||
return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static void addmm_impl_cpu_(
|
||||
Tensor &result, const Tensor &self, Tensor m1, Tensor m2, const Scalar& beta, const Scalar& alpha) {
|
||||
TORCH_INTERNAL_ASSERT(self.dim() == 2 && m1.dim() == 2 && m2.dim() == 2);
|
||||
@ -1771,8 +1772,8 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
|
||||
return (strides[2] == 1 && (sizes[1] == 1 || strides[1] >= sizes[2])) ||
|
||||
(strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1]));
|
||||
};
|
||||
#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
|
||||
// Always apply mkldnn heuristic on x86 platform, but on ARM only if compiled with ACL
|
||||
|
||||
#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
|
||||
bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]);
|
||||
if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) {
|
||||
try {
|
||||
@ -1784,6 +1785,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (contraction_size * res_rows * res_cols < 400) {
|
||||
if (is_bmm_out) {
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, batch1.scalar_type(), "bmm", [&] {
|
||||
|
@ -624,9 +624,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
|
||||
if (backend == BatchNormBackend::Miopen) {
|
||||
return std::tuple_cat(
|
||||
at::miopen_batch_norm(
|
||||
input.contiguous(input.suggest_memory_format()),
|
||||
weight.contiguous(),
|
||||
bias.contiguous(),
|
||||
input.contiguous(), weight.contiguous(), bias.contiguous(),
|
||||
running_mean.defined() ? running_mean.contiguous() : running_mean,
|
||||
running_var.defined() ? running_var.contiguous() : running_var,
|
||||
training, momentum, eps),
|
||||
|
@ -2174,7 +2174,7 @@ static void _scatter_via_index_put(
|
||||
if (self.dim() == 1 || broadcast_index) {
|
||||
Tensor squeezed = index;
|
||||
if (broadcast_index && index.dim() > 1) {
|
||||
for (int64_t d = index.dim() - 1; d >= 0; --d) {
|
||||
for (const auto d : c10::irange(index.dim())) {
|
||||
if (d == dim) {
|
||||
continue;
|
||||
}
|
||||
|
@ -52,7 +52,6 @@ void apply_triu_tril_single(
|
||||
int64_t self_col_stride,
|
||||
bool upper) {
|
||||
constexpr int64_t zero = 0;
|
||||
k = std::clamp(k, -n, m); // Clamp k to [-n, m] to prevent i + k arithmetic overflow, especially if k approaches INT64_MAX/INT64_MIN.
|
||||
|
||||
if (upper) {
|
||||
parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
|
||||
|
@ -36,7 +36,7 @@ void hardsigmoid_kernel(TensorIteratorBase& iter) {
|
||||
[zero, one_sixth, three, six] GPU_LAMBDA(
|
||||
scalar_t self_val) -> scalar_t {
|
||||
opmath_t x = static_cast<opmath_t>(self_val);
|
||||
return std::min<opmath_t>(std::max<opmath_t>(x + three, zero), six) * one_sixth;
|
||||
return std::min(std::max(x + three, zero), six) * one_sixth;
|
||||
});
|
||||
});
|
||||
}
|
||||
|
@ -317,17 +317,6 @@ void nonzero_static_cuda_out_impl(
|
||||
out_temp =
|
||||
Tensor(at::detail::empty_cuda({self.dim(), size}, out.options())).t();
|
||||
}
|
||||
// If input has zero elements, avoid kernel grid calculations (which can
|
||||
// produce zero divisors) and just fill the output with fill_value.
|
||||
if (self.numel() == 0) {
|
||||
if (need_to_copy) {
|
||||
out_temp.fill_(fill_value);
|
||||
out.copy_(out_temp);
|
||||
} else {
|
||||
out.fill_(fill_value);
|
||||
}
|
||||
return;
|
||||
}
|
||||
int64_t* out_data_ptr = need_to_copy ? out_temp.mutable_data_ptr<int64_t>()
|
||||
: out.mutable_data_ptr<int64_t>();
|
||||
|
||||
|
@ -416,7 +416,6 @@ struct ReduceOp {
|
||||
if (config.should_block_y_reduce()) {
|
||||
value = block_y_reduce<output_vec_size>(value, shared_memory);
|
||||
}
|
||||
__syncthreads();
|
||||
if (config.should_block_x_reduce()) {
|
||||
value = block_x_reduce<output_vec_size>(value, shared_memory);
|
||||
}
|
||||
|
@ -17,11 +17,12 @@ __global__ static void compute_cuda_kernel(
|
||||
index_t* result_ptr,
|
||||
int64_t size,
|
||||
int64_t result_size) {
|
||||
CUDA_KERNEL_ASSERT_PRINTF(
|
||||
result_size == cumsum_ptr[size - 1],
|
||||
if (C10_UNLIKELY((result_size != cumsum_ptr[size - 1]))) {
|
||||
printf("%s:%d:%s: block: [%d,%d,%d], thread: [%d,%d,%d] "
|
||||
"Invalid input! In `repeat_interleave`, the `output_size` argument (%ld) must be the same as the sum of the elements in the `repeats` tensor (%ld).\n",
|
||||
result_size,
|
||||
cumsum_ptr[size - 1]);
|
||||
__FILE__, __LINE__, __func__,blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, result_size, cumsum_ptr[size - 1 ]);
|
||||
CUDA_KERNEL_ASSERT(result_size == cumsum_ptr[size - 1])
|
||||
}
|
||||
|
||||
int64_t idx = ((int64_t) blockIdx.x) * blockDim.x + threadIdx.x;
|
||||
int64_t stride = (blockDim.x * gridDim.x) / C10_WARP_SIZE;
|
||||
|
@ -7,7 +7,6 @@
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/empty.h>
|
||||
#include <ATen/ops/empty_like.h>
|
||||
#include <ATen/ops/miopen_batch_norm_native.h>
|
||||
#include <ATen/ops/miopen_batch_norm_backward_native.h>
|
||||
#endif
|
||||
@ -103,7 +102,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
|
||||
mode = miopenBNSpatial;
|
||||
}
|
||||
|
||||
auto output_t = at::empty_like(input_t, input_t.options(), input_t.suggest_memory_format());
|
||||
auto output_t = at::empty(input->sizes(), input->options());
|
||||
TensorArg output{ output_t, "output", 0 };
|
||||
|
||||
auto handle = getMiopenHandle();
|
||||
@ -171,15 +170,20 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
|
||||
const std::optional<Tensor>& save_var_t_opt,
|
||||
double epsilon) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
const Tensor& save_mean_t = save_mean_t_opt.value_or(Tensor());
|
||||
const Tensor& save_var_t = save_var_t_opt.value_or(Tensor());
|
||||
const Tensor& running_mean =
|
||||
running_mean_opt.value_or(Tensor());
|
||||
const Tensor& running_var =
|
||||
running_var_opt.value_or(Tensor());
|
||||
const Tensor& save_mean_t =
|
||||
save_mean_t_opt.value_or(Tensor());
|
||||
const Tensor& save_var_t =
|
||||
save_var_t_opt.value_or(Tensor());
|
||||
|
||||
auto grad_output_contig =
|
||||
grad_output_t.contiguous(input_t.suggest_memory_format());
|
||||
TensorArg input{input_t, "input", 1},
|
||||
grad_output{grad_output_contig, "grad_output", 2},
|
||||
weight{weight_t, "weight", 3}, save_mean{save_mean_t, "save_mean", 4},
|
||||
save_var{save_var_t, "save_var", 5};
|
||||
TensorArg input{ input_t, "input", 1 },
|
||||
grad_output{ grad_output_t, "grad_output", 2 },
|
||||
weight{ weight_t, "weight", 3 },
|
||||
save_mean{ save_mean_t, "save_mean", 4 },
|
||||
save_var{ save_var_t, "save_var", 5 };
|
||||
CheckedFrom c = "miopen_batch_norm_backward";
|
||||
|
||||
checkAllDefined(c, {input, grad_output, weight, save_mean, save_var});
|
||||
@ -191,11 +195,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
|
||||
}
|
||||
checkAllSameType(c, {input, grad_output});
|
||||
checkAllSameType(c, {weight, save_mean, save_var});
|
||||
// TODO: is weight required to be contiguous?
|
||||
checkAllContiguous(c, {save_mean, save_var});
|
||||
// TODO: TensorArg check should start handle memory format
|
||||
TORCH_CHECK(input->is_contiguous(input->suggest_memory_format()));
|
||||
TORCH_CHECK(grad_output->is_contiguous(input->suggest_memory_format()));
|
||||
checkAllContiguous(c, {input, grad_output, save_mean, save_var});
|
||||
checkDimRange(c, input, 2, 6 /* exclusive */);
|
||||
checkSameSize(c, input, grad_output);
|
||||
auto num_features = input->size(1);
|
||||
@ -210,7 +210,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
|
||||
mode = miopenBNSpatial;
|
||||
}
|
||||
|
||||
auto grad_input_t = at::empty(input->sizes(), input->options(), input->suggest_memory_format());
|
||||
auto grad_input_t = at::empty(input->sizes(), input->options());
|
||||
auto grad_weight_t = at::empty(weight->sizes(), weight->options());
|
||||
auto grad_bias_t = at::empty(weight->sizes(), weight->options());
|
||||
|
||||
|
@ -1770,12 +1770,10 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> miopen_depthwise_convolution_back
|
||||
// fusions
|
||||
// ---------------------------------------------------------------------
|
||||
|
||||
void raw_miopen_convolution_add_relu_out(
|
||||
void raw_miopen_convolution_relu_out(
|
||||
const Tensor& output,
|
||||
const Tensor& input,
|
||||
const Tensor& weight,
|
||||
const Tensor& z,
|
||||
float alpha,
|
||||
const Tensor& bias,
|
||||
IntArrayRef stride,
|
||||
IntArrayRef padding,
|
||||
@ -1783,20 +1781,68 @@ void raw_miopen_convolution_add_relu_out(
|
||||
int64_t groups,
|
||||
bool benchmark,
|
||||
bool deterministic) {
|
||||
raw_miopen_convolution_forward_out(
|
||||
output,
|
||||
auto dataType = getMiopenDataType(input);
|
||||
miopenConvolutionMode_t c_mode = miopenConvolution;
|
||||
ConvolutionArgs args{ input, output, weight };
|
||||
args.handle = getMiopenHandle();
|
||||
at::MemoryFormat memory_format = miopen_conv_suggest_memory_format(input, weight);
|
||||
setConvolutionParams(
|
||||
&args.params,
|
||||
args.handle,
|
||||
input,
|
||||
weight,
|
||||
padding,
|
||||
stride,
|
||||
dilation,
|
||||
groups,
|
||||
deterministic,
|
||||
memory_format);
|
||||
args.idesc.set(input, memory_format);
|
||||
args.wdesc.set(weight, memory_format, 0);
|
||||
args.odesc.set(output, memory_format);
|
||||
args.cdesc.set(
|
||||
dataType,
|
||||
c_mode,
|
||||
input.dim() - 2,
|
||||
args.params.padding,
|
||||
args.params.stride,
|
||||
args.params.dilation,
|
||||
args.params.groups,
|
||||
benchmark,
|
||||
deterministic);
|
||||
at::Tensor alpha_mul_z_add_bias =
|
||||
at::native::reshape_bias(input.dim(), bias).add(z, alpha);
|
||||
output.add_(alpha_mul_z_add_bias);
|
||||
output.relu_();
|
||||
|
||||
TensorDescriptor bdesc;
|
||||
bdesc.set(bias.expand({1, bias.size(0)}), output.dim());
|
||||
|
||||
// Create the fusion plan
|
||||
miopenFusionPlanDescriptor_t fusePlanDesc;
|
||||
miopenFusionOpDescriptor_t convoOp;
|
||||
miopenFusionOpDescriptor_t biasOp;
|
||||
miopenFusionOpDescriptor_t activOp;
|
||||
MIOPEN_CHECK(miopenCreateFusionPlan(&fusePlanDesc, miopenVerticalFusion, args.idesc.desc()));
|
||||
MIOPEN_CHECK(miopenCreateOpConvForward(fusePlanDesc, &convoOp, args.cdesc.desc(), args.wdesc.desc()));
|
||||
MIOPEN_CHECK(miopenCreateOpBiasForward(fusePlanDesc, &biasOp, bdesc.desc()));
|
||||
MIOPEN_CHECK(miopenCreateOpActivationForward(fusePlanDesc, &activOp, miopenActivationRELU));
|
||||
|
||||
// compile fusion plan
|
||||
MIOPEN_CHECK(miopenCompileFusionPlan(args.handle, fusePlanDesc));
|
||||
|
||||
// Set the Args
|
||||
float alpha = static_cast<float>(1);
|
||||
float beta = static_cast<float>(0);
|
||||
float activ_alpha = static_cast<float>(0);
|
||||
float activ_beta = static_cast<float>(0);
|
||||
float activ_gamma = static_cast<float>(0);
|
||||
miopenOperatorArgs_t fusionArgs;
|
||||
MIOPEN_CHECK(miopenCreateOperatorArgs(&fusionArgs));
|
||||
MIOPEN_CHECK(miopenSetOpArgsConvForward(fusionArgs, convoOp, &alpha, &beta, weight.const_data_ptr()));
|
||||
MIOPEN_CHECK(miopenSetOpArgsBiasForward(fusionArgs, biasOp, &alpha, &beta, bias.const_data_ptr()));
|
||||
MIOPEN_CHECK(miopenSetOpArgsActivForward(fusionArgs, activOp, &alpha, &beta, activ_alpha, activ_beta, activ_gamma));
|
||||
|
||||
miopenExecuteFusionPlan(args.handle, fusePlanDesc, args.idesc.desc(), input.const_data_ptr(), args.odesc.desc(), output.data_ptr(), fusionArgs);
|
||||
|
||||
// Cleanup
|
||||
miopenDestroyFusionPlan(fusePlanDesc);
|
||||
}
|
||||
|
||||
static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat memory_format) {
|
||||
@ -1809,107 +1855,171 @@ static at::Tensor self_or_new_memory_format(at::Tensor& self, at::MemoryFormat m
|
||||
Tensor miopen_convolution_add_relu(
|
||||
const Tensor& input_t,
|
||||
const Tensor& weight_t,
|
||||
const Tensor& z_t,
|
||||
const Tensor& z,
|
||||
const std::optional<Scalar>& alpha,
|
||||
const std::optional<Tensor>& bias_t,
|
||||
const std::optional<Tensor>& bias,
|
||||
IntArrayRef stride,
|
||||
IntArrayRef padding,
|
||||
IntArrayRef dilation,
|
||||
int64_t groups) {
|
||||
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
|
||||
const Tensor input = input_t.contiguous(memory_format);
|
||||
const Tensor weight = weight_t.contiguous(memory_format);
|
||||
Tensor z = z_t;
|
||||
if (z.suggest_memory_format() != memory_format) {
|
||||
z = z.to(memory_format);
|
||||
}
|
||||
z = z.contiguous(memory_format);
|
||||
|
||||
// FuseFrozenConvAddRelu performs some tensor shape checking
|
||||
Tensor output_t = at::detail::empty_cuda(
|
||||
conv_output_size(
|
||||
input.sizes(), weight.sizes(), padding, stride, dilation),
|
||||
input.options().memory_format(memory_format));
|
||||
if (output_t.numel() == 0) {
|
||||
return output_t;
|
||||
}
|
||||
// MIOpen does not support fusion of add, the alpha2 * z step of the below cuDNN function:
|
||||
// y = act ( alpha1 * conv(x) + alpha2 * z + bias )
|
||||
|
||||
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
|
||||
|
||||
auto& ctx = at::globalContext();
|
||||
bool benchmark = ctx.benchmarkCuDNN();
|
||||
auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
|
||||
auto _bias = bias_t.has_value()
|
||||
? bias_t.value()
|
||||
: at::zeros(
|
||||
{output_t.size(1)},
|
||||
optTypeMetaToScalarType(output_t.options().dtype_opt()),
|
||||
output_t.options().layout_opt(),
|
||||
output_t.options().device_opt(),
|
||||
output_t.options().pinned_memory_opt());
|
||||
|
||||
raw_miopen_convolution_add_relu_out(
|
||||
output_t,
|
||||
TensorArg input { input_t, "input", 1 },
|
||||
weight { weight_t, "weight", 2 };
|
||||
|
||||
Tensor output_t = at::detail::empty_cuda(
|
||||
conv_output_size(
|
||||
input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
|
||||
input_t.options().memory_format(memory_format));
|
||||
if (output_t.numel() == 0){
|
||||
return output_t;
|
||||
}
|
||||
// Avoid ambiguity of "output" when this is being used as backwards
|
||||
TensorArg output{output_t, "result", 0};
|
||||
miopen_convolution_forward_out(
|
||||
output,
|
||||
"miopen_convolution_add_relu",
|
||||
input,
|
||||
weight,
|
||||
z,
|
||||
_alpha,
|
||||
_bias,
|
||||
stride,
|
||||
padding,
|
||||
stride,
|
||||
dilation,
|
||||
groups,
|
||||
benchmark,
|
||||
true); // deterministic
|
||||
false // deterministic
|
||||
);
|
||||
|
||||
return output_t;
|
||||
auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
|
||||
|
||||
if (!output_t.is_same(contig_output_t)) {
|
||||
contig_output_t.copy_(output_t);
|
||||
}
|
||||
|
||||
auto _alpha = alpha.has_value() ? alpha.value().to<float>() : 1.0;
|
||||
auto _bias = bias.has_value()
|
||||
? bias.value()
|
||||
: at::zeros(
|
||||
{contig_output_t.size(1)},
|
||||
optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
|
||||
contig_output_t.options().layout_opt(),
|
||||
contig_output_t.options().device_opt(),
|
||||
contig_output_t.options().pinned_memory_opt());
|
||||
|
||||
at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input_t.dim(), _bias).add(z, _alpha);
|
||||
contig_output_t.add_(alpha_mul_z_add_bias);
|
||||
contig_output_t.relu_();
|
||||
|
||||
return contig_output_t;
|
||||
}
|
||||
|
||||
Tensor miopen_convolution_relu(
|
||||
const Tensor& input_t,
|
||||
const Tensor& weight_t,
|
||||
const std::optional<Tensor>& bias_t,
|
||||
const std::optional<Tensor>& bias,
|
||||
IntArrayRef stride,
|
||||
IntArrayRef padding,
|
||||
IntArrayRef dilation,
|
||||
int64_t groups) {
|
||||
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
|
||||
const Tensor input = input_t.contiguous(memory_format);
|
||||
const Tensor weight = weight_t.contiguous(memory_format);
|
||||
|
||||
// FuseFrozenConvAddRelu performs some tensor shape checking
|
||||
Tensor output_t = at::detail::empty_cuda(
|
||||
conv_output_size(
|
||||
input.sizes(), weight.sizes(), padding, stride, dilation),
|
||||
input.options().memory_format(memory_format));
|
||||
if (output_t.numel() == 0) {
|
||||
return output_t;
|
||||
}
|
||||
|
||||
auto& ctx = at::globalContext();
|
||||
bool benchmark = ctx.benchmarkCuDNN();
|
||||
auto _bias = bias_t.has_value()
|
||||
? bias_t.value()
|
||||
: at::zeros(
|
||||
{output_t.size(1)},
|
||||
optTypeMetaToScalarType(output_t.options().dtype_opt()),
|
||||
output_t.options().layout_opt(),
|
||||
output_t.options().device_opt(),
|
||||
output_t.options().pinned_memory_opt());
|
||||
|
||||
raw_miopen_convolution_add_relu_out(
|
||||
output_t,
|
||||
input,
|
||||
weight,
|
||||
output_t, // use output_t as z to satisfy MIOpen API
|
||||
0, // alpha
|
||||
_bias,
|
||||
stride,
|
||||
padding,
|
||||
dilation,
|
||||
groups,
|
||||
benchmark, // benchmark
|
||||
true); // deterministic
|
||||
// MIOpen currently only supports MemoryFormat::Contiguous and fp32 and 2d
|
||||
if (input_t.suggest_memory_format() == at::MemoryFormat::Contiguous
|
||||
&& input_t.scalar_type() == at::kFloat
|
||||
&& input_t.ndimension() == 4) {
|
||||
|
||||
return output_t;
|
||||
// FuseFrozenConvAddRelu performs some tensor shape checking
|
||||
Tensor output_t = at::detail::empty_cuda(
|
||||
conv_output_size(
|
||||
input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
|
||||
input_t.options().memory_format(input_t.suggest_memory_format()));
|
||||
if (output_t.numel() == 0) {
|
||||
return output_t;
|
||||
}
|
||||
|
||||
auto _bias = bias.has_value()
|
||||
? bias.value()
|
||||
: at::zeros(
|
||||
{output_t.size(1)},
|
||||
optTypeMetaToScalarType(output_t.options().dtype_opt()),
|
||||
output_t.options().layout_opt(),
|
||||
output_t.options().device_opt(),
|
||||
output_t.options().pinned_memory_opt());
|
||||
|
||||
raw_miopen_convolution_relu_out(
|
||||
output_t,
|
||||
input_t,
|
||||
weight_t,
|
||||
_bias,
|
||||
stride,
|
||||
padding,
|
||||
dilation,
|
||||
groups,
|
||||
benchmark, // benchmark
|
||||
false // deterministic
|
||||
);
|
||||
|
||||
return output_t;
|
||||
}
|
||||
else {
|
||||
// fallback
|
||||
|
||||
auto memory_format = miopen_conv_suggest_memory_format(input_t, weight_t);
|
||||
|
||||
TensorArg input { input_t, "input", 1 },
|
||||
weight { weight_t, "weight", 2 };
|
||||
|
||||
Tensor output_t = at::detail::empty_cuda(
|
||||
conv_output_size(
|
||||
input_t.sizes(), weight_t.sizes(), padding, stride, dilation),
|
||||
input->options().memory_format(memory_format));
|
||||
if (output_t.numel() == 0){
|
||||
return output_t;
|
||||
}
|
||||
// Avoid ambiguity of "output" when this is being used as backwards
|
||||
TensorArg output{output_t, "result", 0};
|
||||
miopen_convolution_forward_out(
|
||||
output,
|
||||
"miopen_convolution_relu",
|
||||
input,
|
||||
weight,
|
||||
padding,
|
||||
stride,
|
||||
dilation,
|
||||
groups,
|
||||
benchmark,
|
||||
false // deterministic
|
||||
);
|
||||
|
||||
auto contig_output_t = self_or_new_memory_format(output_t, memory_format);
|
||||
|
||||
if (!output_t.is_same(contig_output_t)) {
|
||||
contig_output_t.copy_(output_t);
|
||||
}
|
||||
|
||||
auto _bias = bias.has_value()
|
||||
? bias.value()
|
||||
: at::zeros(
|
||||
{contig_output_t.size(1)},
|
||||
optTypeMetaToScalarType(contig_output_t.options().dtype_opt()),
|
||||
contig_output_t.options().layout_opt(),
|
||||
contig_output_t.options().device_opt(),
|
||||
contig_output_t.options().pinned_memory_opt());
|
||||
|
||||
at::Tensor reshaped_bias = at::native::reshape_bias(input_t.dim(), _bias);
|
||||
contig_output_t.add_(reshaped_bias);
|
||||
contig_output_t.relu_();
|
||||
|
||||
return contig_output_t;
|
||||
}
|
||||
}
|
||||
|
||||
REGISTER_CUDA_DISPATCH(miopen_convolution_backward_stub, &miopen_convolution_backward)
|
||||
|
48
aten/src/ATen/native/mps/MPSGraphSonomaOps.h
Normal file
48
aten/src/ATen/native/mps/MPSGraphSonomaOps.h
Normal file
@ -0,0 +1,48 @@
|
||||
#pragma once
|
||||
|
||||
#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
|
||||
|
||||
#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
|
||||
|
||||
typedef NS_ENUM(NSUInteger, MPSGraphFFTScalingMode) {
|
||||
MPSGraphFFTScalingModeNone = 0L,
|
||||
MPSGraphFFTScalingModeSize = 1L,
|
||||
MPSGraphFFTScalingModeUnitary = 2L,
|
||||
};
|
||||
|
||||
@interface FakeMPSGraphFFTDescriptor : NSObject<NSCopying>
|
||||
@property(readwrite, nonatomic) BOOL inverse;
|
||||
@property(readwrite, nonatomic) MPSGraphFFTScalingMode scalingMode;
|
||||
@property(readwrite, nonatomic) BOOL roundToOddHermitean;
|
||||
+ (nullable instancetype)descriptor;
|
||||
@end
|
||||
|
||||
@compatibility_alias MPSGraphFFTDescriptor FakeMPSGraphFFTDescriptor;
|
||||
|
||||
@interface MPSGraph (SonomaOps)
|
||||
- (MPSGraphTensor* _Nonnull)conjugateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)realPartOfTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)fastFourierTransformWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axes:(NSArray<NSNumber*>* _Nonnull)axes
|
||||
descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)realToHermiteanFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axes:(NSArray<NSNumber*>* _Nonnull)axes
|
||||
descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)HermiteanToRealFFTWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axes:(NSArray<NSNumber*>* _Nonnull)axes
|
||||
descriptor:(MPSGraphFFTDescriptor* _Nonnull)descriptor
|
||||
name:(NSString* _Nullable)name;
|
||||
@end
|
||||
|
||||
// define BFloat16 enums for MacOS13
|
||||
#define MPSDataTypeBFloat16 ((MPSDataType)(MPSDataTypeAlternateEncodingBit | MPSDataTypeFloat16))
|
||||
|
||||
// define Metal version
|
||||
#define MTLLanguageVersion3_1 ((MTLLanguageVersion)((3 << 16) + 1))
|
||||
#endif
|
196
aten/src/ATen/native/mps/MPSGraphVenturaOps.h
Normal file
196
aten/src/ATen/native/mps/MPSGraphVenturaOps.h
Normal file
@ -0,0 +1,196 @@
|
||||
#pragma once
|
||||
#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
|
||||
|
||||
// TODO: Remove me when moved to MacOS 13
|
||||
#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
|
||||
|
||||
@interface FakeMPSGraphConvolution3DOpDescriptor : NSObject<NSCopying>
|
||||
|
||||
@property(readwrite, nonatomic) NSUInteger strideInX;
|
||||
@property(readwrite, nonatomic) NSUInteger strideInY;
|
||||
@property(readwrite, nonatomic) NSUInteger strideInZ;
|
||||
@property(readwrite, nonatomic) NSUInteger dilationRateInX;
|
||||
@property(readwrite, nonatomic) NSUInteger dilationRateInY;
|
||||
@property(readwrite, nonatomic) NSUInteger dilationRateInZ;
|
||||
|
||||
@property(readwrite, nonatomic) NSUInteger paddingLeft;
|
||||
@property(readwrite, nonatomic) NSUInteger paddingRight;
|
||||
@property(readwrite, nonatomic) NSUInteger paddingTop;
|
||||
@property(readwrite, nonatomic) NSUInteger paddingBottom;
|
||||
@property(readwrite, nonatomic) NSUInteger paddingFront;
|
||||
@property(readwrite, nonatomic) NSUInteger paddingBack;
|
||||
|
||||
@property(readwrite, nonatomic) MPSGraphPaddingStyle paddingStyle;
|
||||
@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout dataLayout;
|
||||
@property(readwrite, nonatomic) MPSGraphTensorNamedDataLayout weightsLayout;
|
||||
|
||||
@property(readwrite, nonatomic) NSUInteger groups;
|
||||
|
||||
@end
|
||||
|
||||
@compatibility_alias MPSGraphConvolution3DOpDescriptor FakeMPSGraphConvolution3DOpDescriptor;
|
||||
|
||||
#endif
|
||||
|
||||
@interface MPSGraph (VenturaOps)
|
||||
|
||||
#if !defined(__MAC_13_0) && (!defined(MAC_OS_X_VERSION_13_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_0))
|
||||
|
||||
typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode) {
|
||||
MPSGraphResizeNearestRoundingModeRoundPreferCeil = 0L,
|
||||
MPSGraphResizeNearestRoundingModeRoundPreferFloor = 1L,
|
||||
MPSGraphResizeNearestRoundingModeCeil = 2L,
|
||||
MPSGraphResizeNearestRoundingModeFloor = 3L,
|
||||
MPSGraphResizeNearestRoundingModeRoundToEven = 4L,
|
||||
MPSGraphResizeNearestRoundingModeRoundToOdd = 5L,
|
||||
};
|
||||
|
||||
// Define complex enums for MacOS 12
|
||||
#define MPSDataTypeComplexBit 0x01000000
|
||||
#define MPSDataTypeComplexFloat32 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 64))
|
||||
#define MPSDataTypeComplexFloat16 ((MPSDataType)(MPSDataTypeFloatBit | MPSDataTypeComplexBit | 32))
|
||||
#endif
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)convolution3DWithSourceTensor:(MPSGraphTensor* _Nonnull)source
|
||||
weightsTensor:(MPSGraphTensor* _Nonnull)weights
|
||||
descriptor:(MPSGraphConvolution3DOpDescriptor* _Nonnull)descriptor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)
|
||||
convolution3DDataGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
|
||||
weightsTensor:(MPSGraphTensor* _Nonnull)weights
|
||||
outputShape:(MPSShape* _Nonnull)outputShape
|
||||
forwardConvolutionDescriptor:
|
||||
(MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)
|
||||
convolution3DWeightsGradientWithIncomingGradientTensor:(MPSGraphTensor* _Nonnull)incomingGradient
|
||||
sourceTensor:(MPSGraphTensor* _Nonnull)source
|
||||
outputShape:(MPSShape* _Nonnull)outputShape
|
||||
forwardConvolutionDescriptor:
|
||||
(MPSGraphConvolution3DOpDescriptor* _Nonnull)forwardConvolutionDescriptor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)cumulativeSumWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axis:(NSInteger)axis
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axis:(NSInteger)axis
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axis:(NSInteger)axis
|
||||
descending:(BOOL)descending
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
|
||||
descending:(BOOL)descending
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)sortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axis:(NSInteger)axis
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axis:(NSInteger)axis
|
||||
descending:(BOOL)descending
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
|
||||
descending:(BOOL)descending
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)argSortWithTensor:(MPSGraphTensor* _Nonnull)tensor
|
||||
axisTensor:(MPSGraphTensor* _Nonnull)axisTensor
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)inverseOfTensor:(MPSGraphTensor* _Nonnull)inputTensor name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
|
||||
sizeTensor:(MPSGraphTensor* _Nonnull)size
|
||||
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
|
||||
centerResult:(BOOL)centerResult
|
||||
alignCorners:(BOOL)alignCorners
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeNearestWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
|
||||
sizeTensor:(MPSGraphTensor* _Nonnull)size
|
||||
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
|
||||
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
|
||||
sizeTensor:(MPSGraphTensor* _Nonnull)size
|
||||
centerResult:(BOOL)centerResult
|
||||
alignCorners:(BOOL)alignCorners
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeBilinearWithTensor:(MPSGraphTensor* _Nonnull)imagesTensor
|
||||
sizeTensor:(MPSGraphTensor* _Nonnull)size
|
||||
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
|
||||
input:(MPSGraphTensor* _Nonnull)input
|
||||
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
|
||||
centerResult:(BOOL)centerResult
|
||||
alignCorners:(BOOL)alignCorners
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeNearestWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
|
||||
input:(MPSGraphTensor* _Nonnull)input
|
||||
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
|
||||
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
|
||||
input:(MPSGraphTensor* _Nonnull)input
|
||||
centerResult:(BOOL)centerResult
|
||||
alignCorners:(BOOL)alignCorners
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)resizeBilinearWithGradientTensor:(MPSGraphTensor* _Nonnull)gradient
|
||||
input:(MPSGraphTensor* _Nonnull)input
|
||||
scaleOffsetTensor:(MPSGraphTensor* _Nonnull)scaleOffset
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
|
||||
coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
normalizeCoordinates:(BOOL)normalizeCoordinates
|
||||
relativeCoordinates:(BOOL)relativeCoordinates
|
||||
alignCorners:(BOOL)alignCorners
|
||||
paddingMode:(MPSGraphPaddingMode)paddingMode
|
||||
samplingMode:(MPSGraphResizeMode)samplingMode
|
||||
constantValue:(double)constantValue
|
||||
name:(NSString* _Nullable)name;
|
||||
|
||||
- (MPSGraphTensor* _Nonnull)sampleGridWithSourceTensor:(MPSGraphTensor* _Nonnull)source
|
||||
coordinateTensor:(MPSGraphTensor* _Nonnull)coordinates
|
||||
layout:(MPSGraphTensorNamedDataLayout)layout
|
||||
normalizeCoordinates:(BOOL)normalizeCoordinates
|
||||
relativeCoordinates:(BOOL)relativeCoordinates
|
||||
alignCorners:(BOOL)alignCorners
|
||||
paddingMode:(MPSGraphPaddingMode)paddingMode
|
||||
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode)nearestRoundingMode
|
||||
constantValue:(double)constantValue
|
||||
name:(NSString* _Nullable)name;
|
||||
- (MPSGraphTensor* _Nonnull)truncateWithTensor:(MPSGraphTensor* _Nonnull)tensor name:(NSString* _Nullable)name;
|
||||
|
||||
@end
|
@ -9,6 +9,8 @@
|
||||
#include <ATen/mps/MPSAllocatorInterface.h>
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/native/mps/MPSGraphSequoiaOps.h>
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <fmt/format.h>
|
||||
#include <fmt/ranges.h>
|
||||
@ -568,7 +570,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
|
||||
MPSShape* mpsStrides = getMPSShape(_tensor.strides());
|
||||
check_mps_shape(mpsShape);
|
||||
|
||||
auto storage_numel = src.storage().nbytes() / src.element_size() - src.storage_offset();
|
||||
auto storage_numel = src.storage().nbytes() / src.element_size();
|
||||
TORCH_CHECK(storage_numel <= std::numeric_limits<int32_t>::max(),
|
||||
"MPSGaph does not support tensor dims larger than INT_MAX");
|
||||
MPSNDArrayDescriptor* srcTensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:dataType
|
||||
|
@ -8,6 +8,8 @@
|
||||
#include <ATen/native/TensorIterator.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/native/mps/operations/BinaryKernel.h>
|
||||
// For MTLLanguageVersion_3_1
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
@ -1,12 +1,23 @@
|
||||
// Copyright © 2022 Apple Inc.
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/ConvUtils.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/ops/_mps_convolution_native.h>
|
||||
#include <ATen/ops/_mps_convolution_transpose_native.h>
|
||||
#include <ATen/ops/mps_convolution_backward_native.h>
|
||||
#include <ATen/ops/mps_convolution_transpose_backward_native.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
#if !defined(__MAC_13_2) && (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
|
||||
|
||||
@implementation FakeMPSGraphConvolution3DOpDescriptor
|
||||
- (nonnull id)copyWithZone:(nullable NSZone*)zone {
|
||||
return self;
|
||||
}
|
||||
|
||||
@end
|
||||
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
|
||||
@ -39,9 +50,11 @@ static void fill_conv3d_desc(MPSGraphConvolution3DOpDescriptor* descriptor_,
|
||||
descriptor_.paddingFront = paddingDepth;
|
||||
descriptor_.paddingBack = paddingDepth;
|
||||
|
||||
descriptor_.dataLayout = MPSGraphTensorNamedDataLayoutNCDHW;
|
||||
// PyTorch always uses NCDHW memory layout for 3D tensors
|
||||
descriptor_.dataLayout = (MPSGraphTensorNamedDataLayout)7L; // MPSGraphTensorNamedDataLayoutNCDHW;
|
||||
|
||||
descriptor_.weightsLayout = MPSGraphTensorNamedDataLayoutOIDHW;
|
||||
// PyTorch always uses OIDHW memory layout for 3D weights
|
||||
descriptor_.weightsLayout = (MPSGraphTensorNamedDataLayout)9L; // MPSGraphTensorNamedDataLayoutOIDHW;
|
||||
|
||||
descriptor_.groups = groups; // not yet tested in Xcode/C++
|
||||
}
|
||||
@ -173,6 +186,18 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
if (bias_defined)
|
||||
bias_shape = bias_opt.value().sizes();
|
||||
|
||||
std::string mem_format_key;
|
||||
switch (memory_format) {
|
||||
case at::MemoryFormat::Contiguous:
|
||||
mem_format_key = "Contiguous";
|
||||
break;
|
||||
case at::MemoryFormat::ChannelsLast:
|
||||
mem_format_key = "ChannelsLast";
|
||||
break;
|
||||
default:
|
||||
assert(0 && "Check should have been done earlier\n");
|
||||
}
|
||||
|
||||
std::string bias_shape_key;
|
||||
if (bias_defined) {
|
||||
bias_shape_key = std::to_string(bias_shape[0]);
|
||||
@ -180,16 +205,20 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
|
||||
bias_shape_key = "nobias";
|
||||
}
|
||||
|
||||
std::string key = fmt::format("mps_{}convolution:{}:{}:{}:{}:{}:{}:{}:{}",
|
||||
is3DConv ? "3d_" : "",
|
||||
getArrayRefString(stride),
|
||||
getArrayRefString(dilation),
|
||||
getArrayRefString(padding),
|
||||
groups,
|
||||
is_channels_last,
|
||||
mps::getTensorsStringKey({input_t, weight_t}),
|
||||
bias_defined,
|
||||
bias_shape_key);
|
||||
std::string key;
|
||||
if (is3DConv) {
|
||||
key = "mps_3d_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
|
||||
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
|
||||
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
|
||||
mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
|
||||
|
||||
} else {
|
||||
key = "mps_convolution:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
|
||||
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
|
||||
mps::getTensorsStringKey({input_t, weight_t}) + ":" + std::to_string(bias_defined) + ":" + bias_shape_key;
|
||||
}
|
||||
|
||||
MPSShape* inputShape = mps::getMPSShape(input_t, memory_format);
|
||||
MPSShape* outputShape = mps::getMPSShape(output_t, memory_format);
|
||||
@ -371,15 +400,33 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size,
|
||||
@autoreleasepool {
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
|
||||
std::string mem_format_key;
|
||||
switch (memory_format) {
|
||||
case at::MemoryFormat::Contiguous:
|
||||
mem_format_key = "Contiguous";
|
||||
break;
|
||||
case at::MemoryFormat::ChannelsLast:
|
||||
mem_format_key = "ChannelsLast";
|
||||
break;
|
||||
default:
|
||||
assert(0 && "Check should have been done earlier\n");
|
||||
}
|
||||
|
||||
MPSShape* mps_input_shape = getMPSShape(input_size);
|
||||
std::string key = fmt::format("mps_{}_convolution_backward_input:{}:{}:{}:{}:{}:{}",
|
||||
is3DConv ? "3d_" : "",
|
||||
getArrayRefString(stride),
|
||||
getArrayRefString(dilation),
|
||||
getArrayRefString(padding),
|
||||
groups,
|
||||
is_channels_last,
|
||||
getTensorsStringKey({grad_output_t, weight_t}));
|
||||
std::string key;
|
||||
if (is3DConv) {
|
||||
key = "mps_3d_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
":" + std::to_string(stride[2]) + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
|
||||
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
|
||||
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" + mem_format_key +
|
||||
getTensorsStringKey({grad_output_t, weight_t});
|
||||
|
||||
} else {
|
||||
key = "mps_convolution_backward_input:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
|
||||
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" + mem_format_key +
|
||||
getTensorsStringKey({grad_output_t, weight_t});
|
||||
}
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
auto gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output_t);
|
||||
auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_t);
|
||||
@ -504,13 +551,19 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size,
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
|
||||
MPSShape* mps_weight_shape = getMPSShape(weight_size);
|
||||
std::string key = fmt::format("mps_{}convolution_backward_weights:{}:{}:{}:{}:{}",
|
||||
is3DConv ? "3d_" : "",
|
||||
getArrayRefString(stride),
|
||||
getArrayRefString(dilation),
|
||||
getArrayRefString(padding),
|
||||
groups,
|
||||
getTensorsStringKey({grad_output_t, input_t, grad_weight_t}));
|
||||
std::string key;
|
||||
if (is3DConv) {
|
||||
key = "mps_3d_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
std::to_string(stride[2]) + ":" + std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" +
|
||||
std::to_string(dilation[2]) + ":" + std::to_string(padding[0]) + ":" + std::to_string(padding[1]) + ":" +
|
||||
std::to_string(padding[2]) + ":" + std::to_string(groups) + ":" +
|
||||
getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
|
||||
} else {
|
||||
key = "mps_convolution_backward_weights:" + std::to_string(stride[0]) + ":" + std::to_string(stride[1]) + ":" +
|
||||
std::to_string(dilation[0]) + ":" + std::to_string(dilation[1]) + ":" + std::to_string(padding[0]) + ":" +
|
||||
std::to_string(padding[1]) + ":" + std::to_string(groups) + ":" +
|
||||
getTensorsStringKey({grad_output_t, input_t, grad_weight_t});
|
||||
}
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
MPSShape* inputShape = getMPSShape(input_t);
|
||||
bool isDepthwiseConv =
|
||||
|
@ -2,6 +2,7 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/native/mps/Copy.h>
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/ops/_copy_from_and_resize_native.h>
|
||||
#include <ATen/ops/_copy_from_native.h>
|
||||
|
@ -5,6 +5,8 @@
|
||||
#include <ATen/native/DistributionTemplates.h>
|
||||
#include <ATen/native/Distributions.h>
|
||||
#include <ATen/native/TensorFactories.h>
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
@ -1,4 +1,6 @@
|
||||
#include <ATen/native/SpectralOpsUtils.h>
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
@ -10,6 +12,20 @@
|
||||
#include <ATen/ops/_fft_r2c_native.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0))
|
||||
@implementation FakeMPSGraphFFTDescriptor
|
||||
+ (nullable instancetype)descriptor {
|
||||
// Redispatch the constructor to the actual implementation
|
||||
id desc = NSClassFromString(@"MPSGraphFFTDescriptor");
|
||||
return (FakeMPSGraphFFTDescriptor*)[desc descriptor];
|
||||
}
|
||||
|
||||
- (nonnull id)copyWithZone:(nullable NSZone*)zone {
|
||||
return self;
|
||||
}
|
||||
@end
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
namespace {
|
||||
MPSGraphFFTScalingMode normalization_to_ScalingMode(int64_t normalization) {
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/native/GridSamplerUtils.h>
|
||||
#include <ATen/native/Pool.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/native/mps/kernels/GridSampler.h>
|
||||
|
||||
|
@ -17,6 +17,7 @@
|
||||
#include <ATen/native/LinearAlgebraUtils.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/TensorAdvancedIndexing.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <c10/util/SmallVector.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <fmt/format.h>
|
||||
|
@ -6,7 +6,9 @@
|
||||
#include <ATen/native/LinearAlgebra.h>
|
||||
#include <ATen/native/LinearAlgebraUtils.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
// For MTLLanguageVersion_3_1
|
||||
#include <ATen/native/mps/MPSGraphSequoiaOps.h>
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include <ATen/TensorUtils.h>
|
||||
#include <ATen/native/Pool.h>
|
||||
#include <ATen/native/ReduceOpsUtils.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
@ -616,7 +617,6 @@ static Tensor median_common_mps(const Tensor& input_t, bool nanmedian) {
|
||||
// we allocate 1 here due to MacOS13 bug for gather MPSGraph op, look below for the error
|
||||
Tensor output_t = at::empty({1}, input_t.scalar_type(), std::nullopt, kMPS, std::nullopt, std::nullopt);
|
||||
if (output_t.numel() == 0 || num_in_elements == 0) {
|
||||
output_t.fill_(std::numeric_limits<float>::quiet_NaN());
|
||||
return output_t;
|
||||
}
|
||||
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include <ATen/WrapDimUtils.h>
|
||||
#include <ATen/native/TensorShape.h>
|
||||
#include <ATen/native/TypeProperties.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <ATen/native/SortingUtils.h>
|
||||
#include <ATen/native/TensorShape.h>
|
||||
#include <ATen/native/TypeProperties.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
@ -2,6 +2,8 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/UnaryOps.h>
|
||||
#include <ATen/native/mps/Copy.h>
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
@ -1,6 +1,7 @@
|
||||
// Copyright © 2022 Apple Inc.
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
|
@ -1,6 +1,7 @@
|
||||
// Copyright © 2023 Apple Inc.
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/UpSample.h>
|
||||
#include <ATen/native/mps/MPSGraphVenturaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
|
@ -4,6 +4,8 @@
|
||||
#include <ATen/mps/MPSAllocatorInterface.h>
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
// For MTLLanguageVersion_3_1
|
||||
#include <ATen/native/mps/MPSGraphSonomaOps.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
|
@ -1414,7 +1414,7 @@
|
||||
- func: cat(Tensor[] tensors, int dim=0) -> Tensor
|
||||
structured_delegate: cat.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: cat_sparse
|
||||
SparseCPU, SparseCUDA: cat_sparse
|
||||
QuantizedCPU: cat_quantized_cpu
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: cat_nested
|
||||
tags: core
|
||||
@ -1798,7 +1798,7 @@
|
||||
device_guard: False
|
||||
dispatch:
|
||||
MkldnnCPU: copy_mkldnn_
|
||||
SparseCPU, SparseCUDA, SparseMPS: copy_sparse_wrapper_
|
||||
SparseCPU, SparseCUDA: copy_sparse_wrapper_
|
||||
CompositeExplicitAutograd: copy_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: copy_sparse_compressed_
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: copy_nested_
|
||||
@ -2160,7 +2160,7 @@
|
||||
variants: function, method
|
||||
structured_delegate: div.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: div_sparse
|
||||
SparseCPU, SparseCUDA: div_sparse
|
||||
ZeroTensor: div_zerotensor
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Tensor
|
||||
tags: [core, pointwise]
|
||||
@ -2170,7 +2170,7 @@
|
||||
variants: method
|
||||
structured_delegate: div.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: div_sparse_
|
||||
SparseCPU, SparseCUDA: div_sparse_
|
||||
tags: pointwise
|
||||
|
||||
- func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
|
||||
@ -2179,7 +2179,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS, MTIA: div_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: div_out_sparse_zerodim
|
||||
SparseCPU, SparseCUDA: div_out_sparse_zerodim
|
||||
tags: pointwise
|
||||
|
||||
- func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
|
||||
@ -2187,7 +2187,7 @@
|
||||
variants: function, method
|
||||
structured_delegate: div.out_mode
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: div_sparse
|
||||
SparseCPU, SparseCUDA: div_sparse
|
||||
tags: [core, pointwise]
|
||||
|
||||
- func: div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
|
||||
@ -2195,7 +2195,7 @@
|
||||
variants: method
|
||||
structured_delegate: div.out_mode
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: div_sparse_
|
||||
SparseCPU, SparseCUDA: div_sparse_
|
||||
tags: pointwise
|
||||
|
||||
- func: div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
|
||||
@ -2204,7 +2204,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: div_out_mode
|
||||
SparseCPU, SparseCUDA, SparseMPS: div_out_sparse_zerodim
|
||||
SparseCPU, SparseCUDA: div_out_sparse_zerodim
|
||||
tags: pointwise
|
||||
|
||||
# For C++ only, until we have conversion from C++ numbers to Tensor
|
||||
@ -2517,7 +2517,7 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: empty_like
|
||||
QuantizedCPU, QuantizedCUDA: empty_like_quantized
|
||||
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: empty_like_sparse_coo
|
||||
SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: empty_like_nested
|
||||
autogen: empty_like.out
|
||||
@ -2768,20 +2768,20 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CPU, CUDA, MPS, MTIA: floor_divide
|
||||
SparseCPU, SparseCUDA, SparseMPS: floor_divide_sparse
|
||||
SparseCPU, SparseCUDA: floor_divide_sparse
|
||||
|
||||
- func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: method
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: floor_divide_
|
||||
SparseCPU, SparseCUDA, SparseMPS: floor_divide_sparse_
|
||||
SparseCPU, SparseCUDA: floor_divide_sparse_
|
||||
|
||||
- func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: floor_divide_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim
|
||||
SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
|
||||
|
||||
- func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -4273,7 +4273,7 @@
|
||||
structured_delegate: mul.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: mul_sparse
|
||||
SparseCPU, SparseCUDA: mul_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr
|
||||
MkldnnCPU: mkldnn_mul
|
||||
ZeroTensor: mul_zerotensor
|
||||
@ -4285,7 +4285,7 @@
|
||||
structured_delegate: mul.out
|
||||
variants: method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: mul_sparse_
|
||||
SparseCPU, SparseCUDA: mul_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr_
|
||||
MkldnnCPU: mkldnn_mul_
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Tensor
|
||||
@ -4299,7 +4299,6 @@
|
||||
CPU, CUDA, MPS, MTIA: mul_out
|
||||
SparseCPU: mul_out_sparse_cpu
|
||||
SparseCUDA: mul_out_sparse_cuda
|
||||
SparseMPS: mul_out_sparse_mps
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_out_sparse_csr
|
||||
MkldnnCPU: mkldnn_mul_out
|
||||
tags: pointwise
|
||||
@ -5849,7 +5848,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: sum
|
||||
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sum_coo
|
||||
SparseCPU, SparseCUDA, SparseMeta: sum_coo
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
|
||||
autogen: sum.out
|
||||
|
||||
@ -5860,7 +5859,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
NestedTensorCPU: NestedTensor_sum_dim_CPU
|
||||
SparseCPU, SparseCUDA, SparseMPS: sum_sparse_coo
|
||||
SparseCPU, SparseCUDA: sum_sparse_coo
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed
|
||||
tags: core
|
||||
|
||||
@ -6492,7 +6491,7 @@
|
||||
device_guard: False
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: unsqueeze
|
||||
SparseCPU, SparseCUDA, SparseMPS: unsqueeze_sparse
|
||||
SparseCPU, SparseCUDA: unsqueeze_sparse
|
||||
QuantizedCPU, QuantizedCUDA: unsqueeze_quantized
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: unsqueeze_nested
|
||||
tags: core
|
||||
@ -6976,7 +6975,7 @@
|
||||
CPU, CUDA: sub_out
|
||||
MPS: sub_out_mps
|
||||
MTIA: sub_out_mtia
|
||||
SparseCPU, SparseCUDA, SparseMPS: sub_out_sparse
|
||||
SparseCPU, SparseCUDA: sub_out_sparse
|
||||
tags: pointwise
|
||||
|
||||
- func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
|
||||
@ -6984,7 +6983,7 @@
|
||||
variants: function, method
|
||||
structured_delegate: sub.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: sub_sparse
|
||||
SparseCPU, SparseCUDA: sub_sparse
|
||||
ZeroTensor: sub_zerotensor
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sub_Tensor
|
||||
tags: [core, pointwise]
|
||||
@ -6994,7 +6993,7 @@
|
||||
variants: method
|
||||
structured_delegate: sub.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: sub_sparse_
|
||||
SparseCPU, SparseCUDA: sub_sparse_
|
||||
tags: pointwise
|
||||
# For C++ only, until we have conversion from C++ numbers to Tensor
|
||||
|
||||
@ -10259,7 +10258,7 @@
|
||||
structured_delegate: any.all_out
|
||||
variants: method, function
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: any_sparse
|
||||
SparseCPU, SparseCUDA: any_sparse
|
||||
tags: core
|
||||
|
||||
- func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
|
||||
@ -10343,7 +10342,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA: pow_Tensor_Scalar_out
|
||||
SparseCPU, SparseCUDA, SparseMPS: pow_out_sparse_scalar
|
||||
SparseCPU, SparseCUDA: pow_out_sparse_scalar
|
||||
MPS: pow_tensor_scalar_out_mps
|
||||
tags: pointwise
|
||||
|
||||
@ -10352,7 +10351,7 @@
|
||||
structured_delegate: pow.Tensor_Scalar_out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: pow_sparse_scalar
|
||||
SparseCPU, SparseCUDA: pow_sparse_scalar
|
||||
tags: [core, pointwise]
|
||||
|
||||
- func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
|
||||
|
@ -2,7 +2,6 @@
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/Config.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/AccumulateType.h>
|
||||
#include <ATen/NamedTensorUtils.h>
|
||||
#include <ATen/native/sparse/ParamUtils.h>
|
||||
#include <ATen/native/SparseTensorUtils.h>
|
||||
@ -296,7 +295,6 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
|
||||
to exp functions as well as reuse of softmax implementation for
|
||||
log_softmax.
|
||||
*/
|
||||
using accscalar_t = at::acc_type<scalar_t, false>;
|
||||
auto sparse_dim = input.sparse_dim();
|
||||
auto indices = input._indices().contiguous();
|
||||
auto values = input._values().contiguous();
|
||||
@ -342,14 +340,14 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
|
||||
continue;
|
||||
|
||||
/* Prepare scratch space */
|
||||
std::vector<accscalar_t> mx_row(nvalues, -std::numeric_limits<accscalar_t>::infinity());
|
||||
std::vector<accscalar_t> exp_sums_row(nvalues, 0);
|
||||
std::vector<scalar_t> mx_row(nvalues, -std::numeric_limits<scalar_t>::infinity());
|
||||
std::vector<scalar_t> exp_sums_row(nvalues, 0);
|
||||
|
||||
/* Compute mx */
|
||||
for (int64_t i : pool_indices) {
|
||||
auto values_row = values_accessor[i];
|
||||
for (const auto j : c10::irange(nvalues)) {
|
||||
mx_row[j] = std::max(mx_row[j], accscalar_t(values_row[j]));
|
||||
mx_row[j] = std::max(mx_row[j], values_row[j]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -10,7 +10,6 @@
|
||||
#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
|
||||
#include <ATen/ops/cat.h>
|
||||
#include <ATen/ops/add_native.h>
|
||||
#include <ATen/ops/mul_native.h>
|
||||
#include <ATen/ops/empty_native.h>
|
||||
#include <ATen/ops/zeros_native.h>
|
||||
#include <ATen/ops/result_type.h>
|
||||
@ -21,265 +20,10 @@
|
||||
namespace at::native {
|
||||
|
||||
using namespace at::sparse;
|
||||
using namespace mps;
|
||||
|
||||
#ifndef PYTORCH_JIT_COMPILE_SHADERS
|
||||
static auto& lib = MetalShaderLibrary::getBundledLibrary();
|
||||
#else
|
||||
#include <ATen/native/mps/Mul_metallib.h>
|
||||
#endif
|
||||
Tensor& add_out_dense_sparse_mps(Tensor& out, const Tensor& dense, const SparseTensor& sparse, const Scalar& alpha);
|
||||
|
||||
static SparseTensor& mul_out_dense_sparse_mps(
|
||||
const Tensor& dense,
|
||||
const Tensor& sparse,
|
||||
SparseTensor& out) {
|
||||
|
||||
TORCH_CHECK(sparse.is_sparse(), "mul: expected 'sparse' to be sparse COO");
|
||||
TORCH_CHECK(sparse.is_mps(), "mul: expected 'sparse' to be MPS, got ", sparse.device());
|
||||
TORCH_CHECK(out.is_mps(), "mul: expected 'out' to be MPS, got ", out.device());
|
||||
|
||||
const bool scalar_like = (dense.dim() == 0) || (dense.numel() == 1);
|
||||
TORCH_CHECK(dense.is_mps() || scalar_like,
|
||||
"mul: expected 'dense' to be MPS or scalar-like, got ", dense.device());
|
||||
|
||||
const int64_t nnz = sparse._nnz();
|
||||
out.resize_as_(sparse);
|
||||
|
||||
auto commonDtype = at::result_type(dense, sparse);
|
||||
TORCH_CHECK(canCast(commonDtype, out.scalar_type()),
|
||||
"Can't convert result type ", commonDtype, " to output ", out.scalar_type());
|
||||
|
||||
auto indices = sparse._indices().contiguous();
|
||||
auto values = sparse._values().to(commonDtype).contiguous();
|
||||
|
||||
if (nnz == 0) {
|
||||
auto empty_vals = values.narrow(0, 0, 0);
|
||||
alias_into_sparse(out,
|
||||
indices.narrow(1, 0, 0),
|
||||
(out.scalar_type() == commonDtype) ? empty_vals
|
||||
: empty_vals.to(out.scalar_type()));
|
||||
out._coalesced_(sparse.is_coalesced());
|
||||
return out;
|
||||
}
|
||||
|
||||
if (scalar_like) {
|
||||
auto scalar = dense;
|
||||
if (dense.numel() == 1 && dense.dim() > 0) {
|
||||
scalar = dense.view({});
|
||||
}
|
||||
scalar = scalar.to(values.options());
|
||||
auto out_vals = values.mul(scalar);
|
||||
if (out.scalar_type() != commonDtype) {
|
||||
out_vals = out_vals.to(out.scalar_type());
|
||||
}
|
||||
|
||||
alias_into_sparse(out, indices, out_vals);
|
||||
out._coalesced_(sparse.is_coalesced());
|
||||
return out;
|
||||
}
|
||||
|
||||
TORCH_CHECK(dense.sizes().equals(sparse.sizes()),
|
||||
"mul(dense, sparse): sizes must match exactly (no broadcasting): ",
|
||||
dense.sizes(), " vs ", sparse.sizes());
|
||||
|
||||
const int64_t ndim_i = sparse.sparse_dim();
|
||||
const int64_t ndim = dense.dim();
|
||||
TORCH_CHECK(
|
||||
ndim_i <= ndim,
|
||||
"mul(dense, sparse): sparse_dim=", ndim_i, " exceeds dense.dim()=", ndim);
|
||||
|
||||
// Prepare shapes
|
||||
int64_t view_rows = 1, view_cols = 1;
|
||||
for (int64_t i = 0; i < ndim_i; ++i) view_rows *= sparse.size(i);
|
||||
for (int64_t i = ndim_i; i < ndim; ++i) view_cols *= sparse.size(i);
|
||||
|
||||
auto dense_mps = dense.to(commonDtype).contiguous().reshape({view_rows, view_cols});
|
||||
auto out_vals = at::empty_like(values, values.options());
|
||||
|
||||
const uint32_t u_view_cols = static_cast<uint32_t>(view_cols);
|
||||
const uint32_t u_nnz = static_cast<uint32_t>(nnz);
|
||||
const uint32_t u_ndim_i = static_cast<uint32_t>(ndim_i);
|
||||
|
||||
auto stream = getCurrentMPSStream();
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto pso = lib.getPipelineStateForFunc("dense_sparse_mul_kernel_" + mps::scalarToMetalTypeString(values));
|
||||
auto computeEncoder = stream->commandEncoder();
|
||||
[computeEncoder setComputePipelineState:pso];
|
||||
|
||||
const uint32_t gridWidth = u_view_cols;
|
||||
const uint32_t gridDepth = u_nnz;
|
||||
MTLSize gridSize = MTLSizeMake(gridWidth, 1, gridDepth);
|
||||
|
||||
const uint32_t maxThreadsPerGroup = pso.maxTotalThreadsPerThreadgroup;
|
||||
const uint32_t tew = pso.threadExecutionWidth;
|
||||
uint32_t tgWidth = std::min(gridWidth, tew);
|
||||
MTLSize threadgroupSize = MTLSizeMake(tgWidth, 1, 1);
|
||||
|
||||
mtl_setArgs(
|
||||
computeEncoder,
|
||||
dense_mps,
|
||||
values,
|
||||
out_vals,
|
||||
indices,
|
||||
sparse.sizes(),
|
||||
std::array<uint32_t, 3>{u_nnz, u_ndim_i, u_view_cols}
|
||||
);
|
||||
|
||||
[computeEncoder dispatchThreads:gridSize threadsPerThreadgroup:threadgroupSize];
|
||||
}
|
||||
});
|
||||
|
||||
Tensor final_vals = out_vals;
|
||||
if (out.scalar_type() != commonDtype) {
|
||||
final_vals = final_vals.to(out.scalar_type());
|
||||
}
|
||||
|
||||
alias_into_sparse(out, indices, final_vals);
|
||||
out._coalesced_(sparse.is_coalesced());
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
SparseTensor& mul_out_sparse_mps(const Tensor& t_, const Tensor& src_, SparseTensor& r_) {
|
||||
TORCH_CHECK(r_.is_mps(), "mul: expected 'out' to be MPS, but got ", r_.device());
|
||||
|
||||
// Dense x sparse fallback (keep dense first)
|
||||
if (!t_.is_sparse() || !src_.is_sparse()) {
|
||||
const Tensor& dense = t_.is_sparse() ? src_ : t_;
|
||||
const Tensor& sparse = t_.is_sparse() ? t_ : src_;
|
||||
return mul_out_dense_sparse_mps(dense, sparse, r_);
|
||||
}
|
||||
|
||||
TORCH_CHECK(t_.is_mps(), "mul: expected 'self' to be MPS, but got ", t_.device());
|
||||
TORCH_CHECK(src_.is_mps(), "mul: expected 'other' to be MPS, but got ", src_.device());
|
||||
TORCH_CHECK(t_.sparse_dim() == src_.sparse_dim(),
|
||||
"mul(sparse, sparse): must have same sparse_dim, got ",
|
||||
t_.sparse_dim(), " vs ", src_.sparse_dim());
|
||||
TORCH_CHECK(t_.sizes().equals(src_.sizes()),
|
||||
"mul(sparse, sparse): sizes must match exactly (no broadcasting).");
|
||||
|
||||
// Coalesce and early-exit on structurally empty operands
|
||||
auto lhs = t_.coalesce();
|
||||
auto rhs = src_.coalesce();
|
||||
const int64_t lhs_nnz = lhs._nnz();
|
||||
const int64_t rhs_nnz = rhs._nnz();
|
||||
if (!lhs_nnz || !rhs_nnz) {
|
||||
r_.resize_as_(lhs);
|
||||
return r_.zero_();
|
||||
}
|
||||
|
||||
// dtype checks and promotion
|
||||
auto commonDtype = at::result_type(lhs, rhs);
|
||||
TORCH_CHECK(canCast(commonDtype, r_.scalar_type()),
|
||||
"Can't convert result type ", commonDtype, " to output ", r_.scalar_type());
|
||||
|
||||
const int64_t ndim_i = lhs.sparse_dim();
|
||||
|
||||
// ndim_i == 0, at most one structural entry
|
||||
if (ndim_i == 0) {
|
||||
r_.resize_as_(lhs);
|
||||
const bool has = (lhs_nnz && rhs_nnz);
|
||||
|
||||
auto out_indices = lhs._indices().narrow(1, 0, has ? 1 : 0);
|
||||
|
||||
Tensor lhs_vals = lhs._values().to(commonDtype);
|
||||
Tensor rhs_vals = rhs._values().to(commonDtype);
|
||||
lhs_vals = lhs_vals.narrow(0, 0, has ? 1 : 0);
|
||||
rhs_vals = rhs_vals.narrow(0, 0, has ? 1 : 0);
|
||||
|
||||
Tensor out_values = lhs_vals.mul(rhs_vals);
|
||||
if (r_.scalar_type() != commonDtype) {
|
||||
out_values = out_values.to(r_.scalar_type());
|
||||
}
|
||||
|
||||
alias_into_sparse(r_, out_indices, out_values);
|
||||
r_._coalesced_(true);
|
||||
return r_;
|
||||
}
|
||||
|
||||
// General path, intersect keys, then gather + multiply on GPU
|
||||
const auto device = r_.device();
|
||||
auto stream = getCurrentMPSStream();
|
||||
|
||||
auto lhs_indices = lhs._indices();
|
||||
auto rhs_indices = rhs._indices();
|
||||
auto lhs_values = lhs._values().to(commonDtype);
|
||||
auto rhs_values = rhs._values().to(commonDtype);
|
||||
|
||||
// Flatten sparse indices to keys
|
||||
auto lhs_keys = flatten_indices(lhs_indices, lhs.sizes());
|
||||
auto rhs_keys = flatten_indices(rhs_indices, rhs.sizes());
|
||||
|
||||
// Intersect sorted keys (search the shorter in the longer)
|
||||
const bool A_is_lhs = (lhs_nnz <= rhs_nnz);
|
||||
const int64_t lenA = A_is_lhs ? lhs_nnz : rhs_nnz;
|
||||
const int64_t lenB = A_is_lhs ? rhs_nnz : lhs_nnz;
|
||||
auto A_keys = A_is_lhs ? lhs_keys : rhs_keys;
|
||||
auto B_keys = A_is_lhs ? rhs_keys : lhs_keys;
|
||||
|
||||
auto outA_idx = at::empty({lenA}, at::device(device).dtype(kLong));
|
||||
auto outB_idx = at::empty({lenA}, at::device(device).dtype(kLong));
|
||||
auto counter = at::zeros({1}, at::device(device).dtype(kInt));
|
||||
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto pso = lib.getPipelineStateForFunc("intersect_binary_search");
|
||||
auto enc = stream->commandEncoder();
|
||||
[enc setComputePipelineState:pso];
|
||||
mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter,
|
||||
static_cast<uint32_t>(lenB), A_is_lhs);
|
||||
mtl_dispatch1DJob(enc, pso, static_cast<uint32_t>(lenA));
|
||||
}
|
||||
});
|
||||
|
||||
const uint32_t M = counter.item<int32_t>(); // number of structural matches
|
||||
|
||||
r_.resize_as_(lhs);
|
||||
|
||||
auto out_indices = at::empty({ndim_i, static_cast<int64_t>(M)}, at::device(device).dtype(at::kLong));
|
||||
auto lhs_match = outA_idx.narrow(0, 0, M);
|
||||
auto rhs_match = outB_idx.narrow(0, 0, M);
|
||||
auto out_val_sizes = lhs_values.sizes().vec();
|
||||
out_val_sizes[0] = static_cast<int64_t>(M);
|
||||
auto out_values = at::empty(out_val_sizes, lhs_values.options());
|
||||
|
||||
const uint32_t cols = static_cast<uint32_t>(
|
||||
lhs_values.numel() / std::max<int64_t>(1, lhs_nnz));
|
||||
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto pso = lib.getPipelineStateForFunc(
|
||||
"fused_gather_mul_kernel_" + mps::scalarToMetalTypeString(lhs_values));
|
||||
auto enc = stream->commandEncoder();
|
||||
[enc setComputePipelineState:pso];
|
||||
|
||||
const uint32_t tew = pso.threadExecutionWidth;
|
||||
uint32_t tgW = std::min(cols, tew);
|
||||
MTLSize grid = MTLSizeMake(cols, 1, M);
|
||||
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
|
||||
|
||||
mtl_setArgs(enc,
|
||||
lhs_values, rhs_values,
|
||||
lhs_match, rhs_match,
|
||||
lhs_indices, out_indices,
|
||||
out_values,
|
||||
std::array<uint32_t, 2>{static_cast<uint32_t>(ndim_i), static_cast<uint32_t>(lhs_nnz)},
|
||||
std::array<uint32_t, 2>{M, cols});
|
||||
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
|
||||
}
|
||||
});
|
||||
|
||||
if (r_.scalar_type() != commonDtype) {
|
||||
out_values = out_values.to(r_.scalar_type());
|
||||
}
|
||||
|
||||
alias_into_sparse(r_, out_indices, out_values);
|
||||
r_._coalesced_(true);
|
||||
return r_;
|
||||
}
|
||||
|
||||
static Tensor& add_out_dense_sparse_mps(
|
||||
Tensor& add_out_dense_sparse_mps(
|
||||
Tensor& out,
|
||||
const Tensor& dense,
|
||||
const SparseTensor& sparse,
|
||||
|
@ -1,150 +0,0 @@
|
||||
#include <metal_stdlib>
|
||||
#include <c10/metal/indexing.h>
|
||||
using namespace metal;
|
||||
|
||||
|
||||
template <typename T>
|
||||
kernel void dense_sparse_mul_kernel(
|
||||
device const T* dense [[buffer(0)]],
|
||||
device const T* values [[buffer(1)]],
|
||||
device T* out_values [[buffer(2)]],
|
||||
device const long* indices [[buffer(3)]],
|
||||
device const long* sizes [[buffer(4)]],
|
||||
constant uint3& sparse_params [[buffer(5)]],
|
||||
uint3 gid [[thread_position_in_grid]])
|
||||
{
|
||||
uint col = gid.x;
|
||||
uint i = gid.z;
|
||||
uint nnz = sparse_params.x;
|
||||
uint ndim_i = sparse_params.y;
|
||||
uint view_cols = sparse_params.z;
|
||||
|
||||
long key = 0;
|
||||
for (uint d = 0; d < ndim_i; ++d) {
|
||||
long idx_d = indices[(ulong)d * (ulong)nnz + (ulong)i];
|
||||
const auto sz_d = sizes[d];
|
||||
key = key * sz_d + idx_d;
|
||||
}
|
||||
|
||||
ulong dense_idx = (ulong)key * (ulong)view_cols + (ulong)col;
|
||||
ulong val_idx = (ulong)i * (ulong)view_cols + (ulong)col;
|
||||
|
||||
const auto a = static_cast<float>(values[val_idx]);
|
||||
const auto b = static_cast<float>(dense[dense_idx]);
|
||||
out_values[val_idx] = static_cast<T>(a * b);
|
||||
}
|
||||
|
||||
kernel void intersect_binary_search(
|
||||
device const long* keysA [[buffer(0)]],
|
||||
device const long* keysB [[buffer(1)]],
|
||||
device long* outA_idx [[buffer(2)]],
|
||||
device long* outB_idx [[buffer(3)]],
|
||||
device atomic_uint* counter [[buffer(4)]],
|
||||
constant uint& lenB [[buffer(5)]],
|
||||
constant bool& A_is_lhs [[buffer(6)]],
|
||||
uint3 tid_in_grid [[thread_position_in_grid]])
|
||||
{
|
||||
uint gid = tid_in_grid.x;
|
||||
|
||||
long key = keysA[gid];
|
||||
|
||||
// lower_bound in B
|
||||
uint lo = 0;
|
||||
uint hi = lenB;
|
||||
while (lo < hi) {
|
||||
uint mid = (lo + hi) >> 1;
|
||||
long v = keysB[mid];
|
||||
if (v < key) lo = mid + 1;
|
||||
else hi = mid;
|
||||
}
|
||||
|
||||
if (lo < lenB && keysB[lo] == key) {
|
||||
uint pos = atomic_fetch_add_explicit(counter, 1u, memory_order_relaxed);
|
||||
if (A_is_lhs) {
|
||||
outA_idx[pos] = (long)gid;
|
||||
outB_idx[pos] = (long)lo;
|
||||
} else {
|
||||
outA_idx[pos] = (long)lo;
|
||||
outB_idx[pos] = (long)gid;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
kernel void fused_gather_mul_kernel(
|
||||
device const T* lhs_vals [[buffer(0)]],
|
||||
device const T* rhs_vals [[buffer(1)]],
|
||||
device const long* lhs_sel [[buffer(2)]],
|
||||
device const long* rhs_sel [[buffer(3)]],
|
||||
device const long* lhs_indices [[buffer(4)]],
|
||||
device long* out_indices [[buffer(5)]],
|
||||
device T* out_vals [[buffer(6)]],
|
||||
constant uint2& dims_input [[buffer(7)]],
|
||||
constant uint2& dims_output [[buffer(8)]],
|
||||
uint3 gid [[thread_position_in_grid]])
|
||||
{
|
||||
const uint col = gid.x;
|
||||
const uint k = gid.z;
|
||||
const uint n_dim_i = dims_input.x;
|
||||
const uint L = dims_input.y;
|
||||
const uint M = dims_output.x;
|
||||
const uint view_cols = dims_output.y;
|
||||
|
||||
const long iL = lhs_sel[k];
|
||||
const long iR = rhs_sel[k];
|
||||
|
||||
if (col < view_cols) {
|
||||
const ulong offL = (ulong)iL * (ulong)view_cols + (ulong)col;
|
||||
const ulong offR = (ulong)iR * (ulong)view_cols + (ulong)col;
|
||||
const ulong offO = (ulong)k * (ulong)view_cols + (ulong)col;
|
||||
|
||||
const float a = (float)lhs_vals[offL];
|
||||
const float b = (float)rhs_vals[offR];
|
||||
out_vals[offO] = (T)(a * b);
|
||||
}
|
||||
|
||||
// One thread per match copies the indices column
|
||||
if (col == 0) {
|
||||
const ulong uL = (ulong)L;
|
||||
const ulong uM = (ulong)M;
|
||||
const ulong src_col = (ulong)iL; // gather from lhs
|
||||
for (uint d = 0; d < n_dim_i; ++d) {
|
||||
const long v = lhs_indices[(ulong)d * uL + src_col];
|
||||
out_indices[(ulong)d * uM + (ulong)k] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANTIATE_DENSE_SPARSE_MUL(DTYPE) \
|
||||
template [[host_name("dense_sparse_mul_kernel_" #DTYPE)]] kernel void \
|
||||
dense_sparse_mul_kernel<DTYPE>( \
|
||||
device const DTYPE* dense [[buffer(0)]], \
|
||||
device const DTYPE* values [[buffer(1)]], \
|
||||
device DTYPE* out_values [[buffer(2)]], \
|
||||
device const long* indices [[buffer(3)]], \
|
||||
device const long* sizes [[buffer(4)]], \
|
||||
constant uint3& sparse_params [[buffer(5)]], \
|
||||
uint3 gid [[thread_position_in_grid]]);
|
||||
|
||||
INSTANTIATE_DENSE_SPARSE_MUL(float);
|
||||
INSTANTIATE_DENSE_SPARSE_MUL(half);
|
||||
INSTANTIATE_DENSE_SPARSE_MUL(bfloat);
|
||||
|
||||
#define INSTANTIATE_FUSED_GATHER_MUL(DTYPE) \
|
||||
template [[host_name("fused_gather_mul_kernel_" #DTYPE)]] kernel void \
|
||||
fused_gather_mul_kernel<DTYPE>( \
|
||||
device const DTYPE* lhs_vals [[buffer(0)]], \
|
||||
device const DTYPE* rhs_vals [[buffer(1)]], \
|
||||
device const long* lhs_sel [[buffer(2)]], \
|
||||
device const long* rhs_sel [[buffer(3)]], \
|
||||
device const long* lhs_indices [[buffer(4)]], \
|
||||
device long* out_indices [[buffer(5)]], \
|
||||
device DTYPE* out_vals [[buffer(6)]], \
|
||||
constant uint2& dims_input [[buffer(7)]], \
|
||||
constant uint2& dims_output [[buffer(8)]], \
|
||||
uint3 gid [[thread_position_in_grid]]);
|
||||
|
||||
INSTANTIATE_FUSED_GATHER_MUL(float);
|
||||
INSTANTIATE_FUSED_GATHER_MUL(half);
|
||||
INSTANTIATE_FUSED_GATHER_MUL(bfloat);
|
@ -95,72 +95,6 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(USE_ROCM) && (defined(USE_FLASH_ATTENTION) || defined(USE_MEM_EFF_ATTENTION))
|
||||
namespace pytorch_flash
|
||||
{
|
||||
std::tuple<
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor>
|
||||
mha_fwd(
|
||||
const at::Tensor& q, // batch_size x seqlen_q x num_heads x head_size
|
||||
const at::Tensor& k, // batch_size x seqlen_k x num_heads_k x head_size
|
||||
const at::Tensor& v, // batch_size x seqlen_k x num_heads_k x head_size
|
||||
std::optional<at::Tensor>&
|
||||
out_, // batch_size x seqlen_q x num_heads x head_size
|
||||
std::optional<at::Tensor>&
|
||||
alibi_slopes_, // num_heads or batch_size x num_heads
|
||||
const float p_dropout,
|
||||
const float softmax_scale,
|
||||
bool is_causal,
|
||||
std::optional<int64_t> window_size_left,
|
||||
std::optional<int64_t> window_size_right,
|
||||
const float softcap,
|
||||
const bool return_softmax,
|
||||
std::optional<at::Generator> gen_) {
|
||||
#if defined(USE_ROCM_CK_SDPA)
|
||||
if (at::globalContext().getROCmFAPreferredBackend() ==
|
||||
at::ROCmFABackend::Ck) {
|
||||
const int non_null_window_left = window_size_left.value_or(-1);
|
||||
const int non_null_window_right = window_size_right.value_or(-1);
|
||||
std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
|
||||
return mha_fwd_ck(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
out_,
|
||||
p_dropout,
|
||||
softmax_scale,
|
||||
is_causal,
|
||||
non_null_window_left,
|
||||
non_null_window_right,
|
||||
return_softmax,
|
||||
gen_,
|
||||
dummy_attn_bias); // Not used in flash attention
|
||||
}
|
||||
#endif
|
||||
return mha_fwd_aot(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
out_,
|
||||
alibi_slopes_,
|
||||
p_dropout,
|
||||
softmax_scale,
|
||||
is_causal,
|
||||
window_size_left,
|
||||
window_size_right,
|
||||
return_softmax,
|
||||
gen_);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace at {
|
||||
|
||||
namespace cuda::philox {
|
||||
|
@ -270,7 +270,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varle
|
||||
#endif
|
||||
|
||||
TORCH_API
|
||||
std::tuple<
|
||||
inline std::tuple<
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
@ -294,7 +294,42 @@ mha_fwd(
|
||||
std::optional<int64_t> window_size_right,
|
||||
const float softcap,
|
||||
const bool return_softmax,
|
||||
std::optional<at::Generator> gen_);
|
||||
std::optional<at::Generator> gen_) {
|
||||
#if defined(USE_ROCM_CK_SDPA)
|
||||
if (at::globalContext().getROCmFAPreferredBackend() ==
|
||||
at::ROCmFABackend::Ck) {
|
||||
const int non_null_window_left = window_size_left.value_or(-1);
|
||||
const int non_null_window_right = window_size_right.value_or(-1);
|
||||
std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
|
||||
return mha_fwd_ck(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
out_,
|
||||
p_dropout,
|
||||
softmax_scale,
|
||||
is_causal,
|
||||
non_null_window_left,
|
||||
non_null_window_right,
|
||||
return_softmax,
|
||||
gen_,
|
||||
dummy_attn_bias); // Not used in flash attention
|
||||
}
|
||||
#endif
|
||||
return mha_fwd_aot(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
out_,
|
||||
alibi_slopes_,
|
||||
p_dropout,
|
||||
softmax_scale,
|
||||
is_causal,
|
||||
window_size_left,
|
||||
window_size_right,
|
||||
return_softmax,
|
||||
gen_);
|
||||
}
|
||||
|
||||
inline std::tuple<
|
||||
at::Tensor,
|
||||
|
@ -72,12 +72,6 @@ def check_accuracy(actual_csv, expected_csv, expected_filename):
|
||||
"timm_vovnet",
|
||||
"torchrec_dlrm",
|
||||
"vgg16",
|
||||
# LLM
|
||||
"meta-llama/Llama-3.2-1B",
|
||||
"google/gemma-2-2b",
|
||||
"google/gemma-3-4b-it",
|
||||
"openai/whisper-tiny",
|
||||
"Qwen/Qwen3-0.6B",
|
||||
}
|
||||
)
|
||||
|
||||
|
@ -55,12 +55,6 @@ def check_graph_breaks(actual_csv, expected_csv, expected_filename):
|
||||
"timm_nfnet",
|
||||
"torchrec_dlrm",
|
||||
"vgg16",
|
||||
# LLM
|
||||
"meta-llama/Llama-3.2-1B",
|
||||
"google/gemma-2-2b",
|
||||
"google/gemma-3-4b-it",
|
||||
"openai/whisper-tiny",
|
||||
"Qwen/Qwen3-0.6B",
|
||||
}
|
||||
)
|
||||
|
||||
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass,5
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass,6
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass,5
|
||||
|
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5
|
||||
|
||||
|
||||
YituTechConvBert,pass,5
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,eager_fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,eager_fail_to_run,0
|
||||
|
|
@ -167,23 +167,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,fail_accuracy,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,fail_to_run,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,fail_accuracy,0
|
||||
|
|
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,0
|
||||
|
||||
|
||||
YituTechConvBert,pass,0
|
||||
|
||||
|
||||
|
||||
meta-llama/Llama-3.2-1B,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-2-2b,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
google/gemma-3-4b-it,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
openai/whisper-tiny,pass_due_to_skip,0
|
||||
|
||||
|
||||
|
||||
Qwen/Qwen3-0.6B,pass_due_to_skip,0
|
||||
|
|
@ -98,11 +98,11 @@ dlrm,pass,0
|
||||
|
||||
|
||||
|
||||
doctr_det_predictor,pass,3
|
||||
doctr_det_predictor,pass,5
|
||||
|
||||
|
||||
|
||||
doctr_reco_predictor,pass,1
|
||||
doctr_reco_predictor,pass,4
|
||||
|
||||
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user