mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-23 06:34:55 +08:00
Compare commits
3 Commits
codegen_tr
...
csl/test_i
Author | SHA1 | Date | |
---|---|---|---|
267a53ee1f | |||
76bd7cbe45 | |||
fcf794ac58 |
@ -15,8 +15,6 @@ fi
|
||||
# Compress the fatbin with -compress-mode=size for CUDA 13
|
||||
if [[ "$DESIRED_CUDA" == *"13"* ]]; then
|
||||
export TORCH_NVCC_FLAGS="-compress-mode=size"
|
||||
# Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
|
||||
export BUILD_BUNDLE_PTXAS=1
|
||||
fi
|
||||
|
||||
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
|
||||
|
@ -372,7 +372,7 @@ if __name__ == "__main__":
|
||||
else:
|
||||
print("build pytorch without mkldnn backend")
|
||||
|
||||
os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")
|
||||
os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel")
|
||||
if enable_cuda:
|
||||
print("Updating Cuda Dependency")
|
||||
filename = os.listdir("/pytorch/dist/")
|
||||
|
@ -442,7 +442,7 @@ def build_torchvision(
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation")
|
||||
host.run_cmd(f"cd vision && {build_vars} python3 setup.py bdist_wheel")
|
||||
vision_wheel_name = host.list_dir("vision/dist")[0]
|
||||
embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name))
|
||||
|
||||
@ -497,7 +497,7 @@ def build_torchdata(
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation")
|
||||
host.run_cmd(f"cd data && {build_vars} python3 setup.py bdist_wheel")
|
||||
wheel_name = host.list_dir("data/dist")[0]
|
||||
embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name))
|
||||
|
||||
@ -553,7 +553,7 @@ def build_torchtext(
|
||||
if host.using_docker():
|
||||
build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
|
||||
|
||||
host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation")
|
||||
host.run_cmd(f"cd text && {build_vars} python3 setup.py bdist_wheel")
|
||||
wheel_name = host.list_dir("text/dist")[0]
|
||||
embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name))
|
||||
|
||||
@ -614,7 +614,7 @@ def build_torchaudio(
|
||||
host.run_cmd(
|
||||
f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \
|
||||
&& ./packaging/ffmpeg/build.sh \
|
||||
&& {build_vars} python3 -m build --wheel --no-isolation"
|
||||
&& {build_vars} python3 setup.py bdist_wheel"
|
||||
)
|
||||
|
||||
wheel_name = host.list_dir("audio/dist")[0]
|
||||
@ -726,7 +726,7 @@ def start_build(
|
||||
print("Building PyTorch wheel")
|
||||
build_opts = ""
|
||||
if pytorch_build_number is not None:
|
||||
build_opts += f" -C--build-option=--build-number={pytorch_build_number}"
|
||||
build_opts += f" --build-number {pytorch_build_number}"
|
||||
# Breakpad build fails on aarch64
|
||||
build_vars = "USE_BREAKPAD=0 "
|
||||
if branch == "nightly":
|
||||
@ -747,8 +747,7 @@ def start_build(
|
||||
print("build pytorch with mkldnn+acl backend")
|
||||
build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
|
||||
host.run_cmd(
|
||||
f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && "
|
||||
f"{build_vars} python3 -m build --wheel --no-isolation{build_opts}"
|
||||
f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}"
|
||||
)
|
||||
print("Repair the wheel")
|
||||
pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
|
||||
@ -764,7 +763,7 @@ def start_build(
|
||||
else:
|
||||
print("build pytorch without mkldnn backend")
|
||||
host.run_cmd(
|
||||
f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
|
||||
f"cd pytorch && {build_vars} python3 setup.py bdist_wheel{build_opts}"
|
||||
)
|
||||
|
||||
print("Deleting build folder")
|
||||
|
@ -84,8 +84,8 @@ fi
|
||||
_UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
|
||||
_UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
|
||||
if [[ "$image" == *rocm* ]]; then
|
||||
_UCX_COMMIT=29831d319e6be55cb8c768ca61de335c934ca39e
|
||||
_UCC_COMMIT=9f4b242cbbd8b1462cbc732eb29316cdfa124b77
|
||||
_UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
|
||||
_UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
|
||||
fi
|
||||
|
||||
tag=$(echo $image | awk -F':' '{print $2}')
|
||||
@ -175,6 +175,20 @@ case "$tag" in
|
||||
fi
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
ROCM_VERSION=6.4
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
if [[ $tag =~ "benchmarks" ]]; then
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
fi
|
||||
;;
|
||||
pytorch-linux-noble-rocm-alpha-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
ROCM_VERSION=7.0
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
@ -182,9 +196,6 @@ case "$tag" in
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
|
||||
if [[ $tag =~ "benchmarks" ]]; then
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
fi
|
||||
;;
|
||||
pytorch-linux-jammy-xpu-n-1-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
@ -441,3 +452,12 @@ elif [ "$HAS_TRITON" = "yes" ]; then
|
||||
echo "expecting triton to not be installed, but it is"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Sanity check cmake version. Executorch reinstalls cmake and I'm not sure if
|
||||
# they support 4.0.0 yet, so exclude them from this check.
|
||||
CMAKE_VERSION=$(drun cmake --version)
|
||||
if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then
|
||||
echo "CMake version is not 4.0.0:"
|
||||
drun cmake --version
|
||||
exit 1
|
||||
fi
|
||||
|
@ -1 +1 @@
|
||||
v2.28.3-1
|
||||
v2.27.5-1
|
||||
|
@ -1 +1 @@
|
||||
v2.28.3-1
|
||||
v2.27.7-1
|
||||
|
@ -42,6 +42,12 @@ EOF
|
||||
rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
|
||||
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
|
||||
|
||||
# Special case for ROCM_VERSION == 7.0
|
||||
if [[ $(ver "$ROCM_VERSION") -eq $(ver 7.0) ]]; then
|
||||
rocm_baseurl="https://repo.radeon.com/rocm/apt/7.0_alpha2"
|
||||
amdgpu_baseurl="https://repo.radeon.com/amdgpu/30.10_alpha2/ubuntu"
|
||||
fi
|
||||
|
||||
# Add amdgpu repository
|
||||
UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
|
||||
echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
|
||||
|
@ -12,8 +12,8 @@ function do_install() {
|
||||
|
||||
rocm_version_nodot=${rocm_version//./}
|
||||
|
||||
# https://github.com/icl-utk-edu/magma/pull/65
|
||||
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
|
||||
# Version 2.7.2 + ROCm related updates
|
||||
MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
|
||||
magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
|
||||
|
||||
rocm_dir="/opt/rocm"
|
||||
|
@ -66,15 +66,15 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}"
|
||||
# Triton needs at least gcc-9 to build
|
||||
apt-get install -y g++-9
|
||||
|
||||
CXX=g++-9 conda_run python -m build --wheel --no-isolation
|
||||
CXX=g++-9 conda_run python setup.py bdist_wheel
|
||||
elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
|
||||
# Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
|
||||
add-apt-repository -y ppa:ubuntu-toolchain-r/test
|
||||
apt-get install -y g++-9
|
||||
|
||||
CXX=g++-9 conda_run python -m build --wheel --no-isolation
|
||||
CXX=g++-9 conda_run python setup.py bdist_wheel
|
||||
else
|
||||
conda_run python -m build --wheel --no-isolation
|
||||
conda_run python setup.py bdist_wheel
|
||||
fi
|
||||
|
||||
# Copy the wheel to /opt for multi stage docker builds
|
||||
|
71
.ci/docker/manywheel/Dockerfile_cxx11-abi
Normal file
71
.ci/docker/manywheel/Dockerfile_cxx11-abi
Normal file
@ -0,0 +1,71 @@
|
||||
FROM centos:8 as base
|
||||
|
||||
ENV LC_ALL en_US.UTF-8
|
||||
ENV LANG en_US.UTF-8
|
||||
ENV LANGUAGE en_US.UTF-8
|
||||
ENV PATH /opt/rh/gcc-toolset-11/root/bin/:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||
|
||||
# change to a valid repo
|
||||
RUN sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-Linux-*.repo
|
||||
# enable to install ninja-build
|
||||
RUN sed -i 's|enabled=0|enabled=1|g' /etc/yum.repos.d/CentOS-Linux-PowerTools.repo
|
||||
|
||||
RUN yum -y update
|
||||
RUN yum install -y wget curl perl util-linux xz bzip2 git patch which zlib-devel sudo
|
||||
RUN yum install -y autoconf automake make cmake gdb gcc-toolset-11-gcc-c++
|
||||
|
||||
|
||||
FROM base as openssl
|
||||
ADD ./common/install_openssl.sh install_openssl.sh
|
||||
RUN bash ./install_openssl.sh && rm install_openssl.sh
|
||||
|
||||
# Install python
|
||||
FROM base as python
|
||||
RUN yum install -y openssl-devel zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel
|
||||
ADD common/install_cpython.sh install_cpython.sh
|
||||
RUN bash ./install_cpython.sh && rm install_cpython.sh
|
||||
|
||||
FROM base as conda
|
||||
ADD ./common/install_conda_docker.sh install_conda.sh
|
||||
RUN bash ./install_conda.sh && rm install_conda.sh
|
||||
RUN /opt/conda/bin/conda install -y cmake
|
||||
|
||||
FROM base as intel
|
||||
# Install MKL
|
||||
COPY --from=python /opt/python /opt/python
|
||||
COPY --from=python /opt/_internal /opt/_internal
|
||||
COPY --from=conda /opt/conda /opt/conda
|
||||
ENV PATH=/opt/conda/bin:$PATH
|
||||
ADD ./common/install_mkl.sh install_mkl.sh
|
||||
RUN bash ./install_mkl.sh && rm install_mkl.sh
|
||||
|
||||
FROM base as patchelf
|
||||
ADD ./common/install_patchelf.sh install_patchelf.sh
|
||||
RUN bash ./install_patchelf.sh && rm install_patchelf.sh
|
||||
RUN cp $(which patchelf) /patchelf
|
||||
|
||||
FROM base as jni
|
||||
ADD ./common/install_jni.sh install_jni.sh
|
||||
ADD ./java/jni.h jni.h
|
||||
RUN bash ./install_jni.sh && rm install_jni.sh
|
||||
|
||||
FROM base as libpng
|
||||
ADD ./common/install_libpng.sh install_libpng.sh
|
||||
RUN bash ./install_libpng.sh && rm install_libpng.sh
|
||||
|
||||
FROM base as final
|
||||
COPY --from=openssl /opt/openssl /opt/openssl
|
||||
COPY --from=python /opt/python /opt/python
|
||||
COPY --from=python /opt/_internal /opt/_internal
|
||||
COPY --from=intel /opt/intel /opt/intel
|
||||
COPY --from=conda /opt/conda /opt/conda
|
||||
COPY --from=patchelf /usr/local/bin/patchelf /usr/local/bin/patchelf
|
||||
COPY --from=jni /usr/local/include/jni.h /usr/local/include/jni.h
|
||||
COPY --from=libpng /usr/local/bin/png* /usr/local/bin/
|
||||
COPY --from=libpng /usr/local/bin/libpng* /usr/local/bin/
|
||||
COPY --from=libpng /usr/local/include/png* /usr/local/include/
|
||||
COPY --from=libpng /usr/local/include/libpng* /usr/local/include/
|
||||
COPY --from=libpng /usr/local/lib/libpng* /usr/local/lib/
|
||||
COPY --from=libpng /usr/local/lib/pkgconfig /usr/local/lib/pkgconfig
|
||||
|
||||
RUN yum install -y ninja-build
|
@ -43,6 +43,12 @@ case ${image} in
|
||||
MANY_LINUX_VERSION="2_28_aarch64"
|
||||
OPENBLAS_VERSION="v0.3.30"
|
||||
;;
|
||||
manylinuxcxx11-abi-builder:cpu-cxx11-abi)
|
||||
TARGET=final
|
||||
GPU_IMAGE=""
|
||||
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
|
||||
MANY_LINUX_VERSION="cxx11-abi"
|
||||
;;
|
||||
manylinuxs390x-builder:cpu-s390x)
|
||||
TARGET=final
|
||||
GPU_IMAGE=s390x/almalinux:8
|
||||
|
@ -10,11 +10,6 @@ boto3==1.35.42
|
||||
#Pinned versions: 1.19.12, 1.16.34
|
||||
#test that import:
|
||||
|
||||
build==1.3.0
|
||||
#Description: A simple, correct Python build frontend.
|
||||
#Pinned versions: 1.3.0
|
||||
#test that import:
|
||||
|
||||
click
|
||||
#Description: Command Line Interface Creation Kit
|
||||
#Pinned versions:
|
||||
@ -111,10 +106,10 @@ networkx==2.8.8
|
||||
#Pinned versions: 2.8.8
|
||||
#test that import: functorch
|
||||
|
||||
ninja==1.11.1.4
|
||||
ninja==1.11.1.3
|
||||
#Description: build system. Used in some tests. Used in build to generate build
|
||||
#time tracing information
|
||||
#Pinned versions: 1.11.1.4
|
||||
#Pinned versions: 1.11.1.3
|
||||
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
|
||||
|
||||
numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
|
||||
@ -172,9 +167,9 @@ pillow==11.0.0
|
||||
#Pinned versions: 10.3.0
|
||||
#test that import:
|
||||
|
||||
protobuf==5.29.5
|
||||
protobuf==5.29.4
|
||||
#Description: Google's data interchange format
|
||||
#Pinned versions: 5.29.5
|
||||
#Pinned versions: 5.29.4
|
||||
#test that import: test_tensorboard.py, test/onnx/*
|
||||
|
||||
psutil
|
||||
@ -378,7 +373,7 @@ dataclasses_json==0.6.7
|
||||
#Pinned versions: 0.6.7
|
||||
#test that import:
|
||||
|
||||
cmake==3.31.6
|
||||
cmake==4.0.0
|
||||
#Description: required for building
|
||||
|
||||
tlparse==0.4.0
|
||||
|
@ -1,15 +1,8 @@
|
||||
sphinx==5.3.0
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 5.3.0
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2
|
||||
|
||||
standard-imghdr==3.13.0; python_version >= "3.13"
|
||||
#Description: This is needed by Sphinx, so it needs to be added here.
|
||||
# The reasons are as follows:
|
||||
# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
|
||||
# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
|
||||
# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
|
||||
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
|
||||
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
|
||||
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
|
||||
# something related to Docker setup. We can investigate this later.
|
||||
|
@ -142,7 +142,7 @@ time CMAKE_ARGS=${CMAKE_ARGS[@]} \
|
||||
EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
|
||||
BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
|
||||
USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
|
||||
python -m build --wheel --no-isolation --outdir /tmp/$WHEELHOUSE_DIR
|
||||
python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
|
||||
echo "Finished setup.py bdist at $(date)"
|
||||
|
||||
# Build libtorch packages
|
||||
|
@ -104,7 +104,7 @@ if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
|
||||
export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr
|
||||
fi
|
||||
|
||||
echo "Calling -m pip install . -v --no-build-isolation at $(date)"
|
||||
echo "Calling 'python -m pip install .' at $(date)"
|
||||
|
||||
if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
|
||||
STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
|
||||
|
@ -290,13 +290,13 @@ else
|
||||
|
||||
WERROR=1 python setup.py clean
|
||||
|
||||
WERROR=1 python -m build --wheel --no-isolation
|
||||
WERROR=1 python setup.py bdist_wheel
|
||||
else
|
||||
python setup.py clean
|
||||
if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
|
||||
source .ci/pytorch/install_cache_xla.sh
|
||||
fi
|
||||
python -m build --wheel --no-isolation
|
||||
python setup.py bdist_wheel
|
||||
fi
|
||||
pip_install_whl "$(echo dist/*.whl)"
|
||||
|
||||
|
@ -36,11 +36,11 @@ fi
|
||||
print_cmake_info
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
|
||||
# Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
|
||||
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python -m build --wheel --no-isolation
|
||||
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
||||
else
|
||||
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
|
||||
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
|
||||
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python -m build --wheel --no-isolation -C--build-option=--plat-name=macosx_11_0_arm64
|
||||
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
|
||||
fi
|
||||
if which sccache > /dev/null; then
|
||||
print_sccache_stats
|
||||
|
@ -26,7 +26,6 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then
|
||||
time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
|
||||
time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
|
||||
time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering
|
||||
time python test/run_test.py --verbose -i distributed/test_aten_comm_compute_reordering
|
||||
time python test/run_test.py --verbose -i distributed/test_store
|
||||
time python test/run_test.py --verbose -i distributed/test_symmetric_memory
|
||||
time python test/run_test.py --verbose -i distributed/test_pg_wrapper
|
||||
|
@ -435,7 +435,7 @@ test_inductor_distributed() {
|
||||
|
||||
# this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
|
||||
# with if required # gpus aren't available
|
||||
python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_aten_comm_compute_reordering distributed/test_compute_comm_reordering --verbose
|
||||
python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_compute_comm_reordering --verbose
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
@ -1415,7 +1415,7 @@ EOF
|
||||
pip3 install -r requirements.txt
|
||||
# shellcheck source=./common-build.sh
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
|
||||
python -m build --wheel --no-isolation -C--build-option=--bdist-dir="base_bdist_tmp" --outdir "base_dist"
|
||||
python setup.py bdist_wheel --bdist-dir="base_bdist_tmp" --dist-dir="base_dist"
|
||||
python -mpip install base_dist/*.whl
|
||||
echo "::endgroup::"
|
||||
|
||||
@ -1617,7 +1617,7 @@ test_operator_benchmark() {
|
||||
test_inductor_set_cpu_affinity
|
||||
|
||||
cd benchmarks/operator_benchmark/pt_extension
|
||||
python -m pip install . -v --no-build-isolation
|
||||
python -m pip install .
|
||||
|
||||
cd "${TEST_DIR}"/benchmarks/operator_benchmark
|
||||
$TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \
|
||||
@ -1630,25 +1630,6 @@ test_operator_benchmark() {
|
||||
--expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
|
||||
}
|
||||
|
||||
test_operator_microbenchmark() {
|
||||
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
||||
mkdir -p "$TEST_REPORTS_DIR"
|
||||
TEST_DIR=$(pwd)
|
||||
|
||||
cd benchmarks/operator_benchmark/pt_extension
|
||||
python -m pip install .
|
||||
|
||||
cd "${TEST_DIR}"/benchmarks/operator_benchmark
|
||||
|
||||
for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do
|
||||
$TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
|
||||
--output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
|
||||
--benchmark-name "PyTorch operator microbenchmark" --use-compile
|
||||
$TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
|
||||
--output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}.json" \
|
||||
--benchmark-name "PyTorch operator microbenchmark"
|
||||
done
|
||||
}
|
||||
|
||||
if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
|
||||
(cd test && python -c "import torch; print(torch.__config__.show())")
|
||||
@ -1705,8 +1686,6 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
|
||||
test_operator_benchmark cpu ${TEST_MODE}
|
||||
|
||||
fi
|
||||
elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
|
||||
test_operator_microbenchmark
|
||||
elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
|
||||
test_inductor_distributed
|
||||
elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
|
||||
@ -1815,8 +1794,6 @@ elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
|
||||
test_h100_distributed
|
||||
elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
|
||||
test_h100_symm_mem
|
||||
elif [[ "${TEST_CONFIG}" == "b200-symm-mem" ]]; then
|
||||
test_h100_symm_mem
|
||||
elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then
|
||||
test_h100_cutlass_backend
|
||||
else
|
||||
|
@ -70,7 +70,7 @@ sccache --zero-stats
|
||||
sccache --show-stats
|
||||
|
||||
# Build the wheel
|
||||
python -m build --wheel --no-build-isolation
|
||||
python setup.py bdist_wheel
|
||||
if ($LASTEXITCODE -ne 0) { exit 1 }
|
||||
|
||||
# Install the wheel locally
|
||||
|
@ -130,7 +130,7 @@ if "%USE_CUDA%"=="1" (
|
||||
:: Print all existing environment variable for debugging
|
||||
set
|
||||
|
||||
python -m build --wheel --no-isolation
|
||||
python setup.py bdist_wheel
|
||||
if errorlevel 1 goto fail
|
||||
if not errorlevel 0 goto fail
|
||||
sccache --show-stats
|
||||
|
@ -48,7 +48,7 @@ sccache --zero-stats
|
||||
sccache --show-stats
|
||||
|
||||
:: Call PyTorch build script
|
||||
python -m build --wheel --no-isolation --outdir "%PYTORCH_FINAL_PACKAGE_DIR%"
|
||||
python setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%"
|
||||
|
||||
:: show sccache stats
|
||||
sccache --show-stats
|
||||
|
@ -28,5 +28,5 @@ start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_t
|
||||
if errorlevel 1 exit /b 1
|
||||
|
||||
set "PATH=%CD%\Python\Scripts;%CD%\Python;%PATH%"
|
||||
%PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel build
|
||||
%PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel
|
||||
if errorlevel 1 exit /b 1
|
||||
|
@ -86,7 +86,7 @@ copy /Y "%LIBTORCH_PREFIX%-%PYTORCH_BUILD_VERSION%.zip" "%PYTORCH_FINAL_PACKAGE_
|
||||
goto build_end
|
||||
|
||||
:pytorch
|
||||
%PYTHON_EXEC% -m build --wheel --no-isolation --outdir "%PYTORCH_FINAL_PACKAGE_DIR%"
|
||||
%PYTHON_EXEC% setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%"
|
||||
|
||||
:build_end
|
||||
IF ERRORLEVEL 1 exit /b 1
|
||||
|
@ -63,7 +63,7 @@ if errorlevel 1 exit /b 1
|
||||
call %CONDA_HOME%\condabin\activate.bat testenv
|
||||
if errorlevel 1 exit /b 1
|
||||
|
||||
call conda install -y -q -c conda-forge libuv=1.51
|
||||
call conda install -y -q -c conda-forge libuv=1.39
|
||||
call conda install -y -q intel-openmp
|
||||
|
||||
echo "install and test libtorch"
|
||||
|
@ -18,7 +18,7 @@ if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
|
||||
|
||||
%PYTHON_EXEC% -m pip install pyyaml
|
||||
%PYTHON_EXEC% -m pip install mkl-include mkl-static
|
||||
%PYTHON_EXEC% -m pip install boto3 requests ninja typing_extensions setuptools==72.1.0
|
||||
%PYTHON_EXEC% -m pip install boto3 ninja typing_extensions setuptools==72.1.0
|
||||
|
||||
where cmake.exe
|
||||
|
||||
|
@ -143,8 +143,7 @@ case $desired_python in
|
||||
RENAME_WHEEL=false
|
||||
;;
|
||||
3.13t)
|
||||
echo "Using 3.13t deps"
|
||||
mac_version='macosx-11.0-arm64'
|
||||
echo "Using 3.13 deps"
|
||||
NUMPY_PINNED_VERSION="==2.1.0"
|
||||
RENAME_WHEEL=false
|
||||
;;
|
||||
@ -186,11 +185,11 @@ export USE_QNNPACK=OFF
|
||||
export BUILD_TEST=OFF
|
||||
|
||||
pushd "$pytorch_rootdir"
|
||||
echo "Calling -m build --wheel --no-isolation at $(date)"
|
||||
echo "Calling setup.py bdist_wheel at $(date)"
|
||||
|
||||
_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python -m build --wheel --no-isolation --outdir "$whl_tmp_dir" -C--plat-name="${mac_version//[-.]/_}"
|
||||
_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name "${mac_version//[-.]/_}"
|
||||
|
||||
echo "Finished -m build --wheel --no-isolation at $(date)"
|
||||
echo "Finished setup.py bdist_wheel at $(date)"
|
||||
|
||||
if [[ $package_type != 'libtorch' ]]; then
|
||||
echo "delocating wheel dependencies"
|
||||
|
47
.circleci/scripts/functorch_doc_push_script.sh
Executable file
47
.circleci/scripts/functorch_doc_push_script.sh
Executable file
@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
# =================== The following code **should** be executed inside Docker container ===================
|
||||
|
||||
# Install dependencies
|
||||
sudo apt-get -y update
|
||||
sudo apt-get -y install expect-dev
|
||||
|
||||
# This is where the local pytorch install in the docker image is located
|
||||
pt_checkout="/var/lib/jenkins/workspace"
|
||||
source "$pt_checkout/.ci/pytorch/common_utils.sh"
|
||||
echo "functorch_doc_push_script.sh: Invoked with $*"
|
||||
|
||||
set -ex
|
||||
|
||||
version=${DOCS_VERSION:-nightly}
|
||||
echo "version: $version"
|
||||
|
||||
# Build functorch docs
|
||||
pushd $pt_checkout/functorch/docs
|
||||
pip -q install -r requirements.txt
|
||||
make html
|
||||
popd
|
||||
|
||||
git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
|
||||
pushd functorch_ghpages
|
||||
|
||||
if [ $version == "main" ]; then
|
||||
version=nightly
|
||||
fi
|
||||
|
||||
git rm -rf "$version" || true
|
||||
mv "$pt_checkout/functorch/docs/build/html" "$version"
|
||||
|
||||
git add "$version" || true
|
||||
git status
|
||||
git config user.email "soumith+bot@pytorch.org"
|
||||
git config user.name "pytorchbot"
|
||||
# If there aren't changes, don't make a commit; push is no-op
|
||||
git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
|
||||
git status
|
||||
|
||||
if [[ "${WITH_PUSH:-}" == true ]]; then
|
||||
git push -u origin gh-pages
|
||||
fi
|
||||
|
||||
popd
|
||||
# =================== The above code **should** be executed inside Docker container ===================
|
@ -69,8 +69,6 @@ readability-string-compare,
|
||||
'
|
||||
HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
|
||||
WarningsAsErrors: '*'
|
||||
LineFilter:
|
||||
- name: '/usr/include/.*'
|
||||
CheckOptions:
|
||||
cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true
|
||||
cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true
|
||||
|
4
.github/ISSUE_TEMPLATE/ci-sev.md
vendored
4
.github/ISSUE_TEMPLATE/ci-sev.md
vendored
@ -1,10 +1,6 @@
|
||||
---
|
||||
name: "⚠️ CI SEV"
|
||||
about: Tracking incidents for PyTorch's CI infra.
|
||||
title: ''
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
> NOTE: Remember to label this issue with "`ci: sev`"
|
||||
|
18
.github/ISSUE_TEMPLATE/disable-autorevert.md
vendored
18
.github/ISSUE_TEMPLATE/disable-autorevert.md
vendored
@ -1,18 +0,0 @@
|
||||
---
|
||||
name: DISABLE AUTOREVERT
|
||||
about: Disables autorevert when open
|
||||
title: "❌\U0001F519 [DISABLE AUTOREVERT]"
|
||||
labels: 'ci: disable-autorevert'
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
This issue, while open, disables the autorevert functionality.
|
||||
|
||||
More details can be found [here](https://github.com/pytorch/test-infra/blob/main/aws/lambda/pytorch-auto-revert/README.md)
|
||||
|
||||
|
||||
## Why are you disabling autorevert?
|
||||
|
||||
|
||||
## Links to any issues/commits/errors that shows the source of problem
|
6
.github/ISSUE_TEMPLATE/disable-ci-jobs.md
vendored
6
.github/ISSUE_TEMPLATE/disable-ci-jobs.md
vendored
@ -1,10 +1,8 @@
|
||||
---
|
||||
name: Disable CI jobs (PyTorch Dev Infra only)
|
||||
about: Use this template to disable CI jobs
|
||||
title: DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]
|
||||
labels: 'module: ci'
|
||||
assignees: ''
|
||||
|
||||
title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]"
|
||||
labels: "module: ci"
|
||||
---
|
||||
|
||||
> For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once
|
||||
|
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
78a47f87ce259a48f0391fa9ae15add05ea7432b
|
||||
9fe4c2bdb9859c14ad7f7479e1db7e01083bada3
|
||||
|
2
.github/ci_commit_pins/xla.txt
vendored
2
.github/ci_commit_pins/xla.txt
vendored
@ -1 +1 @@
|
||||
0fc62aa26a30ed7ca419d285f285cb5ba02c4394
|
||||
c77852e117bdf056c8e9a087e51d6f65cf6ba53d
|
||||
|
39
.github/pytorch-probot.yml
vendored
39
.github/pytorch-probot.yml
vendored
@ -1,44 +1,43 @@
|
||||
tracking_issue: 24422
|
||||
ciflow_tracking_issue: 64124
|
||||
ciflow_push_tags:
|
||||
- ciflow/b200
|
||||
- ciflow/b200-symm-mem
|
||||
- ciflow/binaries
|
||||
- ciflow/binaries_libtorch
|
||||
- ciflow/binaries_wheel
|
||||
- ciflow/h100
|
||||
- ciflow/h100-cutlass-backend
|
||||
- ciflow/h100-distributed
|
||||
- ciflow/h100-symm-mem
|
||||
- ciflow/triton_binaries
|
||||
- ciflow/inductor
|
||||
- ciflow/inductor-cu126
|
||||
- ciflow/inductor-micro-benchmark
|
||||
- ciflow/inductor-micro-benchmark-cpu-x86
|
||||
- ciflow/inductor-perf-compare
|
||||
- ciflow/inductor-perf-test-nightly-rocm
|
||||
- ciflow/inductor-perf-test-nightly-x86-zen
|
||||
- ciflow/inductor-periodic
|
||||
- ciflow/inductor-rocm
|
||||
- ciflow/inductor-perf-test-nightly-rocm
|
||||
- ciflow/inductor-perf-compare
|
||||
- ciflow/inductor-micro-benchmark
|
||||
- ciflow/inductor-micro-benchmark-cpu-x86
|
||||
- ciflow/inductor-perf-test-nightly-x86-zen
|
||||
- ciflow/inductor-cu126
|
||||
- ciflow/linux-aarch64
|
||||
- ciflow/mps
|
||||
- ciflow/nightly
|
||||
- ciflow/op-benchmark
|
||||
- ciflow/periodic
|
||||
- ciflow/periodic-rocm-mi300
|
||||
- ciflow/pull
|
||||
- ciflow/quantization-periodic
|
||||
- ciflow/riscv64
|
||||
- ciflow/rocm
|
||||
- ciflow/rocm-mi300
|
||||
- ciflow/s390
|
||||
- ciflow/riscv64
|
||||
- ciflow/slow
|
||||
- ciflow/torchbench
|
||||
- ciflow/triton_binaries
|
||||
- ciflow/trunk
|
||||
- ciflow/unstable
|
||||
- ciflow/vllm
|
||||
- ciflow/win-arm64
|
||||
- ciflow/xpu
|
||||
- ciflow/vllm
|
||||
- ciflow/torchbench
|
||||
- ciflow/op-benchmark
|
||||
- ciflow/pull
|
||||
- ciflow/h100
|
||||
- ciflow/h100-distributed
|
||||
- ciflow/win-arm64
|
||||
- ciflow/h100-symm-mem
|
||||
- ciflow/h100-cutlass-backend
|
||||
- ciflow/b200
|
||||
retryable_workflows:
|
||||
- pull
|
||||
- trunk
|
||||
@ -47,4 +46,4 @@ retryable_workflows:
|
||||
- inductor-A100-perf-nightly
|
||||
labeler_config: labeler.yml
|
||||
label_to_label_config: label_to_label.yml
|
||||
mergebot: true
|
||||
mergebot: True
|
||||
|
@ -1,5 +1,4 @@
|
||||
boto3==1.35.42
|
||||
build==1.2.2.post1
|
||||
cmake==3.27.*
|
||||
expecttest==0.3.0
|
||||
fbscribelogger==0.1.7
|
||||
|
@ -30,7 +30,7 @@ CUDA_ARCHES_CUDNN_VERSION = {
|
||||
}
|
||||
|
||||
# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
|
||||
ROCM_ARCHES = ["6.4", "7.0"]
|
||||
ROCM_ARCHES = ["6.3", "6.4"]
|
||||
|
||||
XPU_ARCHES = ["xpu"]
|
||||
|
||||
@ -53,7 +53,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
|
||||
@ -70,7 +70,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
|
||||
@ -87,7 +87,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
|
||||
|
93
.github/scripts/generate_ci_workflows.py
vendored
93
.github/scripts/generate_ci_workflows.py
vendored
@ -127,6 +127,53 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
|
||||
),
|
||||
]
|
||||
|
||||
ROCM_SMOKE_WORKFLOWS = [
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.LINUX,
|
||||
package_type="manywheel",
|
||||
build_variant="rocm",
|
||||
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
|
||||
OperatingSystem.LINUX,
|
||||
arches=["6.4"],
|
||||
python_versions=["3.10"],
|
||||
),
|
||||
ciflow_config=CIFlowConfig(
|
||||
labels={
|
||||
LABEL_CIFLOW_BINARIES,
|
||||
LABEL_CIFLOW_BINARIES_WHEEL,
|
||||
LABEL_CIFLOW_ROCM,
|
||||
},
|
||||
isolated_workflow=True,
|
||||
),
|
||||
branches="main",
|
||||
),
|
||||
]
|
||||
|
||||
LINUX_BINARY_SMOKE_WORKFLOWS = [
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.LINUX,
|
||||
package_type="manywheel",
|
||||
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
|
||||
OperatingSystem.LINUX,
|
||||
arches=["13.0"],
|
||||
python_versions=["3.12"],
|
||||
),
|
||||
branches="main",
|
||||
),
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.LINUX,
|
||||
package_type="libtorch",
|
||||
build_variant=generate_binary_build_matrix.RELEASE,
|
||||
build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
|
||||
OperatingSystem.LINUX,
|
||||
generate_binary_build_matrix.RELEASE,
|
||||
arches=["cpu"],
|
||||
libtorch_variants=["shared-with-deps"],
|
||||
),
|
||||
branches="main",
|
||||
),
|
||||
]
|
||||
|
||||
WINDOWS_BINARY_BUILD_WORKFLOWS = [
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.WINDOWS,
|
||||
@ -212,6 +259,39 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
|
||||
),
|
||||
]
|
||||
|
||||
WINDOWS_BINARY_SMOKE_WORKFLOWS = [
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.WINDOWS,
|
||||
package_type="libtorch",
|
||||
build_variant=generate_binary_build_matrix.RELEASE,
|
||||
build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
|
||||
OperatingSystem.WINDOWS,
|
||||
generate_binary_build_matrix.RELEASE,
|
||||
arches=["cpu"],
|
||||
libtorch_variants=["shared-with-deps"],
|
||||
),
|
||||
branches="main",
|
||||
ciflow_config=CIFlowConfig(
|
||||
isolated_workflow=True,
|
||||
),
|
||||
),
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.WINDOWS,
|
||||
package_type="libtorch",
|
||||
build_variant=generate_binary_build_matrix.DEBUG,
|
||||
build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
|
||||
OperatingSystem.WINDOWS,
|
||||
generate_binary_build_matrix.DEBUG,
|
||||
arches=["cpu"],
|
||||
libtorch_variants=["shared-with-deps"],
|
||||
),
|
||||
branches="main",
|
||||
ciflow_config=CIFlowConfig(
|
||||
isolated_workflow=True,
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
MACOS_BINARY_BUILD_WORKFLOWS = [
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.MACOS_ARM64,
|
||||
@ -292,10 +372,23 @@ def main() -> None:
|
||||
jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
|
||||
S390X_BINARY_BUILD_WORKFLOWS,
|
||||
),
|
||||
(
|
||||
# Give rocm it's own workflow file
|
||||
jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
|
||||
ROCM_SMOKE_WORKFLOWS,
|
||||
),
|
||||
(
|
||||
jinja_env.get_template("linux_binary_build_workflow.yml.j2"),
|
||||
LINUX_BINARY_SMOKE_WORKFLOWS,
|
||||
),
|
||||
(
|
||||
jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
|
||||
WINDOWS_BINARY_BUILD_WORKFLOWS,
|
||||
),
|
||||
(
|
||||
jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
|
||||
WINDOWS_BINARY_SMOKE_WORKFLOWS,
|
||||
),
|
||||
(
|
||||
jinja_env.get_template("macos_binary_build_workflow.yml.j2"),
|
||||
MACOS_BINARY_BUILD_WORKFLOWS,
|
||||
|
2
.github/workflows/_linux-test.yml
vendored
2
.github/workflows/_linux-test.yml
vendored
@ -273,8 +273,6 @@ jobs:
|
||||
TEST_CONFIG: ${{ matrix.config }}
|
||||
SHARD_NUMBER: ${{ matrix.shard }}
|
||||
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
|
||||
EXTRA_FLAGS: ${{ matrix.extra_flags || '' }}
|
||||
OP_BENCHMARK_TESTS: ${{ matrix.op_benchmark_tests }}
|
||||
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
|
||||
CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
|
||||
VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
|
||||
|
60
.github/workflows/b200-symm-mem.yml
vendored
60
.github/workflows/b200-symm-mem.yml
vendored
@ -1,60 +0,0 @@
|
||||
name: Limited CI for symmetric memory tests on B200
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/b200-symm-mem.yml
|
||||
workflow_dispatch:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/b200-symm-mem/*
|
||||
schedule:
|
||||
- cron: 22 8 * * * # about 1:22am PDT
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
|
||||
get-label-type:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '10.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "b200-symm-mem", shard: 1, num_shards: 1, runner: "linux.dgx.b200.8" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.test-matrix }}
|
||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
secrets: inherit
|
2
.github/workflows/build-almalinux-images.yml
vendored
2
.github/workflows/build-almalinux-images.yml
vendored
@ -36,7 +36,7 @@ jobs:
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
strategy:
|
||||
matrix:
|
||||
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"]
|
||||
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "rocm7.0", "cpu"]
|
||||
steps:
|
||||
- name: Build docker image
|
||||
uses: pytorch/pytorch/.github/actions/binary-docker-build@main
|
||||
|
1
.github/workflows/build-libtorch-images.yml
vendored
1
.github/workflows/build-libtorch-images.yml
vendored
@ -52,6 +52,7 @@ jobs:
|
||||
{ tag: "cuda12.9" },
|
||||
{ tag: "cuda12.8" },
|
||||
{ tag: "cuda12.6" },
|
||||
{ tag: "rocm6.3" },
|
||||
{ tag: "rocm6.4" },
|
||||
{ tag: "rocm7.0" },
|
||||
{ tag: "cpu" },
|
||||
|
2
.github/workflows/build-magma-rocm-linux.yml
vendored
2
.github/workflows/build-magma-rocm-linux.yml
vendored
@ -34,7 +34,7 @@ jobs:
|
||||
id-token: write
|
||||
strategy:
|
||||
matrix:
|
||||
rocm_version: ["70", "64"]
|
||||
rocm_version: ["70", "64", "63"]
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
2
.github/workflows/build-manywheel-images.yml
vendored
2
.github/workflows/build-manywheel-images.yml
vendored
@ -52,10 +52,12 @@ jobs:
|
||||
{ name: "manylinuxaarch64-builder", tag: "cuda13.0", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||
{ name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||
{ name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "rocm6.3", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "rocm7.0", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28_aarch64-builder", tag: "cpu-aarch64", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||
{ name: "manylinuxcxx11-abi-builder", tag: "cpu-cxx11-abi", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "xpu", runner: "linux.9xlarge.ephemeral" },
|
||||
]
|
||||
runs-on: ${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}
|
||||
|
2
.github/workflows/build-triton-wheel.yml
vendored
2
.github/workflows/build-triton-wheel.yml
vendored
@ -55,7 +55,7 @@ jobs:
|
||||
docker-image: ["pytorch/manylinux2_28-builder:cpu"]
|
||||
include:
|
||||
- device: "rocm"
|
||||
rocm_version: "7.0"
|
||||
rocm_version: "6.4"
|
||||
runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
|
||||
- device: "cuda"
|
||||
rocm_version: ""
|
||||
|
1
.github/workflows/docker-builds.yml
vendored
1
.github/workflows/docker-builds.yml
vendored
@ -59,6 +59,7 @@ jobs:
|
||||
pytorch-linux-jammy-py3.13-clang12,
|
||||
pytorch-linux-jammy-rocm-n-py3,
|
||||
pytorch-linux-noble-rocm-n-py3,
|
||||
pytorch-linux-noble-rocm-alpha-py3,
|
||||
pytorch-linux-jammy-rocm-n-py3-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12,
|
||||
pytorch-linux-jammy-py3.10-gcc11,
|
||||
|
42
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
42
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -132,7 +132,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -178,7 +178,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -224,7 +224,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -335,7 +335,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -381,7 +381,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -427,7 +427,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -538,7 +538,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -584,7 +584,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -630,7 +630,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -741,7 +741,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -787,7 +787,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -833,7 +833,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -944,7 +944,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -990,7 +990,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1036,7 +1036,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1147,7 +1147,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1193,7 +1193,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1239,7 +1239,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1350,7 +1350,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1396,7 +1396,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1442,7 +1442,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
230
.github/workflows/generated-linux-binary-libtorch-nightly.yml
generated
vendored
230
.github/workflows/generated-linux-binary-libtorch-nightly.yml
generated
vendored
@ -316,6 +316,121 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
libtorch-rocm6_3-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm6.3
|
||||
GPU_ARCH_VERSION: "6.3"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
timeout-minutes: 300
|
||||
build_name: libtorch-rocm6_3-shared-with-deps-release
|
||||
build_environment: linux-binary-libtorch
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
libtorch-rocm6_3-shared-with-deps-release-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- libtorch-rocm6_3-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm6.3
|
||||
GPU_ARCH_VERSION: "6.3"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
SKIP_ALL_TESTS: 1
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
steps:
|
||||
- name: Setup ROCm
|
||||
uses: ./.github/actions/setup-rocm
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: libtorch-rocm6_3-shared-with-deps-release
|
||||
path: "${{ runner.temp }}/artifacts/"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: ROCm set GPU_FLAG
|
||||
run: |
|
||||
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
|
||||
- name: configure aws credentials
|
||||
id: aws_creds
|
||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
aws-region: us-east-1
|
||||
role-duration-seconds: 18000
|
||||
- name: Calculate docker image
|
||||
id: calculate-docker-image
|
||||
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
||||
with:
|
||||
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
|
||||
docker-image-name: libtorch-cxx11-builder
|
||||
custom-tag-prefix: rocm6.3
|
||||
docker-build-dir: .ci/docker
|
||||
working-directory: pytorch
|
||||
- name: Pull Docker image
|
||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
||||
with:
|
||||
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Test Pytorch binary
|
||||
uses: ./pytorch/.github/actions/test-pytorch-binary
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Teardown ROCm
|
||||
uses: ./.github/actions/teardown-rocm
|
||||
libtorch-rocm6_3-shared-with-deps-release-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: libtorch-rocm6_3-shared-with-deps-release-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm6.3
|
||||
GPU_ARCH_VERSION: "6.3"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.3
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
build_name: libtorch-rocm6_3-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
libtorch-rocm6_4-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -430,118 +545,3 @@ jobs:
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
libtorch-rocm7_0-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm7.0
|
||||
GPU_ARCH_VERSION: "7.0"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
timeout-minutes: 300
|
||||
build_name: libtorch-rocm7_0-shared-with-deps-release
|
||||
build_environment: linux-binary-libtorch
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
libtorch-rocm7_0-shared-with-deps-release-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- libtorch-rocm7_0-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm7.0
|
||||
GPU_ARCH_VERSION: "7.0"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
SKIP_ALL_TESTS: 1
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
steps:
|
||||
- name: Setup ROCm
|
||||
uses: ./.github/actions/setup-rocm
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: libtorch-rocm7_0-shared-with-deps-release
|
||||
path: "${{ runner.temp }}/artifacts/"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: ROCm set GPU_FLAG
|
||||
run: |
|
||||
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
|
||||
- name: configure aws credentials
|
||||
id: aws_creds
|
||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
aws-region: us-east-1
|
||||
role-duration-seconds: 18000
|
||||
- name: Calculate docker image
|
||||
id: calculate-docker-image
|
||||
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
||||
with:
|
||||
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
|
||||
docker-image-name: libtorch-cxx11-builder
|
||||
custom-tag-prefix: rocm7.0
|
||||
docker-build-dir: .ci/docker
|
||||
working-directory: pytorch
|
||||
- name: Pull Docker image
|
||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
||||
with:
|
||||
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Test Pytorch binary
|
||||
uses: ./pytorch/.github/actions/test-pytorch-binary
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Teardown ROCm
|
||||
uses: ./.github/actions/teardown-rocm
|
||||
libtorch-rocm7_0-shared-with-deps-release-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: libtorch-rocm7_0-shared-with-deps-release-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm7.0
|
||||
GPU_ARCH_VERSION: "7.0"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm7.0
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
build_name: libtorch-rocm7_0-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
87
.github/workflows/generated-linux-binary-libtorch-release-main.yml
generated
vendored
Normal file
87
.github/workflows/generated-linux-binary-libtorch-release-main.yml
generated
vendored
Normal file
@ -0,0 +1,87 @@
|
||||
# @generated DO NOT EDIT MANUALLY
|
||||
|
||||
# Template is at: .github/templates/linux_binary_build_workflow.yml.j2
|
||||
# Generation script: .github/scripts/generate_ci_workflows.py
|
||||
name: linux-binary-libtorch-release
|
||||
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
tags:
|
||||
- 'ciflow/trunk/*'
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
|
||||
env:
|
||||
# Needed for conda builds
|
||||
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
BINARY_ENV_FILE: /tmp/env
|
||||
BUILD_ENVIRONMENT: linux-binary-libtorch-release
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
PYTORCH_FINAL_PACKAGE_DIR: /artifacts
|
||||
PYTORCH_ROOT: /pytorch
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
SKIP_ALL_TESTS: 0
|
||||
concurrency:
|
||||
group: linux-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
libtorch-cpu-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: libtorch-cpu-shared-with-deps-release
|
||||
build_environment: linux-binary-libtorch-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
libtorch-cpu-shared-with-deps-release-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- libtorch-cpu-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cpu
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
build_name: libtorch-cpu-shared-with-deps-release
|
||||
build_environment: linux-binary-libtorch-release
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
88
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
Normal file
88
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
Normal file
@ -0,0 +1,88 @@
|
||||
# @generated DO NOT EDIT MANUALLY
|
||||
|
||||
# Template is at: .github/templates/linux_binary_build_workflow.yml.j2
|
||||
# Generation script: .github/scripts/generate_ci_workflows.py
|
||||
name: linux-binary-manywheel
|
||||
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
tags:
|
||||
- 'ciflow/trunk/*'
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
|
||||
env:
|
||||
# Needed for conda builds
|
||||
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
BINARY_ENV_FILE: /tmp/env
|
||||
BUILD_ENVIRONMENT: linux-binary-manywheel
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
PYTORCH_FINAL_PACKAGE_DIR: /artifacts
|
||||
PYTORCH_ROOT: /pytorch
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
SKIP_ALL_TESTS: 0
|
||||
concurrency:
|
||||
group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
manywheel-py3_12-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda13_0-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_12-cuda13_0-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu130
|
||||
GPU_ARCH_VERSION: "13.0"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda13.0
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
1610
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
1610
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
File diff suppressed because it is too large
Load Diff
136
.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
generated
vendored
Normal file
136
.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
generated
vendored
Normal file
@ -0,0 +1,136 @@
|
||||
# @generated DO NOT EDIT MANUALLY
|
||||
|
||||
# Template is at: .github/templates/linux_binary_build_workflow.yml.j2
|
||||
# Generation script: .github/scripts/generate_ci_workflows.py
|
||||
name: linux-binary-manywheel-rocm
|
||||
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
tags:
|
||||
- 'ciflow/binaries/*'
|
||||
- 'ciflow/binaries_wheel/*'
|
||||
- 'ciflow/rocm/*'
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
|
||||
env:
|
||||
# Needed for conda builds
|
||||
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
BINARY_ENV_FILE: /tmp/env
|
||||
BUILD_ENVIRONMENT: linux-binary-manywheel-rocm
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
PYTORCH_FINAL_PACKAGE_DIR: /artifacts
|
||||
PYTORCH_ROOT: /pytorch
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
SKIP_ALL_TESTS: 0
|
||||
concurrency:
|
||||
group: linux-binary-manywheel-rocm-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
manywheel-py3_10-rocm6_4-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm6.4
|
||||
GPU_ARCH_VERSION: "6.4"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
timeout-minutes: 300
|
||||
build_name: manywheel-py3_10-rocm6_4
|
||||
build_environment: linux-binary-manywheel-rocm
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-rocm6_4-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_10-rocm6_4-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm6.4
|
||||
GPU_ARCH_VERSION: "6.4"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
SKIP_ALL_TESTS: 1
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
- name: Setup ROCm
|
||||
uses: ./.github/actions/setup-rocm
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: manywheel-py3_10-rocm6_4
|
||||
path: "${{ runner.temp }}/artifacts/"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: ROCm set GPU_FLAG
|
||||
run: |
|
||||
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
|
||||
- name: configure aws credentials
|
||||
id: aws_creds
|
||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
aws-region: us-east-1
|
||||
role-duration-seconds: 18000
|
||||
- name: Calculate docker image
|
||||
id: calculate-docker-image
|
||||
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
||||
with:
|
||||
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
|
||||
docker-image-name: manylinux2_28-builder
|
||||
custom-tag-prefix: rocm6.4
|
||||
docker-build-dir: .ci/docker
|
||||
working-directory: pytorch
|
||||
- name: Pull Docker image
|
||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
||||
with:
|
||||
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Test Pytorch binary
|
||||
uses: ./pytorch/.github/actions/test-pytorch-binary
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Teardown ROCm
|
||||
uses: ./.github/actions/teardown-rocm
|
261
.github/workflows/generated-windows-binary-libtorch-debug-main.yml
generated
vendored
Normal file
261
.github/workflows/generated-windows-binary-libtorch-debug-main.yml
generated
vendored
Normal file
@ -0,0 +1,261 @@
|
||||
# @generated DO NOT EDIT MANUALLY
|
||||
|
||||
# Template is at: .github/templates/windows_binary_build_workflow.yml.j2
|
||||
# Generation script: .github/scripts/generate_ci_workflows.py
|
||||
name: windows-binary-libtorch-debug
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
# Needed for conda builds
|
||||
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
BUILD_ENVIRONMENT: windows-binary-libtorch-debug
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
SKIP_ALL_TESTS: 1
|
||||
OS: windows
|
||||
concurrency:
|
||||
group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
libtorch-cpu-shared-with-deps-debug-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu
|
||||
SKIP_ALL_TESTS: 1
|
||||
LIBTORCH_CONFIG: debug
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
# runner.temp variable, which we need.
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
function get_ec2_metadata() {
|
||||
# Pulled from instance metadata endpoint for EC2
|
||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
||||
category=$1
|
||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
||||
}
|
||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
continue-on-error: true
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
||||
shell: bash
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
git config --global core.symlinks true
|
||||
|
||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
||||
# the directory on Windows and prevent GHA from checking out as reported
|
||||
# in https://github.com/actions/checkout/issues/1018
|
||||
git config --global core.fsmonitor false
|
||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
||||
- name: Enable long paths on Windows
|
||||
shell: powershell
|
||||
run: |
|
||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
||||
# removed once Windows Defender is removed from the AMI
|
||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
||||
continue-on-error: true
|
||||
shell: powershell
|
||||
run: |
|
||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
||||
# that it doesn't interfere
|
||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
|
||||
- uses: actions/upload-artifact@v4.4.0
|
||||
if: always()
|
||||
with:
|
||||
name: libtorch-cpu-shared-with-deps-debug
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
- name: Wait until all sessions have drained
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
timeout-minutes: 120
|
||||
run: |
|
||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
run: |
|
||||
.github\scripts\kill_active_ssh_sessions.ps1
|
||||
|
||||
libtorch-cpu-shared-with-deps-debug-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- libtorch-cpu-shared-with-deps-debug-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu
|
||||
SKIP_ALL_TESTS: 1
|
||||
LIBTORCH_CONFIG: debug
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
function get_ec2_metadata() {
|
||||
# Pulled from instance metadata endpoint for EC2
|
||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
||||
category=$1
|
||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
||||
}
|
||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
continue-on-error: true
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
||||
shell: bash
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
git config --global core.symlinks true
|
||||
|
||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
||||
# the directory on Windows and prevent GHA from checking out as reported
|
||||
# in https://github.com/actions/checkout/issues/1018
|
||||
git config --global core.fsmonitor false
|
||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
||||
- name: Enable long paths on Windows
|
||||
shell: powershell
|
||||
run: |
|
||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
||||
# removed once Windows Defender is removed from the AMI
|
||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
||||
continue-on-error: true
|
||||
shell: powershell
|
||||
run: |
|
||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
||||
# that it doesn't interfere
|
||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
# runner.temp variable, which we need.
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: libtorch-cpu-shared-with-deps-debug
|
||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Test PyTorch binary
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
|
||||
- name: Wait until all sessions have drained
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
timeout-minutes: 120
|
||||
run: |
|
||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
run: |
|
||||
.github\scripts\kill_active_ssh_sessions.ps1
|
261
.github/workflows/generated-windows-binary-libtorch-release-main.yml
generated
vendored
Normal file
261
.github/workflows/generated-windows-binary-libtorch-release-main.yml
generated
vendored
Normal file
@ -0,0 +1,261 @@
|
||||
# @generated DO NOT EDIT MANUALLY
|
||||
|
||||
# Template is at: .github/templates/windows_binary_build_workflow.yml.j2
|
||||
# Generation script: .github/scripts/generate_ci_workflows.py
|
||||
name: windows-binary-libtorch-release
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
# Needed for conda builds
|
||||
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
BUILD_ENVIRONMENT: windows-binary-libtorch-release
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
SKIP_ALL_TESTS: 1
|
||||
OS: windows
|
||||
concurrency:
|
||||
group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
libtorch-cpu-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu
|
||||
SKIP_ALL_TESTS: 1
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
# runner.temp variable, which we need.
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
function get_ec2_metadata() {
|
||||
# Pulled from instance metadata endpoint for EC2
|
||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
||||
category=$1
|
||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
||||
}
|
||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
continue-on-error: true
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
||||
shell: bash
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
git config --global core.symlinks true
|
||||
|
||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
||||
# the directory on Windows and prevent GHA from checking out as reported
|
||||
# in https://github.com/actions/checkout/issues/1018
|
||||
git config --global core.fsmonitor false
|
||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
||||
- name: Enable long paths on Windows
|
||||
shell: powershell
|
||||
run: |
|
||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
||||
# removed once Windows Defender is removed from the AMI
|
||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
||||
continue-on-error: true
|
||||
shell: powershell
|
||||
run: |
|
||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
||||
# that it doesn't interfere
|
||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
|
||||
- uses: actions/upload-artifact@v4.4.0
|
||||
if: always()
|
||||
with:
|
||||
name: libtorch-cpu-shared-with-deps-release
|
||||
retention-days: 14
|
||||
if-no-files-found: error
|
||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
- name: Wait until all sessions have drained
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
timeout-minutes: 120
|
||||
run: |
|
||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
run: |
|
||||
.github\scripts\kill_active_ssh_sessions.ps1
|
||||
|
||||
libtorch-cpu-shared-with-deps-release-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- libtorch-cpu-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cpu
|
||||
GPU_ARCH_TYPE: cpu
|
||||
SKIP_ALL_TESTS: 1
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
# This is a dummy value for libtorch to work correctly with our batch scripts
|
||||
# without this value pip does not get installed for some reason
|
||||
DESIRED_PYTHON: "3.10"
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
function get_ec2_metadata() {
|
||||
# Pulled from instance metadata endpoint for EC2
|
||||
# see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
|
||||
category=$1
|
||||
curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
|
||||
}
|
||||
echo "ami-id: $(get_ec2_metadata ami-id)"
|
||||
echo "instance-id: $(get_ec2_metadata instance-id)"
|
||||
echo "instance-type: $(get_ec2_metadata instance-type)"
|
||||
echo "system info $(uname -a)"
|
||||
- name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
continue-on-error: true
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
|
||||
shell: bash
|
||||
run: |
|
||||
git config --global core.longpaths true
|
||||
git config --global core.symlinks true
|
||||
|
||||
# https://git-scm.com/docs/git-fsmonitor--daemon. The daemon could lock
|
||||
# the directory on Windows and prevent GHA from checking out as reported
|
||||
# in https://github.com/actions/checkout/issues/1018
|
||||
git config --global core.fsmonitor false
|
||||
# Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
|
||||
- name: Enable long paths on Windows
|
||||
shell: powershell
|
||||
run: |
|
||||
Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
|
||||
# Since it's just a defensive command, the workflow should continue even the command fails. This step can be
|
||||
# removed once Windows Defender is removed from the AMI
|
||||
- name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
|
||||
continue-on-error: true
|
||||
shell: powershell
|
||||
run: |
|
||||
Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
|
||||
# Let's both exclude the path and disable Windows Defender completely just to be sure
|
||||
# that it doesn't interfere
|
||||
Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
# runner.temp variable, which we need.
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: libtorch-cpu-shared-with-deps-release
|
||||
path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
|
||||
- name: Populate binary env
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Test PyTorch binary
|
||||
shell: bash
|
||||
run: |
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
|
||||
- name: Wait until all sessions have drained
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
timeout-minutes: 120
|
||||
run: |
|
||||
.github\scripts\wait_for_ssh_to_drain.ps1
|
||||
- name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
|
||||
shell: powershell
|
||||
working-directory: pytorch
|
||||
if: always()
|
||||
run: |
|
||||
.github\scripts\kill_active_ssh_sessions.ps1
|
46
.github/workflows/operator_microbenchmark.yml
vendored
46
.github/workflows/operator_microbenchmark.yml
vendored
@ -1,46 +0,0 @@
|
||||
name: operator_microbenchmark
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/op-benchmark/*
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# Run at 06:00 UTC everyday
|
||||
- cron: 0 6 * * *
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
opmicrobenchmark-build:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: opmicrobenchmark-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '8.0 9.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" },
|
||||
{ config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
opmicrobenchmark-test:
|
||||
name: opmicrobenchmark-test
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: opmicrobenchmark-build
|
||||
with:
|
||||
timeout-minutes: 500
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
29
.github/workflows/periodic.yml
vendored
29
.github/workflows/periodic.yml
vendored
@ -59,14 +59,13 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.4-py3.10-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11
|
||||
cuda-arch-list: 7.5
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
|
||||
{ config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
|
||||
{ config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
|
||||
{ config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
|
||||
{ config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
|
||||
{ config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
@ -113,13 +112,13 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-build:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc9
|
||||
linux-jammy-cuda12_8-py3_9-gcc9-build:
|
||||
name: linux-jammy-cuda12.8-py3.9-gcc9
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9
|
||||
build-environment: linux-jammy-cuda12.8-py3.9-gcc9
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
|
||||
cuda-arch-list: 8.6
|
||||
test-matrix: |
|
||||
@ -129,14 +128,14 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-test:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc9
|
||||
linux-jammy-cuda12_8-py3_9-gcc9-test:
|
||||
name: linux-jammy-cuda12.8-py3.9-gcc9
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-cuda12_8-py3_10-gcc9-build
|
||||
needs: linux-jammy-cuda12_8-py3_9-gcc9-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.test-matrix }}
|
||||
build-environment: linux-jammy-cuda12.8-py3.9-gcc9
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_9-gcc9-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_9-gcc9-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-debug-build:
|
||||
|
6
.github/workflows/pull.yml
vendored
6
.github/workflows/pull.yml
vendored
@ -343,14 +343,14 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-xpu-n-py3_10-build:
|
||||
name: linux-jammy-xpu-n-py3.10
|
||||
linux-jammy-xpu-n-py3_9-build:
|
||||
name: linux-jammy-xpu-n-py3.9
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
sync-tag: linux-xpu-n-build
|
||||
runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
|
||||
build-environment: linux-jammy-xpu-n-py3.10
|
||||
build-environment: linux-jammy-xpu-n-py3.9
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
|
2
.github/workflows/rocm-mi355.yml
vendored
2
.github/workflows/rocm-mi355.yml
vendored
@ -38,7 +38,7 @@ jobs:
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-noble-rocm-py3.12-mi355
|
||||
docker-image-name: ci-image:pytorch-linux-noble-rocm-n-py3
|
||||
docker-image-name: ci-image:pytorch-linux-noble-rocm-alpha-py3
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
|
@ -442,7 +442,7 @@ if(WIN32)
|
||||
message(
|
||||
WARNING
|
||||
"Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
|
||||
"Please run command 'conda install -c conda-forge libuv=1.51' to install libuv."
|
||||
"Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
|
||||
)
|
||||
else()
|
||||
set(ENV{libuv_ROOT} ${libuv_tmp_LIBRARY}/../../)
|
||||
@ -888,28 +888,23 @@ cmake_dependent_option(
|
||||
"(USE_CUDA AND NOT MSVC) OR USE_ROCM"
|
||||
OFF)
|
||||
|
||||
|
||||
IF(USE_ROCM AND "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
|
||||
message(WARNING "Setting USE_FBGEMM_GENAI for gfx942 to ON by default, doing ROCM build")
|
||||
set(USE_FBGEMM_GENAI_DEFAULT ON)
|
||||
elseif(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
|
||||
message(STATUS "Setting USE_FBGEMM_GENAI to ON by default , doing CUDA build for SM100a")
|
||||
set(USE_FBGEMM_GENAI_DEFAULT ON)
|
||||
else()
|
||||
set(USE_FBGEMM_GENAI_DEFAULT OFF)
|
||||
endif()
|
||||
|
||||
cmake_dependent_option(
|
||||
USE_FBGEMM_GENAI
|
||||
"Whether to build FBGEMM GenAI quantized GEMM kernels.\
|
||||
Will be disabled if not supported by the platform"
|
||||
${USE_FBGEMM_GENAI_DEFAULT}
|
||||
"(USE_CUDA AND NOT MSVC) OR USE_ROCM"
|
||||
ON
|
||||
"USE_ROCM"
|
||||
OFF)
|
||||
|
||||
IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
|
||||
message(WARNING "Unsupported ROCM arch for FBGEMM GenAI, will set USE_FBGEMM_GENAI to OFF")
|
||||
set(USE_FBGEMM_GENAI off)
|
||||
endif()
|
||||
|
||||
# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100.
|
||||
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
|
||||
message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a")
|
||||
set(USE_FBGEMM_GENAI ON)
|
||||
endif()
|
||||
|
||||
# CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
|
||||
|
@ -275,7 +275,7 @@ conda install pkg-config libuv
|
||||
pip install mkl-static mkl-include
|
||||
# Add these packages if torch.distributed is needed.
|
||||
# Distributed package support on Windows is a prototype feature and is subject to changes.
|
||||
conda install -c conda-forge libuv=1.51
|
||||
conda install -c conda-forge libuv
|
||||
```
|
||||
|
||||
#### Install PyTorch
|
||||
|
@ -45,39 +45,7 @@ inline void infer_size_impl(
|
||||
}
|
||||
}
|
||||
|
||||
if (infer_dim) {
|
||||
// numel is the product of known sizes, it has to be divisible by newsize.
|
||||
// and newsize should be positive unless newsize == numel (we throw
|
||||
// different) error message in that case.
|
||||
if constexpr (std::is_same_v<NumelType, c10::SymInt>) {
|
||||
auto v = newsize.maybe_as_int();
|
||||
if (v and *v == 0) {
|
||||
// Avoid div by 0 when sym_eq(numel % newsize, 0) is constructed!
|
||||
// which may happen when newsize is not a symbol! if its a symbol
|
||||
// division won't happen anyway during compile.
|
||||
TORCH_MAYBE_SYM_CHECK(
|
||||
numel == newsize,
|
||||
"shape '",
|
||||
shape,
|
||||
"' is invalid for input of size ",
|
||||
numel);
|
||||
} else {
|
||||
auto cond = sym_gt(newsize, 0)
|
||||
.sym_and(sym_eq(numel % newsize, 0))
|
||||
.sym_or(sym_eq(numel, newsize));
|
||||
TORCH_MAYBE_SYM_CHECK(
|
||||
cond, "shape '", shape, "' is invalid for input of size ", numel);
|
||||
}
|
||||
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
(newsize > 0 && (numel % newsize == 0)) || numel == newsize,
|
||||
"shape '",
|
||||
shape,
|
||||
"' is invalid for input of size ",
|
||||
numel);
|
||||
}
|
||||
|
||||
auto set_infer_dim = [&]() {
|
||||
// We have a degree of freedom here to select the dimension size; follow
|
||||
// NumPy semantics and just bail. However, a nice error message is needed
|
||||
// because users often use `view` as a way to flatten & unflatten
|
||||
@ -86,15 +54,19 @@ inline void infer_size_impl(
|
||||
// works yet
|
||||
// empty_tensor.view(-1, 0)
|
||||
// doesn't.
|
||||
TORCH_MAYBE_SYM_CHECK(
|
||||
TORCH_CHECK(
|
||||
newsize != 0,
|
||||
"cannot reshape tensor of 0 elements into shape ",
|
||||
shape,
|
||||
" because the unspecified dimension size -1 can be any "
|
||||
"value and is ambiguous");
|
||||
|
||||
res[*infer_dim] = numel / newsize;
|
||||
return;
|
||||
};
|
||||
|
||||
if (infer_dim && newsize > 0 && numel % newsize == 0) {
|
||||
set_infer_dim();
|
||||
return;
|
||||
}
|
||||
|
||||
TORCH_MAYBE_SYM_CHECK(
|
||||
@ -103,6 +75,9 @@ inline void infer_size_impl(
|
||||
shape,
|
||||
"' is invalid for input of size ",
|
||||
numel);
|
||||
if (infer_dim) {
|
||||
set_infer_dim();
|
||||
}
|
||||
}
|
||||
|
||||
inline std::vector<int64_t> infer_size(IntArrayRef shape, int64_t numel) {
|
||||
|
@ -103,9 +103,7 @@ std::string get_cpu_capability() {
|
||||
#elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
|
||||
case native::CPUCapability::ZVECTOR:
|
||||
return "Z VECTOR";
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
case native::CPUCapability::SVE128:
|
||||
return "SVE128";
|
||||
#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
case native::CPUCapability::SVE256:
|
||||
return "SVE256";
|
||||
#else
|
||||
|
@ -1,32 +1,22 @@
|
||||
#include <ATen/core/PythonOpRegistrationTrampoline.h>
|
||||
#include <c10/core/impl/PyInterpreterHooks.h>
|
||||
|
||||
// TODO: delete this
|
||||
namespace at::impl {
|
||||
|
||||
// The strategy is that all python interpreters attempt to register themselves
|
||||
// as the main interpreter, but only one wins. Only that interpreter is
|
||||
// allowed to interact with the C++ dispatcher. Furthermore, when we execute
|
||||
// logic on that interpreter, we do so hermetically, never setting pyobj field
|
||||
// on Tensor.
|
||||
|
||||
std::atomic<c10::impl::PyInterpreter*>
|
||||
PythonOpRegistrationTrampoline::interpreter_{nullptr};
|
||||
c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::interpreter_ = nullptr;
|
||||
|
||||
c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::getInterpreter() {
|
||||
return PythonOpRegistrationTrampoline::interpreter_.load();
|
||||
return c10::impl::getGlobalPyInterpreter();
|
||||
}
|
||||
|
||||
bool PythonOpRegistrationTrampoline::registerInterpreter(
|
||||
c10::impl::PyInterpreter* interp) {
|
||||
c10::impl::PyInterpreter* expected = nullptr;
|
||||
interpreter_.compare_exchange_strong(expected, interp);
|
||||
if (expected != nullptr) {
|
||||
// This is the second (or later) Python interpreter, which means we need
|
||||
// non-trivial hermetic PyObject TLS
|
||||
c10::impl::HermeticPyObjectTLS::init_state();
|
||||
if (interpreter_ != nullptr) {
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
interpreter_ = interp;
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace at::impl
|
||||
|
@ -2,19 +2,21 @@
|
||||
|
||||
#include <ATen/core/dispatch/Dispatcher.h>
|
||||
|
||||
// TODO: this can probably live in c10
|
||||
// TODO: We can get rid of this
|
||||
|
||||
|
||||
namespace at::impl {
|
||||
|
||||
// Manages the single Python interpreter instance for PyTorch.
|
||||
class TORCH_API PythonOpRegistrationTrampoline final {
|
||||
static std::atomic<c10::impl::PyInterpreter*> interpreter_;
|
||||
static c10::impl::PyInterpreter* interpreter_;
|
||||
|
||||
public:
|
||||
// Returns true if you successfully registered yourself (that means
|
||||
// you are in the hot seat for doing the operator registrations!)
|
||||
// Register the Python interpreter. Returns true on first registration,
|
||||
// false if an interpreter was already registered.
|
||||
static bool registerInterpreter(c10::impl::PyInterpreter*);
|
||||
|
||||
// Returns the registered interpreter via the global PyInterpreter hooks.
|
||||
// Returns nullptr if no interpreter has been registered yet.
|
||||
static c10::impl::PyInterpreter* getInterpreter();
|
||||
};
|
||||
|
@ -1234,7 +1234,7 @@ struct TORCH_API TupleType : public NamedType {
|
||||
std::shared_ptr<FunctionSchema> schema_;
|
||||
};
|
||||
|
||||
// the common supertype of all Enums, only used in operator registration.
|
||||
// the common supertype of all Enums, only used in operator registraion.
|
||||
// EnumType <: AnyEnumType for all Enums
|
||||
struct AnyEnumType;
|
||||
using AnyEnumTypePtr = SingletonTypePtr<AnyEnumType>;
|
||||
|
@ -102,31 +102,8 @@ struct VecReduceAllSIMD<float, Op> {
|
||||
#endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
|
||||
// !defined(C10_MOBILE)
|
||||
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
|
||||
#if defined(CPU_CAPABILITY_SVE256)
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(
|
||||
const Op& vec_fun,
|
||||
const Vectorized<float>& acc_vec) {
|
||||
using Vec = Vectorized<float>;
|
||||
Vec v = acc_vec;
|
||||
// 128-bit shuffle
|
||||
svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
|
||||
Vec v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 64-bit shuffle
|
||||
ind = svdupq_n_u32(2, 3, 0, 1);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 32-bit shuffle
|
||||
ind = svdupq_n_u32(1, 0, 2, 3);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
return svlasta(svpfalse(), v);
|
||||
}
|
||||
};
|
||||
#else
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE)
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(
|
||||
@ -163,8 +140,35 @@ struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
|
||||
return vaddvq_f32(acc_vec);
|
||||
}
|
||||
};
|
||||
#endif // defined(CPU_CAPABILITY_SVE256)
|
||||
#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
|
||||
// && !defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
defined(CPU_CAPABILITY_SVE256)
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(
|
||||
const Op& vec_fun,
|
||||
const Vectorized<float>& acc_vec) {
|
||||
using Vec = Vectorized<float>;
|
||||
Vec v = acc_vec;
|
||||
// 128-bit shuffle
|
||||
svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
|
||||
Vec v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 64-bit shuffle
|
||||
ind = svdupq_n_u32(2, 3, 0, 1);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 32-bit shuffle
|
||||
ind = svdupq_n_u32(1, 0, 2, 3);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
return svlasta(svpfalse(), v);
|
||||
}
|
||||
};
|
||||
#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
|
||||
// && defined(CPU_CAPABILITY_SVE256)
|
||||
|
||||
template <typename scalar_t, typename Op>
|
||||
inline scalar_t vec_reduce_all(
|
||||
|
@ -1,21 +1,9 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <cstdint>
|
||||
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#if defined(__aarch64__) && \
|
||||
(defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) || \
|
||||
defined(AT_BUILD_ARM_VECSVE_WITH_SLEEF))
|
||||
#define SLEEF_STATIC_LIBS
|
||||
#include <sleef.h>
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
|
||||
#else
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
|
||||
#endif
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// Define the data type of VLS(vector-length specific).
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/sve/vec_common_sve.h>
|
||||
#include <ATen/cpu/vec/sve/vec_float.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/util/bit_cast.h>
|
||||
|
@ -8,48 +8,13 @@
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE128
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_convert.h>
|
||||
|
||||
#include <ATen/cpu/vec/sve/vec_qint.h>
|
||||
|
||||
#elif defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
#include <ATen/cpu/vec/sve/vec_float.h>
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#include <ATen/cpu/vec/sve/vec_bfloat16.h>
|
||||
|
||||
#include <ATen/cpu/vec/sve/vec_double.h>
|
||||
#include <ATen/cpu/vec/sve/vec_float.h>
|
||||
#include <ATen/cpu/vec/sve/vec_int.h>
|
||||
|
||||
#include <ATen/cpu/vec/sve/vec_qint.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec256/vec256_half.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec256/vec256_convert.h>
|
||||
|
||||
#else // NEON
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_convert.h>
|
||||
|
||||
#include <ATen/cpu/vec/vec256/vec256_qint.h>
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE128)
|
||||
|
||||
#include <ATen/cpu/vec/functional.h>
|
||||
#endif
|
||||
|
||||
namespace at::vec {
|
||||
// Note [CPU_CAPABILITY namespace]
|
||||
@ -83,6 +48,12 @@ DEFINE_SVE_CAST(int32_t, s32, float, f32)
|
||||
DEFINE_SVE_CAST(int16_t, s16, float, f32)
|
||||
DEFINE_SVE_CAST(float, f32, double, f64)
|
||||
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
DEFINE_SVE_CAST(int64_t, s64, c10::BFloat16, bf16)
|
||||
DEFINE_SVE_CAST(int32_t, s32, c10::BFloat16, bf16)
|
||||
DEFINE_SVE_CAST(int16_t, s16, c10::BFloat16, bf16)
|
||||
#endif // __ARM_FEATURE_BF16
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
template <int64_t scale = 1>
|
||||
@ -202,11 +173,9 @@ std::pair<
|
||||
// group cols crossing lanes:
|
||||
// return {a0, b0, a1, b1, a2, b2, a3, b3}
|
||||
// {a4, b4, a5, b5, a6, b6, a7, b7}
|
||||
svbfloat16_t aReg = a;
|
||||
svbfloat16_t bReg = b;
|
||||
Vectorized<c10::BFloat16> c = svzip1_bf16(aReg, bReg);
|
||||
Vectorized<c10::BFloat16> d = svzip2_bf16(aReg, bReg);
|
||||
return std::make_pair(c, d);
|
||||
return std::make_pair(
|
||||
Vectorized<c10::BFloat16>(svzip1_bf16(a, b)),
|
||||
Vectorized<c10::BFloat16>(svzip2_bf16(a, b)));
|
||||
}
|
||||
#endif // __ARM_FEATURE_BF16
|
||||
|
||||
@ -255,27 +224,12 @@ std::pair<
|
||||
// swap lanes:
|
||||
// return {a0, a1, a2, a3, a4, a5, a6, a7}
|
||||
// {b0, b1, b2, b3, b4, b5, b6, b7}
|
||||
svbfloat16_t aReg = a;
|
||||
svbfloat16_t bReg = b;
|
||||
Vectorized<c10::BFloat16> c = svuzp1_bf16(aReg, bReg);
|
||||
Vectorized<c10::BFloat16> d = svuzp2_bf16(aReg, bReg);
|
||||
return std::make_pair(c, d);
|
||||
return std::make_pair(
|
||||
Vectorized<c10::BFloat16>(svuzp1_bf16((svbfloat16_t)a, (svbfloat16_t)b)),
|
||||
Vectorized<c10::BFloat16>(svuzp2_bf16((svbfloat16_t)a, (svbfloat16_t)b)));
|
||||
}
|
||||
#endif // __ARM_FEATURE_BF16
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
#define DEFINE_FLIP_FUNC(type, sve_func) \
|
||||
inline Vectorized<type> flip(const Vectorized<type>& v) { \
|
||||
return Vectorized<type>(sve_func(v)); \
|
||||
}
|
||||
// Use the macro to define the flip functions
|
||||
DEFINE_FLIP_FUNC(float, svrev_f32)
|
||||
DEFINE_FLIP_FUNC(double, svrev_f64)
|
||||
DEFINE_FLIP_FUNC(int64_t, svrev_s64)
|
||||
DEFINE_FLIP_FUNC(int32_t, svrev_s32)
|
||||
DEFINE_FLIP_FUNC(int16_t, svrev_s16)
|
||||
DEFINE_FLIP_FUNC(int8_t, svrev_s8)
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
@ -1,8 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#if defined(__aarch64__)
|
||||
#include <ATen/cpu/vec/vec_common_aarch64.h>
|
||||
#elif defined(CPU_CAPABILITY_AVX512)
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
#include <ATen/cpu/vec/vec512/vec512.h>
|
||||
#else
|
||||
#include <ATen/cpu/vec/vec128/vec128.h>
|
||||
@ -13,34 +11,6 @@ namespace at::vec {
|
||||
// See Note [CPU_CAPABILITY namespace]
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
|
||||
stream << val.val_;
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
|
||||
stream << static_cast<int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
|
||||
stream << static_cast<unsigned int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
|
||||
T buf[Vectorized<T>::size()];
|
||||
vec.store(buf);
|
||||
stream << "vec[";
|
||||
for (int i = 0; i != Vectorized<T>::size(); i++) {
|
||||
if (i != 0) {
|
||||
stream << ", ";
|
||||
}
|
||||
stream << buf[i];
|
||||
}
|
||||
stream << "]";
|
||||
return stream;
|
||||
}
|
||||
|
||||
inline Vectorized<bool> convert_to_bool(Vectorized<int8_t> x) {
|
||||
__at_align__ bool buffer[x.size()];
|
||||
x.ne(Vectorized<int8_t>(0)).store(buffer);
|
||||
|
@ -2,7 +2,6 @@
|
||||
|
||||
// DO NOT DEFINE STATIC DATA IN THIS HEADER!
|
||||
// See Note [Do not compile initializers with AVX]
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
@ -263,13 +262,6 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
c10::bit_cast<at_bfloat16_t>(val6.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val7.x)}) {}
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE128
|
||||
Vectorized(svbfloat16_t v) : Vectorized16(svget_neonq(v)) {}
|
||||
operator svbfloat16_t() const {
|
||||
return svset_neonq(svundef_bf16(), values);
|
||||
}
|
||||
#endif
|
||||
|
||||
static Vectorized<c10::BFloat16> blendv(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
@ -382,23 +374,6 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
Vectorized ge(const Vectorized& other) const;
|
||||
Vectorized lt(const Vectorized& other) const;
|
||||
Vectorized le(const Vectorized& other) const;
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE128
|
||||
|
||||
template <typename step_t>
|
||||
static Vectorized<BFloat16> arange(
|
||||
BFloat16 base = 0.f,
|
||||
step_t step = static_cast<step_t>(1)) {
|
||||
__at_align__ BFloat16 buffer[size()];
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
buffer[i] = base + i * step;
|
||||
}
|
||||
return svget_neonq(
|
||||
svld1_bf16(ptrue, reinterpret_cast<bfloat16_t*>(buffer)));
|
||||
}
|
||||
|
||||
#endif // CPU_CAPABILITY_SVE128
|
||||
|
||||
}; // Vectorized<c10::BFloat16>
|
||||
|
||||
inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
|
||||
@ -422,24 +397,6 @@ inline Vectorized<c10::BFloat16> convert_float_bfloat16(
|
||||
return Vectorized<c10::BFloat16>(at_vcombine_bf16(x1, x2));
|
||||
}
|
||||
|
||||
inline void load_fp32_from_bf16(const BFloat16* data, Vectorized<float>& out) {
|
||||
__at_align__ float values[Vectorized<float>::size()];
|
||||
for (const auto k : c10::irange(Vectorized<float>::size())) {
|
||||
values[k] = data[k];
|
||||
}
|
||||
out = Vectorized<float>::loadu(values);
|
||||
}
|
||||
|
||||
inline void load_fp32_from_bf16(
|
||||
const BFloat16* data,
|
||||
Vectorized<float>& out1,
|
||||
Vectorized<float>& out2) {
|
||||
Vectorized<BFloat16> bf16_vec = Vectorized<BFloat16>::loadu(data);
|
||||
auto floats = convert_bfloat16_float(bf16_vec);
|
||||
out1 = std::get<0>(floats);
|
||||
out2 = std::get<1>(floats);
|
||||
}
|
||||
|
||||
template <typename Op>
|
||||
Vectorized<c10::BFloat16> binary_operator_via_float(
|
||||
Op op,
|
||||
@ -622,12 +579,6 @@ Vectorized<c10::BFloat16> inline fnmsub(
|
||||
return -a * b - c;
|
||||
}
|
||||
|
||||
#else //
|
||||
|
||||
CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
|
||||
|
||||
LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16)
|
||||
|
||||
#endif // !defined(C10_MOBILE) && defined(__aarch64__)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
namespace at::vec {
|
||||
inline namespace CPU_CAPABILITY {
|
||||
#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)
|
||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
||||
template <typename src_t>
|
||||
struct VecConvert<
|
||||
float,
|
||||
@ -60,7 +60,6 @@ struct VecConvert<float, 1, BFloat16, 1> {
|
||||
}
|
||||
};
|
||||
|
||||
#endif // defined(__aarch64__) && (!defined(CPU_CAPABILITY_SVE) ||
|
||||
// defined(CPU_CAPABILITY_SVE128))
|
||||
#endif // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
@ -4,10 +4,13 @@
|
||||
// See Note [Do not compile initializers with AVX]
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
|
||||
#include <sleef.h>
|
||||
#endif
|
||||
|
||||
// Sleef offers vectorized versions of some transcedentals
|
||||
// such as sin, cos, tan etc..
|
||||
// However for now opting for STL, since we are not building
|
||||
@ -32,6 +35,12 @@ inline namespace CPU_CAPABILITY {
|
||||
#error "Big endian is not supported."
|
||||
#endif
|
||||
|
||||
#if defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
|
||||
#else
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
|
||||
#endif
|
||||
|
||||
template <int index, bool mask_val>
|
||||
struct BlendRegs {
|
||||
static float32x4_t impl(
|
||||
@ -85,12 +94,6 @@ class Vectorized<float> {
|
||||
operator float32x4_t() const {
|
||||
return values;
|
||||
}
|
||||
#ifdef CPU_CAPABILITY_SVE128
|
||||
Vectorized(svfloat32_t v) : values(svget_neonq(v)) {}
|
||||
operator svfloat32_t() const {
|
||||
return svset_neonq(svundef_f32(), values);
|
||||
}
|
||||
#endif
|
||||
template <int64_t mask>
|
||||
static Vectorized<float> blend(
|
||||
const Vectorized<float>& a,
|
||||
|
@ -4,6 +4,7 @@
|
||||
// See Note [Do not compile initializers with AVX]
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_convert.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
@ -24,6 +25,7 @@ inline namespace CPU_CAPABILITY {
|
||||
// https://bugs.llvm.org/show_bug.cgi?id=45824
|
||||
// Most likely we will do aarch32 support with inline asm.
|
||||
#if !defined(C10_MOBILE) && defined(__aarch64__)
|
||||
|
||||
#ifdef __BIG_ENDIAN__
|
||||
#error "Big endian is not supported."
|
||||
#endif
|
||||
@ -419,24 +421,6 @@ Vectorized<c10::Half> inline operator+(
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void load_fp32_from_fp16(const c10::Half* data, Vectorized<float>& out) {
|
||||
__at_align__ float values[Vectorized<float>::size()];
|
||||
for (const auto k : c10::irange(Vectorized<float>::size())) {
|
||||
values[k] = data[k];
|
||||
}
|
||||
out = Vectorized<float>::loadu(values);
|
||||
}
|
||||
|
||||
inline void load_fp32_from_fp16(
|
||||
const c10::Half* data,
|
||||
Vectorized<float>& out1,
|
||||
Vectorized<float>& out2) {
|
||||
Vectorized<c10::Half> f16_vec = Vectorized<c10::Half>::loadu(data);
|
||||
auto floats = convert_half_float(f16_vec);
|
||||
out1 = std::get<0>(floats);
|
||||
out2 = std::get<1>(floats);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::Half> inline operator-(
|
||||
const Vectorized<c10::Half>& a,
|
||||
@ -672,53 +656,6 @@ Vectorized<c10::Half> inline fnmsub(
|
||||
return -a * b - c;
|
||||
#endif
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define CONVERT_NON_VECTORIZED_INIT(type, name) \
|
||||
inline std::tuple<Vectorized<float>, Vectorized<float>> \
|
||||
convert_##name##_float(const Vectorized<type>& a) { \
|
||||
constexpr int64_t K = Vectorized<type>::size(); \
|
||||
__at_align__ float arr[K]; \
|
||||
__at_align__ type arr2[K]; \
|
||||
a.store(arr2); \
|
||||
convert(arr2, arr, K); \
|
||||
return std::make_tuple( \
|
||||
Vectorized<float>::loadu(arr), \
|
||||
Vectorized<float>::loadu(arr + Vectorized<float>::size())); \
|
||||
} \
|
||||
inline Vectorized<type> convert_float_##name( \
|
||||
const Vectorized<float>& a, const Vectorized<float>& b) { \
|
||||
constexpr int64_t K = Vectorized<type>::size(); \
|
||||
__at_align__ float arr[K]; \
|
||||
__at_align__ type arr2[K]; \
|
||||
a.store(arr); \
|
||||
b.store(arr + Vectorized<float>::size()); \
|
||||
convert(arr, arr2, K); \
|
||||
return Vectorized<type>::loadu(arr2); \
|
||||
}
|
||||
|
||||
#define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
|
||||
inline void load_fp32_from_##name( \
|
||||
const type* data, Vectorized<float>& out) { \
|
||||
__at_align__ float values[Vectorized<float>::size()]; \
|
||||
for (const auto k : c10::irange(Vectorized<float>::size())) { \
|
||||
values[k] = data[k]; \
|
||||
} \
|
||||
out = Vectorized<float>::loadu(values); \
|
||||
} \
|
||||
\
|
||||
inline void load_fp32_from_##name( \
|
||||
const type* data, Vectorized<float>& out1, Vectorized<float>& out2) { \
|
||||
load_fp32_from_##name(data, out1); \
|
||||
data += Vectorized<float>::size(); \
|
||||
load_fp32_from_##name(data, out2); \
|
||||
}
|
||||
|
||||
CONVERT_NON_VECTORIZED_INIT(Half, half)
|
||||
|
||||
LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16)
|
||||
|
||||
#endif // !defined(C10_MOBILE) && defined(__aarch64__)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
|
@ -9,16 +9,21 @@
|
||||
#if !( \
|
||||
defined(__VSX__) || defined(CPU_CAPABILITY_VSX) || \
|
||||
defined(CPU_CAPABILITY_ZVECTOR))
|
||||
#include <ATen/cpu/vec/vec256/vec256_double.h>
|
||||
#if defined(CPU_CAPABILITY_SVE256)
|
||||
#include <ATen/cpu/vec/sve/vec_common_sve.h>
|
||||
#else
|
||||
// clang-format off
|
||||
#include <ATen/cpu/vec/vec256/vec256_float.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_double.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_int.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_qint.h>
|
||||
#endif
|
||||
#if !defined(CPU_CAPABILITY_SVE256) || !defined(__ARM_FEATURE_BF16)
|
||||
#include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
|
||||
#endif
|
||||
#include <ATen/cpu/vec/vec256/vec256_complex_double.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_complex_float.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_half.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_complex_float.h>
|
||||
#include <ATen/cpu/vec/vec256/vec256_complex_double.h>
|
||||
// clang-format on
|
||||
#elif defined(__VSX__) || defined(CPU_CAPABILITY_VSX)
|
||||
#include <ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h>
|
||||
@ -51,6 +56,34 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
|
||||
stream << val.val_;
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
|
||||
stream << static_cast<int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
|
||||
stream << static_cast<unsigned int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
|
||||
T buf[Vectorized<T>::size()];
|
||||
vec.store(buf);
|
||||
stream << "vec[";
|
||||
for (int i = 0; i != Vectorized<T>::size(); i++) {
|
||||
if (i != 0) {
|
||||
stream << ", ";
|
||||
}
|
||||
stream << buf[i];
|
||||
}
|
||||
stream << "]";
|
||||
return stream;
|
||||
}
|
||||
|
||||
#if defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
@ -268,7 +268,9 @@ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16)
|
||||
|
||||
#else // defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
#if !(defined(__aarch64__))
|
||||
#if !( \
|
||||
defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE256))
|
||||
CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
|
||||
#endif
|
||||
|
||||
|
@ -268,7 +268,9 @@ LOAD_FP32_VECTORIZED_INIT(Half, fp16)
|
||||
|
||||
#else // defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
#if !defined(__aarch64__) || defined(CPU_CAPABILITY_SVE256)
|
||||
#if !( \
|
||||
defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE256))
|
||||
CONVERT_NON_VECTORIZED_INIT(Half, half)
|
||||
#endif
|
||||
|
||||
|
@ -5,13 +5,6 @@
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#ifdef __aarch64__
|
||||
#if defined(CPU_CAPABILITY_SVE128) || !defined(CPU_CAPABILITY_SVE)
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <ATen/native/quantized/AffineQuantizerBase.h>
|
||||
|
||||
#include <c10/util/irange.h>
|
||||
@ -922,7 +915,7 @@ Vectorized<c10::quint8> inline maximum(
|
||||
return a.maximum(b);
|
||||
}
|
||||
|
||||
#else
|
||||
#elif !defined(CPU_CAPABILITY_SVE256)
|
||||
|
||||
// NOTE: These are low-performance implementations that we fall back on
|
||||
// if we are not building with AVX2. This may not be an issue, because
|
||||
@ -1379,18 +1372,12 @@ Vectorized<c10::quint8> inline maximum(
|
||||
return a.maximum(b);
|
||||
}
|
||||
|
||||
#if defined(__aarch64__) && \
|
||||
(defined(CPU_CAPABILITY_SVE128) || !defined(CPU_CAPABILITY_SVE))
|
||||
#endif // if defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
||||
std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
|
||||
at::vec::Vectorized<int8_t> src) {
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE
|
||||
svint8_t x = src;
|
||||
auto s8x8 = vget_low_s8(svget_neonq(x));
|
||||
#else
|
||||
auto s8x8 = vld1_s8(src.operator const int8_t*());
|
||||
#endif
|
||||
|
||||
auto s16x8 = vmovl_s8(s8x8);
|
||||
|
||||
auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
|
||||
@ -1415,14 +1402,7 @@ std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
|
||||
|
||||
Vectorized<float> inline convert_int8_half_register_to_float(
|
||||
at::vec::Vectorized<int8_t> src) {
|
||||
|
||||
#ifdef CPU_CAPABILITY_SVE
|
||||
svint8_t x = src;
|
||||
auto s8x8 = vget_low_s8(svget_neonq(x));
|
||||
#else
|
||||
auto s8x8 = vld1_s8(src.operator const int8_t*());
|
||||
#endif
|
||||
|
||||
auto s16x8 = vmovl_s8(s8x8);
|
||||
|
||||
auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
|
||||
@ -1440,8 +1420,5 @@ Vectorized<float> inline convert_int8_half_register_to_float(
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif // if defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
@ -31,6 +31,34 @@ namespace vec {
|
||||
// See Note [CPU_CAPABILITY namespace]
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
|
||||
stream << val.val_;
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
|
||||
stream << static_cast<int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
|
||||
stream << static_cast<unsigned int>(val.val_);
|
||||
return stream;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
|
||||
T buf[Vectorized<T>::size()];
|
||||
vec.store(buf);
|
||||
stream << "vec[";
|
||||
for (int i = 0; i != Vectorized<T>::size(); i++) {
|
||||
if (i != 0) {
|
||||
stream << ", ";
|
||||
}
|
||||
stream << buf[i];
|
||||
}
|
||||
stream << "]";
|
||||
return stream;
|
||||
}
|
||||
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512)
|
||||
|
@ -67,7 +67,18 @@ Windows llvm will not have this definition.
|
||||
#endif
|
||||
#define VECTOR_WIDTH 64
|
||||
#define int_vector __m512i
|
||||
#elif defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_SVE256)
|
||||
#elif defined(__aarch64__) && \
|
||||
!defined(CPU_CAPABILITY_SVE) // CPU_CAPABILITY_AVX512
|
||||
// SVE code expects 256-vectors; leave that set for SVE?
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(16)))
|
||||
#elif defined(_WIN32)
|
||||
#define __at_align__ __declspec(align(16))
|
||||
#else
|
||||
#define __at_align__
|
||||
#endif
|
||||
#define VECTOR_WIDTH 16
|
||||
#else // CPU_CAPABILITY_AVX512
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(32)))
|
||||
#elif defined(_WIN32)
|
||||
@ -77,27 +88,7 @@ Windows llvm will not have this definition.
|
||||
#endif
|
||||
#define VECTOR_WIDTH 32
|
||||
#define int_vector __m256i
|
||||
#elif defined(__aarch64__)
|
||||
// Define alignment and vector width for SVE128/Default (e.g., NEON)
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(16)))
|
||||
#elif defined(_WIN32)
|
||||
#define __at_align__ __declspec(align(16))
|
||||
#else
|
||||
#define __at_align__
|
||||
#endif
|
||||
#define VECTOR_WIDTH 16
|
||||
#else
|
||||
// Fallback: define default alignment and vector width
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(32)))
|
||||
#elif defined(_WIN32)
|
||||
#define __at_align__ __declspec(align(32))
|
||||
#else
|
||||
#define __at_align__
|
||||
#endif
|
||||
#define VECTOR_WIDTH 32
|
||||
#endif
|
||||
#endif // CPU_CAPABILITY_AVX512
|
||||
|
||||
namespace at::vec {
|
||||
// See Note [CPU_CAPABILITY namespace]
|
||||
|
@ -151,11 +151,6 @@ struct CUDACachingHostAllocatorImpl
|
||||
}
|
||||
|
||||
bool query_event(EventPool::Event& event) override {
|
||||
// Do not call cudaEventQuery if capturing is underway
|
||||
if (at::cuda::currentStreamCaptureStatusMayInitCtx() !=
|
||||
at::cuda::CaptureStatus::None) {
|
||||
return false;
|
||||
}
|
||||
cudaError_t err = cudaEventQuery(*event);
|
||||
if (err == cudaErrorNotReady) {
|
||||
(void)cudaGetLastError(); // clear CUDA error
|
||||
|
@ -70,10 +70,7 @@ void MPSHooks::commitStream() const {
|
||||
}
|
||||
|
||||
void* MPSHooks::getCommandBuffer() const {
|
||||
auto stream = at::mps::getDefaultMPSStream();
|
||||
// Release pending computeCommandEncoder, as extensions is likely to allocate new one
|
||||
stream->endKernelCoalescing();
|
||||
return stream->commandBuffer();
|
||||
return at::mps::getDefaultMPSStream()->commandBuffer();
|
||||
}
|
||||
|
||||
void* MPSHooks::getDispatchQueue() const {
|
||||
|
@ -158,18 +158,7 @@ void MPSStream::fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t
|
||||
endKernelCoalescing();
|
||||
id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
|
||||
|
||||
// For some reason fillBufferfor stopped working for lengh > 4Gb on MacOS 26
|
||||
// See https://github.com/pytorch/pytorch/issues/163962
|
||||
// Workaround by batching copy commands into 4Gb chunks
|
||||
constexpr size_t max_copy_size = 0x100000000; // 4GB
|
||||
size_t bytes_filled = 0;
|
||||
size_t bytes_remains = length;
|
||||
while (bytes_remains > 0) {
|
||||
NSUInteger bytes_to_copy = std::min(max_copy_size, bytes_remains);
|
||||
[blitEncoder fillBuffer:buffer range:NSMakeRange(offset + bytes_filled, bytes_to_copy) value:value];
|
||||
bytes_filled += bytes_to_copy;
|
||||
bytes_remains -= bytes_to_copy;
|
||||
}
|
||||
[blitEncoder fillBuffer:buffer range:NSMakeRange(offset, length) value:value];
|
||||
[blitEncoder endEncoding];
|
||||
synchronize(syncType);
|
||||
}
|
||||
|
@ -670,8 +670,6 @@ Tensor rrelu_with_noise_backward(
|
||||
}
|
||||
|
||||
Tensor rrelu(const Tensor & self, const Scalar& lower, const Scalar& upper, bool training, std::optional<Generator> generator) {
|
||||
TORCH_CHECK(std::isfinite(lower.to<double>()), "rrelu: lower bound must be finite, got ", lower.to<double>());
|
||||
TORCH_CHECK(std::isfinite(upper.to<double>()), "rrelu: upper bound must be finite, got ", upper.to<double>());
|
||||
TORCH_CHECK(lower.to<double>() <= upper.to<double>(), "Lower bound should be less than or equal to the upper bound")
|
||||
auto noise = at::empty_like(self, LEGACY_CONTIGUOUS_MEMORY_FORMAT);
|
||||
return at::rrelu_with_noise(self, noise, lower, upper, training, std::move(generator));
|
||||
|
@ -1157,103 +1157,103 @@ REGISTER_AVX512_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_AVX2_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_VSX_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_SVE_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_SVE256_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(cholesky_inverse_stub, DEFAULT, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_AVX512_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_AVX2_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_VSX_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_ZVECTOR_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_SVE_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_SVE256_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(linalg_eig_stub, DEFAULT, &linalg_eig_kernel)
|
||||
REGISTER_AVX512_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_AVX2_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_VSX_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_SVE_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_SVE256_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(linalg_eigh_stub, DEFAULT, &linalg_eigh_kernel)
|
||||
REGISTER_AVX512_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_AVX2_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_VSX_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_SVE_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_SVE256_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(geqrf_stub, DEFAULT, &geqrf_kernel)
|
||||
REGISTER_AVX512_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_AVX2_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_VSX_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_SVE_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_SVE256_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(orgqr_stub, DEFAULT, &orgqr_kernel_impl)
|
||||
REGISTER_AVX512_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_AVX2_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_VSX_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_ZVECTOR_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_SVE_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_SVE256_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(ormqr_stub, DEFAULT, &ormqr_kernel)
|
||||
REGISTER_AVX512_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_AVX2_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_VSX_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_SVE_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_SVE256_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(lstsq_stub, DEFAULT, &lstsq_kernel)
|
||||
REGISTER_AVX512_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_AVX2_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_VSX_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_SVE_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_SVE256_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(triangular_solve_stub, DEFAULT, &triangular_solve_kernel)
|
||||
REGISTER_AVX512_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_AVX2_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_VSX_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_SVE_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_SVE256_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(lu_factor_stub, DEFAULT, &lu_factor_kernel)
|
||||
REGISTER_AVX512_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_AVX2_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_VSX_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_SVE_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_SVE256_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(ldl_factor_stub, DEFAULT, &ldl_factor_kernel)
|
||||
REGISTER_AVX512_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_AVX2_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_VSX_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_SVE_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_SVE256_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(ldl_solve_stub, DEFAULT, &ldl_solve_kernel)
|
||||
REGISTER_AVX512_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_AVX2_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_VSX_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_SVE_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_SVE256_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(lu_solve_stub, DEFAULT, &lu_solve_kernel)
|
||||
REGISTER_AVX512_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_AVX2_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_VSX_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_SVE_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_SVE256_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(svd_stub, DEFAULT, &svd_kernel)
|
||||
REGISTER_AVX512_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_AVX2_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_VSX_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_SVE_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_SVE256_DISPATCH(svd_stub, &svd_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(unpack_pivots_stub, DEFAULT, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_AVX512_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_AVX2_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
} // namespace at::native
|
||||
|
@ -406,23 +406,11 @@ struct ConvParams {
|
||||
// cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest
|
||||
// that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how)
|
||||
#if !defined(C10_MOBILE)
|
||||
if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
|
||||
if (!detail::getCUDAHooks().compiledWithCuDNN()) {
|
||||
return false;
|
||||
}
|
||||
static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
|
||||
// broken on cuDNN 9.8
|
||||
if (cudnn_version >= 90800) {
|
||||
if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
|
||||
(input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) &&
|
||||
weight.dim() == 5) {
|
||||
for (int i = 2; i < weight.dim(); i++) {
|
||||
if (weight.size(i) != 1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (needs_64bit_indexing_no_split(input, weight)) {
|
||||
static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
|
||||
if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
|
||||
TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
|
||||
" if the V8 API is not enabled or before cuDNN version 9.3+."
|
||||
@ -430,6 +418,9 @@ struct ConvParams {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (!input.is_cuda() || !cudnn_enabled) {
|
||||
return false;
|
||||
}
|
||||
if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) {
|
||||
if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) {
|
||||
return false;
|
||||
@ -448,19 +439,16 @@ struct ConvParams {
|
||||
|
||||
// Use cudnn for FP16 depthwise convolutions
|
||||
bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const {
|
||||
if (!cudnn_enabled || !detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda()) {
|
||||
if (!detail::getCUDAHooks().compiledWithCuDNN()) {
|
||||
return false;
|
||||
}
|
||||
if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) {
|
||||
// always use cudnn_depthwise for channels_last format
|
||||
return true;
|
||||
}
|
||||
// native kernel doesn't support 64-bit non-splittable case
|
||||
if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
|
||||
if (cudnn_enabled && !(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
|
||||
static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
|
||||
// TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
|
||||
if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
|
||||
if (cudnn_version < 0 || cudnn_version > 91000) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
|
||||
TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
|
||||
" if the V8 API is not enabled or before cuDNN version 9.3+."
|
||||
@ -470,10 +458,6 @@ struct ConvParams {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
|
||||
// always use cudnn_depthwise for channels_last format
|
||||
return true;
|
||||
}
|
||||
if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) {
|
||||
bool kernel_cond = (use_cudnn(input, weight) &&
|
||||
input.scalar_type() == kHalf && // only for FP16
|
||||
|
@ -39,21 +39,19 @@ static CPUCapability compute_cpu_capability() {
|
||||
}
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION)
|
||||
int sve_vl = cpuinfo_get_max_arm_sve_length(); //Returns maximum SVE VL supported by your HW.
|
||||
if (envar == "sve") {
|
||||
// Select SVE capability based on the maximum SVE VL supported by the HW.
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
if (envar == "sve256") {
|
||||
if (sve_vl == 256) {
|
||||
#ifdef HAVE_ARM_BF16_CPU_DEFINITION
|
||||
if (cpuinfo_has_arm_bf16()) {
|
||||
return CPUCapability::SVE256;
|
||||
}
|
||||
} else if (sve_vl == 128) {
|
||||
if (cpuinfo_has_arm_bf16()) {
|
||||
return CPUCapability::SVE128;
|
||||
}
|
||||
} else {
|
||||
TORCH_WARN("SVE capability not available on hardware. Falling back to DEFAULT");
|
||||
return CPUCapability::DEFAULT;
|
||||
#endif
|
||||
}
|
||||
TORCH_WARN("SVE256 capability not available on hardware. Falling back to DEFAULT");
|
||||
return CPUCapability::DEFAULT;
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
#ifdef HAVE_AVX512_CPU_DEFINITION
|
||||
if (envar == "avx512") {
|
||||
@ -115,11 +113,6 @@ static CPUCapability compute_cpu_capability() {
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
if (sve_vl == 128) { // Check for SVE128
|
||||
return CPUCapability::SVE128;
|
||||
}
|
||||
#endif
|
||||
// Return the default CPU capability.
|
||||
return CPUCapability::DEFAULT;
|
||||
}
|
||||
@ -154,9 +147,6 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
) {
|
||||
constexpr auto supported_devices = c10::array_of<c10::DeviceType>(
|
||||
c10::DeviceType::CPU,
|
||||
@ -194,9 +184,6 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, SVE128
|
||||
#endif
|
||||
);
|
||||
if (!std::holds_alternative<ErrorType>(result)) {
|
||||
@ -255,9 +242,6 @@ void* DispatchStubImpl::get_call_ptr(
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
) {
|
||||
|
||||
auto result = try_get_call_ptr(
|
||||
@ -282,10 +266,6 @@ void* DispatchStubImpl::get_call_ptr(
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
,
|
||||
SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
,
|
||||
SVE128
|
||||
#endif
|
||||
);
|
||||
if (std::holds_alternative<ErrorType>(result)) {
|
||||
@ -320,9 +300,6 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
){
|
||||
|
||||
@ -365,16 +342,6 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
|
||||
return DispatchResult(SVE256);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
if (capability >= static_cast<int>(CPUCapability::SVE128)) {
|
||||
if (C10_UNLIKELY(!SVE128)) {
|
||||
// dispatch to DEFAULT, since the SVE kernel is missing
|
||||
return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
|
||||
} else {
|
||||
return DispatchResult(SVE128);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
|
||||
}
|
||||
@ -396,9 +363,6 @@ void* DispatchStubImpl::choose_cpu_impl(
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
) {
|
||||
auto capability = static_cast<int>(get_cpu_capability());
|
||||
(void)capability;
|
||||
@ -444,17 +408,6 @@ void* DispatchStubImpl::choose_cpu_impl(
|
||||
return SVE256;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
if (capability >= static_cast<int>(CPUCapability::SVE128)) {
|
||||
if (C10_UNLIKELY(!SVE128)) {
|
||||
// dispatch to DEFAULT, since the SVE kernel is missing
|
||||
TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
|
||||
return DEFAULT;
|
||||
} else {
|
||||
return SVE128;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
|
||||
return DEFAULT;
|
||||
|
@ -64,9 +64,8 @@ enum class CPUCapability {
|
||||
VSX = 1,
|
||||
#elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
|
||||
ZVECTOR = 1,
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
SVE256 = 1,
|
||||
SVE128 = 2,
|
||||
#else
|
||||
AVX2 = 1,
|
||||
AVX512 = 2,
|
||||
@ -118,9 +117,6 @@ struct TORCH_API DispatchStubImpl {
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
);
|
||||
|
||||
@ -142,9 +138,6 @@ struct TORCH_API DispatchStubImpl {
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
);
|
||||
|
||||
@ -166,9 +159,6 @@ struct TORCH_API DispatchStubImpl {
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
);
|
||||
|
||||
@ -193,9 +183,6 @@ struct TORCH_API DispatchStubImpl {
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, void *SVE256
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, void *SVE128
|
||||
#endif
|
||||
);
|
||||
|
||||
@ -253,9 +240,6 @@ private:
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(SVE256)
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(SVE128)
|
||||
#endif
|
||||
)
|
||||
);
|
||||
@ -317,9 +301,6 @@ public:
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(SVE256)
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(SVE128)
|
||||
#endif
|
||||
);
|
||||
if (std::holds_alternative<ErrorType>(result)){
|
||||
@ -344,9 +325,6 @@ public:
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
static TORCH_API FnPtr SVE256;
|
||||
#endif
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
static TORCH_API FnPtr SVE128;
|
||||
#endif
|
||||
private:
|
||||
DispatchStubImpl impl;
|
||||
};
|
||||
@ -454,12 +432,6 @@ struct RegisterPRIVATEUSE1Dispatch {
|
||||
#define REGISTER_SVE256_DISPATCH(name, fn)
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SVE128_CPU_DEFINITION
|
||||
#define REGISTER_SVE128_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, SVE128, fn)
|
||||
#else
|
||||
#define REGISTER_SVE128_DISPATCH(name, fn)
|
||||
#endif
|
||||
|
||||
// Macro to register the same kernel for all CPU arch types. This is useful
|
||||
// if a kernel does not benefit from being recompiled across different arch types.
|
||||
#define REGISTER_ALL_CPU_DISPATCH(name, fn) \
|
||||
@ -468,11 +440,6 @@ struct RegisterPRIVATEUSE1Dispatch {
|
||||
REGISTER_AVX2_DISPATCH(name, fn) \
|
||||
REGISTER_VSX_DISPATCH(name, fn) \
|
||||
REGISTER_ZVECTOR_DISPATCH(name, fn) \
|
||||
REGISTER_SVE256_DISPATCH(name, fn) \
|
||||
REGISTER_SVE128_DISPATCH(name, fn)
|
||||
|
||||
#define REGISTER_SVE_DISPATCH(name, fn) \
|
||||
REGISTER_SVE128_DISPATCH(name, fn) \
|
||||
REGISTER_SVE256_DISPATCH(name, fn)
|
||||
|
||||
#define REGISTER_NO_CPU_DISPATCH(name) \
|
||||
@ -515,7 +482,6 @@ struct RegisterPRIVATEUSE1Dispatch {
|
||||
// REGISTER_DISPATCH now dispatches an AVX512 kernel to nullptr but registers other dispatches.
|
||||
// ALSO_REGISTER_AVX512_DISPATCH should be used for ensuring AVX512 dispatch, among others.
|
||||
// ALSO_REGISTER_SVE256_DISPATCH should be used for ensuring SVE256 dispatch, among others.
|
||||
// ALSO_REGISTER_SVE128_DISPATCH should be used for ensuring SVE128 dispatch, among others.
|
||||
#ifdef CPU_CAPABILITY_AVX512
|
||||
#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, ((void*)(fn) ? nullptr : nullptr))
|
||||
#else
|
||||
@ -523,7 +489,6 @@ struct RegisterPRIVATEUSE1Dispatch {
|
||||
#endif
|
||||
#define ALSO_REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
|
||||
#define ALSO_REGISTER_SVE256_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
|
||||
#define ALSO_REGISTER_SVE128_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
|
||||
#endif
|
||||
} // namespace at::native
|
||||
|
||||
|
@ -466,7 +466,7 @@ REGISTER_AVX2_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cp
|
||||
REGISTER_AVX512_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
|
||||
// offsets dispatches
|
||||
REGISTER_ARCH_DISPATCH(
|
||||
@ -477,7 +477,7 @@ REGISTER_AVX2_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cp
|
||||
REGISTER_AVX512_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
|
||||
// Currently some computation is being duplicated across forward and backward.
|
||||
// TODO: Cache indices in forward pass to reuse in backward
|
||||
@ -548,7 +548,7 @@ REGISTER_VSX_DISPATCH(
|
||||
REGISTER_ZVECTOR_DISPATCH(
|
||||
_segment_reduce_lengths_backward_stub,
|
||||
&_segment_reduce_cpu_lengths_backward_kernel)
|
||||
REGISTER_SVE_DISPATCH(
|
||||
REGISTER_SVE256_DISPATCH(
|
||||
_segment_reduce_lengths_backward_stub,
|
||||
&_segment_reduce_cpu_lengths_backward_kernel)
|
||||
|
||||
@ -568,7 +568,7 @@ REGISTER_VSX_DISPATCH(
|
||||
REGISTER_ZVECTOR_DISPATCH(
|
||||
_segment_reduce_offsets_backward_stub,
|
||||
&_segment_reduce_cpu_offsets_backward_kernel)
|
||||
REGISTER_SVE_DISPATCH(
|
||||
REGISTER_SVE256_DISPATCH(
|
||||
_segment_reduce_offsets_backward_stub,
|
||||
&_segment_reduce_cpu_offsets_backward_kernel)
|
||||
|
||||
|
@ -212,7 +212,7 @@ std::pair<vec::Vectorized<float>, vec::Vectorized<float>> fmadd(
|
||||
const vec::Vectorized<c10::Half>& b,
|
||||
const vec::Vectorized<float>& acc_low,
|
||||
const vec::Vectorized<float>& acc_high) {
|
||||
#if defined(__aarch64__) && ((defined(__ARM_FEATURE_FP16_FML) && !defined(__ARM_FEATURE_SVE)) || (defined(CPU_CAPABILITY_SVE128)))
|
||||
#if defined(__ARM_FEATURE_FP16_FML) && !defined(CPU_CAPABILITY_SVE)
|
||||
return std::make_pair(vfmlalq_low_f16(acc_low, a, b), vfmlalq_high_f16(acc_high, a, b));
|
||||
#else
|
||||
const auto [a_float_low, a_float_high] = convert_half_float(a);
|
||||
@ -233,7 +233,7 @@ std::pair<vec::Vectorized<float>, vec::Vectorized<float>> fmadd(
|
||||
|
||||
// Return a + b_low * c_low + b_high * c_high
|
||||
vec::Vectorized<float> fmadd(vec::Vectorized<float> a, vec::Vectorized<Half> b, vec::Vectorized<Half> c) {
|
||||
#if defined(__aarch64__) && ((defined(__ARM_FEATURE_FP16_FML) && !defined(__ARM_FEATURE_SVE)) || (defined(CPU_CAPABILITY_SVE128)))
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_FML) && !defined(__ARM_FEATURE_SVE)
|
||||
// NOTE: this instruction is an optional instruction in ARM v8.2 and
|
||||
// v8.3, but mandatory in v8.4 per
|
||||
// https://developer.arm.com/documentation/ddi0596/2021-03/SIMD-FP-Instructions/FMLAL--FMLAL2--vector---Floating-point-fused-Multiply-Add-Long-to-accumulator--vector--?lang=en
|
||||
|
@ -1124,17 +1124,6 @@ bool is_rowwise_scaling(const at::Tensor& t, const at::Tensor& scale) {
|
||||
&& scale.is_contiguous());
|
||||
}
|
||||
|
||||
bool check_size_stride(const at::Tensor& scale, int dim, int size, int stride) {
|
||||
// For Blockwise1x128 and Blockwise128x128,
|
||||
// when the scale tensor has a dimension of size 1, the stride is effectively
|
||||
// "meaningless", i.e. PyTorch decides to use a stride of 1. Thus, the regular
|
||||
// stride check fails. Here, we relax the stride check when the effective
|
||||
// stride is 1.
|
||||
|
||||
return (
|
||||
scale.size(dim) == size && (size <= 1 || scale.stride(dim) == stride));
|
||||
}
|
||||
|
||||
// 1x16 blocks for packed nvfp4 data and fp8_e4m3fn scales
|
||||
bool is_blockwise_1x16_scaling(const at::Tensor& t, const at::Tensor& scale) {
|
||||
// Multiply t.size(1) by 2 to adjust for fp4x2 packing
|
||||
@ -1160,24 +1149,15 @@ bool is_blockwise_1x32_scaling(const at::Tensor& t, const at::Tensor& scale) {
|
||||
}
|
||||
|
||||
bool is_blockwise_1x128_scaling(const at::Tensor& t, const at::Tensor& scale) {
|
||||
return (
|
||||
isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat &&
|
||||
scale.dim() == 2 && check_size_stride(scale, 0, t.size(0), 1) &&
|
||||
check_size_stride(
|
||||
scale, 1, ceil_div<int64_t>(t.size(1), 128), t.size(0)));
|
||||
return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat && scale.dim() == 2
|
||||
&& scale.size(0) == t.size(0) && scale.size(1) == ceil_div<int64_t>(t.size(1), 128)
|
||||
&& scale.stride(0) == 1 && scale.stride(1) == t.size(0));
|
||||
}
|
||||
|
||||
bool is_blockwise_128x128_scaling(const at::Tensor& t, const at::Tensor& scale) {
|
||||
return (
|
||||
isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat &&
|
||||
scale.dim() == 2 &&
|
||||
check_size_stride(
|
||||
scale,
|
||||
0,
|
||||
ceil_div<int64_t>(t.size(0), 128),
|
||||
round_up<int64_t>(ceil_div<int64_t>(t.size(1), 128), 4)) &&
|
||||
check_size_stride(
|
||||
scale, 1, ceil_div<int64_t>(t.size(1), 128), 1));
|
||||
return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == kFloat && scale.dim() == 2
|
||||
&& scale.size(0) == ceil_div<int64_t>(t.size(0), 128) && scale.size(1) == ceil_div<int64_t>(t.size(1), 128)
|
||||
&& scale.stride(0) == round_up<int64_t>(ceil_div<int64_t>(t.size(1), 128), 4) && scale.stride(1) == 1);
|
||||
}
|
||||
|
||||
bool is_desired_scaling(const at::Tensor& t, const at::Tensor& scale, ScalingType desired_scaling) {
|
||||
@ -1831,37 +1811,6 @@ std::optional<c10::ScalarType> out_dtype) {
|
||||
return out;
|
||||
}
|
||||
|
||||
static void baddbmm_bmm_out_dtype_checks(const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, const at::ScalarType out_dtype, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
|
||||
// ref ATen/native/LinearAlgebra.cpp common_checks_baddbmm_bmm
|
||||
TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
|
||||
TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
|
||||
|
||||
const auto batch1_sizes = batch1.sizes();
|
||||
const auto batch2_sizes = batch2.sizes();
|
||||
|
||||
int64_t bs = batch1_sizes[0];
|
||||
int64_t contraction_size = batch1_sizes[2];
|
||||
int64_t res_rows = batch1_sizes[1];
|
||||
int64_t res_cols = batch2_sizes[2];
|
||||
std::vector<int64_t> output_size {bs, res_rows, res_cols};
|
||||
|
||||
TORCH_CHECK(batch2_sizes[0] == bs && batch2_sizes[1] == contraction_size,
|
||||
"Expected size for first two dimensions of batch2 tensor to be: [",
|
||||
bs, ", ", contraction_size, "] but got: [", batch2_sizes[0], ", ", batch2_sizes[1], "].");
|
||||
|
||||
TORCH_CHECK(batch1.scalar_type() == batch2.scalar_type(), "batch1 and batch2 must have the same dtype");
|
||||
|
||||
TORCH_CHECK(out_dtype == batch1.scalar_type() ||
|
||||
(out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
|
||||
"out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
|
||||
|
||||
if (!is_bmm && self_baddbmm.has_value()) {
|
||||
const auto& self = self_baddbmm.value();
|
||||
TORCH_CHECK(self.dim() == 3, "self must be a 3D tensor");
|
||||
TORCH_CHECK(self.sizes() == output_size, "self must have the same shape as the output");
|
||||
}
|
||||
}
|
||||
|
||||
Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
|
||||
IntArrayRef batch1_sizes = batch1.sizes();
|
||||
IntArrayRef batch2_sizes = batch2.sizes();
|
||||
@ -1871,7 +1820,12 @@ Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::Sca
|
||||
}
|
||||
|
||||
Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
|
||||
baddbmm_bmm_out_dtype_checks(batch1, batch2, 0.0, 1.0, out_dtype, true);
|
||||
TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
|
||||
|
||||
TORCH_CHECK(out_dtype == batch1.scalar_type() ||
|
||||
(out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
|
||||
"out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
|
||||
|
||||
Scalar beta(0.0);
|
||||
Scalar alpha(1.0);
|
||||
{
|
||||
@ -1890,7 +1844,12 @@ Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tenso
|
||||
}
|
||||
|
||||
Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
|
||||
baddbmm_bmm_out_dtype_checks(batch1, batch2, beta, alpha, out_dtype, false, self);
|
||||
TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
|
||||
|
||||
TORCH_CHECK(out_dtype == batch1.scalar_type() ||
|
||||
(out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
|
||||
"out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
|
||||
|
||||
{
|
||||
NoNamesGuard guard;
|
||||
baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
|
||||
@ -1905,12 +1864,6 @@ Tensor _mm_dtype_cuda(const Tensor& self, const Tensor& mat2, const at::ScalarTy
|
||||
}
|
||||
|
||||
Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::ScalarType out_dtype, Tensor &out) {
|
||||
TORCH_CHECK(self.dim() == 2, "self must be a matrix, got ", self.dim(), "-D tensor");
|
||||
TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
|
||||
TORCH_CHECK(
|
||||
self.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
|
||||
self.sizes()[0], "x", self.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
|
||||
|
||||
TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
|
||||
TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "input dtypes must be the same");
|
||||
TORCH_CHECK(out_dtype == self.scalar_type() ||
|
||||
@ -1930,14 +1883,6 @@ Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& m
|
||||
}
|
||||
|
||||
Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
|
||||
TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "self and mat2 must have the same dtype, but got ", self.scalar_type(), " and ", mat2.scalar_type());
|
||||
TORCH_CHECK(mat1.scalar_type() == mat2.scalar_type(), "mat1 and mat2 must have the same dtype, but got ", mat1.scalar_type(), " and ", mat2.scalar_type());
|
||||
TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor");
|
||||
TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor");
|
||||
TORCH_CHECK(
|
||||
mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
|
||||
mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
|
||||
|
||||
TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
|
||||
TORCH_CHECK(out_dtype == self.scalar_type() ||
|
||||
(out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
|
||||
|
@ -5,7 +5,6 @@
|
||||
#include <array>
|
||||
#include <type_traits>
|
||||
#include <ATen/core/TensorBase.h>
|
||||
#include <ATen/ceil_div.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Dispatch_v2.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
@ -84,17 +83,11 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
|
||||
auto ind_dim_size = index_size[0];
|
||||
auto inp_stride_bytes = index_stride[0];
|
||||
auto out_stride_bytes = iter.strides(0)[1];
|
||||
// avoid grid overflow in the fast kernel
|
||||
const int64_t vec_chunks = ceil_div(slice_size, alignment);
|
||||
const int64_t blocks_per_slice_upper = ceil_div(vec_chunks, (int64_t)launch_size_nd);
|
||||
const int max_grid_y = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
|
||||
// if it's an eligible grid we use the fast path, otherwise default to slower path
|
||||
if (blocks_per_slice_upper <= max_grid_y) {
|
||||
at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
|
||||
slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true);
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (iter.numel() == 0) return;
|
||||
at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
|
||||
slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
auto sizes = std::array<int64_t, MAX_DIMS>{};
|
||||
|
@ -1238,7 +1238,7 @@ Tensor _cholesky_solve_helper_cuda_magma(const Tensor& self, const Tensor& A, bo
|
||||
// Todo: cusolverDn<T>potrsBatched only supports nrhs == 1 and does not have good performance.
|
||||
// Batched cholesky_solve is dispatched to magma.
|
||||
Tensor _cholesky_solve_helper_cuda(const Tensor& self, const Tensor& A, bool upper) {
|
||||
#if defined(USE_LINALG_SOLVER)
|
||||
#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
|
||||
auto preferred_backend = at::globalContext().linalgPreferredBackend();
|
||||
switch (preferred_backend) {
|
||||
case at::LinalgBackend::Cusolver:
|
||||
@ -1352,7 +1352,7 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info)
|
||||
}
|
||||
|
||||
static void cholesky_kernel(const Tensor& input, const Tensor& info, bool upper) {
|
||||
#if defined(USE_LINALG_SOLVER)
|
||||
#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
|
||||
auto preferred_backend = at::globalContext().linalgPreferredBackend();
|
||||
switch (preferred_backend) {
|
||||
case at::LinalgBackend::Cusolver:
|
||||
@ -2709,7 +2709,7 @@ void linalg_lstsq_gels(const Tensor& A, const Tensor& B, const Tensor& /*infos*/
|
||||
}
|
||||
|
||||
void gels_looped(const Tensor& a, Tensor& b, Tensor& infos) {
|
||||
#if defined(USE_LINALG_SOLVER)
|
||||
#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
|
||||
auto preferred_backend = at::globalContext().linalgPreferredBackend();
|
||||
switch (preferred_backend) {
|
||||
case at::LinalgBackend::Magma:
|
||||
@ -2733,7 +2733,7 @@ void lstsq_kernel(const Tensor& a, Tensor& b, Tensor& /*rank*/, Tensor& /*singul
|
||||
// first handle the underdetermined case (m < n)
|
||||
// this case is not supported by MAGMA or cuBLAS
|
||||
if (m < n) {
|
||||
#if defined(USE_LINALG_SOLVER)
|
||||
#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
|
||||
linalg_lstsq_gels(a, b, infos);
|
||||
#else
|
||||
TORCH_CHECK(
|
||||
|
@ -165,7 +165,7 @@ REGISTER_AVX2_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_co
|
||||
REGISTER_AVX512_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
REGISTER_ZVECTOR_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
REGISTER_VSX_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
REGISTER_SVE_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
REGISTER_SVE256_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
|
||||
// _out variants can be shared between PocketFFT and MKL
|
||||
Tensor& _fft_r2c_mkl_out(const Tensor& self, IntArrayRef dim, int64_t normalization,
|
||||
|
@ -14,8 +14,8 @@ template <typename T, int D, int V = D>
|
||||
device T* out [[buffer(3)]],
|
||||
const constant uint& gqa_factor [[buffer(4)]],
|
||||
const constant uint& N [[buffer(5)]],
|
||||
const constant uint3& qkv_head_strides [[buffer(6)]],
|
||||
const constant uint3& qkv_seq_strides [[buffer(7)]],
|
||||
const constant uint2& k_head_seq_stride [[buffer(6)]],
|
||||
const constant uint2& v_head_seq_stride [[buffer(7)]],
|
||||
const constant float& scale [[buffer(8)]],
|
||||
const device bool* mask [[buffer(9)]],
|
||||
const constant uint3& mask_strides [[buffer(10)]],
|
||||
@ -28,12 +28,10 @@ template <typename T, int D, int V = D>
|
||||
constexpr uint BD = 32;
|
||||
constexpr uint qk_per_thread = D / BD;
|
||||
constexpr uint v_per_thread = V / BD;
|
||||
const uint q_head_stride = qkv_head_strides.x;
|
||||
const uint q_seq_stride = qkv_seq_strides.x;
|
||||
const uint k_head_stride = qkv_head_strides.y;
|
||||
const uint k_seq_stride = qkv_seq_strides.y;
|
||||
const uint v_head_stride = qkv_head_strides.z;
|
||||
const uint v_seq_stride = qkv_seq_strides.z;
|
||||
const uint k_head_stride = k_head_seq_stride.x;
|
||||
const uint k_seq_stride = k_head_seq_stride.y;
|
||||
const uint v_head_stride = v_head_seq_stride.x;
|
||||
const uint v_seq_stride = v_head_seq_stride.y;
|
||||
const uint mask_head_stride = mask_strides.x;
|
||||
const uint mask_kv_seq_stride = mask_strides.y;
|
||||
const uint mask_q_seq_stride = mask_strides.z;
|
||||
@ -56,9 +54,9 @@ template <typename T, int D, int V = D>
|
||||
const int kv_head_idx = head_idx / gqa_factor;
|
||||
const int Q = tpg.y;
|
||||
const int group_offset = head_idx * Q + q_seq_idx;
|
||||
const int q_offset = group_offset;
|
||||
const int o_offset = group_offset;
|
||||
queries += head_idx * q_head_stride + q_seq_idx * q_seq_stride +
|
||||
simd_lid * qk_per_thread;
|
||||
queries += q_offset * D + simd_lid * qk_per_thread;
|
||||
keys += kv_head_idx * k_head_stride + simd_gid * k_seq_stride +
|
||||
simd_lid * qk_per_thread;
|
||||
values += kv_head_idx * v_head_stride + simd_gid * v_seq_stride +
|
||||
@ -158,8 +156,8 @@ template <typename T, int D, int V = D>
|
||||
device float* maxs [[buffer(5)]],
|
||||
const constant uint& gqa_factor [[buffer(6)]],
|
||||
const constant uint& N [[buffer(7)]],
|
||||
const constant uint3& qkv_head_strides [[buffer(8)]],
|
||||
const constant uint3& qkv_seq_strides [[buffer(9)]],
|
||||
const constant uint2& k_head_seq_stride [[buffer(8)]],
|
||||
const constant uint2& v_head_seq_stride [[buffer(9)]],
|
||||
const constant float& scale [[buffer(10)]],
|
||||
const device bool* mask [[buffer(11)]],
|
||||
const constant uint3& mask_strides [[buffer(12)]],
|
||||
@ -172,12 +170,10 @@ template <typename T, int D, int V = D>
|
||||
constexpr int BD = 32;
|
||||
constexpr int qk_per_thread = D / BD;
|
||||
constexpr int v_per_thread = V / BD;
|
||||
const int q_head_stride = qkv_head_strides.x;
|
||||
const int q_seq_stride = qkv_seq_strides.x;
|
||||
const int k_head_stride = qkv_head_strides.y;
|
||||
const int k_seq_stride = qkv_seq_strides.y;
|
||||
const int v_head_stride = qkv_head_strides.z;
|
||||
const int v_seq_stride = qkv_seq_strides.z;
|
||||
const int k_head_stride = k_head_seq_stride.x;
|
||||
const int k_seq_stride = k_head_seq_stride.y;
|
||||
const int v_head_stride = v_head_seq_stride.x;
|
||||
const int v_seq_stride = v_head_seq_stride.y;
|
||||
const int mask_kv_seq_stride = mask_strides.x;
|
||||
const int mask_q_seq_stride = mask_strides.y;
|
||||
const int mask_head_stride = mask_strides.z;
|
||||
@ -200,10 +196,10 @@ template <typename T, int D, int V = D>
|
||||
const int head_idx = tid.x;
|
||||
const int q_seq_idx = tid.y;
|
||||
const int o_offset = head_idx * tpg.y + q_seq_idx;
|
||||
const int q_offset = o_offset;
|
||||
const int kv_head_idx = head_idx / gqa_factor;
|
||||
|
||||
queries += head_idx * q_head_stride + q_seq_idx * q_seq_stride +
|
||||
simd_lid * qk_per_thread;
|
||||
queries += q_offset * D + simd_lid * qk_per_thread;
|
||||
keys += kv_head_idx * k_head_stride +
|
||||
(block_idx * BN + simd_gid) * k_seq_stride + simd_lid * qk_per_thread;
|
||||
values += kv_head_idx * v_head_stride +
|
||||
@ -524,25 +520,25 @@ kernel void attention(
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANTIATE_SDPA_VECTOR(DTYPE, QK_DIM, VALUE_DIM) \
|
||||
template [[host_name("sdpa_vector_" #DTYPE "_" #QK_DIM \
|
||||
"_" #VALUE_DIM)]] kernel void \
|
||||
sdpa_vector<DTYPE, QK_DIM, VALUE_DIM>( \
|
||||
const device DTYPE* queries [[buffer(0)]], \
|
||||
const device DTYPE* keys [[buffer(1)]], \
|
||||
const device DTYPE* values [[buffer(2)]], \
|
||||
device DTYPE* out [[buffer(3)]], \
|
||||
const constant uint& gqa_factor [[buffer(4)]], \
|
||||
const constant uint& N [[buffer(5)]], \
|
||||
const constant uint3& qkv_head_strides [[buffer(6)]], \
|
||||
const constant uint3& qkv_seq_strides [[buffer(7)]], \
|
||||
const constant float& scale [[buffer(8)]], \
|
||||
const device bool* mask [[buffer(9)]], \
|
||||
const constant uint3& mask_strides [[buffer(10)]], \
|
||||
const constant bool& has_mask [[buffer(11)]], \
|
||||
uint3 tid [[threadgroup_position_in_grid]], \
|
||||
uint3 tpg [[threadgroups_per_grid]], \
|
||||
uint simd_gid [[simdgroup_index_in_threadgroup]], \
|
||||
#define INSTANTIATE_SDPA_VECTOR(DTYPE, QK_DIM, VALUE_DIM) \
|
||||
template [[host_name("sdpa_vector_" #DTYPE "_" #QK_DIM \
|
||||
"_" #VALUE_DIM)]] kernel void \
|
||||
sdpa_vector<DTYPE, QK_DIM, VALUE_DIM>( \
|
||||
const device DTYPE* queries [[buffer(0)]], \
|
||||
const device DTYPE* keys [[buffer(1)]], \
|
||||
const device DTYPE* values [[buffer(2)]], \
|
||||
device DTYPE* out [[buffer(3)]], \
|
||||
const constant uint& gqa_factor [[buffer(4)]], \
|
||||
const constant uint& N [[buffer(5)]], \
|
||||
const constant uint2& k_head_seq_stride [[buffer(6)]], \
|
||||
const constant uint2& v_head_seq_stride [[buffer(7)]], \
|
||||
const constant float& scale [[buffer(8)]], \
|
||||
const device bool* mask [[buffer(9)]], \
|
||||
const constant uint3& mask_strides [[buffer(10)]], \
|
||||
const constant bool& has_mask [[buffer(11)]], \
|
||||
uint3 tid [[threadgroup_position_in_grid]], \
|
||||
uint3 tpg [[threadgroups_per_grid]], \
|
||||
uint simd_gid [[simdgroup_index_in_threadgroup]], \
|
||||
uint simd_lid [[thread_index_in_simdgroup]]);
|
||||
|
||||
#define INSTANTIATE_SDPA_VECTOR_2PASS_1(DTYPE, QK_DIM, VALUE_DIM) \
|
||||
@ -557,8 +553,8 @@ kernel void attention(
|
||||
device float* maxs [[buffer(5)]], \
|
||||
const constant uint& gqa_factor [[buffer(6)]], \
|
||||
const constant uint& N [[buffer(7)]], \
|
||||
const constant uint3& qkv_head_strides [[buffer(8)]], \
|
||||
const constant uint3& qkv_seq_strides [[buffer(9)]], \
|
||||
const constant uint2& k_head_seq_stride [[buffer(8)]], \
|
||||
const constant uint2& v_head_seq_stride [[buffer(9)]], \
|
||||
const constant float& scale [[buffer(10)]], \
|
||||
const device bool* mask [[buffer(11)]], \
|
||||
const constant uint3& mask_strides [[buffer(12)]], \
|
||||
|
@ -25,7 +25,7 @@ struct ReductionOp {
|
||||
inline opmath_t<T> operator()(
|
||||
opmath_t<T> weight_val,
|
||||
opmath_t<T> out_val,
|
||||
bool /*is_first*/) {
|
||||
bool is_first) {
|
||||
return weight_val + out_val;
|
||||
}
|
||||
};
|
||||
@ -44,9 +44,9 @@ template <EmbeddingBagMode M, typename T>
|
||||
struct MaybeApplyPerSampleWeight {
|
||||
inline opmath_t<T> operator()(
|
||||
opmath_t<T> weight_val,
|
||||
uint32_t /*per_sample_weights_index*/,
|
||||
constant T* /*per_sample_weights*/,
|
||||
uint32_t /*per_sample_weights_stride*/) {
|
||||
uint32_t per_sample_weights_index,
|
||||
constant T* per_sample_weights,
|
||||
uint32_t per_sample_weights_stride) {
|
||||
return weight_val;
|
||||
}
|
||||
};
|
||||
@ -71,12 +71,12 @@ struct MaybeApplyPerSampleWeight<EmbeddingBagMode::SUM, T> {
|
||||
template <EmbeddingBagMode M, typename T, typename I>
|
||||
struct MaybeCalcMaxIndex {
|
||||
inline void operator()(
|
||||
opmath_t<T> /*weight_val*/,
|
||||
opmath_t<T> /*out_val*/,
|
||||
bool /*is_first*/,
|
||||
thread I& /*max_idx*/,
|
||||
I /*weight_idx*/,
|
||||
bool /*pad*/) {}
|
||||
opmath_t<T> weight_val,
|
||||
opmath_t<T> out_val,
|
||||
bool is_first,
|
||||
thread I& max_idx,
|
||||
I weight_idx,
|
||||
bool pad) {}
|
||||
};
|
||||
|
||||
template <typename T, typename I>
|
||||
|
@ -223,6 +223,9 @@ void grid_sampler_single_element(
|
||||
auto input_size = input_sizes[input_dim];
|
||||
auto coord = static_cast<opmath_t<T>>(coords[coord_dim]);
|
||||
|
||||
// Interpret nan as -1
|
||||
coord = isnan(coord) ? -1 : coord;
|
||||
|
||||
if (!align_corners) {
|
||||
// Map unaligned grid space to aligned grid space
|
||||
auto corner_alignment_factor = static_cast<opmath_t<T>>(input_size) /
|
||||
|
@ -182,8 +182,6 @@ static std::tuple<Tensor, Tensor> sdpa_vector_fast_mps(const Tensor& q_,
|
||||
uint maxSeqLength = k_.size(2);
|
||||
uint N = k_.size(2);
|
||||
uint B = q_.size(0) * q_.size(1);
|
||||
uint q_head_stride = q_.stride(1);
|
||||
uint q_seq_stride = q_.stride(2);
|
||||
uint k_head_stride = k_.stride(1);
|
||||
uint k_seq_stride = k_.stride(2);
|
||||
uint v_head_stride = v_.stride(1);
|
||||
@ -211,8 +209,8 @@ static std::tuple<Tensor, Tensor> sdpa_vector_fast_mps(const Tensor& q_,
|
||||
out,
|
||||
1,
|
||||
N,
|
||||
std::array<uint32_t, 3>{q_head_stride, k_head_stride, v_head_stride},
|
||||
std::array<uint32_t, 3>{q_seq_stride, k_seq_stride, v_seq_stride},
|
||||
std::array<uint32_t, 2>{k_head_stride, k_seq_stride},
|
||||
std::array<uint32_t, 2>{v_head_stride, v_seq_stride},
|
||||
scale_factor);
|
||||
|
||||
if (has_mask) {
|
||||
@ -259,8 +257,6 @@ static std::tuple<Tensor, Tensor> sdpa_vector_2pass_mps(const Tensor& q_,
|
||||
uint B = batchSize * num_heads;
|
||||
uint gqa_factor = q_.size(1) / k_.size(1);
|
||||
|
||||
uint q_head_stride = q_.stride(1);
|
||||
uint q_seq_stride = q_.stride(2);
|
||||
uint k_head_stride = k_.stride(1);
|
||||
uint k_seq_stride = k_.stride(2);
|
||||
uint v_head_stride = v_.stride(1);
|
||||
@ -298,8 +294,8 @@ static std::tuple<Tensor, Tensor> sdpa_vector_2pass_mps(const Tensor& q_,
|
||||
maxs,
|
||||
gqa_factor,
|
||||
N,
|
||||
std::array<uint32_t, 3>{q_head_stride, k_head_stride, v_head_stride},
|
||||
std::array<uint32_t, 3>{q_seq_stride, k_seq_stride, v_seq_stride},
|
||||
std::array<uint32_t, 2>{k_head_stride, k_seq_stride},
|
||||
std::array<uint32_t, 2>{v_head_stride, v_seq_stride},
|
||||
scale_factor);
|
||||
|
||||
if (has_mask) {
|
||||
|
@ -9,22 +9,11 @@
|
||||
#else
|
||||
#include <ATen/ops/_unique2.h>
|
||||
#include <ATen/ops/_unique2_native.h>
|
||||
#include <ATen/ops/arange.h>
|
||||
#include <ATen/ops/argsort.h>
|
||||
#include <ATen/ops/cat.h>
|
||||
#include <ATen/ops/cumsum.h>
|
||||
#include <ATen/ops/full.h>
|
||||
#include <ATen/ops/masked_select.h>
|
||||
#include <ATen/ops/nonzero.h>
|
||||
#include <ATen/ops/ones.h>
|
||||
#include <ATen/ops/ones_like.h>
|
||||
#include <ATen/ops/slice.h>
|
||||
#include <ATen/ops/unique_consecutive.h>
|
||||
#include <ATen/ops/unique_consecutive_native.h>
|
||||
#include <ATen/ops/unique_dim_consecutive.h>
|
||||
#include <ATen/ops/unique_dim_consecutive_native.h>
|
||||
#include <ATen/ops/unique_dim_native.h>
|
||||
#include <ATen/ops/zeros.h>
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
@ -316,85 +305,4 @@ std::tuple<Tensor, Tensor, Tensor> _unique2_mps(const Tensor& self,
|
||||
return _unique_impl_mps(self, return_inverse, return_counts, false, std::nullopt);
|
||||
}
|
||||
|
||||
static Tensor lexsort_rows_perm_mps(const Tensor& mat_2d) {
|
||||
const auto rows = mat_2d.size(0), cols = mat_2d.size(1);
|
||||
if (rows <= 1 || cols == 0) {
|
||||
return arange(rows, mat_2d.options().dtype(kLong));
|
||||
}
|
||||
|
||||
auto perm = arange(rows, mat_2d.options().dtype(kLong));
|
||||
for (auto c = cols - 1; c >= 0; --c) {
|
||||
auto keys = mat_2d.select(1, c).index_select(0, perm);
|
||||
const auto idx = argsort(keys, /*dim=*/0, /*descending=*/false);
|
||||
perm = perm.index_select(0, idx);
|
||||
}
|
||||
return perm;
|
||||
}
|
||||
|
||||
static std::tuple<Tensor, Tensor, Tensor> unique_dim_sorted_mps_impl(const Tensor& self,
|
||||
int64_t dim,
|
||||
bool return_inverse,
|
||||
bool return_counts) {
|
||||
dim = maybe_wrap_dim(dim, self.dim());
|
||||
|
||||
auto sizes = self.sizes().vec();
|
||||
auto num_zero_dims = std::count(sizes.begin(), sizes.end(), (int64_t)0);
|
||||
if (self.size(dim) == 0) {
|
||||
auto output = at::empty(sizes, self.options());
|
||||
auto inverse_indices = at::empty({0}, self.options().dtype(kLong));
|
||||
auto counts = at::empty({0}, self.options().dtype(kLong));
|
||||
return {output, inverse_indices, counts};
|
||||
}
|
||||
|
||||
auto transposed = self.moveaxis(dim, 0);
|
||||
auto orig_sizes = transposed.sizes().vec();
|
||||
auto rows = transposed.size(0);
|
||||
auto input_flat = transposed.contiguous().view({rows, -1});
|
||||
|
||||
auto perm = lexsort_rows_perm_mps(input_flat);
|
||||
auto input_sorted = input_flat.index_select(0, perm);
|
||||
|
||||
Tensor is_unique = at::zeros({rows}, self.options().dtype(kBool));
|
||||
if (rows > 0) {
|
||||
is_unique.narrow(0, 0, 1).fill_(true);
|
||||
}
|
||||
if (rows > 1) {
|
||||
auto a = input_sorted.narrow(0, 1, rows - 1);
|
||||
auto b = input_sorted.narrow(0, 0, rows - 1);
|
||||
auto row_changed = a.ne(b).any(1);
|
||||
is_unique.narrow(0, 1, rows - 1).copy_(row_changed);
|
||||
}
|
||||
|
||||
auto unique_pos = nonzero(is_unique).squeeze(1);
|
||||
auto group_id = cumsum(is_unique.to(kLong), 0).sub(1);
|
||||
|
||||
auto unique_rows_2d = input_sorted.index_select(0, unique_pos);
|
||||
|
||||
Tensor inverse_indices = empty({0}, self.options().dtype(kLong));
|
||||
if (return_inverse) {
|
||||
inverse_indices = empty({rows}, self.options().dtype(kLong));
|
||||
inverse_indices.index_copy_(0, perm, group_id);
|
||||
}
|
||||
|
||||
Tensor counts = empty({0}, self.options().dtype(kLong));
|
||||
if (return_counts) {
|
||||
const auto num_unique = unique_pos.size(0);
|
||||
counts = zeros({num_unique}, self.options().dtype(kLong));
|
||||
counts.scatter_add_(0, group_id, ones_like(group_id, group_id.options().dtype(kLong)));
|
||||
}
|
||||
|
||||
orig_sizes[0] = unique_rows_2d.size(0);
|
||||
auto output = unique_rows_2d.view(orig_sizes).moveaxis(0, dim);
|
||||
|
||||
return std::make_tuple(std::move(output), std::move(inverse_indices), std::move(counts));
|
||||
}
|
||||
|
||||
std::tuple<Tensor, Tensor, Tensor> unique_dim_mps(const Tensor& self,
|
||||
int64_t dim,
|
||||
const bool /*sorted*/,
|
||||
const bool return_inverse,
|
||||
const bool return_counts) {
|
||||
return unique_dim_sorted_mps_impl(self, dim, return_inverse, return_counts);
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
@ -1409,7 +1409,7 @@
|
||||
- func: _sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
|
||||
variants: function
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: sparse_broadcast_to
|
||||
SparseCPU, SparseCUDA: sparse_broadcast_to
|
||||
|
||||
- func: cat(Tensor[] tensors, int dim=0) -> Tensor
|
||||
structured_delegate: cat.out
|
||||
@ -6450,7 +6450,6 @@
|
||||
dispatch:
|
||||
CPU: unique_dim_cpu
|
||||
CUDA: unique_dim_cuda
|
||||
MPS: unique_dim_mps
|
||||
tags: dynamic_output_shape
|
||||
autogen: unique_dim.out
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user