mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-24 23:54:56 +08:00
Compare commits
173 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 8ac81dba21 | |||
| dae9a71d99 | |||
| bf4b0e8c41 | |||
| 0384f48daa | |||
| 3b92a1adfe | |||
| 6ca9dc026d | |||
| a499828924 | |||
| e84eabd4f9 | |||
| 5e53e458b9 | |||
| ccb450b190 | |||
| ae97eb86f7 | |||
| 7a9c4d794c | |||
| 501e19137a | |||
| 4a757e1e17 | |||
| 563921619b | |||
| 84d8ec73f1 | |||
| a956066b4e | |||
| ff6870d134 | |||
| 92f9ed7ac3 | |||
| 8e217a9f6d | |||
| 429052f151 | |||
| a3f01f6418 | |||
| 62843c14bb | |||
| 082d3dd9d5 | |||
| 468c1f9e9d | |||
| 9614c2eb14 | |||
| 4c6a6c2db9 | |||
| 3ad3bfe11d | |||
| 1c6dfbe557 | |||
| 934f878883 | |||
| cef05b1202 | |||
| b500c166ef | |||
| d65ffdef3d | |||
| ac72f81c12 | |||
| 9cac1b9259 | |||
| 9bc648235d | |||
| 799471d92b | |||
| 43d9b5ecaa | |||
| 463fbc8ca0 | |||
| 2f53395943 | |||
| fccddf02b6 | |||
| 8be8b94793 | |||
| fe8cc619b8 | |||
| 2f5a24c2a2 | |||
| 24492cbab2 | |||
| 3f6d88f04c | |||
| 94db2ad51d | |||
| 9f783e172d | |||
| a8432bcaad | |||
| a3a40cb741 | |||
| c924c675d0 | |||
| c3f30eca9e | |||
| 1e710552c1 | |||
| 7c39b2ecbe | |||
| afdd4247a2 | |||
| 22df9332da | |||
| 6b9b7ce6fe | |||
| 1274297e06 | |||
| f68f76d8c7 | |||
| fa1d409e83 | |||
| 52d4660ae9 | |||
| 7345454e2e | |||
| 23170dfebc | |||
| 12e993f533 | |||
| 07d2531672 | |||
| 6944d4b639 | |||
| f654cff566 | |||
| f17c5e0789 | |||
| 435c18fb4a | |||
| 612cdc8f48 | |||
| da5069f289 | |||
| 4fd2a2b273 | |||
| bb1d53bc47 | |||
| 36338fc7f2 | |||
| e0c910149c | |||
| f4aeceaa9d | |||
| d8e6b2fddc | |||
| 31c25c7d01 | |||
| 5dbee5691c | |||
| 864ffe12d7 | |||
| 4e35594674 | |||
| 35d7b32159 | |||
| 0663bdb123 | |||
| 40ea6e418a | |||
| 348303ebd2 | |||
| 94755e81c4 | |||
| 6d65737aee | |||
| 053251b98d | |||
| 7e2e83cdbe | |||
| d033d11d26 | |||
| 80d4da893c | |||
| bf7f481144 | |||
| ab0694f1c6 | |||
| 5f630d28d7 | |||
| a67e798cb7 | |||
| 30191fcf03 | |||
| 623e623c82 | |||
| f08487aa86 | |||
| 1051c7dbc2 | |||
| 2dc2613180 | |||
| 582d278983 | |||
| b5e6e58050 | |||
| fefc406a3d | |||
| 3d32bb114b | |||
| de05dbc39c | |||
| fc1b09a52a | |||
| c2388201fc | |||
| a6f9e0e62a | |||
| 337fe1079d | |||
| b494547f0b | |||
| d9832d8425 | |||
| f0ae3a57f6 | |||
| 26b3ae5890 | |||
| be8095b07f | |||
| b2d8f6a6af | |||
| 98e22c8a69 | |||
| e1f0a69943 | |||
| 833997a6fd | |||
| b9a7d0e13b | |||
| 1c16c18a53 | |||
| 96ef26f71a | |||
| 5ac112b569 | |||
| dda071587f | |||
| 11acfed3ce | |||
| 5f40a8a9a3 | |||
| e64965300a | |||
| 00985970e3 | |||
| 484c4093a8 | |||
| 760c478a14 | |||
| dc4f97e9c1 | |||
| c66e58b7d0 | |||
| 878f59ef75 | |||
| e60ad4f628 | |||
| 2281d009e5 | |||
| 33589374b6 | |||
| 5539916fe1 | |||
| e4174b1fd7 | |||
| 0e7ccc09db | |||
| 87cc126457 | |||
| a3e26d1727 | |||
| d2393c2d7d | |||
| b498299953 | |||
| 4d66a3b894 | |||
| e2545487de | |||
| 8922bbcaab | |||
| 14744e1ab2 | |||
| b477fb106f | |||
| d22d916719 | |||
| 86d34a43f5 | |||
| 8508651477 | |||
| 723c27ed78 | |||
| bdbe931d58 | |||
| af60398c3a | |||
| 82f1eb9b03 | |||
| 4b2d297eec | |||
| 0ec723acd0 | |||
| e1be887870 | |||
| d91eecc9a5 | |||
| 24a4dae85b | |||
| d3c4cf838e | |||
| b1e99c8c7a | |||
| 5eb35d2ab8 | |||
| f03d635dc6 | |||
| 1f0b01d4b6 | |||
| c0142f5c06 | |||
| 3ea6868049 | |||
| be3b8d2ec9 | |||
| 5ccf3ca3ec | |||
| e38e953432 | |||
| 4dd73e659a | |||
| 0d9c95cd7e | |||
| dcc42e95f4 | |||
| 002e59440a |
@ -5,9 +5,9 @@ GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
|
||||
|
||||
# Set CUDA architecture lists to match x86 build_cuda.sh
|
||||
if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;8.0;9.0"
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0"
|
||||
elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="7.0;8.0;9.0;10.0;12.0"
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
|
||||
elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
|
||||
fi
|
||||
@ -42,9 +42,6 @@ else
|
||||
echo "Bundling CUDA libraries with wheel for aarch64."
|
||||
else
|
||||
echo "Using nvidia libs from pypi for aarch64."
|
||||
# Fix platform constraints in PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64
|
||||
# Replace 'platform_machine == "x86_64"' with 'platform_machine == "aarch64"'
|
||||
export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS//platform_machine == \'x86_64\'/platform_machine == \'aarch64\'}"
|
||||
echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
|
||||
export USE_NVIDIA_PYPI_LIBS=1
|
||||
fi
|
||||
|
||||
@ -138,6 +138,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
|
||||
folder = os.path.dirname(wheel_path)
|
||||
os.mkdir(f"{folder}/tmp")
|
||||
os.system(f"unzip {wheel_path} -d {folder}/tmp")
|
||||
# Delete original wheel since it will be repackaged
|
||||
os.system(f"rm {wheel_path}")
|
||||
|
||||
# Check if we should use PyPI NVIDIA libraries or bundle system libraries
|
||||
use_nvidia_pypi_libs = os.getenv("USE_NVIDIA_PYPI_LIBS", "0") == "1"
|
||||
@ -211,7 +213,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
|
||||
]
|
||||
|
||||
# CUDA version-specific libraries
|
||||
if "130" in desired_cuda:
|
||||
if "13" in desired_cuda:
|
||||
minor_version = desired_cuda[-1]
|
||||
version_specific_libs = [
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
|
||||
"/usr/local/cuda/lib64/libcublas.so.13",
|
||||
@ -221,7 +224,7 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
|
||||
"/usr/local/cuda/lib64/libcusolver.so.12",
|
||||
"/usr/local/cuda/lib64/libnvJitLink.so.13",
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.13",
|
||||
"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0",
|
||||
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",
|
||||
]
|
||||
elif "12" in desired_cuda:
|
||||
# Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
|
||||
@ -237,6 +240,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.12",
|
||||
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
|
||||
]
|
||||
else:
|
||||
raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")
|
||||
|
||||
# Combine all libraries
|
||||
libs_to_copy = common_libs + version_specific_libs
|
||||
@ -275,14 +280,7 @@ def complete_wheel(folder: str) -> str:
|
||||
f"/{folder}/dist/{repaired_wheel_name}",
|
||||
)
|
||||
else:
|
||||
repaired_wheel_name = wheel_name.replace(
|
||||
"linux_aarch64", "manylinux_2_28_aarch64"
|
||||
)
|
||||
print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}")
|
||||
os.rename(
|
||||
f"/{folder}/dist/{wheel_name}",
|
||||
f"/{folder}/dist/{repaired_wheel_name}",
|
||||
)
|
||||
repaired_wheel_name = list_dir(f"/{folder}/dist")[0]
|
||||
|
||||
print(f"Copying {repaired_wheel_name} to artifacts")
|
||||
shutil.copy2(
|
||||
|
||||
@ -56,9 +56,13 @@ ENV INSTALLED_VISION ${VISION}
|
||||
|
||||
# Install rocm
|
||||
ARG ROCM_VERSION
|
||||
RUN mkdir ci_commit_pins
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt
|
||||
COPY ./common/install_rocm.sh install_rocm.sh
|
||||
RUN bash ./install_rocm.sh
|
||||
RUN rm install_rocm.sh
|
||||
RUN rm install_rocm.sh common_utils.sh
|
||||
RUN rm -r ci_commit_pins
|
||||
COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
|
||||
RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
|
||||
RUN rm install_rocm_magma.sh
|
||||
|
||||
1
.ci/docker/ci_commit_pins/rocm-composable-kernel.txt
Normal file
1
.ci/docker/ci_commit_pins/rocm-composable-kernel.txt
Normal file
@ -0,0 +1 @@
|
||||
7fe50dc3da2069d6645d9deb8c017a876472a977
|
||||
@ -1 +1 @@
|
||||
fccfc522864cf8bc172abe0cd58ae5581e2d44b9
|
||||
70cbcaca84471df49e81ddc56873c9241b671f8d
|
||||
|
||||
@ -2,6 +2,11 @@
|
||||
|
||||
set -ex
|
||||
|
||||
# for pip_install function
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
|
||||
|
||||
ROCM_COMPOSABLE_KERNEL_VERSION="$(cat $(dirname $0)/../ci_commit_pins/rocm-composable-kernel.txt)"
|
||||
|
||||
ver() {
|
||||
printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
|
||||
}
|
||||
@ -113,6 +118,8 @@ EOF
|
||||
rm -rf HIP clr
|
||||
fi
|
||||
|
||||
pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
|
||||
|
||||
# Cleanup
|
||||
apt-get autoclean && apt-get clean
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
@ -176,6 +183,8 @@ install_centos() {
|
||||
sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
|
||||
done
|
||||
|
||||
pip_install "git+https://github.com/rocm/composable_kernel@$ROCM_COMPOSABLE_KERNEL_VERSION"
|
||||
|
||||
# Cleanup
|
||||
yum clean all
|
||||
rm -rf /var/cache/yum
|
||||
|
||||
@ -52,9 +52,13 @@ ENV INSTALLED_VISION ${VISION}
|
||||
|
||||
# Install rocm
|
||||
ARG ROCM_VERSION
|
||||
RUN mkdir ci_commit_pins
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ./ci_commit_pins/rocm-composable-kernel.txt ci_commit_pins/rocm-composable-kernel.txt
|
||||
COPY ./common/install_rocm.sh install_rocm.sh
|
||||
RUN bash ./install_rocm.sh
|
||||
RUN rm install_rocm.sh
|
||||
RUN rm install_rocm.sh common_utils.sh
|
||||
RUN rm -r ci_commit_pins
|
||||
COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
|
||||
RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
|
||||
RUN rm install_rocm_magma.sh
|
||||
|
||||
@ -258,11 +258,19 @@ function install_torchrec_and_fbgemm() {
|
||||
git clone --recursive https://github.com/pytorch/fbgemm
|
||||
pushd fbgemm/fbgemm_gpu
|
||||
git checkout "${fbgemm_commit}" --recurse-submodules
|
||||
python setup.py bdist_wheel \
|
||||
--build-variant=rocm \
|
||||
-DHIP_ROOT_DIR="${ROCM_PATH}" \
|
||||
-DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
|
||||
-DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
|
||||
# until the fbgemm_commit includes the tbb patch
|
||||
patch <<'EOF'
|
||||
--- a/FbgemmGpu.cmake
|
||||
+++ b/FbgemmGpu.cmake
|
||||
@@ -184,5 +184,6 @@ gpu_cpp_library(
|
||||
fbgemm_gpu_tbe_cache
|
||||
fbgemm_gpu_tbe_optimizers
|
||||
fbgemm_gpu_tbe_utils
|
||||
+ tbb
|
||||
DESTINATION
|
||||
fbgemm_gpu)
|
||||
EOF
|
||||
python setup.py bdist_wheel --build-variant=rocm
|
||||
popd
|
||||
|
||||
# Save the wheel before cleaning up
|
||||
|
||||
@ -35,10 +35,11 @@ fi
|
||||
|
||||
print_cmake_info
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
|
||||
USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
||||
# Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
|
||||
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
||||
else
|
||||
# NB: we always build with distributed; USE_DISTRIBUTED turns off all
|
||||
# backends (specifically the gloo backend), so test that this case works too
|
||||
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
|
||||
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
|
||||
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
|
||||
fi
|
||||
if which sccache > /dev/null; then
|
||||
|
||||
@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
|
||||
fi
|
||||
popd
|
||||
|
||||
python -mpip install -r requirements.txt
|
||||
|
||||
# enable debug asserts in serialization
|
||||
export TORCH_SERIALIZATION_DEBUG=1
|
||||
|
||||
python -mpip install --no-input -r requirements.txt
|
||||
|
||||
setup_test_python() {
|
||||
# The CircleCI worker hostname doesn't resolve to an address.
|
||||
# This environment variable makes ProcessGroupGloo default to
|
||||
|
||||
@ -386,8 +386,8 @@ def smoke_test_compile(device: str = "cpu") -> None:
|
||||
|
||||
|
||||
def smoke_test_nvshmem() -> None:
|
||||
if not torch.cuda.is_available():
|
||||
print("CUDA is not available, skipping NVSHMEM test")
|
||||
if not torch.cuda.is_available() or target_os == "windows":
|
||||
print("Windows platform or CUDA is not available, skipping NVSHMEM test")
|
||||
return
|
||||
|
||||
# Check if NVSHMEM is compiled in current build
|
||||
@ -396,7 +396,9 @@ def smoke_test_nvshmem() -> None:
|
||||
except ImportError:
|
||||
# Not built with NVSHMEM support.
|
||||
# torch is not compiled with NVSHMEM prior to 2.9
|
||||
if torch.__version__ < "2.9":
|
||||
from torch.torch_version import TorchVersion
|
||||
|
||||
if TorchVersion(torch.__version__) < (2, 9):
|
||||
return
|
||||
else:
|
||||
# After 2.9: NVSHMEM is expected to be compiled in current build
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
set WIN_DRIVER_VN=528.89
|
||||
set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe" & REM @lint-ignore
|
||||
curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe
|
||||
set WIN_DRIVER_VN=580.88
|
||||
set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/%WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe" & REM @lint-ignore
|
||||
curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe
|
||||
if errorlevel 1 exit /b 1
|
||||
|
||||
start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe -s -noreboot
|
||||
start /wait %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe -s -noreboot
|
||||
if errorlevel 1 exit /b 1
|
||||
|
||||
del %WIN_DRIVER_VN%-data-center-tesla-desktop-winserver-2016-2019-2022-dch-international.exe || ver > NUL
|
||||
del %WIN_DRIVER_VN%-data-center-tesla-desktop-win10-win11-64bit-dch-international.exe || ver > NUL
|
||||
|
||||
@ -85,7 +85,7 @@ mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
|
||||
# Create an isolated directory to store this builds pytorch checkout and conda
|
||||
# installation
|
||||
if [[ -z "$MAC_PACKAGE_WORK_DIR" ]]; then
|
||||
MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_conda_${DESIRED_PYTHON}_$(date +%H%M%S)"
|
||||
MAC_PACKAGE_WORK_DIR="$(pwd)/tmp_wheel_${DESIRED_PYTHON}_$(date +%H%M%S)"
|
||||
fi
|
||||
mkdir -p "$MAC_PACKAGE_WORK_DIR" || true
|
||||
if [[ -n ${GITHUB_ACTIONS} ]]; then
|
||||
@ -96,11 +96,11 @@ fi
|
||||
whl_tmp_dir="${MAC_PACKAGE_WORK_DIR}/dist"
|
||||
mkdir -p "$whl_tmp_dir"
|
||||
|
||||
mac_version='macosx_11_0_arm64'
|
||||
mac_version='macosx-11_0-arm64'
|
||||
libtorch_arch='arm64'
|
||||
|
||||
# Create a consistent wheel package name to rename the wheel to
|
||||
wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version}.whl"
|
||||
wheel_filename_new="${TORCH_PACKAGE_NAME}-${build_version}${build_number_prefix}-cp${python_nodot}-none-${mac_version//[-,]/_}.whl"
|
||||
|
||||
###########################################################
|
||||
|
||||
@ -125,7 +125,6 @@ popd
|
||||
export TH_BINARY_BUILD=1
|
||||
export INSTALL_TEST=0 # dont install test binaries into site-packages
|
||||
export MACOSX_DEPLOYMENT_TARGET=11.0
|
||||
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
@ -133,25 +132,19 @@ RENAME_WHEEL=true
|
||||
case $desired_python in
|
||||
3.14t)
|
||||
echo "Using 3.14 deps"
|
||||
mac_version='macosx-11.0-arm64'
|
||||
NUMPY_PINNED_VERSION="==2.1.0"
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
RENAME_WHEEL=false
|
||||
;;
|
||||
3.14)
|
||||
echo "Using 3.14t deps"
|
||||
mac_version='macosx-11.0-arm64'
|
||||
NUMPY_PINNED_VERSION="==2.1.0"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
RENAME_WHEEL=false
|
||||
;;
|
||||
3.13t)
|
||||
echo "Using 3.13 deps"
|
||||
NUMPY_PINNED_VERSION="==2.1.0"
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
RENAME_WHEEL=false
|
||||
;;
|
||||
3.13)
|
||||
@ -176,21 +169,15 @@ case $desired_python in
|
||||
;;
|
||||
esac
|
||||
|
||||
# Install into a fresh env
|
||||
tmp_env_name="wheel_py$python_nodot"
|
||||
conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS}
|
||||
source activate "$tmp_env_name"
|
||||
|
||||
PINNED_PACKAGES=(
|
||||
"numpy${NUMPY_PINNED_VERSION}"
|
||||
)
|
||||
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt"
|
||||
pip install requests ninja typing-extensions
|
||||
retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
|
||||
python -mvenv ~/${desired_python}-build
|
||||
source ~/${desired_python}-build/bin/activate
|
||||
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
|
||||
retry brew install libomp
|
||||
|
||||
# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
|
||||
# is build as part of tensorpipe submodule
|
||||
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
|
||||
export USE_DISTRIBUTED=1
|
||||
|
||||
export USE_MKLDNN=OFF
|
||||
@ -200,7 +187,7 @@ export BUILD_TEST=OFF
|
||||
pushd "$pytorch_rootdir"
|
||||
echo "Calling setup.py bdist_wheel at $(date)"
|
||||
|
||||
python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name ${mac_version}
|
||||
_PYTHON_HOST_PLATFORM=${mac_version} ARCHFLAGS="-arch arm64" python setup.py bdist_wheel -d "$whl_tmp_dir" --plat-name "${mac_version//[-.]/_}"
|
||||
|
||||
echo "Finished setup.py bdist_wheel at $(date)"
|
||||
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
27fc2493d383354a008106f22f3be232badee9a1
|
||||
fa5142928ee157aa65137c4ecff2fe9b1a9e0648
|
||||
|
||||
2
.github/ci_commit_pins/fbgemm_rocm.txt
vendored
2
.github/ci_commit_pins/fbgemm_rocm.txt
vendored
@ -1 +1 @@
|
||||
7f1de94a4c2d14f59ad4ca84538c36084ea6b2c8
|
||||
08ae0af1395c8d8471f4025deb6af9aef90b342f
|
||||
|
||||
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
e10fef08838612b4560e9c72e5cb1414a5edfa13
|
||||
cc99baf14dacc2497d0c5ed84e076ef2c37f6a4d
|
||||
|
||||
90
.github/scripts/generate_binary_build_matrix.py
vendored
90
.github/scripts/generate_binary_build_matrix.py
vendored
@ -43,55 +43,55 @@ CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "13.0-aarch64"]
|
||||
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"12.6": (
|
||||
"nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
"nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | "
|
||||
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
|
||||
"nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | "
|
||||
"nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | "
|
||||
"nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | "
|
||||
"nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
|
||||
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
|
||||
),
|
||||
"12.8": (
|
||||
"nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
"nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | "
|
||||
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
|
||||
"nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | "
|
||||
"nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | "
|
||||
"nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | "
|
||||
"nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
|
||||
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
|
||||
),
|
||||
"13.0": (
|
||||
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | "
|
||||
"nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | "
|
||||
"nvidia-cublas==13.0.0.19; platform_system == 'Linux' | "
|
||||
"nvidia-cufft==12.0.0.15; platform_system == 'Linux' | "
|
||||
"nvidia-curand==10.4.0.35; platform_system == 'Linux' | "
|
||||
"nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
|
||||
"nvidia-cufile==1.15.0.42; platform_system == 'Linux'"
|
||||
),
|
||||
"xpu": (
|
||||
"intel-cmplr-lib-rt==2025.2.1 | "
|
||||
|
||||
91
.github/scripts/prepare_vllm_wheels.sh
vendored
Executable file
91
.github/scripts/prepare_vllm_wheels.sh
vendored
Executable file
@ -0,0 +1,91 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -eux
|
||||
|
||||
torch_version=$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
nightly=$(echo ${torch_version} | cut -d'.' -f4)
|
||||
|
||||
# Copied from .ci/manywheel/build_common.sh
|
||||
make_wheel_record() {
|
||||
fpath=$1
|
||||
if echo $fpath | grep RECORD >/dev/null 2>&1; then
|
||||
echo "$fpath,,"
|
||||
else
|
||||
fhash=$(openssl dgst -sha256 -binary $fpath | openssl base64 | sed -e 's/+/-/g' | sed -e 's/\//_/g' | sed -e 's/=//g')
|
||||
fsize=$(ls -nl $fpath | awk '{print $5}')
|
||||
echo "$fpath,sha256=$fhash,$fsize"
|
||||
fi
|
||||
}
|
||||
|
||||
change_wheel_version() {
|
||||
local package=$1
|
||||
local wheel=$2
|
||||
local f_version=$3
|
||||
local t_version=$4
|
||||
|
||||
# Extract the wheel
|
||||
${PYTHON_EXECUTABLE} -mwheel unpack $wheel
|
||||
|
||||
mv "${package}-${f_version}" "${package}-${t_version}"
|
||||
# Change the version from f_version to t_version in the dist-info dir
|
||||
pushd "${package}-${t_version}"
|
||||
mv "${package}-${f_version}.dist-info" "${package}-${t_version}.dist-info"
|
||||
|
||||
pushd "${package}-${t_version}.dist-info"
|
||||
sed -i "s/${package}-${f_version}.dist-info/${package}-${t_version}.dist-info/g" RECORD
|
||||
|
||||
# Update the version in METADATA and its SHA256 hash
|
||||
sed -i "s/Version: ${f_version}/Version: ${t_version}/g" METADATA
|
||||
# then add PyTorch nightly dependency of vLLM
|
||||
if [[ "${package}" == vllm ]] || [[ "${package}" == xformers ]]; then
|
||||
sed -i "/License-File/a\Requires-Dist: torch==${torch_version}" METADATA
|
||||
fi
|
||||
sed -i '/METADATA,sha256/d' RECORD
|
||||
popd
|
||||
|
||||
make_wheel_record "${package}-${t_version}.dist-info/METADATA" >> "${package}-${t_version}.dist-info/RECORD"
|
||||
popd
|
||||
|
||||
# Repack the wheel
|
||||
${PYTHON_EXECUTABLE} -mwheel pack "${package}-${t_version}"
|
||||
|
||||
# Clean up
|
||||
rm -rf "${package}-${t_version}"
|
||||
}
|
||||
|
||||
repackage_wheel() {
|
||||
local package=$1
|
||||
pushd $package
|
||||
|
||||
local orig_wheel=$(find . -name *${package//-/_}*)
|
||||
local orig_version=$(unzip -p $orig_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
|
||||
local version=""
|
||||
if [[ "${package}" == vllm ]]; then
|
||||
# Copied from vllm/.buildkite/scripts/upload-wheels.sh
|
||||
version=1.0.0
|
||||
else
|
||||
version=$(echo $orig_version | tr '.+' '.' | cut -d'.' -f1-3)
|
||||
fi
|
||||
local nightly_version=$version.$nightly
|
||||
|
||||
# Use nightly version
|
||||
change_wheel_version ${package//-/_} $orig_wheel $orig_version $nightly_version
|
||||
# Clean up
|
||||
rm "${orig_wheel}"
|
||||
|
||||
auditwheel repair --plat $PLATFORM *.whl \
|
||||
--exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv*
|
||||
local repair_wheel=$(find wheelhouse -name *${PLATFORM}*)
|
||||
local repair_wheel=$(basename ${repair_wheel})
|
||||
popd
|
||||
|
||||
cp ${package}/wheelhouse/${repair_wheel} .
|
||||
rm -rf $package
|
||||
}
|
||||
|
||||
pushd externals/vllm/wheels
|
||||
for package in xformers flashinfer-python vllm; do
|
||||
repackage_wheel $package
|
||||
done
|
||||
popd
|
||||
@ -22,6 +22,16 @@ name: !{{ build_environment }}
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
{%- endmacro %}
|
||||
|
||||
{%- macro setup_python(py_ver) -%}
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}"
|
||||
freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }}
|
||||
{%- endmacro %}
|
||||
|
||||
on:
|
||||
# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321
|
||||
push:
|
||||
@ -61,23 +71,13 @@ jobs:
|
||||
{%- endif %}
|
||||
steps:
|
||||
!{{ set_runner_specific_vars() }}
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
!{{ setup_python(config.get("python_version", "3.10")) }}
|
||||
!{{ common.checkout(deep_clone=False, directory="pytorch") }}
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -94,8 +94,6 @@ jobs:
|
||||
{%- if config["package_type"] == "wheel" %}
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -106,33 +104,9 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
|
||||
61
.github/workflows/build-vllm-wheel.yml
vendored
61
.github/workflows/build-vllm-wheel.yml
vendored
@ -59,20 +59,6 @@ jobs:
|
||||
run: |
|
||||
set -eux
|
||||
|
||||
# Keep PyTorch nightly wheel here so that we can install it later during
|
||||
# vLLM build process
|
||||
mkdir -p "${RUNNER_TEMP}/artifacts/"
|
||||
|
||||
container_name=$(docker run \
|
||||
--tty \
|
||||
--detach \
|
||||
-e PLATFORM \
|
||||
-v "${GITHUB_WORKSPACE}:/pytorch" \
|
||||
-v "${RUNNER_TEMP}/artifacts:/artifacts" \
|
||||
-w /artifacts/ \
|
||||
"${MANYLINUX_IMAGE}"
|
||||
)
|
||||
|
||||
# Determine python executable for given version (copied from build-triton-wheel)
|
||||
case $PY_VERS in
|
||||
3.10)
|
||||
@ -102,6 +88,21 @@ jobs:
|
||||
;;
|
||||
esac
|
||||
|
||||
# Keep PyTorch nightly wheel here so that we can install it later during
|
||||
# vLLM build process
|
||||
mkdir -p "${RUNNER_TEMP}/artifacts/"
|
||||
|
||||
container_name=$(docker run \
|
||||
--tty \
|
||||
--detach \
|
||||
-e PLATFORM \
|
||||
-e PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}" \
|
||||
-v "${GITHUB_WORKSPACE}:/pytorch" \
|
||||
-v "${RUNNER_TEMP}/artifacts:/artifacts" \
|
||||
-w /artifacts/ \
|
||||
"${MANYLINUX_IMAGE}"
|
||||
)
|
||||
|
||||
docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}" -mpip install \
|
||||
--pre torch torchvision torchaudio \
|
||||
--index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
|
||||
@ -113,7 +114,6 @@ jobs:
|
||||
--index-url "https://download.pytorch.org/whl/nightly/${BUILD_DEVICE}"
|
||||
|
||||
# Save this for later
|
||||
echo "PYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}" >> "$GITHUB_ENV"
|
||||
echo "container_name=${container_name}" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Build vLLM wheel
|
||||
@ -131,36 +131,7 @@ jobs:
|
||||
set -eux
|
||||
|
||||
# Get these wheels ready, the vllm renaming logic is copied from its .buildkite/scripts/upload-wheels.sh
|
||||
docker exec -t "${container_name}" bash -c "
|
||||
set -eux
|
||||
|
||||
nightly=\$(unzip -p torch-* '**/METADATA' | grep '^Version: ' | cut -d' ' -f2 | cut -d'.' -f4)
|
||||
|
||||
pushd externals/vllm/wheels
|
||||
for package in xformers flashinfer-python vllm; do
|
||||
pushd \$package
|
||||
auditwheel repair --plat \$PLATFORM *.whl \
|
||||
--exclude libc10* --exclude libtorch* --exclude libcu* --exclude libnv*
|
||||
repair_wheel=\$(find wheelhouse -name *\${PLATFORM}*)
|
||||
repair_wheel=\$(basename \${repair_wheel})
|
||||
popd
|
||||
|
||||
cp \${package}/wheelhouse/\${repair_wheel} .
|
||||
version=\$(unzip -p \$repair_wheel '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
|
||||
if [[ \$package == vllm ]]; then
|
||||
new_wheel=\${repair_wheel/\$version/1.0.0.\$nightly}
|
||||
else
|
||||
major_version=\$(echo \$version | tr '.+' '.' | cut -d'.' -f1-3)
|
||||
new_wheel=\${repair_wheel/\$version/\$major_version.\$nightly}
|
||||
fi
|
||||
|
||||
mv -- \$repair_wheel \$new_wheel
|
||||
rm -rf \$package
|
||||
done
|
||||
popd
|
||||
"
|
||||
|
||||
docker exec -t "${container_name}" bash -c /pytorch/.github/scripts/prepare_vllm_wheels.sh
|
||||
docker exec -t "${container_name}" chown -R 1000:1000 /artifacts
|
||||
|
||||
- uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
|
||||
|
||||
42
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
42
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -132,7 +132,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -178,7 +178,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -224,7 +224,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -335,7 +335,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -381,7 +381,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -427,7 +427,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -538,7 +538,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -584,7 +584,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -630,7 +630,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -741,7 +741,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -787,7 +787,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -833,7 +833,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -944,7 +944,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -990,7 +990,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1036,7 +1036,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1147,7 +1147,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1193,7 +1193,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1239,7 +1239,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1350,7 +1350,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1396,7 +1396,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1442,7 +1442,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
2
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
2
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
@ -60,7 +60,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_8-test: # Testing
|
||||
|
||||
42
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
42
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
@ -127,7 +127,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_6-test: # Testing
|
||||
@ -193,7 +193,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_8-test: # Testing
|
||||
@ -259,7 +259,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda13_0-test: # Testing
|
||||
@ -719,7 +719,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_6-test: # Testing
|
||||
@ -785,7 +785,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_8-test: # Testing
|
||||
@ -851,7 +851,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda13_0-test: # Testing
|
||||
@ -1311,7 +1311,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_6-test: # Testing
|
||||
@ -1377,7 +1377,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_8-test: # Testing
|
||||
@ -1443,7 +1443,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda13_0-test: # Testing
|
||||
@ -1903,7 +1903,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_6-test: # Testing
|
||||
@ -1969,7 +1969,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_8-test: # Testing
|
||||
@ -2035,7 +2035,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda13_0-test: # Testing
|
||||
@ -2495,7 +2495,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_6-test: # Testing
|
||||
@ -2561,7 +2561,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_8-test: # Testing
|
||||
@ -2627,7 +2627,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda13_0-test: # Testing
|
||||
@ -3087,7 +3087,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda12_6-test: # Testing
|
||||
@ -3153,7 +3153,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda12_8-test: # Testing
|
||||
@ -3219,7 +3219,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda13_0-test: # Testing
|
||||
@ -3679,7 +3679,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda12_6-test: # Testing
|
||||
@ -3745,7 +3745,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda12_8-test: # Testing
|
||||
@ -3811,7 +3811,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda13_0-test: # Testing
|
||||
|
||||
18
.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
generated
vendored
18
.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
generated
vendored
@ -60,13 +60,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.10.4"
|
||||
freethreaded: false
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -81,13 +81,9 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
|
||||
336
.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
generated
vendored
336
.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
generated
vendored
@ -56,13 +56,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.10.4"
|
||||
freethreaded: false
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -77,13 +77,9 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -99,8 +95,6 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -111,33 +105,9 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
@ -196,13 +166,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.11.4"
|
||||
freethreaded: false
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -217,13 +187,9 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -239,8 +205,6 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -251,33 +215,9 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
@ -336,13 +276,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.12.4"
|
||||
freethreaded: false
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -357,13 +297,9 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -379,8 +315,6 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -391,33 +325,9 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
@ -476,13 +386,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.13.4"
|
||||
freethreaded: false
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -497,13 +407,9 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -519,8 +425,6 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -531,33 +435,9 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
@ -616,13 +496,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.13.4"
|
||||
freethreaded: true
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -637,13 +517,9 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -659,8 +535,6 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -671,33 +545,9 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
@ -756,13 +606,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.14.0-rc.2"
|
||||
freethreaded: false
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -777,13 +627,9 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -799,8 +645,6 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -811,33 +655,9 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
@ -896,13 +716,13 @@ jobs:
|
||||
echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
|
||||
# shellcheck disable=SC2129
|
||||
echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
|
||||
- name: Install conda and dependencies
|
||||
run: |
|
||||
# Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
|
||||
curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
|
||||
chmod +x "${RUNNER_TEMP}/conda.sh"
|
||||
/bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
|
||||
echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
# TODO: Removeme once 3.14 is out
|
||||
# .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
|
||||
python-version: "3.14.0-rc.2"
|
||||
freethreaded: true
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
@ -917,13 +737,9 @@ jobs:
|
||||
working-directory: pytorch
|
||||
- name: Populate binary env
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
"${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
|
||||
- name: Build PyTorch binary
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -939,8 +755,6 @@ jobs:
|
||||
"${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
|
||||
- name: Test PyTorch wheel
|
||||
run: |
|
||||
# shellcheck disable=SC1091
|
||||
source "${RUNNER_TEMP}/anaconda/bin/activate"
|
||||
set -eux -o pipefail
|
||||
# shellcheck disable=SC1090
|
||||
source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
|
||||
@ -951,33 +765,9 @@ jobs:
|
||||
|
||||
SMOKE_TEST_PARAMS=""
|
||||
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
# shellcheck disable=SC2153
|
||||
case $DESIRED_PYTHON in
|
||||
3.14t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.14)
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
;;
|
||||
3.13t)
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
;;
|
||||
*)
|
||||
# shellcheck disable=SC2153
|
||||
desired_python=${DESIRED_PYTHON}
|
||||
;;
|
||||
esac
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
|
||||
conda activate test_conda_env
|
||||
python -mvenv test_venv
|
||||
source test_venv/bin/activate
|
||||
pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
|
||||
@ -43,6 +43,11 @@ on:
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
freezing:
|
||||
description: Run freezing?
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
benchmark_configs:
|
||||
description: The list of configs used the benchmark
|
||||
required: false
|
||||
@ -102,7 +107,7 @@ jobs:
|
||||
if: github.event.schedule == '0 7 * * *'
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
|
||||
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
@ -116,10 +121,9 @@ jobs:
|
||||
name: inductor-test
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-build
|
||||
if: github.event_name == 'workflow_dispatch'
|
||||
with:
|
||||
build-environment: linux-jammy-py3.9-gcc11-build
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
|
||||
dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
|
||||
2
.github/workflows/nightly.yml
vendored
2
.github/workflows/nightly.yml
vendored
@ -54,7 +54,7 @@ jobs:
|
||||
- get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-py3.9-gcc11
|
||||
build-environment: linux-jammy-py3.10-gcc11
|
||||
docker-image: ${{ needs.docs-build.outputs.docker-image }}
|
||||
push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }}
|
||||
run-doxygen: true
|
||||
|
||||
@ -22,6 +22,7 @@ COMMON_COPTS = [
|
||||
"-DHAVE_SHM_UNLINK=1",
|
||||
"-D_FILE_OFFSET_BITS=64",
|
||||
"-DUSE_FBGEMM",
|
||||
"-DUSE_DISTRIBUTED",
|
||||
"-DAT_PER_OPERATOR_HEADERS",
|
||||
"-DATEN_THREADING=NATIVE",
|
||||
"-DNO_CUDNN_DESTROY_HANDLE",
|
||||
|
||||
@ -181,9 +181,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
|
||||
set(CPU_POWER ON)
|
||||
endif()
|
||||
|
||||
# For non-supported platforms, turn USE_DISTRIBUTED off by default.
|
||||
# NB: USE_DISTRIBUTED simply disables the backend; distributed code
|
||||
# still gets built
|
||||
# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
|
||||
# tested and likely won't work without additional changes.
|
||||
if(NOT LINUX AND NOT WIN32)
|
||||
set(USE_DISTRIBUTED
|
||||
OFF
|
||||
@ -234,6 +233,7 @@ cmake_dependent_option(INSTALL_TEST "Install test binaries if BUILD_TEST is on"
|
||||
option(USE_CPP_CODE_COVERAGE "Compile C/C++ with code coverage flags" OFF)
|
||||
option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON)
|
||||
option(USE_ASAN "Use Address+Undefined Sanitizers" OFF)
|
||||
option(USE_LSAN "Use Leak Sanitizer" OFF)
|
||||
option(USE_TSAN "Use Thread Sanitizer" OFF)
|
||||
option(USE_CUDA "Use CUDA" ON)
|
||||
option(USE_XPU "Use XPU" ON)
|
||||
@ -262,11 +262,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
|
||||
option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
|
||||
option(USE_NATIVE_ARCH "Use -march=native" OFF)
|
||||
cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
|
||||
option(USE_DISTRIBUTED "Enable default distributed backends" ON)
|
||||
option(USE_DISTRIBUTED "Use distributed" ON)
|
||||
cmake_dependent_option(USE_NCCL "Use NCCL" ON
|
||||
"USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
|
||||
cmake_dependent_option(USE_XCCL "Use XCCL" ON
|
||||
"USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
|
||||
"USE_XPU;UNIX;NOT APPLE" OFF)
|
||||
cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
|
||||
cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
|
||||
cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
|
||||
@ -431,10 +431,11 @@ if(WIN32)
|
||||
PATH_SUFFIXES lib
|
||||
NO_DEFAULT_PATH)
|
||||
if(NOT libuv_tmp_LIBRARY)
|
||||
set(USE_DISTRIBUTED OFF)
|
||||
set(USE_GLOO OFF)
|
||||
message(
|
||||
WARNING
|
||||
"Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
|
||||
"Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
|
||||
"Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
|
||||
)
|
||||
else()
|
||||
@ -873,7 +874,7 @@ cmake_dependent_option(
|
||||
"Whether to build the flash_attention kernel for scaled dot product attention.\
|
||||
Will be disabled if not supported by the platform"
|
||||
ON
|
||||
"USE_CUDA OR USE_ROCM;NOT MSVC"
|
||||
"USE_CUDA OR USE_ROCM"
|
||||
OFF)
|
||||
|
||||
cmake_dependent_option(
|
||||
@ -889,9 +890,9 @@ IF(USE_FBGEMM_GENAI AND USE_ROCM AND NOT "gfx942" IN_LIST PYTORCH_ROCM_ARCH)
|
||||
set(USE_FBGEMM_GENAI off)
|
||||
endif()
|
||||
|
||||
# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100
|
||||
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0a")
|
||||
message(WARNING "Setting USE_FBGEMM_GENAI to ON for CUDA build on SM100")
|
||||
# Set USE_FBGEMM_GENAI to ON for CUDA build on SM100.
|
||||
if(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
||||
message(STATUS "Setting USE_FBGEMM_GENAI to ON, doing CUDA build for SM100a")
|
||||
set(USE_FBGEMM_GENAI ON)
|
||||
endif()
|
||||
|
||||
@ -908,7 +909,7 @@ cmake_dependent_option(
|
||||
# USE_FLASH_ATTENTION -> USE_ROCM -> Dependencies.cmake -> aotriton.cmake
|
||||
#
|
||||
if(USE_ROCM)
|
||||
if(UNIX AND (USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION))
|
||||
if(USE_FLASH_ATTENTION OR USE_MEM_EFF_ATTENTION)
|
||||
include(cmake/External/aotriton.cmake)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@ -50,6 +50,7 @@ Following is the Release Compatibility Matrix for PyTorch releases:
|
||||
|
||||
| PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm |
|
||||
| --- | --- | --- | --- | --- | --- |
|
||||
| 2.9 | >=3.10, <=(3.14, 3.14t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 13.0 (CUDNN 9.13.0.50) | ROCm 6.4 |
|
||||
| 2.8 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 12.9 (CUDNN 9.10.2.21) | ROCm 6.4 |
|
||||
| 2.7 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8 (CUDNN 9.1.0.70), CUDA 12.6 (CUDNN 9.5.1.17) | CUDA 12.8 (CUDNN 9.7.1.26) | ROCm 6.3 |
|
||||
| 2.6 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8, CUDA 12.4 (CUDNN 9.1.0.70) | CUDA 12.6 (CUDNN 9.5.1.17) | ROCm 6.2.4 |
|
||||
|
||||
@ -16,6 +16,8 @@ However, if you believe you have found a security vulnerability in PyTorch, we e
|
||||
|
||||
Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new
|
||||
|
||||
All reports submitted thru the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework.
|
||||
|
||||
Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:
|
||||
|
||||
https://www.facebook.com/whitehat
|
||||
|
||||
@ -265,6 +265,14 @@ IF(USE_FBGEMM_GENAI)
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
|
||||
list(FILTER fbgemm_genai_native_cuda_cu INCLUDE REGEX ${FBGEMM_CUTLASS_KERNELS_REGEX})
|
||||
|
||||
# PyTorch is not built for 10.0a in CI, due to lack of portability,
|
||||
# so we need to explicitly build these files for 10.0a.
|
||||
foreach(cu_file ${fbgemm_genai_native_cuda_cu})
|
||||
_BUILD_FOR_ADDITIONAL_ARCHS(
|
||||
"${cu_file}"
|
||||
"100a")
|
||||
endforeach()
|
||||
|
||||
file(GLOB_RECURSE fbgemm_genai_native_cuda_cpp
|
||||
"${FBGEMM_GENAI_SRCS}/common/*.cpp"
|
||||
)
|
||||
|
||||
@ -133,12 +133,12 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
|
||||
"resize_ called on tensor with symbolic shape")
|
||||
TORCH_CHECK(
|
||||
sparse_dim + dense_dim == static_cast<int64_t>(size.size()),
|
||||
"number of dimensions must be sparse_dim (",
|
||||
"'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ",
|
||||
size.size(),
|
||||
", sparse_dim = ",
|
||||
sparse_dim,
|
||||
") + dense_dim (",
|
||||
dense_dim,
|
||||
"), but got ",
|
||||
size.size());
|
||||
", dense_dim = ",
|
||||
dense_dim);
|
||||
if (nnz() > 0) {
|
||||
[[maybe_unused]] auto constexpr alt_options_msg =
|
||||
"You could try the following options:\n\
|
||||
@ -254,12 +254,12 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
|
||||
"resize_and_clear_ called on tensor with symbolic shape")
|
||||
TORCH_CHECK(
|
||||
sparse_dim + dense_dim == static_cast<int64_t>(size.size()),
|
||||
"number of dimensions must be sparse_dim (",
|
||||
"'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ",
|
||||
size.size(),
|
||||
", sparse_dim = ",
|
||||
sparse_dim,
|
||||
") + dense_dim (",
|
||||
dense_dim,
|
||||
"), but got ",
|
||||
size.size());
|
||||
", dense_dim = ",
|
||||
dense_dim);
|
||||
|
||||
set_sizes_and_strides(size, std::vector<int64_t>(size.size()));
|
||||
sparse_dim_ = sparse_dim;
|
||||
|
||||
@ -10,6 +10,10 @@
|
||||
#include <ideep.hpp>
|
||||
#endif
|
||||
|
||||
#if !defined(__s390x__) && !defined(__powerpc__)
|
||||
#include <cpuinfo.h>
|
||||
#endif
|
||||
|
||||
#include <caffe2/core/common.h>
|
||||
|
||||
#include <ATen/native/DispatchStub.h>
|
||||
@ -103,7 +107,9 @@ std::string get_cpu_capability() {
|
||||
#elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
|
||||
case native::CPUCapability::ZVECTOR:
|
||||
return "Z VECTOR";
|
||||
#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
case native::CPUCapability::SVE:
|
||||
return "SVE";
|
||||
case native::CPUCapability::SVE256:
|
||||
return "SVE256";
|
||||
#else
|
||||
@ -118,6 +124,12 @@ std::string get_cpu_capability() {
|
||||
return "";
|
||||
}
|
||||
|
||||
int get_sve_len() {
|
||||
// It is possible that we override the cpu_capability with
|
||||
// environment variable
|
||||
return cpuinfo_get_max_arm_sve_length();
|
||||
}
|
||||
|
||||
static std::string used_cpu_capability() {
|
||||
// It is possible that we override the cpu_capability with
|
||||
// environment variable
|
||||
|
||||
@ -15,4 +15,6 @@ TORCH_API std::string get_cxx_flags();
|
||||
|
||||
TORCH_API std::string get_cpu_capability();
|
||||
|
||||
TORCH_API int get_sve_len();
|
||||
|
||||
} // namespace at
|
||||
|
||||
@ -34,9 +34,9 @@ inline scalar_t vec_reduce_all(
|
||||
scalar_t acc_arr[Vec::size()];
|
||||
acc_vec.store(acc_arr);
|
||||
for (const auto i : c10::irange(1, size)) {
|
||||
std::array<scalar_t, Vec::size()> acc_arr_next = {0};
|
||||
scalar_t acc_arr_next[Vec::size()] = {0};
|
||||
acc_arr_next[0] = acc_arr[i];
|
||||
Vec acc_vec_next = Vec::loadu(acc_arr_next.data());
|
||||
Vec acc_vec_next = Vec::loadu(acc_arr_next);
|
||||
acc_vec = vec_fun(acc_vec, acc_vec_next);
|
||||
}
|
||||
acc_vec.store(acc_arr);
|
||||
@ -102,8 +102,7 @@ struct VecReduceAllSIMD<float, Op> {
|
||||
#endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
|
||||
// !defined(C10_MOBILE)
|
||||
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(
|
||||
@ -143,8 +142,7 @@ struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
|
||||
#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
|
||||
// && !defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
defined(CPU_CAPABILITY_SVE256)
|
||||
#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && (defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE))
|
||||
template <typename Op>
|
||||
struct VecReduceAllSIMD<float, Op> {
|
||||
static inline float apply(
|
||||
@ -152,18 +150,28 @@ struct VecReduceAllSIMD<float, Op> {
|
||||
const Vectorized<float>& acc_vec) {
|
||||
using Vec = Vectorized<float>;
|
||||
Vec v = acc_vec;
|
||||
// 128-bit shuffle
|
||||
svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
|
||||
Vec v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 64-bit shuffle
|
||||
ind = svdupq_n_u32(2, 3, 0, 1);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 32-bit shuffle
|
||||
ind = svdupq_n_u32(1, 0, 2, 3);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
if (Vec::size() == 8) {
|
||||
// 128-bit shuffle
|
||||
svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
|
||||
Vec v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 64-bit shuffle
|
||||
ind = svdupq_n_u32(2, 3, 0, 1);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
// 32-bit shuffle
|
||||
ind = svdupq_n_u32(1, 0, 2, 3);
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
} else {
|
||||
svuint32_t ind = svdupq_n_u32(2, 3, 0, 1); // 64-bit stride-2
|
||||
Vec v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
|
||||
ind = svdupq_n_u32(1, 0, 2, 3); // 32-bit stride-1
|
||||
v1 = svtbl_f32(v, ind);
|
||||
v = vec_fun(v, v1);
|
||||
}
|
||||
return svlasta(svpfalse(), v);
|
||||
}
|
||||
};
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// Define the data type of VLS(vector-length specific).
|
||||
typedef svbool_t vls_pred_t
|
||||
@ -77,4 +77,4 @@ typedef svfloat64_t vls_float64_t
|
||||
#define ALL_F64_TRUE_MASK svreinterpret_f64_s64(ALL_S64_TRUE_MASK)
|
||||
#define ALL_F64_FALSE_MASK svreinterpret_f64_s64(ALL_S64_FALSE_MASK)
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
#endif // defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
@ -19,7 +19,7 @@ namespace vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
|
||||
#if (defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)) && defined(__ARM_FEATURE_BF16)
|
||||
|
||||
template <>
|
||||
struct is_vec_specialized_for<BFloat16> : std::bool_constant<true> {};
|
||||
@ -230,8 +230,6 @@ __attribute__((optimize("no-tree-vectorize")))
|
||||
#endif
|
||||
inline std::tuple<Vectorized<float>, Vectorized<float>>
|
||||
convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
|
||||
static_assert(
|
||||
Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
|
||||
auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f));
|
||||
auto bf16_vec1 = svzip1_bf16(zero, a);
|
||||
auto bf16_vec2 = svzip2_bf16(zero, a);
|
||||
@ -243,19 +241,18 @@ convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
|
||||
inline Vectorized<c10::BFloat16> convert_float_bfloat16(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
static_assert(
|
||||
Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
|
||||
svbfloat16_t x1 = svcvt_bf16_f32_z(ptrue, a);
|
||||
svbfloat16_t x2 = svcvt_bf16_f32_z(ptrue, b);
|
||||
return Vectorized<c10::BFloat16>(svuzp1_bf16(x1, x2));
|
||||
}
|
||||
|
||||
inline void load_fp32_from_bf16(const BFloat16* data, Vectorized<float>& out) {
|
||||
__at_align__ float values[Vectorized<float>::size()];
|
||||
__at_align__ float * values = new float[Vectorized<float>::size()];
|
||||
for (const auto k : c10::irange(Vectorized<float>::size())) {
|
||||
values[k] = data[k];
|
||||
}
|
||||
out = Vectorized<float>::loadu(values);
|
||||
delete[] values;
|
||||
}
|
||||
|
||||
inline void load_fp32_from_bf16(
|
||||
@ -308,8 +305,8 @@ Vectorized<c10::BFloat16> inline operator/(
|
||||
}
|
||||
|
||||
inline Vectorized<BFloat16>::Vectorized() {
|
||||
const short zero = 0;
|
||||
values = svdup_n_bf16(c10::bit_cast<bfloat16_t>(zero));
|
||||
auto vals_f = svdup_n_f32(0);
|
||||
values = convert_float_bfloat16(vals_f, vals_f);
|
||||
}
|
||||
|
||||
inline Vectorized<BFloat16>::Vectorized(int val) {
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE256)
|
||||
#include <ATen/cpu/vec/sve/vec_bfloat16.h>
|
||||
#include <ATen/cpu/vec/sve/vec_double.h>
|
||||
#include <ATen/cpu/vec/sve/vec_float.h>
|
||||
@ -27,7 +27,7 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
#define DEFINE_SVE_CAST(t1_t, t1_prefix, t2_t, t2_prefix) \
|
||||
@ -231,6 +231,5 @@ std::pair<
|
||||
#endif // __ARM_FEATURE_BF16
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
}
|
||||
|
||||
@ -22,7 +22,7 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
template <>
|
||||
struct is_vec_specialized_for<double> : std::bool_constant<true> {};
|
||||
@ -55,10 +55,11 @@ class Vectorized<double> {
|
||||
operator svfloat64_t() const {
|
||||
return values;
|
||||
}
|
||||
template <uint64_t mask>
|
||||
static Vectorized<double> blend(
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b) {
|
||||
const Vectorized<double>& b,
|
||||
int64_t mask
|
||||
) {
|
||||
// Build an array of flags: each element is 1 if the corresponding bit in
|
||||
// 'mask' is set, 0 otherwise.
|
||||
__at_align__ int64_t flag_arr[size()];
|
||||
|
||||
@ -2,8 +2,10 @@
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/sve/sve_helper.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
|
||||
#include <sleef.h>
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
|
||||
@ -22,7 +24,7 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE256)
|
||||
|
||||
template <>
|
||||
struct is_vec_specialized_for<float> : std::bool_constant<true> {};
|
||||
@ -30,52 +32,77 @@ struct is_vec_specialized_for<float> : std::bool_constant<true> {};
|
||||
template <>
|
||||
class Vectorized<float> {
|
||||
private:
|
||||
vls_float32_t values;
|
||||
|
||||
__at_align__ float values[2048 / sizeof(float)];
|
||||
public:
|
||||
|
||||
using value_type = float;
|
||||
using size_type = int;
|
||||
static constexpr size_type size() {
|
||||
return VECTOR_WIDTH / sizeof(float);
|
||||
static inline size_type size() {
|
||||
return svcntw();
|
||||
}
|
||||
Vectorized() {
|
||||
values = svdup_n_f32(0);
|
||||
inline Vectorized() {svst1_f32(ptrue, values, svdup_n_f32(0));}
|
||||
inline Vectorized(const float val) {
|
||||
svst1_f32(ptrue, values, svdup_n_f32(val));
|
||||
}
|
||||
Vectorized(svfloat32_t v) : values(v) {}
|
||||
Vectorized(float val) {
|
||||
values = svdup_n_f32(val);
|
||||
inline Vectorized(const svfloat32_t val) {
|
||||
svst1_f32(ptrue, values, val);
|
||||
}
|
||||
template <
|
||||
typename... Args,
|
||||
typename = std::enable_if_t<(sizeof...(Args) == size())>>
|
||||
Vectorized(Args... vals) {
|
||||
__at_align__ float buffer[size()] = {vals...};
|
||||
values = svld1_f32(ptrue, buffer);
|
||||
template<typename T,
|
||||
typename = std::enable_if_t<std::is_pointer_v<T>>>
|
||||
inline Vectorized(float * val) {
|
||||
svst1_f32(ptrue, values, svld1_f32(ptrue, val));
|
||||
}
|
||||
operator svfloat32_t() const {
|
||||
return values;
|
||||
template<typename... Args,
|
||||
typename = std::enable_if_t<(sizeof...(Args) == size())>>
|
||||
inline Vectorized(Args... vals) {
|
||||
values = { vals... };
|
||||
}
|
||||
template <uint64_t mask>
|
||||
static Vectorized<float> blend(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
// Build an array of flags: each element is 1 if the corresponding bit in
|
||||
// 'mask' is set, 0 otherwise.
|
||||
__at_align__ int32_t flag_arr[size()];
|
||||
inline operator svfloat32_t() const {
|
||||
return svld1_f32(ptrue, values);
|
||||
}
|
||||
static inline Vectorized<float> from_ptr(const float * vs) {
|
||||
Vectorized<float> v;
|
||||
svst1_f32(ptrue, v.values, svld1_f32(ptrue, static_cast<const float *>(vs)));
|
||||
return v;
|
||||
}
|
||||
static inline Vectorized<float> from_ptr(const float * vs, int count) {
|
||||
Vectorized<float> v;
|
||||
svst1_f32(ptrue, v.values, svld1_f32(svwhilelt_b32_s32(0, count), static_cast<const float *>(vs)));
|
||||
return v;
|
||||
}
|
||||
inline void set_lane(int i, float value) {
|
||||
values[i] = value;
|
||||
}
|
||||
inline Vectorized<float> map(float (*fn)(float)) const {
|
||||
Vectorized<float> result;
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
result.set_lane(i, fn(values[i]));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
inline Vectorized<float> map2(float (*fn)(float, float), const Vectorized<float> &b) const {
|
||||
Vectorized<float> result;
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
result.set_lane(i, fn(values[i], b.values[i]));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
static inline Vectorized<float> blend(const Vectorized<float>& a, const Vectorized<float>& b, const uint64_t mask) {
|
||||
// Build an array of flags: each element is 1 if the corresponding bit in 'mask' is set, 0 otherwise.
|
||||
__at_align__ int32_t * flag_arr = new int32_t[size()];
|
||||
for (int i = 0; i < size(); i++) {
|
||||
flag_arr[i] = (mask & (1ULL << i)) ? 1 : 0;
|
||||
}
|
||||
// Load the flag array into an SVE int32 vector.
|
||||
svint32_t int_mask = svld1_s32(svptrue_b32(), flag_arr);
|
||||
// Compare each lane of int_mask to 0; returns an svbool_t predicate where
|
||||
// true indicates a nonzero flag.
|
||||
svbool_t blend_mask = svcmpne_n_s32(svptrue_b32(), int_mask, 0);
|
||||
// Use svsel to select elements from b where the predicate is true, else
|
||||
// from a.
|
||||
svfloat32_t result = svsel_f32(blend_mask, b.values, a.values);
|
||||
return Vectorized<float>(result);
|
||||
svint32_t int_mask = svld1_s32(ptrue, flag_arr);
|
||||
delete[] flag_arr;
|
||||
// Compare each lane of int_mask to 0; returns an svbool_t predicate where true indicates a nonzero flag.
|
||||
svbool_t blend_mask = svcmpne_n_s32(ptrue, int_mask, 0);
|
||||
// Use svsel to select elements from b where the predicate is true, else from a.
|
||||
return svsel_f32(blend_mask, b, a);
|
||||
}
|
||||
static Vectorized<float> blendv(
|
||||
static inline Vectorized<float> blendv(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& mask_) {
|
||||
@ -84,16 +111,18 @@ class Vectorized<float> {
|
||||
return svsel_f32(mask, b, a);
|
||||
}
|
||||
template <typename step_t>
|
||||
static Vectorized<float> arange(
|
||||
static inline Vectorized<float> arange(
|
||||
float base = 0.f,
|
||||
step_t step = static_cast<step_t>(1)) {
|
||||
__at_align__ float buffer[size()];
|
||||
__at_align__ float * buffer = new float[size()];
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
buffer[i] = base + i * step;
|
||||
}
|
||||
return svld1_f32(ptrue, buffer);
|
||||
auto tmp = Vectorized<float>::from_ptr(buffer);
|
||||
delete[] buffer;
|
||||
return tmp;
|
||||
}
|
||||
static Vectorized<float> set(
|
||||
static inline Vectorized<float> set(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
int64_t count = size()) {
|
||||
@ -169,271 +198,213 @@ class Vectorized<float> {
|
||||
poly = svsel_f32(svcmpgt_f32(pg, x, max_input), inf, poly);
|
||||
return poly;
|
||||
}
|
||||
static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
|
||||
if (count == size())
|
||||
return svld1_f32(ptrue, reinterpret_cast<const float*>(ptr));
|
||||
svbool_t pg = svwhilelt_b32(0ull, count);
|
||||
return svld1_f32(pg, reinterpret_cast<const float*>(ptr));
|
||||
static inline Vectorized<float> loadu(const void* ptr) {
|
||||
return Vectorized<float>::from_ptr(reinterpret_cast<const float *>(ptr));
|
||||
}
|
||||
void store(void* ptr, int64_t count = size()) const {
|
||||
if (count == size()) {
|
||||
svst1_f32(ptrue, reinterpret_cast<float*>(ptr), values);
|
||||
} else {
|
||||
svbool_t pg = svwhilelt_b32(0ull, count);
|
||||
svst1_f32(pg, reinterpret_cast<float*>(ptr), values);
|
||||
}
|
||||
static inline Vectorized<float> loadu(const void* ptr, int64_t count) {
|
||||
return Vectorized<float>::from_ptr(reinterpret_cast<const float *>(ptr), count);
|
||||
}
|
||||
const float& operator[](int idx) const = delete;
|
||||
float& operator[](int idx) = delete;
|
||||
int64_t zero_mask() const {
|
||||
// returns an integer mask where all zero elements are translated to 1-bit
|
||||
// and others are translated to 0-bit
|
||||
inline void store(void* ptr) const {
|
||||
svst1_f32(ptrue, static_cast<float *>(ptr), svld1_f32(ptrue, values));
|
||||
}
|
||||
inline void store(void* ptr, int count) const {
|
||||
svst1_f32(svwhilelt_b32_s32(0, count), static_cast<float *>(ptr), svld1_f32(ptrue, values));
|
||||
}
|
||||
inline const float& operator[](int idx) const {
|
||||
return values[idx];
|
||||
};
|
||||
inline float& operator[](int idx) {
|
||||
return values[idx];
|
||||
};
|
||||
inline int64_t zero_mask() const {
|
||||
// returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
|
||||
int64_t mask = 0;
|
||||
__at_align__ int32_t mask_array[size()];
|
||||
__at_align__ int32_t * mask_array = new int32_t[size()];
|
||||
|
||||
svbool_t svbool_mask = svcmpeq_f32(ptrue, values, ZERO_F32);
|
||||
svst1_s32(
|
||||
ptrue,
|
||||
mask_array,
|
||||
svsel_s32(svbool_mask, ALL_S32_TRUE_MASK, ALL_S32_FALSE_MASK));
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
if (mask_array[i])
|
||||
mask |= (1ull << i);
|
||||
svbool_t svbool_mask = svcmpeq_f32(ptrue, *this, ZERO_F32);
|
||||
svst1_s32(ptrue, mask_array, svsel_s32(svbool_mask,
|
||||
ALL_S32_TRUE_MASK,
|
||||
ALL_S32_FALSE_MASK));
|
||||
for (int64_t j = 0; j < size(); ++j) {
|
||||
if (mask_array[j]) mask |= (1ull << j);
|
||||
}
|
||||
delete[] mask_array;
|
||||
return mask;
|
||||
}
|
||||
Vectorized<float> isnan() const {
|
||||
inline Vectorized<float> isnan() const {
|
||||
// NaN check
|
||||
svbool_t mask = svcmpuo_f32(ptrue, values, ZERO_F32);
|
||||
auto mask = svcmpuo_f32(ptrue, *this, ZERO_F32);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
bool has_inf_nan() const {
|
||||
return svptest_any(
|
||||
ptrue,
|
||||
svcmpuo_f32(ptrue, svsub_f32_x(ptrue, values, values), ZERO_F32));
|
||||
inline bool has_inf_nan() const {
|
||||
return svptest_any(ptrue, svcmpuo_f32(ptrue, svsub_f32_x(ptrue, *this, *this), ZERO_F32));
|
||||
}
|
||||
Vectorized<float> map(float (*f)(float)) const {
|
||||
__at_align__ float tmp[size()];
|
||||
store(tmp);
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
tmp[i] = f(tmp[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
|
||||
inline Vectorized<float> abs() const {
|
||||
return svabs_f32_x(ptrue, *this);
|
||||
}
|
||||
Vectorized<float> abs() const {
|
||||
return svabs_f32_x(ptrue, values);
|
||||
}
|
||||
Vectorized<float> angle() const {
|
||||
inline Vectorized<float> angle() const {
|
||||
const auto nan_vec = svdup_n_f32(NAN);
|
||||
const auto nan_mask = svcmpuo_f32(ptrue, values, ZERO_F32);
|
||||
const auto nan_mask = svcmpuo_f32(ptrue, *this, ZERO_F32);
|
||||
const auto pi = svdup_n_f32(c10::pi<float>);
|
||||
|
||||
const auto neg_mask = svcmplt_f32(ptrue, values, ZERO_F32);
|
||||
const auto neg_mask = svcmplt_f32(ptrue, *this, ZERO_F32);
|
||||
auto angle = svsel_f32(neg_mask, pi, ZERO_F32);
|
||||
angle = svsel_f32(nan_mask, nan_vec, angle);
|
||||
return angle;
|
||||
return svsel_f32(nan_mask, nan_vec, angle);
|
||||
}
|
||||
Vectorized<float> real() const {
|
||||
return values;
|
||||
inline Vectorized<float> real() const {
|
||||
return *this;
|
||||
}
|
||||
Vectorized<float> imag() const {
|
||||
inline Vectorized<float> imag() const {
|
||||
return Vectorized<float>(0.f);
|
||||
}
|
||||
Vectorized<float> conj() const {
|
||||
return values;
|
||||
inline Vectorized<float> conj() const {
|
||||
return *this;
|
||||
}
|
||||
Vectorized<float> acos() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_acosfx_u10sve(values)), map(std::acos));
|
||||
inline Vectorized<float> acos() const {
|
||||
return USE_SLEEF(Sleef_acosfx_u10sve(*this), map(std::acos));
|
||||
}
|
||||
Vectorized<float> acosh() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_acoshfx_u10sve(values)), map(std::acosh));
|
||||
inline Vectorized<float> acosh() const {
|
||||
return USE_SLEEF(Sleef_acoshfx_u10sve(*this), map(std::acosh));
|
||||
}
|
||||
Vectorized<float> asin() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_asinfx_u10sve(values)), map(std::asin));
|
||||
inline Vectorized<float> asin() const {
|
||||
return USE_SLEEF(Sleef_asinfx_u10sve(*this), map(std::asin));
|
||||
}
|
||||
Vectorized<float> asinh() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_asinhfx_u10sve(values)), map(std::asinh));
|
||||
inline Vectorized<float> asinh() const {
|
||||
return USE_SLEEF(Sleef_asinhfx_u10sve(*this), map(std::asinh));
|
||||
}
|
||||
Vectorized<float> atan() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_atanfx_u10sve(values)), map(std::atan));
|
||||
inline Vectorized<float> atan() const {
|
||||
return USE_SLEEF(Sleef_atanfx_u10sve(*this), map(std::atan));
|
||||
}
|
||||
Vectorized<float> atanh() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_atanhfx_u10sve(values)), map(std::atanh));
|
||||
inline Vectorized<float> atanh() const {
|
||||
return USE_SLEEF(Sleef_atanhfx_u10sve(*this), map(std::atanh));
|
||||
}
|
||||
Vectorized<float> atan2(const Vectorized<float>& b) const {USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_atan2fx_u10sve(values, b)); },
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::atan2(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})} Vectorized<float> copysign(const Vectorized<float>& sign) const {
|
||||
|
||||
USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_copysignfx_sve(values, sign)); },
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_sign[size()];
|
||||
store(tmp);
|
||||
sign.store(tmp_sign);
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})} Vectorized<float> erf() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_erffx_u10sve(values)), map(std::erf));
|
||||
inline Vectorized<float> atan2(const Vectorized<float> &b) const {
|
||||
return USE_SLEEF(Sleef_atan2fx_u10sve(*this, b), map2(std::atan2, b));
|
||||
}
|
||||
Vectorized<float> erfc() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_erfcfx_u15sve(values)), map(std::erfc));
|
||||
inline Vectorized<float> copysign(const Vectorized<float> &sign) const {
|
||||
return USE_SLEEF(Sleef_copysignfx_sve(*this, sign), map2(std::copysign, sign));
|
||||
}
|
||||
Vectorized<float> erfinv() const {
|
||||
inline Vectorized<float> erf() const {
|
||||
return USE_SLEEF(Sleef_erffx_u10sve(*this), map(std::erf));
|
||||
}
|
||||
inline Vectorized<float> erfc() const {
|
||||
return USE_SLEEF(Sleef_erfcfx_u15sve(*this), map(std::erfc));
|
||||
}
|
||||
inline Vectorized<float> erfinv() const {
|
||||
return map(calc_erfinv);
|
||||
}
|
||||
Vectorized<float> exp() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_expfx_u10sve(values)), map(std::exp));
|
||||
inline Vectorized<float> exp() const {
|
||||
return USE_SLEEF(Sleef_expfx_u10sve(*this), map(std::exp));
|
||||
}
|
||||
Vectorized<float> exp2() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_exp2fx_u10sve(values)), map(std::exp2));
|
||||
inline Vectorized<float> exp2() const {
|
||||
return USE_SLEEF(Sleef_exp2fx_u10sve(*this), map(std::exp2));
|
||||
}
|
||||
Vectorized<float> expm1() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_expm1fx_u10sve(values)), map(std::expm1));
|
||||
inline Vectorized<float> expm1() const {
|
||||
return USE_SLEEF(Sleef_expm1fx_u10sve(*this), map(std::expm1));
|
||||
}
|
||||
// Implementation copied from Arm Optimized Routines:
|
||||
// https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/sve/expf.c
|
||||
Vectorized<float> exp_u20() const {
|
||||
return exp();
|
||||
// special case to handle special inputs that are too large or too small
|
||||
// i.e. where there's at least one element x, s.t. |x| >= 87.3...
|
||||
svbool_t is_special_case = svacgt (svptrue_b32(), *this, 0x1.5d5e2ap+6f);
|
||||
if (svptest_any (svptrue_b32(), is_special_case)) {
|
||||
return exp();
|
||||
}
|
||||
const svfloat32_t ln2_hi = svdup_n_f32(0x1.62e4p-1f);
|
||||
const svfloat32_t ln2_lo = svdup_n_f32(0x1.7f7d1cp-20f);
|
||||
const svfloat32_t c1 = svdup_n_f32(0.5f);
|
||||
const svfloat32_t inv_ln2 = svdup_n_f32(0x1.715476p+0f);
|
||||
|
||||
const float shift = 0x1.803f8p17f;
|
||||
|
||||
/* n = round(x/(ln2/N)). */
|
||||
svfloat32_t z = svmad_x (svptrue_b32(), inv_ln2, *this, shift);
|
||||
svfloat32_t n = svsub_x (svptrue_b32(), z, shift);
|
||||
|
||||
/* r = x - n*ln2/N. */
|
||||
svfloat32_t r = *this;
|
||||
r = svmls_x(svptrue_b32(), r, n, ln2_hi);
|
||||
r = svmls_x(svptrue_b32(), r, n, ln2_lo);
|
||||
|
||||
/* scale = 2^(n/N). */
|
||||
svfloat32_t scale = svexpa (svreinterpret_u32 (z));
|
||||
|
||||
/* poly(r) = exp(r) - 1 ~= r + 0.5 r^2. */
|
||||
svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
|
||||
svfloat32_t poly = svmla_x(svptrue_b32(), r, r2, c1);
|
||||
return svmla_x (svptrue_b32(), scale, scale, poly);
|
||||
}
|
||||
Vectorized<float> fexp_u20() const {
|
||||
return exp();
|
||||
return exp_u20();
|
||||
}
|
||||
Vectorized<float> fmod(const Vectorized<float>& q) const {USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_fmodfx_sve(values, q)); },
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_q[size()];
|
||||
store(tmp);
|
||||
q.store(tmp_q);
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
tmp[i] = std::fmod(tmp[i], tmp_q[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})} Vectorized<float> hypot(const Vectorized<float>& b) const {
|
||||
USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_hypotfx_u05sve(values, b)); },
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::hypot(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})} Vectorized<float> i0() const {
|
||||
inline Vectorized<float> fmod(const Vectorized<float>& q) const {
|
||||
return USE_SLEEF(Sleef_fmodfx_sve(*this, q), return map2(std::fmod, q));
|
||||
}
|
||||
inline Vectorized<float> hypot(const Vectorized<float> &b) const {
|
||||
return USE_SLEEF(Sleef_hypotfx_u05sve(*this, b), map2(std::hypot, b));
|
||||
}
|
||||
inline Vectorized<float> i0() const {
|
||||
return map(calc_i0);
|
||||
}
|
||||
Vectorized<float> i0e() const {
|
||||
return map(calc_i0e);
|
||||
inline Vectorized<float> i0e() const {
|
||||
return map(calc_i0e<float>);
|
||||
}
|
||||
Vectorized<float> digamma() const {
|
||||
inline Vectorized<float> digamma() const {
|
||||
return map(calc_digamma);
|
||||
}
|
||||
Vectorized<float> igamma(const Vectorized<float>& x) const {
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
inline Vectorized<float> igamma(const Vectorized<float> &x) const {
|
||||
return map2(calc_igamma<float>, x);
|
||||
}
|
||||
Vectorized<float> igammac(const Vectorized<float>& x) const {
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_x[size()];
|
||||
store(tmp);
|
||||
x.store(tmp_x);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
inline Vectorized<float> igammac(const Vectorized<float> &x) const {
|
||||
return map2(calc_igammac<float>, x);
|
||||
}
|
||||
Vectorized<float> nextafter(const Vectorized<float>& b) const {USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_nextafterfx_sve(values, b)); },
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); ++i) {
|
||||
tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})} Vectorized<float> log() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_logfx_u10sve(values)), map(std::log));
|
||||
inline Vectorized<float> nextafter(const Vectorized<float> &b) const {
|
||||
return USE_SLEEF(Sleef_nextafterfx_sve(*this, b), map2(std::nextafter, b));
|
||||
}
|
||||
Vectorized<float> log2() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_log2fx_u10sve(values)), map(std::log2));
|
||||
inline Vectorized<float> log() const {
|
||||
return USE_SLEEF(Sleef_logfx_u10sve(*this), map(std::log));
|
||||
}
|
||||
Vectorized<float> log10() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_log10fx_u10sve(values)), map(std::log10));
|
||||
inline Vectorized<float> log2() const {
|
||||
return USE_SLEEF(Sleef_log2fx_u10sve(*this), map(std::log2));
|
||||
}
|
||||
Vectorized<float> log1p() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_log1pfx_u10sve(values)), map(std::log1p));
|
||||
inline Vectorized<float> log10() const {
|
||||
return USE_SLEEF(Sleef_log10fx_u10sve(*this), map(std::log10));
|
||||
}
|
||||
Vectorized<float> frac() const;
|
||||
Vectorized<float> sin() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_sinfx_u10sve(values)), map(std::sin));
|
||||
inline Vectorized<float> log1p() const {
|
||||
return USE_SLEEF(Sleef_log1pfx_u10sve(*this), map(std::log1p));
|
||||
}
|
||||
Vectorized<float> sinh() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_sinhfx_u10sve(values)), map(std::sinh));
|
||||
inline Vectorized<float> frac() const;
|
||||
inline Vectorized<float> sin() const {
|
||||
return USE_SLEEF(Sleef_sinfx_u10sve(*this), map(std::sin));
|
||||
}
|
||||
Vectorized<float> cos() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_cosfx_u10sve(values)), map(std::cos));
|
||||
inline Vectorized<float> sinh() const {
|
||||
return USE_SLEEF(Sleef_sinhfx_u10sve(*this), map(std::sinh));
|
||||
}
|
||||
Vectorized<float> cosh() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_coshfx_u10sve(values)), map(std::cosh));
|
||||
inline Vectorized<float> cos() const {
|
||||
return USE_SLEEF(Sleef_cosfx_u10sve(*this), map(std::cos));
|
||||
}
|
||||
Vectorized<float> ceil() const {
|
||||
return svrintp_f32_x(ptrue, values);
|
||||
inline Vectorized<float> cosh() const {
|
||||
return USE_SLEEF(Sleef_coshfx_u10sve(*this), map(std::cosh));
|
||||
}
|
||||
Vectorized<float> floor() const {
|
||||
return svrintm_f32_x(ptrue, values);
|
||||
inline Vectorized<float> ceil() const {
|
||||
return svrintp_f32_x(ptrue, *this);
|
||||
}
|
||||
Vectorized<float> neg() const {
|
||||
return svneg_f32_x(ptrue, values);
|
||||
inline Vectorized<float> floor() const {
|
||||
return svrintm_f32_x(ptrue, *this);
|
||||
}
|
||||
Vectorized<float> round() const {
|
||||
return svrinti_f32_x(ptrue, values);
|
||||
inline Vectorized<float> neg() const {
|
||||
return svneg_f32_x(ptrue, *this);
|
||||
}
|
||||
Vectorized<float> tan() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_tanfx_u10sve(values)), map(std::tan));
|
||||
inline Vectorized<float> round() const {
|
||||
return svrinti_f32_x(ptrue, *this);
|
||||
}
|
||||
inline Vectorized<float> tan() const {
|
||||
return USE_SLEEF(Sleef_tanfx_u10sve(*this), map(std::tan));
|
||||
}
|
||||
// Implementation is picked from
|
||||
// https://github.com/ARM-software/ComputeLibrary/blob/v25.01/src/core/NEON/SVEMath.inl#L179
|
||||
Vectorized<float> tanh() const {
|
||||
inline Vectorized<float> tanh() const {
|
||||
// Constants used for the tanh calculation.
|
||||
const svfloat32_t CONST_1 =
|
||||
svdup_n_f32(1.f); // Constant 1.0f for the tanh formula.
|
||||
@ -450,7 +421,7 @@ class Vectorized<float> {
|
||||
// instability. svmax_f32_z ensures values are greater than -10, and
|
||||
// svmin_f32_z ensures they are less than 10.
|
||||
svfloat32_t x = svmin_f32_z(
|
||||
ptrue, svmax_f32_z(ptrue, values, CONST_MIN_TANH), CONST_MAX_TANH);
|
||||
ptrue, svmax_f32_z(ptrue, *this, CONST_MIN_TANH), CONST_MAX_TANH);
|
||||
|
||||
// Step 2: Calculate exp(2 * x), where x is the clamped value.
|
||||
// svmul_f32_z computes 2 * x, and svexp_f32_z computes the exponential of
|
||||
@ -472,104 +443,85 @@ class Vectorized<float> {
|
||||
// Return the calculated tanh values.
|
||||
return tanh;
|
||||
}
|
||||
Vectorized<float> trunc() const {
|
||||
return svrintz_f32_x(ptrue, values);
|
||||
inline Vectorized<float> trunc() const {
|
||||
return svrintz_f32_x(ptrue, *this);
|
||||
}
|
||||
Vectorized<float> lgamma() const {
|
||||
return USE_SLEEF(
|
||||
Vectorized<float>(Sleef_lgammafx_u10sve(values)), map(std::lgamma));
|
||||
inline Vectorized<float> lgamma() const {
|
||||
return USE_SLEEF(Sleef_lgammafx_u10sve(*this), map(std::lgamma));
|
||||
}
|
||||
Vectorized<float> sqrt() const {
|
||||
return svsqrt_f32_x(ptrue, values);
|
||||
inline Vectorized<float> sqrt() const {
|
||||
return svsqrt_f32_x(ptrue, *this);
|
||||
}
|
||||
Vectorized<float> reciprocal() const {
|
||||
return svdivr_f32_x(ptrue, values, ONE_F32);
|
||||
inline Vectorized<float> reciprocal() const {
|
||||
return svdivr_f32_x(ptrue, *this, svdup_n_f32(1.f));
|
||||
}
|
||||
Vectorized<float> rsqrt() const {
|
||||
return svdivr_f32_x(ptrue, svsqrt_f32_x(ptrue, values), ONE_F32);
|
||||
inline Vectorized<float> rsqrt() const {
|
||||
return svdivr_f32_x(ptrue, svsqrt_f32_x(ptrue, *this), ONE_F32);
|
||||
}
|
||||
Vectorized<float> pow(const Vectorized<float>& b) const {USE_SLEEF(
|
||||
{ return Vectorized<float>(Sleef_powfx_u10sve(values, b)); },
|
||||
{
|
||||
__at_align__ float tmp[size()];
|
||||
__at_align__ float tmp_b[size()];
|
||||
store(tmp);
|
||||
b.store(tmp_b);
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
tmp[i] = std::pow(tmp[i], tmp_b[i]);
|
||||
}
|
||||
return loadu(tmp);
|
||||
})} // Comparison using the _CMP_**_OQ predicate.
|
||||
// `O`: get false if an operand is NaN
|
||||
// `Q`: do not raise if an operand is NaN
|
||||
Vectorized<float> operator==(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpeq_f32(ptrue, values, other);
|
||||
inline Vectorized<float> pow(const Vectorized<float> &b) const {
|
||||
return USE_SLEEF(Sleef_powfx_u10sve(*this, b), map(std::pow, b));
|
||||
}
|
||||
// Comparison using the _CMP_**_OQ predicate.
|
||||
// `O`: get false if an operand is NaN
|
||||
// `Q`: do not raise if an operand is NaN
|
||||
inline Vectorized<float> operator==(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpeq_f32(ptrue, *this, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
inline Vectorized<float> operator!=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpne_f32(ptrue, *this, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
inline Vectorized<float> operator<(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmplt_f32(ptrue, *this, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator!=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpne_f32(ptrue, values, other);
|
||||
inline Vectorized<float> operator<=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmple_f32(ptrue, *this, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator<(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmplt_f32(ptrue, values, other);
|
||||
inline Vectorized<float> operator>(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpgt_f32(ptrue, *this, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator<=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmple_f32(ptrue, values, other);
|
||||
inline Vectorized<float> operator>=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpge_f32(ptrue, *this, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator>(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpgt_f32(ptrue, values, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> operator>=(const Vectorized<float>& other) const {
|
||||
svbool_t mask = svcmpge_f32(ptrue, values, other);
|
||||
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
|
||||
}
|
||||
|
||||
Vectorized<float> eq(const Vectorized<float>& other) const;
|
||||
Vectorized<float> ne(const Vectorized<float>& other) const;
|
||||
Vectorized<float> gt(const Vectorized<float>& other) const;
|
||||
Vectorized<float> ge(const Vectorized<float>& other) const;
|
||||
Vectorized<float> lt(const Vectorized<float>& other) const;
|
||||
Vectorized<float> le(const Vectorized<float>& other) const;
|
||||
inline Vectorized<float> eq(const Vectorized<float>& other) const;
|
||||
inline Vectorized<float> ne(const Vectorized<float>& other) const;
|
||||
inline Vectorized<float> gt(const Vectorized<float>& other) const;
|
||||
inline Vectorized<float> ge(const Vectorized<float>& other) const;
|
||||
inline Vectorized<float> lt(const Vectorized<float>& other) const;
|
||||
inline Vectorized<float> le(const Vectorized<float>& other) const;
|
||||
};
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator+(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
inline Vectorized<float> operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svadd_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator-(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
inline Vectorized<float> operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svsub_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator*(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
inline Vectorized<float> operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svmul_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator/(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
inline Vectorized<float> operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svdiv_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
// frac. Implement this here so we can use subtraction
|
||||
Vectorized<float> inline Vectorized<float>::frac() const {
|
||||
inline Vectorized<float> Vectorized<float>::frac() const {
|
||||
return *this - this->trunc();
|
||||
}
|
||||
|
||||
@ -585,115 +537,91 @@ Vectorized<float> inline maximum(
|
||||
// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
|
||||
// either input is a NaN.
|
||||
template <>
|
||||
Vectorized<float> inline minimum(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
inline Vectorized<float> minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svmin_f32_x(ptrue, a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline clamp(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& min,
|
||||
const Vectorized<float>& max) {
|
||||
inline Vectorized<float> clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
|
||||
return svmin_f32_x(ptrue, max, svmax_f32_x(ptrue, min, a));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline clamp_max(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& max) {
|
||||
inline Vectorized<float> clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
|
||||
return svmin_f32_x(ptrue, max, a);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline clamp_min(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& min) {
|
||||
inline Vectorized<float> clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
|
||||
return svmax_f32_x(ptrue, min, a);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator&(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(
|
||||
svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
inline Vectorized<float> operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator|(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(
|
||||
svorr_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
inline Vectorized<float> operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(svorr_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline operator^(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(
|
||||
sveor_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
inline Vectorized<float> operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
|
||||
return svreinterpret_f32_s32(sveor_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::eq(
|
||||
const Vectorized<float>& other) const {
|
||||
inline Vectorized<float> Vectorized<float>::eq(const Vectorized<float>& other) const {
|
||||
return (*this == other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::ne(
|
||||
const Vectorized<float>& other) const {
|
||||
inline Vectorized<float> Vectorized<float>::ne(const Vectorized<float>& other) const {
|
||||
return (*this != other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::gt(
|
||||
const Vectorized<float>& other) const {
|
||||
inline Vectorized<float> Vectorized<float>::gt(const Vectorized<float>& other) const {
|
||||
return (*this > other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::ge(
|
||||
const Vectorized<float>& other) const {
|
||||
inline Vectorized<float> Vectorized<float>::ge(const Vectorized<float>& other) const {
|
||||
return (*this >= other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::lt(
|
||||
const Vectorized<float>& other) const {
|
||||
inline Vectorized<float> Vectorized<float>::lt(const Vectorized<float>& other) const {
|
||||
return (*this < other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
Vectorized<float> inline Vectorized<float>::le(
|
||||
const Vectorized<float>& other) const {
|
||||
inline Vectorized<float> Vectorized<float>::le(const Vectorized<float>& other) const {
|
||||
return (*this <= other) & Vectorized<float>(1.0f);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const float* src, float* dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<float>::size();
|
||||
const int64_t fraction = n % svcntw();
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
|
||||
for (int64_t i = 0; i < n - fraction; i += svcntw()) {
|
||||
svst1_f32(ptrue, dst + i, svldnt1_f32(ptrue, src + i));
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
|
||||
for (int64_t i = n - fraction; i < n; i += svcntw()) {
|
||||
svbool_t pg = svwhilelt_b32(i, n);
|
||||
svst1_f32(pg, dst + i, svldnt1_f32(pg, src + i));
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const float* src, at::Half* dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<float>::size();
|
||||
svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
|
||||
inline void convert(const float *src, at::Half *dst, int64_t n) {
|
||||
const int64_t fraction = n % svcntw();
|
||||
svbool_t pg_16 = svwhilelt_b16(0ull, svcntw());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, svcntw());
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
|
||||
svfloat16_t src_vec = svuzp1_f16(
|
||||
svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)), ZERO_F16);
|
||||
for (int64_t i = 0; i < n - fraction; i += svcntw()) {
|
||||
svfloat16_t src_vec = svuzp1_f16(svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)),
|
||||
ZERO_F16);
|
||||
svst1_f16(pg_16, reinterpret_cast<float16_t*>(dst) + i, src_vec);
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
|
||||
for (int64_t i = n - fraction; i < n; i += svcntw()) {
|
||||
pg_16 = svwhilelt_b16(i, n);
|
||||
pg_32 = svwhilelt_b32(i, n);
|
||||
svfloat16_t src_vec = svuzp1_f16(
|
||||
@ -703,19 +631,18 @@ inline void convert(const float* src, at::Half* dst, int64_t n) {
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const at::Half* src, float* dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<float>::size();
|
||||
svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
|
||||
inline void convert(const at::Half *src, float *dst, int64_t n) {
|
||||
const int64_t fraction = n % svcntw();
|
||||
svbool_t pg_16 = svwhilelt_b16(0ull, svcntw());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, svcntw());
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
|
||||
svfloat16_t src_vec = svzip1_f16(
|
||||
svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
|
||||
ZERO_F16);
|
||||
for (int64_t i = 0; i < n - fraction; i += svcntw()) {
|
||||
svfloat16_t src_vec = svzip1_f16(svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
|
||||
ZERO_F16);
|
||||
svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec));
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
|
||||
for (int64_t i = n - fraction; i < n; i += svcntw()) {
|
||||
pg_16 = svwhilelt_b16(i, n);
|
||||
pg_32 = svwhilelt_b32(i, n);
|
||||
svfloat16_t src_vec = svzip1_f16(
|
||||
@ -726,20 +653,19 @@ inline void convert(const at::Half* src, float* dst, int64_t n) {
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const bool* src, float* dst, int64_t n) {
|
||||
const int64_t fraction = n % Vectorized<float>::size();
|
||||
svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<float>::size());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
|
||||
inline void convert(const bool *src, float *dst, int64_t n) {
|
||||
const int64_t fraction = n % svcntw();
|
||||
svbool_t pg_8 = svwhilelt_b8(0ull, svcntw());
|
||||
svbool_t pg_32 = svwhilelt_b32(0ull, svcntw());
|
||||
#pragma unroll
|
||||
for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
|
||||
svuint8_t src_vec_u8 =
|
||||
svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
|
||||
for (int64_t i = 0; i < n - fraction; i += svcntw()) {
|
||||
svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
|
||||
svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
|
||||
svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
|
||||
svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32));
|
||||
}
|
||||
#pragma unroll
|
||||
for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
|
||||
for (int64_t i = n - fraction; i < n; i += svcntw()) {
|
||||
pg_8 = svwhilelt_b8(i, n);
|
||||
pg_32 = svwhilelt_b32(i, n);
|
||||
svuint8_t src_vec_u8 =
|
||||
@ -751,10 +677,7 @@ inline void convert(const bool* src, float* dst, int64_t n) {
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<float> inline fmadd(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& c) {
|
||||
inline Vectorized<float> fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
|
||||
return svmad_f32_x(ptrue, a, b, c);
|
||||
}
|
||||
|
||||
|
||||
@ -15,7 +15,7 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
#define VEC_INT_SVE_TEMPLATE(vl, bit) \
|
||||
template <> \
|
||||
@ -49,10 +49,11 @@ inline namespace CPU_CAPABILITY {
|
||||
operator svint##bit##_t() const { \
|
||||
return values; \
|
||||
} \
|
||||
template <uint64_t mask> \
|
||||
static Vectorized<int##bit##_t> blend( \
|
||||
const Vectorized<int##bit##_t>& a, \
|
||||
const Vectorized<int##bit##_t>& b) { \
|
||||
const Vectorized<int##bit##_t>& b, \
|
||||
uint64_t mask \
|
||||
) { \
|
||||
__at_align__ int##bit##_t flag_arr[size()]; \
|
||||
for (int i = 0; i < size(); ++i) { \
|
||||
flag_arr[i] = (i < 64 && (mask & (1ULL << i))) ? 1 : 0; \
|
||||
@ -493,7 +494,7 @@ Vectorized<int8_t> inline operator>>(
|
||||
return svasr_s8_x(ptrue, a, svreinterpret_u8_s8(b));
|
||||
}
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
#endif // defined(CPU_CAPABILITY_SVE256)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
||||
@ -46,7 +46,7 @@ namespace at::vec {
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// NOTE: These are low-performance implementations that we fall back on
|
||||
// if we are not building with SVE. This may not be an issue, because
|
||||
@ -100,12 +100,12 @@ struct VectorizedQuantizedConverter {
|
||||
Vectorized<float> zero_point,
|
||||
Vectorized<float> scale_zp_premul) const {
|
||||
float_vec_return_type rv;
|
||||
float tmp_scale[Vectorized<float>::size()];
|
||||
float tmp_zero_point[Vectorized<float>::size()];
|
||||
float * tmp_scale = new float[Vectorized<float>::size()];
|
||||
float * tmp_zero_point = new float[Vectorized<float>::size()];
|
||||
scale.store(tmp_scale);
|
||||
zero_point.store(tmp_zero_point);
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
float tmp_vals[Vectorized<float>::size()];
|
||||
float * tmp_vals = new float[Vectorized<float>::size()];
|
||||
for (int j = 0; j < Vectorized<float>::size(); ++j) {
|
||||
tmp_vals[j] = at::native::dequantize_val<T>(
|
||||
tmp_scale[j],
|
||||
@ -113,7 +113,11 @@ struct VectorizedQuantizedConverter {
|
||||
T(vals[Vectorized<float>::size() * i + j]));
|
||||
}
|
||||
rv[i] = Vectorized<float>::loadu(tmp_vals);
|
||||
|
||||
delete[] tmp_vals;
|
||||
}
|
||||
delete[] tmp_scale;
|
||||
delete[] tmp_zero_point;
|
||||
return rv;
|
||||
}
|
||||
|
||||
@ -121,12 +125,12 @@ struct VectorizedQuantizedConverter {
|
||||
Vectorized<float> scale,
|
||||
Vectorized<float> zero_point) const {
|
||||
float_vec_return_type rv;
|
||||
float tmp_scale[Vectorized<float>::size()];
|
||||
float tmp_zero_point[Vectorized<float>::size()];
|
||||
float * tmp_scale = new float[Vectorized<float>::size()];
|
||||
float * tmp_zero_point = new float[Vectorized<float>::size()];
|
||||
scale.store(tmp_scale);
|
||||
zero_point.store(tmp_zero_point);
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
float tmp_vals[Vectorized<float>::size()];
|
||||
float * tmp_vals = new float[Vectorized<float>::size()];
|
||||
for (int j = 0; j < Vectorized<float>::size(); ++j) {
|
||||
tmp_vals[j] = at::native::dequantize_val<T>(
|
||||
tmp_scale[j],
|
||||
@ -134,7 +138,10 @@ struct VectorizedQuantizedConverter {
|
||||
T(vals[Vectorized<float>::size() * i + j]));
|
||||
}
|
||||
rv[i] = Vectorized<float>::loadu(tmp_vals);
|
||||
delete[] tmp_vals;
|
||||
}
|
||||
delete[] tmp_scale;
|
||||
delete[] tmp_zero_point;
|
||||
return rv;
|
||||
}
|
||||
|
||||
@ -205,7 +212,7 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
int32_t zero_point,
|
||||
float inverse_scale) {
|
||||
std::array<value_type, size()> qvals;
|
||||
std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
|
||||
float * float_vals = new float[float_num_vecs() * Vectorized<float>::size()];
|
||||
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
rhs[i].store(
|
||||
@ -216,10 +223,11 @@ struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
|
||||
at::native::quantize_vec<c10::qint32, /*precision=*/32>(
|
||||
scale,
|
||||
zero_point,
|
||||
float_vals.data(),
|
||||
float_vals,
|
||||
(c10::qint32*)qvals.data(),
|
||||
Vectorized<float>::size() * float_num_vecs());
|
||||
|
||||
delete[] float_vals;
|
||||
return Vectorized<c10::qint32>::loadu(qvals.data());
|
||||
}
|
||||
|
||||
@ -359,7 +367,7 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
int32_t zero_point,
|
||||
float inverse_scale) {
|
||||
std::array<value_type, size()> qvals;
|
||||
std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
|
||||
float * float_vals = new float[float_num_vecs() * Vectorized<float>::size()];
|
||||
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
rhs[i].store(
|
||||
@ -370,10 +378,11 @@ struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
|
||||
at::native::quantize_vec<c10::qint8>(
|
||||
scale,
|
||||
zero_point,
|
||||
float_vals.data(),
|
||||
float_vals,
|
||||
(c10::qint8*)qvals.data(),
|
||||
Vectorized<float>::size() * float_num_vecs());
|
||||
|
||||
delete[] float_vals;
|
||||
return Vectorized<c10::qint8>::loadu(qvals.data());
|
||||
}
|
||||
|
||||
@ -511,7 +520,7 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
int32_t zero_point,
|
||||
float inverse_scale) {
|
||||
std::array<value_type, size()> qvals;
|
||||
std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
|
||||
float * float_vals = new float[float_num_vecs() * Vectorized<float>::size()];
|
||||
|
||||
for (int i = 0; i < float_num_vecs(); ++i) {
|
||||
rhs[i].store(
|
||||
@ -522,10 +531,11 @@ struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
|
||||
at::native::quantize_vec<c10::quint8>(
|
||||
scale,
|
||||
zero_point,
|
||||
float_vals.data(),
|
||||
float_vals,
|
||||
(c10::quint8*)qvals.data(),
|
||||
Vectorized<float>::size() * float_num_vecs());
|
||||
|
||||
delete[] float_vals;
|
||||
return Vectorized<c10::quint8>::loadu(qvals.data());
|
||||
}
|
||||
|
||||
@ -600,7 +610,7 @@ Vectorized<c10::quint8> inline maximum(
|
||||
return a.maximum(b);
|
||||
}
|
||||
|
||||
#endif // defined(CPU_CAPABILITY_SVE)
|
||||
#endif // defined(CPU_CAPABILITY_SVE256)
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
||||
@ -4,7 +4,9 @@
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
|
||||
#ifdef __aarch64__
|
||||
#if !defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE256)
|
||||
#include <ATen/cpu/vec/sve/vec_common_sve.h>
|
||||
#else
|
||||
#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
|
||||
|
||||
@ -241,7 +241,7 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
Vectorized() = default;
|
||||
|
||||
Vectorized(c10::BFloat16 val)
|
||||
: Vectorized16(at_vdupq_n_bf16(c10::bit_cast<at_bfloat16_t>(val.x))) {}
|
||||
: Vectorized16(at_vdupq_n_bf16(val.x)) {}
|
||||
Vectorized(float val) : Vectorized(c10::BFloat16(val)) {}
|
||||
Vectorized(
|
||||
value_type val0,
|
||||
@ -253,14 +253,14 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
value_type val6,
|
||||
value_type val7)
|
||||
: Vectorized16(at_bfloat16x8_t{
|
||||
c10::bit_cast<at_bfloat16_t>(val0.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val1.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val2.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val3.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val4.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val5.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val6.x),
|
||||
c10::bit_cast<at_bfloat16_t>(val7.x)}) {}
|
||||
val0.x,
|
||||
val1.x,
|
||||
val2.x,
|
||||
val3.x,
|
||||
val4.x,
|
||||
val5.x,
|
||||
val6.x,
|
||||
val7.x}) {}
|
||||
|
||||
static Vectorized<c10::BFloat16> blendv(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
|
||||
namespace at::vec {
|
||||
inline namespace CPU_CAPABILITY {
|
||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE))
|
||||
template <typename src_t>
|
||||
struct VecConvert<
|
||||
float,
|
||||
|
||||
@ -41,32 +41,16 @@ inline namespace CPU_CAPABILITY {
|
||||
#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
|
||||
#endif
|
||||
|
||||
template <int index, bool mask_val>
|
||||
template <int index>
|
||||
struct BlendRegs {
|
||||
static float32x4_t impl(
|
||||
const float32x4_t& a,
|
||||
const float32x4_t& b,
|
||||
float32x4_t& res);
|
||||
};
|
||||
|
||||
template <int index>
|
||||
struct BlendRegs<index, true> {
|
||||
static float32x4_t impl(
|
||||
const float32x4_t& a,
|
||||
const float32x4_t& b,
|
||||
float32x4_t& res) {
|
||||
return vsetq_lane_f32(vgetq_lane_f32(b, index), res, index);
|
||||
}
|
||||
};
|
||||
|
||||
template <int index>
|
||||
struct BlendRegs<index, false> {
|
||||
static float32x4_t impl(
|
||||
const float32x4_t& a,
|
||||
const float32x4_t& b,
|
||||
float32x4_t& res) {
|
||||
return vsetq_lane_f32(vgetq_lane_f32(a, index), res, index);
|
||||
}
|
||||
float32x4_t& res,
|
||||
bool mask_val
|
||||
) {
|
||||
return vsetq_lane_f32(vgetq_lane_f32(mask_val ? b : a, index), res, index);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
@ -94,19 +78,15 @@ class Vectorized<float> {
|
||||
operator float32x4_t() const {
|
||||
return values;
|
||||
}
|
||||
template <int64_t mask>
|
||||
static Vectorized<float> blend(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
const Vectorized<float>& b,
|
||||
int64_t mask) {
|
||||
Vectorized<float> vec;
|
||||
vec.values = BlendRegs < 0,
|
||||
(mask & 0x01) != 0 > ::impl(a.values, b.values, vec.values);
|
||||
vec.values = BlendRegs < 1,
|
||||
(mask & 0x02) != 0 > ::impl(a.values, b.values, vec.values);
|
||||
vec.values = BlendRegs < 2,
|
||||
(mask & 0x04) != 0 > ::impl(a.values, b.values, vec.values);
|
||||
vec.values = BlendRegs < 3,
|
||||
(mask & 0x08) != 0 > ::impl(a.values, b.values, vec.values);
|
||||
vec.values = BlendRegs <0>::impl(a.values, b.values, vec.values, (mask & 0x01) != 0);
|
||||
vec.values = BlendRegs <1> ::impl(a.values, b.values, vec.values, (mask & 0x02) != 0);
|
||||
vec.values = BlendRegs <2> ::impl(a.values, b.values, vec.values, (mask & 0x04) != 0);
|
||||
vec.values = BlendRegs <3> ::impl(a.values, b.values, vec.values, (mask & 0x08) != 0);
|
||||
return vec;
|
||||
}
|
||||
static Vectorized<float> blendv(
|
||||
@ -307,11 +287,48 @@ class Vectorized<float> {
|
||||
DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp)
|
||||
DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(exp2)
|
||||
DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1)
|
||||
// Implementation copied from Arm Optimized Routine https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
|
||||
Vectorized<float> exp_u20() const {
|
||||
return exp();
|
||||
// bail out to sleef if it's a special case:
|
||||
// i.e. there's an input s.t. |input| > 87.3....
|
||||
const float32x4_t special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);
|
||||
uint32x4_t cmp = vcagtq_f32 (values, special_bound);
|
||||
if (vpaddd_u64 (vreinterpretq_u64_u32 (cmp)) != 0) {
|
||||
return exp();
|
||||
}
|
||||
|
||||
const float32x4_t inv_ln2 = vdupq_n_f32(0x1.715476p+0f);
|
||||
const float ln2_hi = 0x1.62e4p-1f;
|
||||
const float ln2_lo = 0x1.7f7d1cp-20f;
|
||||
const float c0 = 0x1.0e4020p-7f;
|
||||
const float c2 = 0x1.555e66p-3f;
|
||||
const float32x4_t ln2_c02 = {ln2_hi, ln2_lo, c0, c2};
|
||||
|
||||
const uint32x4_t exponent_bias = vdupq_n_u32(0x3f800000);
|
||||
const float32x4_t c1 = vdupq_n_f32(0x1.573e2ep-5f);
|
||||
const float32x4_t c3 = vdupq_n_f32(0x1.fffdb6p-2f);
|
||||
const float32x4_t c4 = vdupq_n_f32(0x1.ffffecp-1f);
|
||||
|
||||
/* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = ln2*n + r, with r in [-ln2/2, ln2/2]. */
|
||||
|
||||
float32x4_t n = vrndaq_f32 (vmulq_f32 (values, inv_ln2));
|
||||
float32x4_t r = vfmsq_laneq_f32 (values, n, ln2_c02, 0);
|
||||
r = vfmsq_laneq_f32 (r, n, ln2_c02, 1);
|
||||
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtq_s32_f32 (n)), 23);
|
||||
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, exponent_bias));
|
||||
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
float32x4_t p = vfmaq_laneq_f32 (c1, r, ln2_c02, 2);
|
||||
float32x4_t q = vfmaq_laneq_f32 (c3, r, ln2_c02, 3);
|
||||
q = vfmaq_f32 (q, p, r2);
|
||||
p = vmulq_f32 (c4, r);
|
||||
float32x4_t poly = vfmaq_f32 (p, q, r2);
|
||||
|
||||
return vfmaq_f32 (scale, poly, scale);
|
||||
}
|
||||
Vectorized<float> fexp_u20() const {
|
||||
return exp();
|
||||
return exp_u20();
|
||||
}
|
||||
DEFINE_SLEEF_COMPATIBLE_BINARY_ELEMENTWISE_FUNC_WITH_SLEEF_NAME(
|
||||
fmod,
|
||||
|
||||
@ -813,11 +813,12 @@ static inline Vectorized<T> binary_op_as_fp32(
|
||||
#define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \
|
||||
inline void load_fp32_from_##name( \
|
||||
const type* data, Vectorized<float>& out) { \
|
||||
__at_align__ float values[Vectorized<float>::size()]; \
|
||||
__at_align__ float * values = new float[Vectorized<float>::size()]; \
|
||||
for (const auto k : c10::irange(Vectorized<float>::size())) { \
|
||||
values[k] = data[k]; \
|
||||
} \
|
||||
out = Vectorized<float>::loadu(values); \
|
||||
delete[] values; \
|
||||
} \
|
||||
\
|
||||
inline void load_fp32_from_##name( \
|
||||
|
||||
@ -269,12 +269,13 @@ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16)
|
||||
#else // defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
#if !( \
|
||||
defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE256))
|
||||
defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__))
|
||||
CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
|
||||
#endif
|
||||
|
||||
#if !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16)
|
||||
#endif
|
||||
#endif // defined(CPU_CAPABILITY_AVX2)
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
|
||||
@ -294,7 +294,7 @@ struct VecConvert<
|
||||
};
|
||||
#endif
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
|
||||
#if (defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)) && defined(__ARM_FEATURE_BF16)
|
||||
|
||||
template <>
|
||||
struct VecConvert<float, 1, BFloat16, 1> {
|
||||
|
||||
@ -270,7 +270,7 @@ LOAD_FP32_VECTORIZED_INIT(Half, fp16)
|
||||
|
||||
#if !( \
|
||||
defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
|
||||
!defined(CPU_CAPABILITY_SVE256))
|
||||
!defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE))
|
||||
CONVERT_NON_VECTORIZED_INIT(Half, half)
|
||||
#endif
|
||||
|
||||
|
||||
@ -915,7 +915,7 @@ Vectorized<c10::quint8> inline maximum(
|
||||
return a.maximum(b);
|
||||
}
|
||||
|
||||
#elif !defined(CPU_CAPABILITY_SVE256)
|
||||
#elif !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
|
||||
// NOTE: These are low-performance implementations that we fall back on
|
||||
// if we are not building with AVX2. This may not be an issue, because
|
||||
@ -1374,11 +1374,11 @@ Vectorized<c10::quint8> inline maximum(
|
||||
|
||||
#endif // if defined(CPU_CAPABILITY_AVX2)
|
||||
|
||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
||||
std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
|
||||
at::vec::Vectorized<int8_t> src) {
|
||||
auto s8x8 = vld1_s8(src.operator const int8_t*());
|
||||
auto s16x8 = vmovl_s8(s8x8);
|
||||
#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
std::pair<Vectorized<float>, Vectorized<float>>
|
||||
inline convert_int8_to_float(at::vec::Vectorized<int8_t> src) {
|
||||
auto s8x8 = vld1_s8(src.operator const int8_t*());
|
||||
auto s16x8 = vmovl_s8(s8x8);
|
||||
|
||||
auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
|
||||
auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
|
||||
|
||||
@ -292,8 +292,7 @@ class Vectorized16 {
|
||||
_mm512_mask_storeu_epi16(ptr, mask, values);
|
||||
}
|
||||
}
|
||||
template <int64_t mask>
|
||||
static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
|
||||
static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b, int64_t mask) {
|
||||
return _mm512_mask_blend_epi16(mask, a.values, b.values);
|
||||
}
|
||||
static Vectorized<T> blendv(
|
||||
|
||||
@ -69,10 +69,10 @@ class Vectorized<c10::complex<double>> {
|
||||
operator __m512d() const {
|
||||
return values;
|
||||
}
|
||||
template <int64_t mask>
|
||||
static Vectorized<c10::complex<double>> blend(
|
||||
const Vectorized<c10::complex<double>>& a,
|
||||
const Vectorized<c10::complex<double>>& b) {
|
||||
const Vectorized<c10::complex<double>>& b,
|
||||
int64_t mask) {
|
||||
// convert c10::complex<V> index mask to V index mask: xy -> xxyy
|
||||
// NOLINTNEXTLINE(clang-diagnostic-warning)
|
||||
switch (mask) {
|
||||
|
||||
@ -89,10 +89,10 @@ class Vectorized<c10::complex<float>> {
|
||||
operator __m512() const {
|
||||
return values;
|
||||
}
|
||||
template <int64_t mask>
|
||||
static Vectorized<c10::complex<float>> blend(
|
||||
const Vectorized<c10::complex<float>>& a,
|
||||
const Vectorized<c10::complex<float>>& b) {
|
||||
const Vectorized<c10::complex<float>>& b,
|
||||
int64_t mask) {
|
||||
// convert c10::complex<V> index mask to V index mask: xy -> xxyy
|
||||
static_assert(mask > -1 && mask < 256, "Unexpected mask value");
|
||||
// The compiler would hopefully convert this switch condition
|
||||
|
||||
@ -55,10 +55,10 @@ class Vectorized<double> {
|
||||
operator __m512d() const {
|
||||
return values;
|
||||
}
|
||||
template <int64_t mask>
|
||||
static Vectorized<double> blend(
|
||||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b) {
|
||||
const Vectorized<double>& b,
|
||||
int64_t mask) {
|
||||
return _mm512_mask_blend_pd(mask, a.values, b.values);
|
||||
}
|
||||
static Vectorized<double> blendv(
|
||||
|
||||
@ -95,10 +95,10 @@ class Vectorized<float> {
|
||||
operator __m512() const {
|
||||
return values;
|
||||
}
|
||||
template <int64_t mask>
|
||||
static Vectorized<float> blend(
|
||||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b) {
|
||||
const Vectorized<float>& b,
|
||||
int64_t mask) {
|
||||
return _mm512_mask_blend_ps(mask, a.values, b.values);
|
||||
}
|
||||
static Vectorized<float> blendv(
|
||||
|
||||
@ -528,10 +528,10 @@ class Vectorized<int16_t> : public Vectorizedi {
|
||||
val2,
|
||||
val1);
|
||||
}
|
||||
template <int64_t mask>
|
||||
static Vectorized<int16_t> blend(
|
||||
Vectorized<int16_t> a,
|
||||
Vectorized<int16_t> b) {
|
||||
Vectorized<int16_t> b,
|
||||
int64_t mask) {
|
||||
return _mm512_mask_blend_epi16(mask, a.values, b.values);
|
||||
}
|
||||
static Vectorized<int16_t> blendv(
|
||||
|
||||
@ -68,7 +68,7 @@ Windows llvm will not have this definition.
|
||||
#define VECTOR_WIDTH 64
|
||||
#define int_vector __m512i
|
||||
#elif defined(__aarch64__) && \
|
||||
!defined(CPU_CAPABILITY_SVE) // CPU_CAPABILITY_AVX512
|
||||
!defined(CPU_CAPABILITY_SVE) && !defined(CPU_CAPABILITY_SVE256) // CPU_CAPABILITY_AVX512
|
||||
// SVE code expects 256-vectors; leave that set for SVE?
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(16)))
|
||||
@ -79,6 +79,18 @@ Windows llvm will not have this definition.
|
||||
#endif
|
||||
#define VECTOR_WIDTH 16
|
||||
#else // CPU_CAPABILITY_AVX512
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(16)))
|
||||
#elif defined(_WIN32)
|
||||
#define __at_align__ __declspec(align(16))
|
||||
#else
|
||||
#define __at_align__
|
||||
#endif
|
||||
#define VECTOR_WIDTH 16
|
||||
#define int_vector __m256i
|
||||
#else // CPU_CAPABILITY_SVE256 || CPU_CAPABILITY_SVE
|
||||
#if defined(CPU_CAPABILITY_SVE256)
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(32)))
|
||||
#elif defined(_WIN32)
|
||||
@ -88,6 +100,18 @@ Windows llvm will not have this definition.
|
||||
#endif
|
||||
#define VECTOR_WIDTH 32
|
||||
#define int_vector __m256i
|
||||
#else // CPU_CAPABILITY_SVE
|
||||
#if defined(__GNUC__)
|
||||
#define __at_align__ __attribute__((aligned(16)))
|
||||
#elif defined(_WIN32)
|
||||
#define __at_align__ __declspec(align(16))
|
||||
#else
|
||||
#define __at_align__
|
||||
#endif
|
||||
#define VECTOR_WIDTH 16
|
||||
#define int_vector __m256i
|
||||
#endif // CPU_CAPABILITY_SVE256
|
||||
#endif // CPU_CAPABILITY_SVE256 || CPU_CAPABILITY_SVE
|
||||
#endif // CPU_CAPABILITY_AVX512
|
||||
|
||||
namespace at::vec {
|
||||
@ -210,8 +234,7 @@ struct Vectorized {
|
||||
auto as_bytes() const -> const char* {
|
||||
return reinterpret_cast<const char*>(values);
|
||||
}
|
||||
template <int64_t mask_>
|
||||
static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b) {
|
||||
static Vectorized<T> blend(const Vectorized<T>& a, const Vectorized<T>& b, const int64_t mask_) {
|
||||
int64_t mask = mask_;
|
||||
Vectorized vector;
|
||||
for (const auto i : c10::irange(size())) {
|
||||
@ -1312,7 +1335,7 @@ std::
|
||||
T const* base_addr,
|
||||
const Vectorized<int_same_size_t<T>>& vindex,
|
||||
Vectorized<T>& mask) {
|
||||
static constexpr int size = Vectorized<T>::size();
|
||||
static const int size = Vectorized<T>::size();
|
||||
T src_arr[size];
|
||||
int_same_size_t<T> mask_arr[size]; // use int type so we can logical and
|
||||
int_same_size_t<T> index_arr[size];
|
||||
@ -1405,7 +1428,7 @@ inline Vectorized<T> convert_to_fp_of_same_size(
|
||||
// clang-format on
|
||||
template <typename T>
|
||||
inline std::enable_if_t<
|
||||
Vectorized<T>::size() % 2 == 0,
|
||||
true,
|
||||
std::pair<Vectorized<T>, Vectorized<T>>>
|
||||
deinterleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
|
||||
static constexpr int size = Vectorized<T>::size();
|
||||
@ -1444,7 +1467,7 @@ VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(deinterleave2)
|
||||
// clang-format on
|
||||
template <typename T>
|
||||
inline std::enable_if_t<
|
||||
Vectorized<T>::size() % 2 == 0,
|
||||
true,
|
||||
std::pair<Vectorized<T>, Vectorized<T>>>
|
||||
interleave2(const Vectorized<T>& a, const Vectorized<T>& b) {
|
||||
static constexpr int size = Vectorized<T>::size();
|
||||
@ -1486,7 +1509,7 @@ inline void convert(const src_T* src, dst_T* dst, int64_t n) {
|
||||
|
||||
template <typename T>
|
||||
inline Vectorized<T> flip(const Vectorized<T>& data) {
|
||||
static constexpr int size = Vectorized<T>::size();
|
||||
static const int size = Vectorized<T>::size();
|
||||
T output[size];
|
||||
T buffer[size];
|
||||
data.store(static_cast<void*>(buffer));
|
||||
|
||||
@ -15,7 +15,7 @@ template <
|
||||
struct VecConvert {
|
||||
static inline VectorizedN<dst_t, dst_n> apply(
|
||||
const VectorizedN<src_t, src_n>& src) {
|
||||
constexpr int count = std::min(
|
||||
const int count = std::min(
|
||||
VectorizedN<src_t, src_n>::size(), VectorizedN<dst_t, dst_n>::size());
|
||||
__at_align__ src_t src_buf[VectorizedN<src_t, src_n>::size()];
|
||||
src.store(src_buf);
|
||||
|
||||
@ -2,6 +2,8 @@
|
||||
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <ATen/cpu/vec/vec_n.h>
|
||||
|
||||
#include <cassert>
|
||||
namespace at::vec {
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
@ -38,9 +40,9 @@ struct VecMaskLoad {
|
||||
static inline VectorizedN<data_t, data_n> apply(
|
||||
const data_t* ptr,
|
||||
const VecMask<mask_t, mask_n>& vec_mask) {
|
||||
constexpr typename VecMask<mask_t, mask_n>::size_type size =
|
||||
const typename VecMask<mask_t, mask_n>::size_type size =
|
||||
VecMask<mask_t, mask_n>::size();
|
||||
static_assert(VectorizedN<data_t, data_n>::size() >= size);
|
||||
assert((VectorizedN<data_t, data_n>::size() >= size));
|
||||
__at_align__ data_t data[size];
|
||||
__at_align__ mask_t mask[size];
|
||||
auto mask_ = VectorizedN<mask_t, mask_n>(vec_mask);
|
||||
@ -134,7 +136,7 @@ class VecMask {
|
||||
template <typename U, int L>
|
||||
static VecMask<T, N> from(const VectorizedN<U, L>& b_vec) {
|
||||
__at_align__ U b_buf[size()];
|
||||
if constexpr (size() >= VectorizedN<U, L>::size()) {
|
||||
if (size() >= VectorizedN<U, L>::size()) {
|
||||
b_vec.store(b_buf);
|
||||
for (int i = VectorizedN<U, L>::size(); i < size(); i++) {
|
||||
b_buf[i] = static_cast<U>(0);
|
||||
@ -235,16 +237,18 @@ class VecMask {
|
||||
template <
|
||||
typename U,
|
||||
int L,
|
||||
std::enable_if_t<L >= 2 && VectorizedN<U, L>::size() >= size(), int> = 0>
|
||||
std::enable_if_t<L >= 2, int> = 0>
|
||||
VectorizedN<U, L> loadu(const U* ptr) const {
|
||||
assert((VectorizedN<U, L>::size() >= size()));
|
||||
return VecMaskLoad<U, L, T, N>::apply(ptr, *this);
|
||||
}
|
||||
|
||||
template <
|
||||
typename U,
|
||||
int L,
|
||||
std::enable_if_t<L == 1 && Vectorized<U>::size() >= size(), int> = 0>
|
||||
std::enable_if_t<L == 1, int> = 0>
|
||||
Vectorized<U> loadu(const U* ptr) const {
|
||||
assert((Vectorized<U>::size() >= size()));
|
||||
return VecMaskLoad<U, L, T, N>::apply(ptr, *this);
|
||||
}
|
||||
};
|
||||
|
||||
@ -28,7 +28,7 @@ class VectorizedN {
|
||||
using size_type = int;
|
||||
|
||||
static constexpr size_type size_T = sizeof(T);
|
||||
static constexpr size_type size() {
|
||||
static size_type size() {
|
||||
return Vectorized<T>::size() * N;
|
||||
}
|
||||
|
||||
|
||||
@ -644,6 +644,8 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
|
||||
void * beta_ptr = &fbeta;
|
||||
#ifdef USE_ROCM
|
||||
int flag = 0;
|
||||
rocblas_datatype c_type = std::is_same<C_Dtype, float>::value ? rocblas_datatype_f32_r : rocblas_datatype_f16_r;
|
||||
rocblas_datatype d_type = c_type;
|
||||
#if USE_GEMM_FLAGS_FP16_ALT_IMPL
|
||||
flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
|
||||
#endif
|
||||
@ -652,8 +654,8 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
|
||||
hipOperationToRocOperation(opb), (int)m, (int)n, (int)k,
|
||||
(void*)alpha_ptr, a, rocblas_datatype_f16_r, (int)lda, stridea,
|
||||
b, rocblas_datatype_f16_r, (int)ldb, strideb,
|
||||
(void*)beta_ptr, c, rocblas_datatype_f16_r, (int)ldc, stridec,
|
||||
c, rocblas_datatype_f16_r, (int)ldc, stridec,
|
||||
(void*)beta_ptr, c, c_type, (int)ldc, stridec,
|
||||
c, d_type, (int)ldc, stridec,
|
||||
(int) num_batches, rocblas_datatype_f32_r, rocblas_gemm_algo_standard,
|
||||
0, flag)));
|
||||
#else
|
||||
@ -1096,6 +1098,8 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
|
||||
GEMM_CHECK_ARGVALUES(at::Half);
|
||||
#ifdef USE_ROCM
|
||||
int flag = 0;
|
||||
rocblas_datatype c_type = std::is_same<C_Dtype, float>::value ? rocblas_datatype_f32_r : rocblas_datatype_f16_r;
|
||||
rocblas_datatype d_type = c_type;
|
||||
#if USE_GEMM_FLAGS_FP16_ALT_IMPL
|
||||
flag = at::ROCmBackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
|
||||
#endif
|
||||
@ -1115,10 +1119,10 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
|
||||
ldb,
|
||||
beta_ptr,
|
||||
c,
|
||||
rocblas_datatype_f16_r,
|
||||
c_type,
|
||||
ldc,
|
||||
c,
|
||||
rocblas_datatype_f16_r,
|
||||
d_type,
|
||||
ldc,
|
||||
rocblas_datatype_f32_r,
|
||||
rocblas_gemm_algo_standard,
|
||||
|
||||
@ -45,6 +45,24 @@ struct OffsetCalculator {
|
||||
|
||||
C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
|
||||
offset_type offsets;
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
if ((dims > 0) && (dims <= 2)) {
|
||||
auto divmod = sizes_[0].divmod(linear_idx);
|
||||
#pragma unroll
|
||||
for (int arg = 0; arg < NARGS; arg++)
|
||||
offsets[arg] = divmod.mod * strides_[0][arg];
|
||||
if (dims >= 2) {
|
||||
divmod = sizes_[1].divmod(divmod.div);
|
||||
#pragma unroll
|
||||
for (int arg = 0; arg < NARGS; arg++)
|
||||
offsets[arg] += divmod.mod * strides_[1][arg];
|
||||
}
|
||||
// [...]
|
||||
return offsets;
|
||||
}
|
||||
#endif
|
||||
|
||||
#pragma unroll
|
||||
for (int arg = 0; arg < NARGS; arg++) {
|
||||
offsets[arg] = 0;
|
||||
|
||||
@ -1157,6 +1157,7 @@ REGISTER_AVX512_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_AVX2_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_VSX_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_SVE_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
REGISTER_SVE256_DISPATCH(cholesky_stub, &cholesky_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(cholesky_inverse_stub, DEFAULT, &cholesky_inverse_kernel_impl)
|
||||
@ -1164,6 +1165,7 @@ REGISTER_AVX512_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_AVX2_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_VSX_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_ZVECTOR_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_SVE_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
REGISTER_SVE256_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(linalg_eig_stub, DEFAULT, &linalg_eig_kernel)
|
||||
@ -1171,6 +1173,7 @@ REGISTER_AVX512_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_AVX2_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_VSX_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_SVE_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
REGISTER_SVE256_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(linalg_eigh_stub, DEFAULT, &linalg_eigh_kernel)
|
||||
@ -1178,6 +1181,7 @@ REGISTER_AVX512_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_AVX2_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_VSX_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_SVE_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
REGISTER_SVE256_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(geqrf_stub, DEFAULT, &geqrf_kernel)
|
||||
@ -1185,6 +1189,7 @@ REGISTER_AVX512_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_AVX2_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_VSX_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_SVE_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
REGISTER_SVE256_DISPATCH(geqrf_stub, &geqrf_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(orgqr_stub, DEFAULT, &orgqr_kernel_impl)
|
||||
@ -1192,6 +1197,7 @@ REGISTER_AVX512_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_AVX2_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_VSX_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_ZVECTOR_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_SVE_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
REGISTER_SVE256_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(ormqr_stub, DEFAULT, &ormqr_kernel)
|
||||
@ -1199,6 +1205,7 @@ REGISTER_AVX512_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_AVX2_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_VSX_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_SVE_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
REGISTER_SVE256_DISPATCH(ormqr_stub, &ormqr_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(lstsq_stub, DEFAULT, &lstsq_kernel)
|
||||
@ -1206,6 +1213,7 @@ REGISTER_AVX512_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_AVX2_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_VSX_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_SVE_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
REGISTER_SVE256_DISPATCH(lstsq_stub, &lstsq_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(triangular_solve_stub, DEFAULT, &triangular_solve_kernel)
|
||||
@ -1213,6 +1221,7 @@ REGISTER_AVX512_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_AVX2_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_VSX_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_SVE_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
REGISTER_SVE256_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(lu_factor_stub, DEFAULT, &lu_factor_kernel)
|
||||
@ -1220,6 +1229,7 @@ REGISTER_AVX512_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_AVX2_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_VSX_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_SVE_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
REGISTER_SVE256_DISPATCH(lu_factor_stub, &lu_factor_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(ldl_factor_stub, DEFAULT, &ldl_factor_kernel)
|
||||
@ -1227,6 +1237,7 @@ REGISTER_AVX512_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_AVX2_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_VSX_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_SVE_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
REGISTER_SVE256_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(ldl_solve_stub, DEFAULT, &ldl_solve_kernel)
|
||||
@ -1234,6 +1245,7 @@ REGISTER_AVX512_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_AVX2_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_VSX_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_SVE_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
REGISTER_SVE256_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(lu_solve_stub, DEFAULT, &lu_solve_kernel)
|
||||
@ -1241,6 +1253,7 @@ REGISTER_AVX512_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_AVX2_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_VSX_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_SVE_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
REGISTER_SVE256_DISPATCH(lu_solve_stub, &lu_solve_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(svd_stub, DEFAULT, &svd_kernel)
|
||||
@ -1248,6 +1261,7 @@ REGISTER_AVX512_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_AVX2_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_VSX_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_SVE_DISPATCH(svd_stub, &svd_kernel)
|
||||
REGISTER_SVE256_DISPATCH(svd_stub, &svd_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(unpack_pivots_stub, DEFAULT, &unpack_pivots_cpu_kernel)
|
||||
@ -1255,5 +1269,6 @@ REGISTER_AVX512_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_AVX2_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
|
||||
} // namespace at::native
|
||||
|
||||
@ -457,24 +457,9 @@ void gemm(
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
// for the fallback path, first compute gemm with beta = 0,
|
||||
// and then add c in full precision.
|
||||
int64_t c_size = n * m;
|
||||
std::vector<float> float_c(c_size, 0.f);
|
||||
gemm_no_downcast_stub(
|
||||
at::kCPU, at::kBFloat16,
|
||||
transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
|
||||
for (const auto j : c10::irange(n)) {
|
||||
for (const auto i : c10::irange(m)) {
|
||||
auto offset = j * ldc + i;
|
||||
// beta == 0 won't propagate NaN from C
|
||||
if (beta == 0.f) {
|
||||
c[offset] = float_c[j * m + i];
|
||||
} else {
|
||||
c[offset] = beta * c[offset] + float_c[j * m + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
|
||||
}
|
||||
|
||||
void gemm(
|
||||
@ -493,24 +478,9 @@ void gemm(
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
// for the fallback path, first compute gemm with beta = 0,
|
||||
// and then add c in full precision.
|
||||
int64_t c_size = n * m;
|
||||
std::vector<float> float_c(c_size, 0.f);
|
||||
gemm_no_downcast_stub(
|
||||
at::kCPU, at::kHalf,
|
||||
transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float_c.data(), m);
|
||||
for (const auto j : c10::irange(n)) {
|
||||
for (const auto i : c10::irange(m)) {
|
||||
auto offset = j * ldc + i;
|
||||
// beta == 0 won't propagate NaN from C
|
||||
if (beta == 0.f) {
|
||||
c[offset] = float_c[j * m + i];
|
||||
} else {
|
||||
c[offset] = beta * c[offset] + float_c[j * m + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
|
||||
}
|
||||
|
||||
void gemm(
|
||||
|
||||
@ -38,17 +38,27 @@ static CPUCapability compute_cpu_capability() {
|
||||
return CPUCapability::ZVECTOR;
|
||||
}
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION)
|
||||
int sve_vl = cpuinfo_get_max_arm_sve_length(); //Returns maximum SVE VL supported by your HW.
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
int sve_vl = cpuinfo_get_max_arm_sve_length(); // Returns maximum SVE VL supported by your HW.
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
if (envar == "sve256") {
|
||||
if (sve_vl == 256) {
|
||||
#ifdef HAVE_ARM_BF16_CPU_DEFINITION
|
||||
if (cpuinfo_has_arm_bf16()) {
|
||||
if (cpuinfo_has_arm_bf16()) {
|
||||
if (sve_vl == 256) {
|
||||
return CPUCapability::SVE256;
|
||||
} else if (sve_vl > 0) {
|
||||
return CPUCapability::SVE;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
TORCH_WARN("SVE256 capability not available on hardware. Falling back to DEFAULT");
|
||||
#endif
|
||||
TORCH_WARN("SVE capability not available on hardware. Falling back to DEFAULT");
|
||||
return CPUCapability::DEFAULT;
|
||||
} else if (envar == "sve") {
|
||||
#ifdef HAVE_ARM_BF16_CPU_DEFINITION
|
||||
if (cpuinfo_has_arm_bf16() && sve_vl > 0) {
|
||||
return CPUCapability::SVE;
|
||||
}
|
||||
#endif
|
||||
TORCH_WARN("SVE capability not available on hardware. Falling back to DEFAULT");
|
||||
return CPUCapability::DEFAULT;
|
||||
}
|
||||
#endif
|
||||
@ -100,19 +110,15 @@ static CPUCapability compute_cpu_capability() {
|
||||
#if defined(__linux__) && defined(HAVE_SVE_CPU_DEFINITION)
|
||||
if (cpuinfo_initialize() && cpuinfo_has_arm_sve()) {
|
||||
int sve_vl = cpuinfo_get_max_arm_sve_length(); //Returns maximum SVE VL supported by your HW.
|
||||
if (sve_vl <= 0) {
|
||||
// SVE is not supported on this system.
|
||||
// Return the default CPU capability.
|
||||
return CPUCapability::DEFAULT;
|
||||
#ifdef HAVE_ARM_BF16_CPU_DEFINITION
|
||||
if (cpuinfo_has_arm_bf16()) {
|
||||
if (sve_vl == 256) { // Check for SVE256
|
||||
return CPUCapability::SVE256;
|
||||
} else if (sve_vl > 0) {
|
||||
return CPUCapability::SVE;
|
||||
}
|
||||
}
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
if (sve_vl == 256) { // Check for SVE256
|
||||
#ifdef HAVE_ARM_BF16_CPU_DEFINITION
|
||||
if (cpuinfo_has_arm_bf16())
|
||||
return CPUCapability::SVE256;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
// Return the default CPU capability.
|
||||
return CPUCapability::DEFAULT;
|
||||
}
|
||||
@ -144,7 +150,8 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, void *SVE
|
||||
, void *SVE256
|
||||
#endif
|
||||
) {
|
||||
@ -182,7 +189,8 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, SVE
|
||||
, SVE256
|
||||
#endif
|
||||
);
|
||||
@ -239,7 +247,8 @@ void* DispatchStubImpl::get_call_ptr(
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, void *SVE
|
||||
, void *SVE256
|
||||
#endif
|
||||
) {
|
||||
@ -263,7 +272,9 @@ void* DispatchStubImpl::get_call_ptr(
|
||||
,
|
||||
ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
,
|
||||
SVE
|
||||
,
|
||||
SVE256
|
||||
#endif
|
||||
@ -298,7 +309,8 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, void *SVE
|
||||
, void *SVE256
|
||||
#endif
|
||||
){
|
||||
@ -333,7 +345,7 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
|
||||
return ZVECTOR != nullptr ? DispatchResult(ZVECTOR) : ErrorType::MissingDeviceKernel;
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
if (capability >= static_cast<int>(CPUCapability::SVE256)) {
|
||||
if (C10_UNLIKELY(!SVE256)) {
|
||||
// dispatch to DEFAULT, since the SVE kernel is missing
|
||||
@ -342,6 +354,14 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
|
||||
return DispatchResult(SVE256);
|
||||
}
|
||||
}
|
||||
if (capability >= static_cast<int>(CPUCapability::SVE)) {
|
||||
if (C10_UNLIKELY(!SVE)) {
|
||||
// dispatch to DEFAULT, since the SVE kernel is missing
|
||||
return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
|
||||
} else {
|
||||
return DispatchResult(SVE);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
|
||||
}
|
||||
@ -360,7 +380,8 @@ void* DispatchStubImpl::choose_cpu_impl(
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, void *SVE
|
||||
, void *SVE256
|
||||
#endif
|
||||
) {
|
||||
@ -398,7 +419,7 @@ void* DispatchStubImpl::choose_cpu_impl(
|
||||
return ZVECTOR;
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
if (capability >= static_cast<int>(CPUCapability::SVE256)) {
|
||||
if (C10_UNLIKELY(!SVE256)) {
|
||||
// dispatch to DEFAULT, since the SVE kernel is missing
|
||||
@ -408,6 +429,15 @@ void* DispatchStubImpl::choose_cpu_impl(
|
||||
return SVE256;
|
||||
}
|
||||
}
|
||||
if (capability >= static_cast<int>(CPUCapability::SVE)) {
|
||||
if (C10_UNLIKELY(!SVE)) {
|
||||
// dispatch to DEFAULT, since the SVE kernel is missing
|
||||
TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
|
||||
return DEFAULT;
|
||||
} else {
|
||||
return SVE;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
|
||||
return DEFAULT;
|
||||
|
||||
@ -64,8 +64,9 @@ enum class CPUCapability {
|
||||
VSX = 1,
|
||||
#elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
|
||||
ZVECTOR = 1,
|
||||
#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
SVE256 = 1,
|
||||
#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
|
||||
SVE=1,
|
||||
SVE256 = 2,
|
||||
#else
|
||||
AVX2 = 1,
|
||||
AVX512 = 2,
|
||||
@ -115,7 +116,8 @@ struct TORCH_API DispatchStubImpl {
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, void *SVE
|
||||
, void *SVE256
|
||||
#endif
|
||||
);
|
||||
@ -136,7 +138,8 @@ struct TORCH_API DispatchStubImpl {
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, void *SVE
|
||||
, void *SVE256
|
||||
#endif
|
||||
);
|
||||
@ -157,7 +160,8 @@ struct TORCH_API DispatchStubImpl {
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, void *SVE
|
||||
, void *SVE256
|
||||
#endif
|
||||
);
|
||||
@ -181,7 +185,8 @@ struct TORCH_API DispatchStubImpl {
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, void *ZVECTOR
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, void *SVE
|
||||
, void *SVE256
|
||||
#endif
|
||||
);
|
||||
@ -238,7 +243,8 @@ private:
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(ZVECTOR)
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(SVE)
|
||||
, reinterpret_cast<void*>(SVE256)
|
||||
#endif
|
||||
)
|
||||
@ -299,7 +305,8 @@ public:
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(ZVECTOR)
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
, reinterpret_cast<void*>(SVE)
|
||||
, reinterpret_cast<void*>(SVE256)
|
||||
#endif
|
||||
);
|
||||
@ -322,7 +329,8 @@ public:
|
||||
#ifdef HAVE_ZVECTOR_CPU_DEFINITION
|
||||
static TORCH_API FnPtr ZVECTOR;
|
||||
#endif
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
static TORCH_API FnPtr SVE;
|
||||
static TORCH_API FnPtr SVE256;
|
||||
#endif
|
||||
private:
|
||||
@ -426,9 +434,11 @@ struct RegisterPRIVATEUSE1Dispatch {
|
||||
#define REGISTER_ZVECTOR_DISPATCH(name, fn)
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SVE256_CPU_DEFINITION
|
||||
#ifdef HAVE_SVE_CPU_DEFINITION
|
||||
#define REGISTER_SVE_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, SVE, fn)
|
||||
#define REGISTER_SVE256_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, SVE256, fn)
|
||||
#else
|
||||
#define REGISTER_SVE_DISPATCH(name, fn)
|
||||
#define REGISTER_SVE256_DISPATCH(name, fn)
|
||||
#endif
|
||||
|
||||
@ -440,6 +450,7 @@ struct RegisterPRIVATEUSE1Dispatch {
|
||||
REGISTER_AVX2_DISPATCH(name, fn) \
|
||||
REGISTER_VSX_DISPATCH(name, fn) \
|
||||
REGISTER_ZVECTOR_DISPATCH(name, fn) \
|
||||
REGISTER_SVE_DISPATCH(name, fn) \
|
||||
REGISTER_SVE256_DISPATCH(name, fn)
|
||||
|
||||
#define REGISTER_NO_CPU_DISPATCH(name) \
|
||||
@ -488,6 +499,7 @@ struct RegisterPRIVATEUSE1Dispatch {
|
||||
#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
|
||||
#endif
|
||||
#define ALSO_REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
|
||||
#define ALSO_REGISTER_SVE_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
|
||||
#define ALSO_REGISTER_SVE256_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
|
||||
#endif
|
||||
} // namespace at::native
|
||||
|
||||
@ -1360,7 +1360,8 @@ Tensor outer(const Tensor& self, const Tensor& vec2) {
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
|
||||
#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
|
||||
// Used by default on x86 platforms and on AArch64+ACL
|
||||
static inline int64_t get_mkldnn_matmul_min_dim() {
|
||||
static auto value = [&] {
|
||||
const int64_t default_min_dim = [&] {
|
||||
@ -1395,8 +1396,6 @@ static inline bool apply_mkldnn_matmul_heur(int64_t m, int64_t k, int64_t n) {
|
||||
return at::globalContext().userEnabledMkldnn() && m > min_dim && k > min_dim && n > min_dim && m * k * n > min_size;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
static void addmm_impl_cpu_(
|
||||
Tensor &result, const Tensor &self, Tensor m1, Tensor m2, const Scalar& beta, const Scalar& alpha) {
|
||||
TORCH_INTERNAL_ASSERT(self.dim() == 2 && m1.dim() == 2 && m2.dim() == 2);
|
||||
@ -1772,8 +1771,8 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
|
||||
return (strides[2] == 1 && (sizes[1] == 1 || strides[1] >= sizes[2])) ||
|
||||
(strides[1] == 1 && (sizes[2] == 1 || strides[2] >= sizes[1]));
|
||||
};
|
||||
|
||||
#if defined(__aarch64__) && AT_MKLDNN_ACL_ENABLED()
|
||||
#if !defined(__aarch64__) || AT_MKLDNN_ACL_ENABLED()
|
||||
// Always apply mkldnn heuristic on x86 platform, but on ARM only if compiled with ACL
|
||||
bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]);
|
||||
if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) {
|
||||
try {
|
||||
@ -1785,7 +1784,6 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (contraction_size * res_rows * res_cols < 400) {
|
||||
if (is_bmm_out) {
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, batch1.scalar_type(), "bmm", [&] {
|
||||
|
||||
@ -624,7 +624,9 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
|
||||
if (backend == BatchNormBackend::Miopen) {
|
||||
return std::tuple_cat(
|
||||
at::miopen_batch_norm(
|
||||
input.contiguous(), weight.contiguous(), bias.contiguous(),
|
||||
input.contiguous(input.suggest_memory_format()),
|
||||
weight.contiguous(),
|
||||
bias.contiguous(),
|
||||
running_mean.defined() ? running_mean.contiguous() : running_mean,
|
||||
running_var.defined() ? running_var.contiguous() : running_var,
|
||||
training, momentum, eps),
|
||||
|
||||
@ -466,6 +466,7 @@ REGISTER_AVX2_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cp
|
||||
REGISTER_AVX512_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
|
||||
|
||||
// offsets dispatches
|
||||
@ -477,6 +478,7 @@ REGISTER_AVX2_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cp
|
||||
REGISTER_AVX512_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
|
||||
|
||||
// Currently some computation is being duplicated across forward and backward.
|
||||
@ -548,6 +550,9 @@ REGISTER_VSX_DISPATCH(
|
||||
REGISTER_ZVECTOR_DISPATCH(
|
||||
_segment_reduce_lengths_backward_stub,
|
||||
&_segment_reduce_cpu_lengths_backward_kernel)
|
||||
REGISTER_SVE_DISPATCH(
|
||||
_segment_reduce_lengths_backward_stub,
|
||||
&_segment_reduce_cpu_lengths_backward_kernel)
|
||||
REGISTER_SVE256_DISPATCH(
|
||||
_segment_reduce_lengths_backward_stub,
|
||||
&_segment_reduce_cpu_lengths_backward_kernel)
|
||||
@ -568,6 +573,9 @@ REGISTER_VSX_DISPATCH(
|
||||
REGISTER_ZVECTOR_DISPATCH(
|
||||
_segment_reduce_offsets_backward_stub,
|
||||
&_segment_reduce_cpu_offsets_backward_kernel)
|
||||
REGISTER_SVE_DISPATCH(
|
||||
_segment_reduce_offsets_backward_stub,
|
||||
&_segment_reduce_cpu_offsets_backward_kernel)
|
||||
REGISTER_SVE256_DISPATCH(
|
||||
_segment_reduce_offsets_backward_stub,
|
||||
&_segment_reduce_cpu_offsets_backward_kernel)
|
||||
|
||||
@ -274,7 +274,7 @@ inline Vectorized<scalar_t> div_floor_floating_vec(
|
||||
return floordiv;
|
||||
}
|
||||
|
||||
#if defined(CPU_CAPABILITY_SVE256) && defined(__ARM_FEATURE_BF16)
|
||||
#if (defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE)) && defined(__ARM_FEATURE_BF16)
|
||||
|
||||
// Since sve lacks sufficient bf16 intrinsics, do the calculations in f32 to
|
||||
// avoid rounding errors. This should not cause performance issues as
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
#include <ATen/native/transformers/attention.h>
|
||||
#include <ATen/native/transformers/sdp_utils_cpp.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <variant>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
@ -44,13 +45,23 @@ inline void _scale_attn_mask_fusion_kernel(
|
||||
#endif
|
||||
const auto vec_size1 = at::vec::Vectorized<T1>::size();
|
||||
const auto vec_size2 = at::vec::Vectorized<T2>::size();
|
||||
constexpr int64_t T1_n =
|
||||
const int64_t T1_n =
|
||||
(vec_size2 == vec_size1 * 2 && is_reduced_floating_point_v<T2>) ? 2 : 1;
|
||||
constexpr int64_t T2_n = 1;
|
||||
auto vec_scale = at::vec::VectorizedN<T1, T1_n>(val);
|
||||
std::variant<at::vec::VectorizedN<T1, 2>, at::vec::VectorizedN<T1, 1>> vec_scale;
|
||||
if (T1_n == 2)
|
||||
vec_scale = at::vec::VectorizedN<T1, 2>(val);
|
||||
else if (T1_n == 1)
|
||||
vec_scale = at::vec::VectorizedN<T1, 1>(val);
|
||||
|
||||
int64_t i = 0;
|
||||
for (; i < size - (size % vec_size2); i += vec_size2) {
|
||||
auto a_n = at::vec::VectorizedN<T1, T1_n>::loadu(a + i);
|
||||
std::variant<at::vec::VectorizedN<T1, 2>, at::vec::VectorizedN<T1, 1>> a_n;
|
||||
if (T1_n == 2)
|
||||
a_n = at::vec::VectorizedN<T1, 2>::loadu(a + i);
|
||||
else if (T1_n == 1)
|
||||
a_n = at::vec::VectorizedN<T1, 1>::loadu(a + i);
|
||||
|
||||
at::vec::VectorizedN<T2, T2_n> b_n;
|
||||
#if __GNUC__ == 11 && defined(__ARM_FEATURE_SVE)
|
||||
if (is_b_stride_zero) {
|
||||
@ -61,9 +72,16 @@ inline void _scale_attn_mask_fusion_kernel(
|
||||
} else {
|
||||
b_n = at::vec::VectorizedN<T2, T2_n>::loadu(b + i);
|
||||
}
|
||||
auto b_n_convert = at::vec::convert<T1, T1_n, T2, T2_n, true>(b_n);
|
||||
auto res = a_n * vec_scale + b_n_convert;
|
||||
res.store(out + i);
|
||||
std::variant<at::vec::VectorizedN<T1, 2>, at::vec::VectorizedN<T1, 1>> b_n_convert;
|
||||
if (T1_n == 2) {
|
||||
auto b_n_convert = at::vec::convert<T1, 2, T2, T2_n, true>(b_n);
|
||||
auto res = std::get<at::vec::VectorizedN<T1, 2>>(a_n) * std::get<at::vec::VectorizedN<T1, 2>>(vec_scale) + b_n_convert;
|
||||
res.store(out + i);
|
||||
} else if(T1_n == 1) {
|
||||
auto b_n_convert = at::vec::convert<T1, 1, T2, T2_n, true>(b_n);
|
||||
auto res = std::get<at::vec::VectorizedN<T1, 1>>(a_n) * std::get<at::vec::VectorizedN<T1, 1>>(vec_scale) + b_n_convert;
|
||||
res.store(out + i);
|
||||
}
|
||||
}
|
||||
for (; i < size; i++) {
|
||||
auto tmp0 = a[i];
|
||||
|
||||
@ -694,7 +694,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bilinear,
|
||||
gx = gx * gx_mult;
|
||||
gy = gy * gy_mult;
|
||||
|
||||
constexpr int64_t step = Vec::size();
|
||||
const int64_t step = Vec::size();
|
||||
auto interleaved_gGrid = interleave2(gx, gy);
|
||||
auto gGrid_ptr = gGrid_slice.data() + offset * 2;
|
||||
std::get<0>(interleaved_gGrid).store(gGrid_ptr,
|
||||
@ -1010,7 +1010,7 @@ struct ApplyGridSample<scalar_t, 2, GridSamplerInterpolation::Bicubic,
|
||||
gx = gx * gx_mult;
|
||||
gy = gy * gy_mult;
|
||||
|
||||
constexpr int64_t step = Vec::size();
|
||||
const int64_t step = Vec::size();
|
||||
auto interleaved_gGrid = interleave2(gx, gy);
|
||||
auto gGrid_ptr = gGrid_slice.data() + offset * 2;
|
||||
std::get<0>(interleaved_gGrid).store(gGrid_ptr,
|
||||
@ -1041,7 +1041,7 @@ static inline void grid_sample_2d_grid_slice_iterator(
|
||||
|
||||
using Vec = Vectorized<scalar_t>;
|
||||
using iVec = Vectorized<int_same_size_t<scalar_t>>;
|
||||
constexpr int64_t step = Vec::size();
|
||||
const int64_t step = Vec::size();
|
||||
|
||||
// Loop over each output pixel in grid.
|
||||
// We consider the following three cases (after slicing out the batch
|
||||
|
||||
@ -19,7 +19,7 @@ Vectorized<scalar_t> is_lerp_weight_small(Vectorized<scalar_t> weight) {
|
||||
// is_lerp_weight_small doesn't work for complex because z.abs() returns a
|
||||
// complex vector which can't be compared. Either implement it with z.abs_2_(),
|
||||
// or fallback to the scalar function.
|
||||
#if !(defined(CPU_CAPABILITY_DEFAULT) || defined(_MSC_VER) || defined(CPU_CAPABILITY_SVE))
|
||||
#if !(defined(CPU_CAPABILITY_DEFAULT) || defined(_MSC_VER) || defined(CPU_CAPABILITY_SVE256) || defined(CPU_CAPABILITY_SVE))
|
||||
template <typename value_t>
|
||||
Vectorized<c10::complex<value_t>> is_lerp_weight_small(Vectorized<c10::complex<value_t>> weight) {
|
||||
using vec_reg_t = decltype(weight.abs_2_());
|
||||
|
||||
@ -210,13 +210,22 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve
|
||||
|
||||
Vec opt_scalar = Vec(S > 0 ? c10::load((scalar_t*)data[S]) : scalar_t(0));
|
||||
int64_t i = 0;
|
||||
for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) {
|
||||
int size = Vec::size();
|
||||
#if !defined(CPU_CAPABILITY_SVE) && !defined(CPU_CAPABILITY_SVE256)
|
||||
// Loop unrolling prevents compiler from optimizing the SVE classes
|
||||
for (; i <= n - 2 * size; i += 2 * size) {
|
||||
auto args1 = dereference_vec<traits>(&data[1], opt_scalar, S, i);
|
||||
auto args2 = dereference_vec<traits>(&data[1], opt_scalar, S, i + Vec::size());
|
||||
auto args2 = dereference_vec<traits>(&data[1], opt_scalar, S, i + size);
|
||||
auto out1 = std::apply(vop, std::move(args1));
|
||||
auto out2 = std::apply(vop, std::move(args2));
|
||||
out1.store(data[0] + i * sizeof(scalar_t));
|
||||
out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t));
|
||||
out2.store(data[0] + (i + size) * sizeof(scalar_t));
|
||||
}
|
||||
#endif
|
||||
for (; i <= n - size; i += size) {
|
||||
auto args1 = dereference_vec<traits>(&data[1], opt_scalar, S, i);
|
||||
auto out1 = c10::guts::apply(vop, std::move(args1));
|
||||
out1.store(data[0] + i * sizeof(scalar_t));
|
||||
}
|
||||
if (i < n) {
|
||||
int64_t strides[ntensors];
|
||||
|
||||
@ -80,7 +80,7 @@ inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int64_t n,
|
||||
template <typename func_t, typename vec_func_t>
|
||||
inline void vectorized_inner_reduction(char** data, int64_t n, func_t op, vec_func_t vop) {
|
||||
VEC_LOOP_HEADER(func_t, data)
|
||||
constexpr int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
|
||||
const int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
|
||||
int64_t count = n / (4 * Vec::size());
|
||||
if (count > 0) {
|
||||
vectorized_reduction(data, count, vector_stride, op, vop, /*reduce=*/true);
|
||||
@ -96,7 +96,7 @@ inline void vectorized_outer_reduction(char** data, int64_t inner_stride, int64_
|
||||
VEC_LOOP_HEADER(func_t, data)
|
||||
|
||||
// reduce down each column of 4 * Vec::size() elements.
|
||||
constexpr int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
|
||||
const int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
|
||||
int64_t outer_stride[2] = { vector_stride, vector_stride };
|
||||
UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size()), [&] {
|
||||
vectorized_reduction(data, size0, inner_stride, op, vop, /*reduce=*/false);
|
||||
|
||||
@ -154,8 +154,8 @@ inline void map_acc(
|
||||
using Vec = vec::Vectorized<scalar_t>;
|
||||
using aVec = vec::Vectorized<accumut>;
|
||||
int64_t d = 0;
|
||||
constexpr int64_t kVecSize = Vec::size();
|
||||
constexpr int64_t kaVecSize = aVec::size();
|
||||
const int64_t kVecSize = Vec::size();
|
||||
const int64_t kaVecSize = aVec::size();
|
||||
for (d = 0; d < size - (size % kVecSize); d += kVecSize) {
|
||||
Vec data2_vec = Vec::loadu(input_data2 + d);
|
||||
auto [data2_avec0, data2_avec1] = convert_to_float<scalar_t>(data2_vec);
|
||||
|
||||
@ -22,8 +22,8 @@ inline namespace CPU_CAPABILITY {
|
||||
|
||||
constexpr auto kF32RegisterPairsPerIteration = 4;
|
||||
constexpr auto kF32RegistersPerIteration = kF32RegisterPairsPerIteration * 2;
|
||||
constexpr auto kF32ElementsPerRegister = vec::Vectorized<float>::size();
|
||||
constexpr auto kF32ElementsPerIteration = kF32RegistersPerIteration * kF32ElementsPerRegister;
|
||||
const auto kF32ElementsPerRegister = vec::Vectorized<float>::size();
|
||||
const auto kF32ElementsPerIteration = kF32RegistersPerIteration * kF32ElementsPerRegister;
|
||||
|
||||
namespace {
|
||||
template <typename T>
|
||||
@ -150,16 +150,16 @@ float reduce(vec::VectorizedN<float, kF32RegistersPerIteration>& x) {
|
||||
// BFDOT. Deferring that for now to get the NEON/ASIMD BFDOT path
|
||||
// working.
|
||||
#if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
|
||||
#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
|
||||
#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && !defined(CPU_CAPABILITY_SVE256) && defined(__clang__) && __clang_major__ > 15
|
||||
// https://godbolt.org/z/z8P4Yncra
|
||||
#define COMPILER_SUPPORTS_BF16_TARGET 1
|
||||
#elif defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 10
|
||||
#elif defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE) && !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 10
|
||||
// https://gcc.gnu.org/gcc-10/changes.html
|
||||
// https://godbolt.org/z/cdGG7vn8o
|
||||
#define COMPILER_SUPPORTS_BF16_TARGET 1
|
||||
#else // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
|
||||
#else // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
|
||||
#define COMPILER_SUPPORTS_BF16_TARGET 0
|
||||
#endif // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
|
||||
#endif // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE) && !defined(CPU_CAPABILITY_SVE) && defined(__clang__) && __clang_major__ > 15
|
||||
#else // __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
|
||||
#define COMPILER_SUPPORTS_BF16_TARGET 0
|
||||
#endif // __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
|
||||
@ -212,7 +212,7 @@ std::pair<vec::Vectorized<float>, vec::Vectorized<float>> fmadd(
|
||||
const vec::Vectorized<c10::Half>& b,
|
||||
const vec::Vectorized<float>& acc_low,
|
||||
const vec::Vectorized<float>& acc_high) {
|
||||
#if defined(__ARM_FEATURE_FP16_FML) && !defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(__ARM_FEATURE_FP16_FML) && !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
return std::make_pair(vfmlalq_low_f16(acc_low, a, b), vfmlalq_high_f16(acc_high, a, b));
|
||||
#else
|
||||
const auto [a_float_low, a_float_high] = convert_half_float(a);
|
||||
|
||||
@ -28,8 +28,8 @@ inline void _update(at::opmath_type<scalar_t>* out_ptr, int64_t e, int64_t c, co
|
||||
using opmath_t = at::opmath_type<scalar_t>;
|
||||
using Vec = vec::Vectorized<scalar_t>;
|
||||
using aVec = VecType<scalar_t>;
|
||||
constexpr int64_t kVecSize = Vec::size();
|
||||
constexpr int64_t kVLEN = kVecSize * 4;
|
||||
const int64_t kVecSize = Vec::size();
|
||||
const int64_t kVLEN = kVecSize * 4;
|
||||
|
||||
int64_t k = 0;
|
||||
aVec val_vec = aVec((opmath_t)val);
|
||||
|
||||
@ -21,11 +21,11 @@ Vectorized<acc_t> load_reduce_vec(const scalar_t* data, F reduce, acc_t ident) {
|
||||
using vacc_t = Vectorized<acc_t>;
|
||||
static_assert(vacc_t::size() <= vec_t::size());
|
||||
const auto val = vec_t::loadu(data);
|
||||
alignas(64) std::array<scalar_t, vec_t::size()> values;
|
||||
val.store(values.data());
|
||||
alignas(64) scalar_t values[vec_t::size()];
|
||||
val.store(values);
|
||||
|
||||
constexpr int vstride = vec_t::size() / vacc_t::size();
|
||||
alignas(64) std::array<acc_t, vacc_t::size()> acc;
|
||||
alignas(64) acc_t acc[vacc_t::size()];
|
||||
acc.fill(ident);
|
||||
for (const auto k : c10::irange(vstride)) {
|
||||
for (const auto i : c10::irange(vacc_t::size())) {
|
||||
@ -33,7 +33,7 @@ Vectorized<acc_t> load_reduce_vec(const scalar_t* data, F reduce, acc_t ident) {
|
||||
}
|
||||
}
|
||||
|
||||
return vacc_t::loadu(acc.data());
|
||||
return vacc_t::loadu(acc);
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
@ -138,7 +138,7 @@ struct OuterSumCastLoadPolicy <vec_t, vacc_t,
|
||||
using scalar_t = vechold_type<vec_t>;
|
||||
using acc_t = vechold_type<vacc_t>;
|
||||
|
||||
static constexpr int64_t memsize() {
|
||||
static int64_t memsize() {
|
||||
return sizeof(scalar_t) * vacc_t::size();
|
||||
}
|
||||
|
||||
@ -161,7 +161,7 @@ template <typename vec_t, typename vacc_t>
|
||||
struct OuterSumCastLoadPolicy <vec_t, vacc_t, std::enable_if_t<is_reduced_floating_point_v<vechold_type<vec_t>>>> {
|
||||
using scalar_t = vechold_type<vec_t>;
|
||||
|
||||
static constexpr int64_t memsize() {
|
||||
static int64_t memsize() {
|
||||
return sizeof(scalar_t) * vacc_t::size();
|
||||
}
|
||||
|
||||
@ -198,7 +198,7 @@ template <typename scalar_t>
|
||||
struct NanSumLoadPolicy<Vectorized<scalar_t>> {
|
||||
using vec_t = Vectorized<scalar_t>;
|
||||
|
||||
static constexpr int64_t memsize() {
|
||||
static int64_t memsize() {
|
||||
return LoadPolicy<vec_t>::memsize();
|
||||
}
|
||||
|
||||
@ -267,7 +267,7 @@ struct InnerNanSumCastLoadPolicy <vec_t, vacc_t, std::enable_if_t<is_reduced_flo
|
||||
|
||||
template <typename vec_t, typename vacc_t>
|
||||
struct OuterNanSumCastLoadPolicy {
|
||||
static constexpr int64_t memsize() {
|
||||
static int64_t memsize() {
|
||||
return OuterSumCastLoadPolicy<vec_t, vacc_t>::memsize();
|
||||
}
|
||||
|
||||
@ -300,13 +300,23 @@ static void store(char * C10_RESTRICT data, int64_t stride, int64_t index,
|
||||
}
|
||||
}
|
||||
|
||||
template <typename StorePolicy, typename scalar_t>
|
||||
static void store(char * C10_RESTRICT data, int64_t stride, int64_t index,
|
||||
const scalar_t *values, size_t numel) {
|
||||
auto *base_ptr = data + stride * index;
|
||||
for (const auto k : c10::irange(numel)) {
|
||||
auto val = values[k];
|
||||
StorePolicy::store(base_ptr, stride, k, val);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename StorePolicy, typename scalar_t>
|
||||
static void store(char * C10_RESTRICT data, int64_t stride, int64_t index,
|
||||
const Vectorized<scalar_t> &values) {
|
||||
using vec_t = Vectorized<scalar_t>;
|
||||
alignas(64) std::array<scalar_t, vec_t::size()> array_values{};
|
||||
values.store(array_values.data());
|
||||
store<StorePolicy>(data, stride, index, array_values);
|
||||
alignas(64) scalar_t array_values[vec_t::size()] = {};
|
||||
values.store(array_values);
|
||||
store<StorePolicy, scalar_t>(data, stride, index, array_values, vec_t::size());
|
||||
}
|
||||
|
||||
/** Simultaneously sum over n rows at once
|
||||
@ -436,9 +446,9 @@ void vectorized_inner_sum(
|
||||
char * C10_RESTRICT data[2], int64_t outer_stride, int64_t out_stride,
|
||||
int64_t size0, int64_t size1) {
|
||||
using vacc_t = Vectorized<acc_t>;
|
||||
constexpr int64_t vec_stride = VecLoadPolicy::memsize();
|
||||
constexpr int64_t scalar_stride = ScalarLoadPolicy::memsize();
|
||||
constexpr int64_t vec_numel = vec_stride / scalar_stride;
|
||||
const int64_t vec_stride = VecLoadPolicy::memsize();
|
||||
const int64_t scalar_stride = ScalarLoadPolicy::memsize();
|
||||
const int64_t vec_numel = vec_stride / scalar_stride;
|
||||
const int64_t vec_size = size0 / vec_numel;
|
||||
|
||||
// Input is contiguous over the first (reduced) dimension
|
||||
@ -451,9 +461,9 @@ void vectorized_inner_sum(
|
||||
final_acc += ScalarLoadPolicy::load(row_in, scalar_stride, k);
|
||||
}
|
||||
|
||||
alignas(64) std::array<acc_t, vacc_t::size()> partials{};
|
||||
vec_acc.store(partials.data());
|
||||
for (const auto k : c10::irange(partials.size())) {
|
||||
alignas(64) acc_t partials[vacc_t::size()] = {};
|
||||
vec_acc.store(partials);
|
||||
for (const auto k : c10::irange(vacc_t::size())) {
|
||||
final_acc += partials[k];
|
||||
}
|
||||
store<StorePolicy>(data[0], out_stride, j, final_acc);
|
||||
@ -479,7 +489,7 @@ void vectorized_outer_sum(
|
||||
int64_t size0, int64_t size1) {
|
||||
using vacc_t = Vectorized<acc_t>;
|
||||
constexpr int64_t scalar_stride = ScalarLoadPolicy::memsize();
|
||||
constexpr int64_t vec_stride = VecLoadPolicy::memsize();
|
||||
const int64_t vec_stride = VecLoadPolicy::memsize();
|
||||
constexpr int64_t nrows = 4;
|
||||
|
||||
// Input is contiguous over the second (non-reduced) dimension
|
||||
|
||||
@ -93,7 +93,7 @@ ColumnwiseMoments(
|
||||
int64_t C,
|
||||
int64_t D) {
|
||||
using Vec = vec::Vectorized<T>;
|
||||
constexpr int64_t K = Vec::size();
|
||||
const int64_t K = Vec::size();
|
||||
const int64_t inner_size = D / K * K;
|
||||
Vec acc0_vec{0}, acc1_vec{0};
|
||||
for (const auto m : c10::irange(HxW)) {
|
||||
@ -668,20 +668,20 @@ void GroupNormInputBackward(
|
||||
const opmath_t s = opmath_t(1) / static_cast<opmath_t>(D * HxW);
|
||||
const bool gamma_null = (gamma == nullptr);
|
||||
at::parallel_for(0, N * G, 1, [=](int64_t start, int64_t end) {
|
||||
constexpr int64_t K = vec::Vectorized<PT>::size();
|
||||
const int64_t K = vec::Vectorized<PT>::size();
|
||||
const int64_t d = D / K * K;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
||||
std::array<opmath_t, at::vec::Vectorized<opmath_t>::size()> ds_arr;
|
||||
opmath_t ds_arr[at::vec::Vectorized<opmath_t>::size()];
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
|
||||
std::array<opmath_t, at::vec::Vectorized<opmath_t>::size()> db_arr;
|
||||
opmath_t db_arr[at::vec::Vectorized<opmath_t>::size()];
|
||||
for (const auto i : c10::irange(start, end)) {
|
||||
const int64_t g = i % G;
|
||||
const opmath_t* ds_ptr = ds + i * D;
|
||||
const opmath_t* db_ptr = db + i * D;
|
||||
const PT* gamma_ptr = gamma_null ? nullptr : (gamma + g * D);
|
||||
CalcDsDb(ds_ptr, db_ptr, gamma_ptr, d, K, ds_arr.data(), db_arr.data());
|
||||
opmath_t ds_val = std::accumulate(ds_arr.cbegin(), ds_arr.cend(), opmath_t(0));
|
||||
opmath_t db_val = std::accumulate(db_arr.cbegin(), db_arr.cend(), opmath_t(0));
|
||||
CalcDsDb(ds_ptr, db_ptr, gamma_ptr, d, K, ds_arr, db_arr);
|
||||
opmath_t ds_val = std::accumulate(&ds_arr[0], &ds_arr[at::vec::Vectorized<opmath_t>::size()], opmath_t(0));
|
||||
opmath_t db_val = std::accumulate(&db_arr[0], &db_arr[at::vec::Vectorized<opmath_t>::size()], opmath_t(0));
|
||||
for (const auto j : c10::irange(d, D)) {
|
||||
const opmath_t gamma_v = gamma_null ? opmath_t(1) : opmath_t(gamma[g * D + j]);
|
||||
ds_val += ds_ptr[j] * gamma_v;
|
||||
@ -718,7 +718,7 @@ GammaBackward(
|
||||
PT* dgamma) {
|
||||
const int64_t G = group;
|
||||
const int64_t D = C / G;
|
||||
constexpr int64_t K = at::vec::Vectorized<PT>::size();
|
||||
const int64_t K = at::vec::Vectorized<PT>::size();
|
||||
using Vec = at::vec::Vectorized<PT>;
|
||||
const int64_t inner_size = D / K * K;
|
||||
for (const auto g : c10::irange(G)) {
|
||||
@ -818,7 +818,7 @@ template <typename PT, typename opmath_t>
|
||||
std::enable_if_t<std::is_same_v<PT, opmath_t>, void>
|
||||
BetaBackward(int64_t N, int64_t C, const opmath_t* db, PT* dbeta) {
|
||||
using Vec = at::vec::Vectorized<PT>;
|
||||
constexpr int64_t K = Vec::size();
|
||||
const int64_t K = Vec::size();
|
||||
Vec acc_vec{0}, zero{0};
|
||||
const int64_t inner_size = C / K * K;
|
||||
int64_t i = 0;
|
||||
@ -943,7 +943,7 @@ DsDbRowwiseMomentsChannelsLast(
|
||||
opmath_t* db_ptr,
|
||||
int64_t C) {
|
||||
using Vec = vec::Vectorized<T>;
|
||||
constexpr int64_t K = vec::Vectorized<T>::size();
|
||||
const int64_t K = vec::Vectorized<T>::size();
|
||||
const int64_t inner_size = C / K * K;
|
||||
int64_t d = 0;
|
||||
for (; d < inner_size; d += K) {
|
||||
@ -1247,7 +1247,7 @@ inline typename std::
|
||||
int64_t D) {
|
||||
using Vec = vec::Vectorized<T>;
|
||||
const bool gamma_null = (gamma_ptr == nullptr);
|
||||
constexpr int64_t K = Vec::size();
|
||||
const int64_t K = Vec::size();
|
||||
const int64_t inner_size = D / K * K;
|
||||
int64_t d = 0;
|
||||
opmath_t ds_gamma{0}, db_gamma{0};
|
||||
|
||||
@ -625,7 +625,7 @@ void weight_to_int4pack_kernel(
|
||||
int K = weight.size(1);
|
||||
|
||||
// 64 for avx512 and 32 for avx2/non-vectorized
|
||||
constexpr int BLOCK_N = vec::Vectorized<float>::size() * 4;
|
||||
const int BLOCK_N = vec::Vectorized<float>::size() * 4;
|
||||
const int NB = (N + BLOCK_N - 1) / BLOCK_N;
|
||||
|
||||
// parallel on NB blocks
|
||||
@ -713,7 +713,7 @@ void int4pack_mm_kernel_(
|
||||
|
||||
constexpr int BLOCK_M = 4;
|
||||
// 64 for avx512 and 32 for avx2/non-vectorized
|
||||
constexpr int BLOCK_N = vec::Vectorized<float>::size() * 4;
|
||||
const int BLOCK_N = vec::Vectorized<float>::size() * 4;
|
||||
// 32, 64, 128, 256
|
||||
const int BLOCK_K = qGroupSize;
|
||||
|
||||
|
||||
@ -109,8 +109,8 @@ template <typename T, int64_t kMaxDepth>
|
||||
std::pair<opmath_t<T>, opmath_t<T>> RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) {
|
||||
using math_t = opmath_t<T>;
|
||||
|
||||
constexpr int64_t kVecSize = vec::Vectorized<T>::size();
|
||||
constexpr int64_t kAccVecSize = vec::Vectorized<math_t>::size();
|
||||
const int64_t kVecSize = vec::Vectorized<T>::size();
|
||||
const int64_t kAccVecSize = vec::Vectorized<math_t>::size();
|
||||
const int64_t n = N / kVecSize;
|
||||
const int64_t m = divup(n, kChunkSize);
|
||||
const int64_t depth = utils::CeilLog2(m);
|
||||
@ -155,10 +155,10 @@ std::pair<opmath_t<T>, opmath_t<T>> RowwiseMomentsImpl(const T* X, int64_t N, in
|
||||
m0_stk[i], m1_stk[i], m2_stk[i], m0_stk[0], m1_stk[0], m2_stk[0]);
|
||||
}
|
||||
|
||||
std::array<math_t, kAccVecSize> m1_arr{};
|
||||
std::array<math_t, kAccVecSize> m2_arr{};
|
||||
m1_stk[0].store(m1_arr.data());
|
||||
m2_stk[0].store(m2_arr.data());
|
||||
math_t m1_arr[kAccVecSize] = {};
|
||||
math_t m2_arr[kAccVecSize] = {};
|
||||
m1_stk[0].store(m1_arr);
|
||||
m2_stk[0].store(m2_arr);
|
||||
|
||||
int64_t m0 = 0;
|
||||
math_t m1 = 0;
|
||||
@ -182,7 +182,7 @@ std::pair<opmath_t<T>, opmath_t<T>> RowwiseMomentsImpl(const T* X, int64_t N, in
|
||||
template <typename T>
|
||||
std::pair<opmath_t<T>, opmath_t<T>> RowwiseMoments(const T* X, int64_t N, int64_t ddof = 0) {
|
||||
using Vec = vec::Vectorized<T>;
|
||||
constexpr int64_t kVecSize = Vec::size();
|
||||
const int64_t kVecSize = Vec::size();
|
||||
const int64_t n = N / kVecSize;
|
||||
const int64_t m = divup(n, kChunkSize);
|
||||
const int64_t depth = utils::CeilLog2(m);
|
||||
|
||||
@ -1080,16 +1080,6 @@ static bool _scaled_mm_allowed_device(bool sm90_only=false, bool sm100_only=fals
|
||||
#endif
|
||||
}
|
||||
|
||||
static bool _grouped_mm_allowed_device() {
|
||||
#ifdef USE_ROCM
|
||||
return false;
|
||||
#else
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
// CUDA capability 8.0 and greater
|
||||
return dprops->major >= 8;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef USE_ROCM
|
||||
static bool _scaled_mm_is_fnuz() {
|
||||
return at::detail::getCUDAHooks().isGPUArch({"gfx942"});
|
||||
@ -1786,14 +1776,19 @@ Tensor _grouped_mm_cuda(const Tensor& mat_a, const Tensor& mat_b,
|
||||
const std::optional<at::Tensor>& offs,
|
||||
const std::optional<at::Tensor>& bias,
|
||||
std::optional<c10::ScalarType> out_dtype) {
|
||||
#ifndef USE_ROCM
|
||||
_grouped_mm_validate_inputs(mat_a, mat_b, offs, bias, out_dtype);
|
||||
bool a_b_and_out_are_bf16 = (
|
||||
mat_a.dtype() == at::kBFloat16 &&
|
||||
mat_b.dtype() == at::kBFloat16 &&
|
||||
out_dtype.value_or(at::kBFloat16) == at::kBFloat16
|
||||
);
|
||||
#ifndef USE_ROCM
|
||||
bool use_fast_path = _scaled_mm_allowed_device(/*sm90_only*/true, /*sm100_only*/true) && a_b_and_out_are_bf16;
|
||||
#else
|
||||
// _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used.
|
||||
// the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm
|
||||
bool use_fast_path = false;
|
||||
#endif
|
||||
const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
|
||||
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
|
||||
if (use_fast_path) {
|
||||
@ -1803,9 +1798,6 @@ std::optional<c10::ScalarType> out_dtype) {
|
||||
_grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
|
||||
}
|
||||
return out;
|
||||
#else
|
||||
TORCH_CHECK(false, "grouped gemm is not supported on ROCM")
|
||||
#endif
|
||||
}
|
||||
|
||||
Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
#include <ATen/NativeFunctions.h>
|
||||
#else
|
||||
#include <ATen/ops/empty.h>
|
||||
#include <ATen/ops/empty_like.h>
|
||||
#include <ATen/ops/miopen_batch_norm_native.h>
|
||||
#include <ATen/ops/miopen_batch_norm_backward_native.h>
|
||||
#endif
|
||||
@ -102,7 +103,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm(
|
||||
mode = miopenBNSpatial;
|
||||
}
|
||||
|
||||
auto output_t = at::empty(input->sizes(), input->options());
|
||||
auto output_t = at::empty_like(input_t, input_t.options(), input_t.suggest_memory_format());
|
||||
TensorArg output{ output_t, "output", 0 };
|
||||
|
||||
auto handle = getMiopenHandle();
|
||||
@ -170,20 +171,15 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
|
||||
const std::optional<Tensor>& save_var_t_opt,
|
||||
double epsilon) {
|
||||
// See [Note: hacky wrapper removal for optional tensor]
|
||||
const Tensor& running_mean =
|
||||
running_mean_opt.value_or(Tensor());
|
||||
const Tensor& running_var =
|
||||
running_var_opt.value_or(Tensor());
|
||||
const Tensor& save_mean_t =
|
||||
save_mean_t_opt.value_or(Tensor());
|
||||
const Tensor& save_var_t =
|
||||
save_var_t_opt.value_or(Tensor());
|
||||
const Tensor& save_mean_t = save_mean_t_opt.value_or(Tensor());
|
||||
const Tensor& save_var_t = save_var_t_opt.value_or(Tensor());
|
||||
|
||||
TensorArg input{ input_t, "input", 1 },
|
||||
grad_output{ grad_output_t, "grad_output", 2 },
|
||||
weight{ weight_t, "weight", 3 },
|
||||
save_mean{ save_mean_t, "save_mean", 4 },
|
||||
save_var{ save_var_t, "save_var", 5 };
|
||||
auto grad_output_contig =
|
||||
grad_output_t.contiguous(input_t.suggest_memory_format());
|
||||
TensorArg input{input_t, "input", 1},
|
||||
grad_output{grad_output_contig, "grad_output", 2},
|
||||
weight{weight_t, "weight", 3}, save_mean{save_mean_t, "save_mean", 4},
|
||||
save_var{save_var_t, "save_var", 5};
|
||||
CheckedFrom c = "miopen_batch_norm_backward";
|
||||
|
||||
checkAllDefined(c, {input, grad_output, weight, save_mean, save_var});
|
||||
@ -195,7 +191,11 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
|
||||
}
|
||||
checkAllSameType(c, {input, grad_output});
|
||||
checkAllSameType(c, {weight, save_mean, save_var});
|
||||
checkAllContiguous(c, {input, grad_output, save_mean, save_var});
|
||||
// TODO: is weight required to be contiguous?
|
||||
checkAllContiguous(c, {save_mean, save_var});
|
||||
// TODO: TensorArg check should start handle memory format
|
||||
TORCH_CHECK(input->is_contiguous(input->suggest_memory_format()));
|
||||
TORCH_CHECK(grad_output->is_contiguous(input->suggest_memory_format()));
|
||||
checkDimRange(c, input, 2, 6 /* exclusive */);
|
||||
checkSameSize(c, input, grad_output);
|
||||
auto num_features = input->size(1);
|
||||
@ -210,7 +210,7 @@ std::tuple<Tensor, Tensor, Tensor> miopen_batch_norm_backward(
|
||||
mode = miopenBNSpatial;
|
||||
}
|
||||
|
||||
auto grad_input_t = at::empty(input->sizes(), input->options());
|
||||
auto grad_input_t = at::empty(input->sizes(), input->options(), input->suggest_memory_format());
|
||||
auto grad_weight_t = at::empty(weight->sizes(), weight->options());
|
||||
auto grad_bias_t = at::empty(weight->sizes(), weight->options());
|
||||
|
||||
|
||||
@ -165,6 +165,7 @@ REGISTER_AVX2_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_co
|
||||
REGISTER_AVX512_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
REGISTER_ZVECTOR_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
REGISTER_VSX_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
REGISTER_SVE_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
REGISTER_SVE256_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
|
||||
|
||||
// _out variants can be shared between PocketFFT and MKL
|
||||
|
||||
@ -1798,7 +1798,7 @@
|
||||
device_guard: False
|
||||
dispatch:
|
||||
MkldnnCPU: copy_mkldnn_
|
||||
SparseCPU, SparseCUDA: copy_sparse_wrapper_
|
||||
SparseCPU, SparseCUDA, SparseMPS: copy_sparse_wrapper_
|
||||
CompositeExplicitAutograd: copy_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: copy_sparse_compressed_
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: copy_nested_
|
||||
@ -2160,7 +2160,7 @@
|
||||
variants: function, method
|
||||
structured_delegate: div.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: div_sparse
|
||||
SparseCPU, SparseCUDA, SparseMPS: div_sparse
|
||||
ZeroTensor: div_zerotensor
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_div_Tensor
|
||||
tags: [core, pointwise]
|
||||
@ -2170,7 +2170,7 @@
|
||||
variants: method
|
||||
structured_delegate: div.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: div_sparse_
|
||||
SparseCPU, SparseCUDA, SparseMPS: div_sparse_
|
||||
tags: pointwise
|
||||
|
||||
- func: div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
|
||||
@ -2179,7 +2179,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS, MTIA: div_out
|
||||
SparseCPU, SparseCUDA: div_out_sparse_zerodim
|
||||
SparseCPU, SparseCUDA, SparseMPS: div_out_sparse_zerodim
|
||||
tags: pointwise
|
||||
|
||||
- func: div.Tensor_mode(Tensor self, Tensor other, *, str? rounding_mode) -> Tensor
|
||||
@ -2187,7 +2187,7 @@
|
||||
variants: function, method
|
||||
structured_delegate: div.out_mode
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: div_sparse
|
||||
SparseCPU, SparseCUDA, SparseMPS: div_sparse
|
||||
tags: [core, pointwise]
|
||||
|
||||
- func: div_.Tensor_mode(Tensor(a!) self, Tensor other, *, str? rounding_mode) -> Tensor(a!)
|
||||
@ -2195,7 +2195,7 @@
|
||||
variants: method
|
||||
structured_delegate: div.out_mode
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: div_sparse_
|
||||
SparseCPU, SparseCUDA, SparseMPS: div_sparse_
|
||||
tags: pointwise
|
||||
|
||||
- func: div.out_mode(Tensor self, Tensor other, *, str? rounding_mode, Tensor(a!) out) -> Tensor(a!)
|
||||
@ -2204,7 +2204,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: div_out_mode
|
||||
SparseCPU, SparseCUDA: div_out_sparse_zerodim
|
||||
SparseCPU, SparseCUDA, SparseMPS: div_out_sparse_zerodim
|
||||
tags: pointwise
|
||||
|
||||
# For C++ only, until we have conversion from C++ numbers to Tensor
|
||||
@ -2768,20 +2768,20 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CPU, CUDA, MPS, MTIA: floor_divide
|
||||
SparseCPU, SparseCUDA: floor_divide_sparse
|
||||
SparseCPU, SparseCUDA, SparseMPS: floor_divide_sparse
|
||||
|
||||
- func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: method
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: floor_divide_
|
||||
SparseCPU, SparseCUDA: floor_divide_sparse_
|
||||
SparseCPU, SparseCUDA, SparseMPS: floor_divide_sparse_
|
||||
|
||||
- func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: floor_divide_out
|
||||
SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
|
||||
SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim
|
||||
|
||||
- func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -4273,7 +4273,7 @@
|
||||
structured_delegate: mul.out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: mul_sparse
|
||||
SparseCPU, SparseCUDA, SparseMPS: mul_sparse
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr
|
||||
MkldnnCPU: mkldnn_mul
|
||||
ZeroTensor: mul_zerotensor
|
||||
@ -4285,7 +4285,7 @@
|
||||
structured_delegate: mul.out
|
||||
variants: method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: mul_sparse_
|
||||
SparseCPU, SparseCUDA, SparseMPS: mul_sparse_
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_sparse_csr_
|
||||
MkldnnCPU: mkldnn_mul_
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_mul__Tensor
|
||||
@ -4299,6 +4299,7 @@
|
||||
CPU, CUDA, MPS, MTIA: mul_out
|
||||
SparseCPU: mul_out_sparse_cpu
|
||||
SparseCUDA: mul_out_sparse_cuda
|
||||
SparseMPS: mul_out_sparse_mps
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: mul_out_sparse_csr
|
||||
MkldnnCPU: mkldnn_mul_out
|
||||
tags: pointwise
|
||||
@ -5848,7 +5849,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: sum
|
||||
SparseCPU, SparseCUDA, SparseMeta: sum_coo
|
||||
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sum_coo
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
|
||||
autogen: sum.out
|
||||
|
||||
@ -5859,7 +5860,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
NestedTensorCPU: NestedTensor_sum_dim_CPU
|
||||
SparseCPU, SparseCUDA: sum_sparse_coo
|
||||
SparseCPU, SparseCUDA, SparseMPS: sum_sparse_coo
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed
|
||||
tags: core
|
||||
|
||||
@ -6975,7 +6976,7 @@
|
||||
CPU, CUDA: sub_out
|
||||
MPS: sub_out_mps
|
||||
MTIA: sub_out_mtia
|
||||
SparseCPU, SparseCUDA: sub_out_sparse
|
||||
SparseCPU, SparseCUDA, SparseMPS: sub_out_sparse
|
||||
tags: pointwise
|
||||
|
||||
- func: sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor
|
||||
@ -6983,7 +6984,7 @@
|
||||
variants: function, method
|
||||
structured_delegate: sub.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: sub_sparse
|
||||
SparseCPU, SparseCUDA, SparseMPS: sub_sparse
|
||||
ZeroTensor: sub_zerotensor
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_sub_Tensor
|
||||
tags: [core, pointwise]
|
||||
@ -6993,7 +6994,7 @@
|
||||
variants: method
|
||||
structured_delegate: sub.out
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: sub_sparse_
|
||||
SparseCPU, SparseCUDA, SparseMPS: sub_sparse_
|
||||
tags: pointwise
|
||||
# For C++ only, until we have conversion from C++ numbers to Tensor
|
||||
|
||||
@ -10342,7 +10343,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA: pow_Tensor_Scalar_out
|
||||
SparseCPU, SparseCUDA: pow_out_sparse_scalar
|
||||
SparseCPU, SparseCUDA, SparseMPS: pow_out_sparse_scalar
|
||||
MPS: pow_tensor_scalar_out_mps
|
||||
tags: pointwise
|
||||
|
||||
@ -10351,7 +10352,7 @@
|
||||
structured_delegate: pow.Tensor_Scalar_out
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA: pow_sparse_scalar
|
||||
SparseCPU, SparseCUDA, SparseMPS: pow_sparse_scalar
|
||||
tags: [core, pointwise]
|
||||
|
||||
- func: pow_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)
|
||||
@ -10698,6 +10699,7 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow
|
||||
CUDA: foreach_tensor_div_list_kernel_cuda
|
||||
MTIA: foreach_tensor_div_list_kernel_mtia
|
||||
|
||||
- func: _foreach_div_.List(Tensor(a!)[] self, Tensor[] other) -> ()
|
||||
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
|
||||
@ -10705,6 +10707,7 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: foreach_tensor_div_list_kernel_slow_
|
||||
CUDA: foreach_tensor_div_list_kernel_cuda_
|
||||
MTIA: foreach_tensor_div_list_kernel_mtia_
|
||||
autogen: _foreach_div.List_out
|
||||
|
||||
- func: _foreach_div.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
|
||||
@ -10728,6 +10731,7 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow
|
||||
CUDA: foreach_tensor_div_tensor_kernel_cuda
|
||||
MTIA: foreach_tensor_div_tensor_kernel_mtia
|
||||
|
||||
- func: _foreach_div_.Tensor(Tensor(a!)[] self, Tensor other) -> ()
|
||||
device_check: NoCheck # foreach kernels fall back to slow path when tensor are on different devices
|
||||
@ -10735,6 +10739,7 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: foreach_tensor_div_tensor_kernel_slow_
|
||||
CUDA: foreach_tensor_div_tensor_kernel_cuda_
|
||||
MTIA: foreach_tensor_div_tensor_kernel_mtia_
|
||||
autogen: _foreach_div.Tensor_out
|
||||
|
||||
- func: _foreach_clamp_max.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
|
||||
|
||||
@ -142,7 +142,7 @@ Tensor qcat_nhwc_kernel(
|
||||
continue;
|
||||
}
|
||||
|
||||
constexpr auto VLEN = Vec::size();
|
||||
const auto VLEN = Vec::size();
|
||||
int64_t c = 0;
|
||||
|
||||
// Vectorized loop
|
||||
@ -170,16 +170,16 @@ Tensor qcat_nhwc_kernel(
|
||||
}
|
||||
|
||||
// Vectorized loop for channel between 8 and 32 (avx2)
|
||||
constexpr auto kVLEN = Vectorized<float>::size();
|
||||
const auto kVLEN = Vectorized<float>::size();
|
||||
int64_t elem_size = curr_C - c;
|
||||
if ((VLEN == 4 * kVLEN) && elem_size >= kVLEN) {
|
||||
auto curr_scale_vec = Vectorized<float>(curr_scale);
|
||||
auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
|
||||
auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
|
||||
int64_t vec_num = elem_size / kVLEN;
|
||||
std::array<typename scalar_t::underlying, VLEN> buf_in{};
|
||||
memcpy(buf_in.data(), iptr + c, vec_num * kVLEN);
|
||||
auto inp_vec = Vec::loadu(buf_in.data());
|
||||
typename scalar_t::underlying buf_in[VLEN] = {};
|
||||
memcpy(buf_in, iptr + c, vec_num * kVLEN);
|
||||
auto inp_vec = Vec::loadu(buf_in);
|
||||
auto float_values = inp_vec.dequantize(
|
||||
curr_scale_vec, curr_zero_pt_vec, scale_neg_zp_premul);
|
||||
Vec::float_vec_return_type retvals;
|
||||
@ -1487,7 +1487,7 @@ void _qmaxpool_2d_nhwc_kernel(
|
||||
int64_t c = 0;
|
||||
|
||||
// Interleaved vector loop 4x
|
||||
constexpr auto vec_width = Vectorized<scalar_t>::size();
|
||||
const auto vec_width = Vectorized<scalar_t>::size();
|
||||
for (; c + 4 * vec_width <= iC; c += 4 * vec_width) {
|
||||
Vectorized<scalar_t> acc{
|
||||
scalar_t(std::numeric_limits<scalar_t_underlying>::lowest())};
|
||||
@ -1623,7 +1623,7 @@ void qmaxpool_3d_nthwc_kernel(
|
||||
w_start += dW;
|
||||
|
||||
int64_t c = 0;
|
||||
constexpr auto vec_width = Vectorized<scalar_t>::size();
|
||||
const auto vec_width = Vectorized<scalar_t>::size();
|
||||
// Vector loop
|
||||
for (; c + vec_width <= iC; c += vec_width) {
|
||||
Vectorized<scalar_t> acc{
|
||||
@ -2449,7 +2449,7 @@ void q_batch_norm_kernel(
|
||||
reinterpret_cast<scalar_t::underlying*>(input.data_ptr());
|
||||
scalar_t::underlying* Y = reinterpret_cast<scalar_t::underlying*>(output.data_ptr());
|
||||
|
||||
constexpr int kVLen = Vectorized<float>::size();
|
||||
const int kVLen = Vectorized<float>::size();
|
||||
const int64_t outer_size = N * HxW;
|
||||
using Vec = Vectorized<scalar_t>;
|
||||
// Hoisted variables
|
||||
@ -2975,7 +2975,7 @@ void quantized_normalize_kernel(
|
||||
float y_scale = Y->q_scale();
|
||||
float y_inv_scale = 1.0f / y_scale;
|
||||
|
||||
constexpr int kFloatVLen = fVec::size();
|
||||
const int kFloatVLen = fVec::size();
|
||||
int64_t kIntVLen = kFloatVLen * qVec::float_num_vecs();
|
||||
int64_t kNumIntVecInLayer = N / kIntVLen;
|
||||
int64_t kNonVecRemInLayer = N % kIntVLen;
|
||||
@ -3263,7 +3263,7 @@ void quantized_groupnorm_nhwc_kernel(
|
||||
float y_scale = Y->q_scale();
|
||||
float y_inv_scale = 1.0f / y_scale;
|
||||
|
||||
constexpr int kFloatVLen = fVec::size();
|
||||
const int kFloatVLen = fVec::size();
|
||||
int64_t kIntVLen = kFloatVLen * qVec::float_num_vecs();
|
||||
int64_t channels_per_group = C / G;
|
||||
int64_t HxW = N / channels_per_group;
|
||||
|
||||
@ -27,6 +27,7 @@ REGISTER_AVX512_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
|
||||
REGISTER_AVX2_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/Config.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/AccumulateType.h>
|
||||
#include <ATen/NamedTensorUtils.h>
|
||||
#include <ATen/native/sparse/ParamUtils.h>
|
||||
#include <ATen/native/SparseTensorUtils.h>
|
||||
@ -295,6 +296,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
|
||||
to exp functions as well as reuse of softmax implementation for
|
||||
log_softmax.
|
||||
*/
|
||||
using accscalar_t = at::acc_type<scalar_t, false>;
|
||||
auto sparse_dim = input.sparse_dim();
|
||||
auto indices = input._indices().contiguous();
|
||||
auto values = input._values().contiguous();
|
||||
@ -340,14 +342,14 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di
|
||||
continue;
|
||||
|
||||
/* Prepare scratch space */
|
||||
std::vector<scalar_t> mx_row(nvalues, -std::numeric_limits<scalar_t>::infinity());
|
||||
std::vector<scalar_t> exp_sums_row(nvalues, 0);
|
||||
std::vector<accscalar_t> mx_row(nvalues, -std::numeric_limits<accscalar_t>::infinity());
|
||||
std::vector<accscalar_t> exp_sums_row(nvalues, 0);
|
||||
|
||||
/* Compute mx */
|
||||
for (int64_t i : pool_indices) {
|
||||
auto values_row = values_accessor[i];
|
||||
for (const auto j : c10::irange(nvalues)) {
|
||||
mx_row[j] = std::max(mx_row[j], values_row[j]);
|
||||
mx_row[j] = std::max(mx_row[j], accscalar_t(values_row[j]));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -161,6 +161,7 @@ REGISTER_AVX512_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_
|
||||
REGISTER_AVX2_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(sparse_mask_intersection_out_stub, DEFAULT, &sparse_mask_intersection_out_cpu_kernel)
|
||||
@ -168,6 +169,7 @@ REGISTER_AVX512_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_interse
|
||||
REGISTER_AVX2_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
|
||||
|
||||
REGISTER_ARCH_DISPATCH(sparse_mask_projection_out_stub, DEFAULT, &sparse_mask_projection_out_cpu_kernel)
|
||||
@ -175,5 +177,6 @@ REGISTER_AVX512_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projectio
|
||||
REGISTER_AVX2_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
|
||||
REGISTER_VSX_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
|
||||
REGISTER_ZVECTOR_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
|
||||
REGISTER_SVE_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
|
||||
REGISTER_SVE256_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
|
||||
}
|
||||
|
||||
@ -391,13 +391,13 @@ void _validate_sparse_coo_tensor_args(
|
||||
int64_t sparse_dim = indices.size(0);
|
||||
int64_t dense_dim = values.dim() - 1;
|
||||
TORCH_CHECK(
|
||||
static_cast<int64_t>(size.size()) == sparse_dim + dense_dim,
|
||||
"number of dimensions must be sparse_dim (",
|
||||
sparse_dim,
|
||||
") + dense_dim (",
|
||||
dense_dim,
|
||||
"), but got ",
|
||||
size.size());
|
||||
sparse_dim + dense_dim == static_cast<int64_t>(size.size()),
|
||||
"'len(size) == sparse_dim + dense_dim' is not satisfied: len(size) = ",
|
||||
size.size(),
|
||||
", sparse_dim = ",
|
||||
sparse_dim,
|
||||
", dense_dim = ",
|
||||
dense_dim);
|
||||
|
||||
if (check_pinning) {
|
||||
TORCH_CHECK(
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
#include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
|
||||
#include <ATen/ops/cat.h>
|
||||
#include <ATen/ops/add_native.h>
|
||||
#include <ATen/ops/mul_native.h>
|
||||
#include <ATen/ops/empty_native.h>
|
||||
#include <ATen/ops/zeros_native.h>
|
||||
#include <ATen/ops/result_type.h>
|
||||
@ -20,10 +21,265 @@
|
||||
namespace at::native {
|
||||
|
||||
using namespace at::sparse;
|
||||
using namespace mps;
|
||||
|
||||
Tensor& add_out_dense_sparse_mps(Tensor& out, const Tensor& dense, const SparseTensor& sparse, const Scalar& alpha);
|
||||
#ifndef PYTORCH_JIT_COMPILE_SHADERS
|
||||
static auto& lib = MetalShaderLibrary::getBundledLibrary();
|
||||
#else
|
||||
#include <ATen/native/mps/Mul_metallib.h>
|
||||
#endif
|
||||
|
||||
Tensor& add_out_dense_sparse_mps(
|
||||
static SparseTensor& mul_out_dense_sparse_mps(
|
||||
const Tensor& dense,
|
||||
const Tensor& sparse,
|
||||
SparseTensor& out) {
|
||||
|
||||
TORCH_CHECK(sparse.is_sparse(), "mul: expected 'sparse' to be sparse COO");
|
||||
TORCH_CHECK(sparse.is_mps(), "mul: expected 'sparse' to be MPS, got ", sparse.device());
|
||||
TORCH_CHECK(out.is_mps(), "mul: expected 'out' to be MPS, got ", out.device());
|
||||
|
||||
const bool scalar_like = (dense.dim() == 0) || (dense.numel() == 1);
|
||||
TORCH_CHECK(dense.is_mps() || scalar_like,
|
||||
"mul: expected 'dense' to be MPS or scalar-like, got ", dense.device());
|
||||
|
||||
const int64_t nnz = sparse._nnz();
|
||||
out.resize_as_(sparse);
|
||||
|
||||
auto commonDtype = at::result_type(dense, sparse);
|
||||
TORCH_CHECK(canCast(commonDtype, out.scalar_type()),
|
||||
"Can't convert result type ", commonDtype, " to output ", out.scalar_type());
|
||||
|
||||
auto indices = sparse._indices().contiguous();
|
||||
auto values = sparse._values().to(commonDtype).contiguous();
|
||||
|
||||
if (nnz == 0) {
|
||||
auto empty_vals = values.narrow(0, 0, 0);
|
||||
alias_into_sparse(out,
|
||||
indices.narrow(1, 0, 0),
|
||||
(out.scalar_type() == commonDtype) ? empty_vals
|
||||
: empty_vals.to(out.scalar_type()));
|
||||
out._coalesced_(sparse.is_coalesced());
|
||||
return out;
|
||||
}
|
||||
|
||||
if (scalar_like) {
|
||||
auto scalar = dense;
|
||||
if (dense.numel() == 1 && dense.dim() > 0) {
|
||||
scalar = dense.view({});
|
||||
}
|
||||
scalar = scalar.to(values.options());
|
||||
auto out_vals = values.mul(scalar);
|
||||
if (out.scalar_type() != commonDtype) {
|
||||
out_vals = out_vals.to(out.scalar_type());
|
||||
}
|
||||
|
||||
alias_into_sparse(out, indices, out_vals);
|
||||
out._coalesced_(sparse.is_coalesced());
|
||||
return out;
|
||||
}
|
||||
|
||||
TORCH_CHECK(dense.sizes().equals(sparse.sizes()),
|
||||
"mul(dense, sparse): sizes must match exactly (no broadcasting): ",
|
||||
dense.sizes(), " vs ", sparse.sizes());
|
||||
|
||||
const int64_t ndim_i = sparse.sparse_dim();
|
||||
const int64_t ndim = dense.dim();
|
||||
TORCH_CHECK(
|
||||
ndim_i <= ndim,
|
||||
"mul(dense, sparse): sparse_dim=", ndim_i, " exceeds dense.dim()=", ndim);
|
||||
|
||||
// Prepare shapes
|
||||
int64_t view_rows = 1, view_cols = 1;
|
||||
for (int64_t i = 0; i < ndim_i; ++i) view_rows *= sparse.size(i);
|
||||
for (int64_t i = ndim_i; i < ndim; ++i) view_cols *= sparse.size(i);
|
||||
|
||||
auto dense_mps = dense.to(commonDtype).contiguous().reshape({view_rows, view_cols});
|
||||
auto out_vals = at::empty_like(values, values.options());
|
||||
|
||||
const uint32_t u_view_cols = static_cast<uint32_t>(view_cols);
|
||||
const uint32_t u_nnz = static_cast<uint32_t>(nnz);
|
||||
const uint32_t u_ndim_i = static_cast<uint32_t>(ndim_i);
|
||||
|
||||
auto stream = getCurrentMPSStream();
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto pso = lib.getPipelineStateForFunc("dense_sparse_mul_kernel_" + mps::scalarToMetalTypeString(values));
|
||||
auto computeEncoder = stream->commandEncoder();
|
||||
[computeEncoder setComputePipelineState:pso];
|
||||
|
||||
const uint32_t gridWidth = u_view_cols;
|
||||
const uint32_t gridDepth = u_nnz;
|
||||
MTLSize gridSize = MTLSizeMake(gridWidth, 1, gridDepth);
|
||||
|
||||
const uint32_t maxThreadsPerGroup = pso.maxTotalThreadsPerThreadgroup;
|
||||
const uint32_t tew = pso.threadExecutionWidth;
|
||||
uint32_t tgWidth = std::min(gridWidth, tew);
|
||||
MTLSize threadgroupSize = MTLSizeMake(tgWidth, 1, 1);
|
||||
|
||||
mtl_setArgs(
|
||||
computeEncoder,
|
||||
dense_mps,
|
||||
values,
|
||||
out_vals,
|
||||
indices,
|
||||
sparse.sizes(),
|
||||
std::array<uint32_t, 3>{u_nnz, u_ndim_i, u_view_cols}
|
||||
);
|
||||
|
||||
[computeEncoder dispatchThreads:gridSize threadsPerThreadgroup:threadgroupSize];
|
||||
}
|
||||
});
|
||||
|
||||
Tensor final_vals = out_vals;
|
||||
if (out.scalar_type() != commonDtype) {
|
||||
final_vals = final_vals.to(out.scalar_type());
|
||||
}
|
||||
|
||||
alias_into_sparse(out, indices, final_vals);
|
||||
out._coalesced_(sparse.is_coalesced());
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
SparseTensor& mul_out_sparse_mps(const Tensor& t_, const Tensor& src_, SparseTensor& r_) {
|
||||
TORCH_CHECK(r_.is_mps(), "mul: expected 'out' to be MPS, but got ", r_.device());
|
||||
|
||||
// Dense x sparse fallback (keep dense first)
|
||||
if (!t_.is_sparse() || !src_.is_sparse()) {
|
||||
const Tensor& dense = t_.is_sparse() ? src_ : t_;
|
||||
const Tensor& sparse = t_.is_sparse() ? t_ : src_;
|
||||
return mul_out_dense_sparse_mps(dense, sparse, r_);
|
||||
}
|
||||
|
||||
TORCH_CHECK(t_.is_mps(), "mul: expected 'self' to be MPS, but got ", t_.device());
|
||||
TORCH_CHECK(src_.is_mps(), "mul: expected 'other' to be MPS, but got ", src_.device());
|
||||
TORCH_CHECK(t_.sparse_dim() == src_.sparse_dim(),
|
||||
"mul(sparse, sparse): must have same sparse_dim, got ",
|
||||
t_.sparse_dim(), " vs ", src_.sparse_dim());
|
||||
TORCH_CHECK(t_.sizes().equals(src_.sizes()),
|
||||
"mul(sparse, sparse): sizes must match exactly (no broadcasting).");
|
||||
|
||||
// Coalesce and early-exit on structurally empty operands
|
||||
auto lhs = t_.coalesce();
|
||||
auto rhs = src_.coalesce();
|
||||
const int64_t lhs_nnz = lhs._nnz();
|
||||
const int64_t rhs_nnz = rhs._nnz();
|
||||
if (!lhs_nnz || !rhs_nnz) {
|
||||
r_.resize_as_(lhs);
|
||||
return r_.zero_();
|
||||
}
|
||||
|
||||
// dtype checks and promotion
|
||||
auto commonDtype = at::result_type(lhs, rhs);
|
||||
TORCH_CHECK(canCast(commonDtype, r_.scalar_type()),
|
||||
"Can't convert result type ", commonDtype, " to output ", r_.scalar_type());
|
||||
|
||||
const int64_t ndim_i = lhs.sparse_dim();
|
||||
|
||||
// ndim_i == 0, at most one structural entry
|
||||
if (ndim_i == 0) {
|
||||
r_.resize_as_(lhs);
|
||||
const bool has = (lhs_nnz && rhs_nnz);
|
||||
|
||||
auto out_indices = lhs._indices().narrow(1, 0, has ? 1 : 0);
|
||||
|
||||
Tensor lhs_vals = lhs._values().to(commonDtype);
|
||||
Tensor rhs_vals = rhs._values().to(commonDtype);
|
||||
lhs_vals = lhs_vals.narrow(0, 0, has ? 1 : 0);
|
||||
rhs_vals = rhs_vals.narrow(0, 0, has ? 1 : 0);
|
||||
|
||||
Tensor out_values = lhs_vals.mul(rhs_vals);
|
||||
if (r_.scalar_type() != commonDtype) {
|
||||
out_values = out_values.to(r_.scalar_type());
|
||||
}
|
||||
|
||||
alias_into_sparse(r_, out_indices, out_values);
|
||||
r_._coalesced_(true);
|
||||
return r_;
|
||||
}
|
||||
|
||||
// General path, intersect keys, then gather + multiply on GPU
|
||||
const auto device = r_.device();
|
||||
auto stream = getCurrentMPSStream();
|
||||
|
||||
auto lhs_indices = lhs._indices();
|
||||
auto rhs_indices = rhs._indices();
|
||||
auto lhs_values = lhs._values().to(commonDtype);
|
||||
auto rhs_values = rhs._values().to(commonDtype);
|
||||
|
||||
// Flatten sparse indices to keys
|
||||
auto lhs_keys = flatten_indices(lhs_indices, lhs.sizes());
|
||||
auto rhs_keys = flatten_indices(rhs_indices, rhs.sizes());
|
||||
|
||||
// Intersect sorted keys (search the shorter in the longer)
|
||||
const bool A_is_lhs = (lhs_nnz <= rhs_nnz);
|
||||
const int64_t lenA = A_is_lhs ? lhs_nnz : rhs_nnz;
|
||||
const int64_t lenB = A_is_lhs ? rhs_nnz : lhs_nnz;
|
||||
auto A_keys = A_is_lhs ? lhs_keys : rhs_keys;
|
||||
auto B_keys = A_is_lhs ? rhs_keys : lhs_keys;
|
||||
|
||||
auto outA_idx = at::empty({lenA}, at::device(device).dtype(kLong));
|
||||
auto outB_idx = at::empty({lenA}, at::device(device).dtype(kLong));
|
||||
auto counter = at::zeros({1}, at::device(device).dtype(kInt));
|
||||
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto pso = lib.getPipelineStateForFunc("intersect_binary_search");
|
||||
auto enc = stream->commandEncoder();
|
||||
[enc setComputePipelineState:pso];
|
||||
mtl_setArgs(enc, A_keys, B_keys, outA_idx, outB_idx, counter,
|
||||
static_cast<uint32_t>(lenB), A_is_lhs);
|
||||
mtl_dispatch1DJob(enc, pso, static_cast<uint32_t>(lenA));
|
||||
}
|
||||
});
|
||||
|
||||
const uint32_t M = counter.item<int32_t>(); // number of structural matches
|
||||
|
||||
r_.resize_as_(lhs);
|
||||
|
||||
auto out_indices = at::empty({ndim_i, static_cast<int64_t>(M)}, at::device(device).dtype(at::kLong));
|
||||
auto lhs_match = outA_idx.narrow(0, 0, M);
|
||||
auto rhs_match = outB_idx.narrow(0, 0, M);
|
||||
auto out_val_sizes = lhs_values.sizes().vec();
|
||||
out_val_sizes[0] = static_cast<int64_t>(M);
|
||||
auto out_values = at::empty(out_val_sizes, lhs_values.options());
|
||||
|
||||
const uint32_t cols = static_cast<uint32_t>(
|
||||
lhs_values.numel() / std::max<int64_t>(1, lhs_nnz));
|
||||
|
||||
dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto pso = lib.getPipelineStateForFunc(
|
||||
"fused_gather_mul_kernel_" + mps::scalarToMetalTypeString(lhs_values));
|
||||
auto enc = stream->commandEncoder();
|
||||
[enc setComputePipelineState:pso];
|
||||
|
||||
const uint32_t tew = pso.threadExecutionWidth;
|
||||
uint32_t tgW = std::min(cols, tew);
|
||||
MTLSize grid = MTLSizeMake(cols, 1, M);
|
||||
MTLSize tgs = MTLSizeMake(tgW, 1, 1);
|
||||
|
||||
mtl_setArgs(enc,
|
||||
lhs_values, rhs_values,
|
||||
lhs_match, rhs_match,
|
||||
lhs_indices, out_indices,
|
||||
out_values,
|
||||
std::array<uint32_t, 2>{static_cast<uint32_t>(ndim_i), static_cast<uint32_t>(lhs_nnz)},
|
||||
std::array<uint32_t, 2>{M, cols});
|
||||
[enc dispatchThreads:grid threadsPerThreadgroup:tgs];
|
||||
}
|
||||
});
|
||||
|
||||
if (r_.scalar_type() != commonDtype) {
|
||||
out_values = out_values.to(r_.scalar_type());
|
||||
}
|
||||
|
||||
alias_into_sparse(r_, out_indices, out_values);
|
||||
r_._coalesced_(true);
|
||||
return r_;
|
||||
}
|
||||
|
||||
static Tensor& add_out_dense_sparse_mps(
|
||||
Tensor& out,
|
||||
const Tensor& dense,
|
||||
const SparseTensor& sparse,
|
||||
|
||||
150
aten/src/ATen/native/sparse/mps/kernels/Mul.metal
Normal file
150
aten/src/ATen/native/sparse/mps/kernels/Mul.metal
Normal file
@ -0,0 +1,150 @@
|
||||
#include <metal_stdlib>
|
||||
#include <c10/metal/indexing.h>
|
||||
using namespace metal;
|
||||
|
||||
|
||||
template <typename T>
|
||||
kernel void dense_sparse_mul_kernel(
|
||||
device const T* dense [[buffer(0)]],
|
||||
device const T* values [[buffer(1)]],
|
||||
device T* out_values [[buffer(2)]],
|
||||
device const long* indices [[buffer(3)]],
|
||||
device const long* sizes [[buffer(4)]],
|
||||
constant uint3& sparse_params [[buffer(5)]],
|
||||
uint3 gid [[thread_position_in_grid]])
|
||||
{
|
||||
uint col = gid.x;
|
||||
uint i = gid.z;
|
||||
uint nnz = sparse_params.x;
|
||||
uint ndim_i = sparse_params.y;
|
||||
uint view_cols = sparse_params.z;
|
||||
|
||||
long key = 0;
|
||||
for (uint d = 0; d < ndim_i; ++d) {
|
||||
long idx_d = indices[(ulong)d * (ulong)nnz + (ulong)i];
|
||||
const auto sz_d = sizes[d];
|
||||
key = key * sz_d + idx_d;
|
||||
}
|
||||
|
||||
ulong dense_idx = (ulong)key * (ulong)view_cols + (ulong)col;
|
||||
ulong val_idx = (ulong)i * (ulong)view_cols + (ulong)col;
|
||||
|
||||
const auto a = static_cast<float>(values[val_idx]);
|
||||
const auto b = static_cast<float>(dense[dense_idx]);
|
||||
out_values[val_idx] = static_cast<T>(a * b);
|
||||
}
|
||||
|
||||
kernel void intersect_binary_search(
|
||||
device const long* keysA [[buffer(0)]],
|
||||
device const long* keysB [[buffer(1)]],
|
||||
device long* outA_idx [[buffer(2)]],
|
||||
device long* outB_idx [[buffer(3)]],
|
||||
device atomic_uint* counter [[buffer(4)]],
|
||||
constant uint& lenB [[buffer(5)]],
|
||||
constant bool& A_is_lhs [[buffer(6)]],
|
||||
uint3 tid_in_grid [[thread_position_in_grid]])
|
||||
{
|
||||
uint gid = tid_in_grid.x;
|
||||
|
||||
long key = keysA[gid];
|
||||
|
||||
// lower_bound in B
|
||||
uint lo = 0;
|
||||
uint hi = lenB;
|
||||
while (lo < hi) {
|
||||
uint mid = (lo + hi) >> 1;
|
||||
long v = keysB[mid];
|
||||
if (v < key) lo = mid + 1;
|
||||
else hi = mid;
|
||||
}
|
||||
|
||||
if (lo < lenB && keysB[lo] == key) {
|
||||
uint pos = atomic_fetch_add_explicit(counter, 1u, memory_order_relaxed);
|
||||
if (A_is_lhs) {
|
||||
outA_idx[pos] = (long)gid;
|
||||
outB_idx[pos] = (long)lo;
|
||||
} else {
|
||||
outA_idx[pos] = (long)lo;
|
||||
outB_idx[pos] = (long)gid;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
kernel void fused_gather_mul_kernel(
|
||||
device const T* lhs_vals [[buffer(0)]],
|
||||
device const T* rhs_vals [[buffer(1)]],
|
||||
device const long* lhs_sel [[buffer(2)]],
|
||||
device const long* rhs_sel [[buffer(3)]],
|
||||
device const long* lhs_indices [[buffer(4)]],
|
||||
device long* out_indices [[buffer(5)]],
|
||||
device T* out_vals [[buffer(6)]],
|
||||
constant uint2& dims_input [[buffer(7)]],
|
||||
constant uint2& dims_output [[buffer(8)]],
|
||||
uint3 gid [[thread_position_in_grid]])
|
||||
{
|
||||
const uint col = gid.x;
|
||||
const uint k = gid.z;
|
||||
const uint n_dim_i = dims_input.x;
|
||||
const uint L = dims_input.y;
|
||||
const uint M = dims_output.x;
|
||||
const uint view_cols = dims_output.y;
|
||||
|
||||
const long iL = lhs_sel[k];
|
||||
const long iR = rhs_sel[k];
|
||||
|
||||
if (col < view_cols) {
|
||||
const ulong offL = (ulong)iL * (ulong)view_cols + (ulong)col;
|
||||
const ulong offR = (ulong)iR * (ulong)view_cols + (ulong)col;
|
||||
const ulong offO = (ulong)k * (ulong)view_cols + (ulong)col;
|
||||
|
||||
const float a = (float)lhs_vals[offL];
|
||||
const float b = (float)rhs_vals[offR];
|
||||
out_vals[offO] = (T)(a * b);
|
||||
}
|
||||
|
||||
// One thread per match copies the indices column
|
||||
if (col == 0) {
|
||||
const ulong uL = (ulong)L;
|
||||
const ulong uM = (ulong)M;
|
||||
const ulong src_col = (ulong)iL; // gather from lhs
|
||||
for (uint d = 0; d < n_dim_i; ++d) {
|
||||
const long v = lhs_indices[(ulong)d * uL + src_col];
|
||||
out_indices[(ulong)d * uM + (ulong)k] = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANTIATE_DENSE_SPARSE_MUL(DTYPE) \
|
||||
template [[host_name("dense_sparse_mul_kernel_" #DTYPE)]] kernel void \
|
||||
dense_sparse_mul_kernel<DTYPE>( \
|
||||
device const DTYPE* dense [[buffer(0)]], \
|
||||
device const DTYPE* values [[buffer(1)]], \
|
||||
device DTYPE* out_values [[buffer(2)]], \
|
||||
device const long* indices [[buffer(3)]], \
|
||||
device const long* sizes [[buffer(4)]], \
|
||||
constant uint3& sparse_params [[buffer(5)]], \
|
||||
uint3 gid [[thread_position_in_grid]]);
|
||||
|
||||
INSTANTIATE_DENSE_SPARSE_MUL(float);
|
||||
INSTANTIATE_DENSE_SPARSE_MUL(half);
|
||||
INSTANTIATE_DENSE_SPARSE_MUL(bfloat);
|
||||
|
||||
#define INSTANTIATE_FUSED_GATHER_MUL(DTYPE) \
|
||||
template [[host_name("fused_gather_mul_kernel_" #DTYPE)]] kernel void \
|
||||
fused_gather_mul_kernel<DTYPE>( \
|
||||
device const DTYPE* lhs_vals [[buffer(0)]], \
|
||||
device const DTYPE* rhs_vals [[buffer(1)]], \
|
||||
device const long* lhs_sel [[buffer(2)]], \
|
||||
device const long* rhs_sel [[buffer(3)]], \
|
||||
device const long* lhs_indices [[buffer(4)]], \
|
||||
device long* out_indices [[buffer(5)]], \
|
||||
device DTYPE* out_vals [[buffer(6)]], \
|
||||
constant uint2& dims_input [[buffer(7)]], \
|
||||
constant uint2& dims_output [[buffer(8)]], \
|
||||
uint3 gid [[thread_position_in_grid]]);
|
||||
|
||||
INSTANTIATE_FUSED_GATHER_MUL(float);
|
||||
INSTANTIATE_FUSED_GATHER_MUL(half);
|
||||
INSTANTIATE_FUSED_GATHER_MUL(bfloat);
|
||||
@ -448,6 +448,7 @@ REGISTER_AVX2_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
|
||||
REGISTER_AVX512_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
|
||||
REGISTER_VSX_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
|
||||
REGISTER_ZVECTOR_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
|
||||
REGISTER_SVE_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
|
||||
REGISTER_SVE256_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
|
||||
REGISTER_HPU_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_meta)
|
||||
|
||||
|
||||
@ -95,6 +95,72 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(USE_ROCM) && (defined(USE_FLASH_ATTENTION) || defined(USE_MEM_EFF_ATTENTION))
|
||||
namespace pytorch_flash
|
||||
{
|
||||
std::tuple<
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor>
|
||||
mha_fwd(
|
||||
const at::Tensor& q, // batch_size x seqlen_q x num_heads x head_size
|
||||
const at::Tensor& k, // batch_size x seqlen_k x num_heads_k x head_size
|
||||
const at::Tensor& v, // batch_size x seqlen_k x num_heads_k x head_size
|
||||
std::optional<at::Tensor>&
|
||||
out_, // batch_size x seqlen_q x num_heads x head_size
|
||||
std::optional<at::Tensor>&
|
||||
alibi_slopes_, // num_heads or batch_size x num_heads
|
||||
const float p_dropout,
|
||||
const float softmax_scale,
|
||||
bool is_causal,
|
||||
std::optional<int64_t> window_size_left,
|
||||
std::optional<int64_t> window_size_right,
|
||||
const float softcap,
|
||||
const bool return_softmax,
|
||||
std::optional<at::Generator> gen_) {
|
||||
#if defined(USE_ROCM_CK_SDPA)
|
||||
if (at::globalContext().getROCmFAPreferredBackend() ==
|
||||
at::ROCmFABackend::Ck) {
|
||||
const int non_null_window_left = window_size_left.value_or(-1);
|
||||
const int non_null_window_right = window_size_right.value_or(-1);
|
||||
std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
|
||||
return mha_fwd_ck(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
out_,
|
||||
p_dropout,
|
||||
softmax_scale,
|
||||
is_causal,
|
||||
non_null_window_left,
|
||||
non_null_window_right,
|
||||
return_softmax,
|
||||
gen_,
|
||||
dummy_attn_bias); // Not used in flash attention
|
||||
}
|
||||
#endif
|
||||
return mha_fwd_aot(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
out_,
|
||||
alibi_slopes_,
|
||||
p_dropout,
|
||||
softmax_scale,
|
||||
is_causal,
|
||||
window_size_left,
|
||||
window_size_right,
|
||||
return_softmax,
|
||||
gen_);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace at {
|
||||
|
||||
namespace cuda::philox {
|
||||
|
||||
@ -270,7 +270,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor> mha_varle
|
||||
#endif
|
||||
|
||||
TORCH_API
|
||||
inline std::tuple<
|
||||
std::tuple<
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
@ -294,42 +294,7 @@ mha_fwd(
|
||||
std::optional<int64_t> window_size_right,
|
||||
const float softcap,
|
||||
const bool return_softmax,
|
||||
std::optional<at::Generator> gen_) {
|
||||
#if defined(USE_ROCM_CK_SDPA)
|
||||
if (at::globalContext().getROCmFAPreferredBackend() ==
|
||||
at::ROCmFABackend::Ck) {
|
||||
const int non_null_window_left = window_size_left.value_or(-1);
|
||||
const int non_null_window_right = window_size_right.value_or(-1);
|
||||
std::optional<at::Tensor> dummy_attn_bias = std::nullopt;
|
||||
return mha_fwd_ck(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
out_,
|
||||
p_dropout,
|
||||
softmax_scale,
|
||||
is_causal,
|
||||
non_null_window_left,
|
||||
non_null_window_right,
|
||||
return_softmax,
|
||||
gen_,
|
||||
dummy_attn_bias); // Not used in flash attention
|
||||
}
|
||||
#endif
|
||||
return mha_fwd_aot(
|
||||
q,
|
||||
k,
|
||||
v,
|
||||
out_,
|
||||
alibi_slopes_,
|
||||
p_dropout,
|
||||
softmax_scale,
|
||||
is_causal,
|
||||
window_size_left,
|
||||
window_size_right,
|
||||
return_softmax,
|
||||
gen_);
|
||||
}
|
||||
std::optional<at::Generator> gen_);
|
||||
|
||||
inline std::tuple<
|
||||
at::Tensor,
|
||||
|
||||
@ -134,7 +134,7 @@ namespace {
|
||||
TYPED_TEST(Memory, UnAlignedLoadStore) {
|
||||
using vec = TypeParam;
|
||||
using VT = ValueType<TypeParam>;
|
||||
constexpr size_t b_size = vec::size() * sizeof(VT);
|
||||
const size_t b_size = vec::size() * sizeof(VT);
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
CACHE_ALIGN unsigned char ref_storage[128 * b_size];
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
@ -164,7 +164,7 @@ namespace {
|
||||
for (size_t offset = 0; offset < b_size; offset += 1) {
|
||||
unsigned char* p1 = ref_storage + offset;
|
||||
unsigned char* p2 = storage + offset;
|
||||
for (; p1 + b_size <= std::end(ref_storage); p1 += b_size, p2 += b_size) {
|
||||
for (; p1 + b_size <= &ref_storage[128 * b_size]; p1 += b_size, p2 += b_size) {
|
||||
vec v = vec::loadu(p1);
|
||||
v.store(p2);
|
||||
}
|
||||
@ -381,7 +381,7 @@ namespace {
|
||||
TYPED_TEST(Hyperbolic, Tanh) {
|
||||
using vec = TypeParam;
|
||||
// NOTE: Because SVE uses ACL logic, the precision changes, hence the adjusted tolerance.
|
||||
#if defined(CPU_CAPABILITY_SVE)
|
||||
#if defined(CPU_CAPABILITY_SVE) || defined(CPU_CAPABILITY_SVE256)
|
||||
using UVT = UvalueType<vec>;
|
||||
UVT tolerance = getDefaultTolerance<UVT>();
|
||||
test_unary<vec>(
|
||||
@ -586,7 +586,7 @@ namespace {
|
||||
}
|
||||
}
|
||||
}
|
||||
#if defined(CPU_CAPABILITY_SVE) && defined(__ARM_FEATURE_BF16)
|
||||
#if (defined(CPU_CAPABILITY_SVE256)) && defined(__ARM_FEATURE_BF16)
|
||||
TEST(NanBfloat16, IsNan) {
|
||||
for (unsigned int ii = 0; ii < 0xFFFF; ++ii) {
|
||||
c10::BFloat16 val(ii, c10::BFloat16::from_bits());
|
||||
@ -598,6 +598,19 @@ namespace {
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if (defined(CPU_CAPABILITY_SVE)) && defined(__ARM_FEATURE_BF16)
|
||||
TEST(NanBfloat16, IsNan) {
|
||||
for (unsigned int ii = 0; ii < 0xFFFF; ++ii) {
|
||||
c10::BFloat16 val(ii, c10::BFloat16::from_bits());
|
||||
bool expected = std::isnan(val);
|
||||
CACHE_ALIGN c10::BFloat16 actual_vals[at::vec::SVE::Vectorized<c10::BFloat16>::size()];
|
||||
at::vec::SVE::Vectorized<c10::BFloat16>(val).isnan().store(actual_vals);
|
||||
for (int jj = 0; jj < at::vec::SVE::Vectorized<c10::BFloat16>::size(); ++jj) {
|
||||
EXPECT_EQ(expected, c10::bit_cast<uint16_t>(actual_vals[jj]) != 0) << "bf16 isnan failure for bit pattern " << std::hex << ii << std::dec;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
TYPED_TEST(LGamma, LGamma) {
|
||||
using vec = TypeParam;
|
||||
@ -653,7 +666,7 @@ namespace {
|
||||
TYPED_TEST(Interleave, Interleave) {
|
||||
using vec = TypeParam;
|
||||
using VT = ValueType<TypeParam>;
|
||||
constexpr auto N = vec::size() * 2LL;
|
||||
const auto N = vec::size() * 2LL;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
CACHE_ALIGN VT vals[N];
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
@ -663,7 +676,7 @@ namespace {
|
||||
for (VT& v : vals) {
|
||||
v = generator.get();
|
||||
}
|
||||
copy_interleave(vals, interleaved);
|
||||
copy_interleave<VT>(vals, interleaved, N);
|
||||
auto a = vec::loadu(vals);
|
||||
auto b = vec::loadu(vals + vec::size());
|
||||
auto cc = interleave2(a, b);
|
||||
@ -673,7 +686,7 @@ namespace {
|
||||
TYPED_TEST(Interleave, DeInterleave) {
|
||||
using vec = TypeParam;
|
||||
using VT = ValueType<TypeParam>;
|
||||
constexpr auto N = vec::size() * 2LL;
|
||||
const auto N = vec::size() * 2LL;
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
CACHE_ALIGN VT vals[N];
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
@ -683,7 +696,7 @@ namespace {
|
||||
for (VT& v : vals) {
|
||||
v = generator.get();
|
||||
}
|
||||
copy_interleave(vals, interleaved);
|
||||
copy_interleave<VT>(vals, interleaved, N);
|
||||
// test interleaved with vals this time
|
||||
auto a = vec::loadu(interleaved);
|
||||
auto b = vec::loadu(interleaved + vec::size());
|
||||
@ -1017,78 +1030,70 @@ namespace {
|
||||
RESOLVE_OVERLOAD(filter_fmadd));
|
||||
}
|
||||
#endif
|
||||
template<typename vec, typename VT, int64_t mask>
|
||||
typename std::enable_if_t<(mask < 0 || mask> 255), void>
|
||||
template<typename vec, typename VT>
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
test_blend(VT expected_val[vec::size()], VT a[vec::size()], VT b[vec::size()])
|
||||
{
|
||||
void test_blend(VT * expected_val, VT * a, VT * b, int64_t mask) {
|
||||
if (mask >= 0 && mask <= 255) {
|
||||
// generate expected_val
|
||||
int64_t m = mask;
|
||||
for (int64_t i = 0; i < vec::size(); i++) {
|
||||
expected_val[i] = (m & 0x01) ? b[i] : a[i];
|
||||
m = m >> 1;
|
||||
}
|
||||
// test with blend
|
||||
auto vec_a = vec::loadu(a);
|
||||
auto vec_b = vec::loadu(b);
|
||||
auto expected = vec::loadu(expected_val);
|
||||
auto actual = vec::blend(vec_a, vec_b, mask);
|
||||
auto mask_str = std::string("\nblend mask: ") + std::to_string(mask);
|
||||
if (AssertVectorized<vec>(std::string(NAME_INFO(test_blend)) + mask_str, expected, actual).check()) return;
|
||||
test_blend<vec, VT>(expected_val, a, b, mask - 1);
|
||||
}
|
||||
}
|
||||
template<typename vec, typename VT, int64_t mask>
|
||||
typename std::enable_if_t<(mask >= 0 && mask <= 255), void>
|
||||
template<typename vec, typename VT>
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
test_blend(VT expected_val[vec::size()], VT a[vec::size()], VT b[vec::size()]) {
|
||||
// generate expected_val
|
||||
int64_t m = mask;
|
||||
for (int64_t i = 0; i < vec::size(); i++) {
|
||||
expected_val[i] = (m & 0x01) ? b[i] : a[i];
|
||||
m = m >> 1;
|
||||
}
|
||||
// test with blend
|
||||
auto vec_a = vec::loadu(a);
|
||||
auto vec_b = vec::loadu(b);
|
||||
auto expected = vec::loadu(expected_val);
|
||||
auto actual = vec::template blend<mask>(vec_a, vec_b);
|
||||
auto mask_str = std::string("\nblend mask: ") + std::to_string(mask);
|
||||
if (AssertVectorized<vec>(std::string(NAME_INFO(test_blend)) + mask_str, expected, actual).check()) return;
|
||||
test_blend<vec, VT, mask - 1>(expected_val, a, b);
|
||||
bool test_blendv(VT * expected_val, VT * a, VT * b, VT * mask, int64_t idx, size_t N) {
|
||||
if ((size_t) idx == N) {
|
||||
using bit_rep = BitType<VT>;
|
||||
// generate expected_val
|
||||
for (int64_t i = 0; i < vec::size(); i++) {
|
||||
bit_rep hex_mask = 0;
|
||||
hex_mask=c10::bit_cast<bit_rep>(mask[i]);
|
||||
expected_val[i] = (hex_mask & 0x01) ? b[i] : a[i];
|
||||
}
|
||||
// test with blendv
|
||||
auto vec_a = vec::loadu(a);
|
||||
auto vec_b = vec::loadu(b);
|
||||
auto vec_m = vec::loadu(mask);
|
||||
auto expected = vec::loadu(expected_val);
|
||||
auto actual = vec::blendv(vec_a, vec_b, vec_m);
|
||||
auto mask_str = std::string("\nblendv mask: ");
|
||||
for (int64_t i = 0; i < vec::size(); i++) {
|
||||
mask_str += std::to_string(mask[i]) + " ";
|
||||
}
|
||||
if (AssertVectorized<vec>(std::string(NAME_INFO(test_blendv)) + mask_str, expected, actual).check()) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
// shuffle mask and do blendv test
|
||||
VT m = mask[idx];
|
||||
if (!test_blendv<vec, VT>(expected_val, a, b, mask, idx+1, N)) return false;
|
||||
if (m != (VT)0) {
|
||||
mask[idx] = (VT)0;
|
||||
}
|
||||
else {
|
||||
uint64_t hex_mask = 0xFFFFFFFFFFFFFFFF;
|
||||
std::memcpy(&mask[idx], &hex_mask, sizeof(VT));
|
||||
}
|
||||
if (!test_blendv<vec, VT>(expected_val, a, b, mask, idx+1, N)) return false;
|
||||
mask[idx] = m;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
template<typename vec, typename VT, int64_t idx, int64_t N>
|
||||
std::enable_if_t<(!is_complex<VT>::value && idx == N), bool>
|
||||
template<typename T>
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
test_blendv(VT expected_val[vec::size()], VT a[vec::size()], VT b[vec::size()], VT mask[vec::size()]) {
|
||||
using bit_rep = BitType<VT>;
|
||||
// generate expected_val
|
||||
for (int64_t i = 0; i < vec::size(); i++) {
|
||||
bit_rep hex_mask = 0;
|
||||
hex_mask=c10::bit_cast<bit_rep>(mask[i]);
|
||||
expected_val[i] = (hex_mask & 0x01) ? b[i] : a[i];
|
||||
}
|
||||
// test with blendv
|
||||
auto vec_a = vec::loadu(a);
|
||||
auto vec_b = vec::loadu(b);
|
||||
auto vec_m = vec::loadu(mask);
|
||||
auto expected = vec::loadu(expected_val);
|
||||
auto actual = vec::blendv(vec_a, vec_b, vec_m);
|
||||
auto mask_str = std::string("\nblendv mask: ");
|
||||
for (int64_t i = 0; i < vec::size(); i++) {
|
||||
mask_str += std::to_string(mask[i]) + " ";
|
||||
}
|
||||
if (AssertVectorized<vec>(std::string(NAME_INFO(test_blendv)) + mask_str, expected, actual).check()) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
template<typename vec, typename VT, int64_t idx, int64_t N>
|
||||
std::enable_if_t<(!is_complex<VT>::value && idx != N), bool>
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
test_blendv(VT expected_val[vec::size()], VT a[vec::size()], VT b[vec::size()], VT mask[vec::size()]) {
|
||||
// shuffle mask and do blendv test
|
||||
VT m = mask[idx];
|
||||
if (!test_blendv<vec, VT, idx+1, N>(expected_val, a, b, mask)) return false;
|
||||
if (m != (VT)0) {
|
||||
mask[idx] = (VT)0;
|
||||
}
|
||||
else {
|
||||
uint64_t hex_mask = 0xFFFFFFFFFFFFFFFF;
|
||||
std::memcpy(&mask[idx], &hex_mask, sizeof(VT));
|
||||
}
|
||||
if (!test_blendv<vec, VT, idx+1, N>(expected_val, a, b, mask)) return false;
|
||||
mask[idx] = m;
|
||||
return true;
|
||||
}
|
||||
template<typename T, int N>
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
void blend_init(T(&a)[N], T(&b)[N]) {
|
||||
void blend_init(T * a, T * b, int N) {
|
||||
a[0] = (T)1.0;
|
||||
b[0] = a[0] + (T)N;
|
||||
for (const auto i : c10::irange(1, N)) {
|
||||
@ -1107,8 +1112,8 @@ namespace {
|
||||
CACHE_ALIGN VT mask[vec::size()] = {0};
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
CACHE_ALIGN VT expected_val[vec::size()];
|
||||
blend_init(a, b);
|
||||
test_blendv<vec, VT, 0, vec::size()>(expected_val, a, b, mask);
|
||||
blend_init(a, b, vec::size());
|
||||
test_blendv<vec, VT>(expected_val, a, b, mask, 0, vec::size());
|
||||
}
|
||||
TYPED_TEST(BitwiseFloatsAdditional2, Blend) {
|
||||
using vec = TypeParam;
|
||||
@ -1119,9 +1124,9 @@ namespace {
|
||||
CACHE_ALIGN VT b[vec::size()];
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
CACHE_ALIGN VT expected_val[vec::size()];
|
||||
blend_init(a, b);
|
||||
constexpr int64_t power_sets = 1LL << (vec::size());
|
||||
test_blend<vec, VT, power_sets - 1>(expected_val, a, b);
|
||||
blend_init(a, b, vec::size());
|
||||
const int64_t power_sets = 1LL << (vec::size());
|
||||
test_blend<vec, VT>(expected_val, a, b, power_sets - 1);
|
||||
}
|
||||
template<typename vec, typename VT>
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
@ -1152,7 +1157,7 @@ namespace {
|
||||
CACHE_ALIGN VT b[vec::size()];
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
CACHE_ALIGN VT expected_val[vec::size()];
|
||||
blend_init(a, b);
|
||||
blend_init(a, b, vec::size());
|
||||
test_set<vec, VT>(expected_val, a, b, vec::size());
|
||||
}
|
||||
template<typename T>
|
||||
@ -1218,7 +1223,7 @@ namespace {
|
||||
// NOLINTNEXTLINE(bugprone-signed-char-misuse)
|
||||
constexpr int min_val = std::numeric_limits<underlying>::min();
|
||||
constexpr int max_val = std::numeric_limits<underlying>::max();
|
||||
constexpr int el_count = vfloat::size();
|
||||
const int el_count = vfloat::size();
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
CACHE_ALIGN float unit_float_vec[el_count];
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
|
||||
@ -1566,7 +1571,7 @@ namespace {
|
||||
using vec = TypeParam;
|
||||
using VT = ValueType<TypeParam>;
|
||||
constexpr auto R = 2LL; // residual
|
||||
constexpr auto N = vec::size() + R;
|
||||
const auto N = vec::size() + R;
|
||||
CACHE_ALIGN VT x1[N];
|
||||
CACHE_ALIGN VT x2[N];
|
||||
CACHE_ALIGN VT x3[N];
|
||||
@ -2130,7 +2135,7 @@ namespace {
|
||||
ASSERT_TRUE(vec_pinf.has_inf_nan()) << "Test failed for positive Infinity\n";
|
||||
ASSERT_TRUE(vec_ninf.has_inf_nan()) << "Test failed for negative Infinity\n";
|
||||
}
|
||||
#if !defined(CPU_CAPABILITY_SVE)
|
||||
#if !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
template <typename vec, typename dst_t>
|
||||
void test_convert_to(const char* dst_t_name) {
|
||||
using src_t = ValueType<vec>;
|
||||
@ -2213,13 +2218,13 @@ namespace {
|
||||
TYPED_TEST(VecMaskTests, MaskedLoad) {
|
||||
using vec = TypeParam;
|
||||
using src_t = ValueType<TypeParam>;
|
||||
constexpr auto size = vec::size();
|
||||
const auto size = vec::size();
|
||||
|
||||
#define TEST_MASK_LOAD(dst_t, mask_t, mask_n) \
|
||||
do { \
|
||||
constexpr int dst_size = at::vec::Vectorized<dst_t>::size(); \
|
||||
constexpr int dst_n = mask_n * size / dst_size; \
|
||||
if constexpr(dst_n * dst_size >= mask_n * size) { \
|
||||
int dst_size = at::vec::Vectorized<dst_t>::size(); \
|
||||
int dst_n = mask_n * size / dst_size; \
|
||||
if (dst_n * dst_size >= mask_n * size) { \
|
||||
CACHE_ALIGN dst_t x[mask_n * size]; \
|
||||
CACHE_ALIGN dst_t y[mask_n * size]; \
|
||||
CACHE_ALIGN dst_t ref[mask_n * size]; \
|
||||
@ -2230,9 +2235,47 @@ namespace {
|
||||
x[i] = generator.get(); \
|
||||
} \
|
||||
auto vec_mask = generate_vec_mask<mask_t, mask_n>(seed); \
|
||||
constexpr int rnd_n = (mask_n * size + dst_size - 1) / dst_size;\
|
||||
auto x_vec = vec_mask.template loadu<dst_t, rnd_n>(x); \
|
||||
x_vec.store(y); \
|
||||
int rnd_n = (mask_n * size + dst_size - 1) / dst_size;\
|
||||
switch (rnd_n) { \
|
||||
case 1: \
|
||||
{ \
|
||||
auto x_vec = vec_mask.template loadu<dst_t, 1>(x); \
|
||||
x_vec.store(y); \
|
||||
break; \
|
||||
} \
|
||||
case 2: \
|
||||
{ \
|
||||
auto x_vec = vec_mask.template loadu<dst_t, 2>(x); \
|
||||
x_vec.store(y); \
|
||||
break; \
|
||||
} \
|
||||
case 3: \
|
||||
{ \
|
||||
auto x_vec = vec_mask.template loadu<dst_t, 3>(x); \
|
||||
x_vec.store(y); \
|
||||
break; \
|
||||
} \
|
||||
case 4: \
|
||||
{ \
|
||||
auto x_vec = vec_mask.template loadu<dst_t, 4>(x); \
|
||||
x_vec.store(y); \
|
||||
break; \
|
||||
} \
|
||||
case 8: \
|
||||
{ \
|
||||
auto x_vec = vec_mask.template loadu<dst_t, 8>(x); \
|
||||
x_vec.store(y); \
|
||||
break; \
|
||||
} \
|
||||
case 16: \
|
||||
{ \
|
||||
auto x_vec = vec_mask.template loadu<dst_t, 16>(x); \
|
||||
x_vec.store(y); \
|
||||
break; \
|
||||
} \
|
||||
default: \
|
||||
throw std::out_of_range("Unexpected rnd_n call to vec_mask"); \
|
||||
} \
|
||||
for (const auto i : c10::irange(mask_n * size)) { \
|
||||
if (vec_mask.is_masked(i)) { \
|
||||
ref[i] = x[i]; \
|
||||
@ -2269,7 +2312,7 @@ namespace {
|
||||
#undef TEST_MASK_LOAD
|
||||
#undef TEST_MASK_LOAD_N
|
||||
}
|
||||
#if !defined(CPU_CAPABILITY_SVE)
|
||||
#if !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
TYPED_TEST(VecMaskTests, MaskedCheck) {
|
||||
using VT = ValueType<TypeParam>;
|
||||
using vec = TypeParam;
|
||||
@ -2294,7 +2337,7 @@ namespace {
|
||||
#undef TEST_MASK_CHECK_N
|
||||
}
|
||||
#endif
|
||||
#if !defined(CPU_CAPABILITY_SVE)
|
||||
#if !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
TYPED_TEST(VecMaskTests, ToFrom) {
|
||||
using vec = TypeParam;
|
||||
using VT = ValueType<TypeParam>;
|
||||
@ -2321,7 +2364,7 @@ namespace {
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if !defined(CPU_CAPABILITY_SVE)
|
||||
#if !defined(CPU_CAPABILITY_SVE256) && !defined(CPU_CAPABILITY_SVE)
|
||||
TYPED_TEST(VecMaskTests, Cast) {
|
||||
using vec = TypeParam;
|
||||
using src_t = ValueType<TypeParam>;
|
||||
|
||||
@ -56,7 +56,7 @@ CACHE_ALIGN #define
|
||||
defined(CPU_CAPABILITY_AVX512) && (defined(__GNUC__) || defined(__GNUG__))
|
||||
#undef CHECK_DEQUANT_WITH_LOW_PRECISION
|
||||
#define CHECK_WITH_FMA 1
|
||||
#elif defined(CPU_CAPABILITY_SVE)
|
||||
#elif defined(CPU_CAPABILITY_SVE256)
|
||||
#define CHECK_DEQUANT_WITH_LOW_PRECISION 1
|
||||
#define CHECK_WITH_FMA 1
|
||||
#elif !defined(CPU_CAPABILITY_VSX) && !defined(CPU_CAPABILITY_AVX2)
|
||||
@ -136,7 +136,7 @@ template<typename T>
|
||||
struct VecTypeHelper {
|
||||
using holdType = typename T::value_type;
|
||||
using memStorageType = typename T::value_type;
|
||||
static constexpr int holdCount = T::size();
|
||||
static inline int holdCount = T::size();
|
||||
static constexpr int unitStorageCount = 1;
|
||||
};
|
||||
|
||||
@ -399,9 +399,9 @@ T clamp_min(const T& a, const T& min) {
|
||||
return a < min ? min : a;
|
||||
}
|
||||
|
||||
template <class VT, size_t N>
|
||||
void copy_interleave(VT(&vals)[N], VT(&interleaved)[N]) {
|
||||
static_assert(N % 2 == 0, "should be even");
|
||||
template <class VT>
|
||||
void copy_interleave(VT * vals, VT * interleaved, size_t N) {
|
||||
assert(N % 2 == 0);
|
||||
auto ptr1 = vals;
|
||||
auto ptr2 = vals + N / 2;
|
||||
for (size_t i = 0; i < N; i += 2) {
|
||||
@ -871,10 +871,10 @@ public:
|
||||
using UVT = UvalueType<T>;
|
||||
using BVT = BitType<UVT>;
|
||||
UVT absErr = correctEpsilon(toleranceEps);
|
||||
constexpr int sizeX = VecTypeHelper<T>::holdCount * VecTypeHelper<T>::unitStorageCount;
|
||||
const int sizeX = VecTypeHelper<T>::holdCount * VecTypeHelper<T>::unitStorageCount;
|
||||
constexpr int unitStorageCount = VecTypeHelper<T>::unitStorageCount;
|
||||
CACHE_ALIGN UVT expArr[sizeX];
|
||||
CACHE_ALIGN UVT actArr[sizeX];
|
||||
UVT expArr[sizeX];
|
||||
UVT actArr[sizeX];
|
||||
exp.store(expArr);
|
||||
act.store(actArr);
|
||||
if (bitwise)
|
||||
@ -942,7 +942,7 @@ void test_unary(
|
||||
using vec_type = T;
|
||||
using VT = ValueType<T>;
|
||||
using UVT = UvalueType<T>;
|
||||
constexpr int el_count = vec_type::size();
|
||||
const int el_count = vec_type::size();
|
||||
CACHE_ALIGN VT vals[el_count];
|
||||
CACHE_ALIGN VT expected[el_count];
|
||||
bool bitwise = testCase.isBitwise();
|
||||
@ -1000,7 +1000,7 @@ void test_binary(
|
||||
using vec_type = T;
|
||||
using VT = ValueType<T>;
|
||||
using UVT = UvalueType<T>;
|
||||
constexpr int el_count = vec_type::size();
|
||||
const int el_count = vec_type::size();
|
||||
CACHE_ALIGN VT vals0[el_count];
|
||||
CACHE_ALIGN VT vals1[el_count];
|
||||
CACHE_ALIGN VT expected[el_count];
|
||||
@ -1163,7 +1163,7 @@ void test_ternary(
|
||||
using vec_type = T;
|
||||
using VT = ValueType<T>;
|
||||
using UVT = UvalueType<T>;
|
||||
constexpr int el_count = vec_type::size();
|
||||
const int el_count = vec_type::size();
|
||||
CACHE_ALIGN VT vals0[el_count];
|
||||
CACHE_ALIGN VT vals1[el_count];
|
||||
CACHE_ALIGN VT vals2[el_count];
|
||||
@ -1203,12 +1203,15 @@ void test_ternary(
|
||||
auto input1 = vec_type::loadu(vals1);
|
||||
auto input2 = vec_type::loadu(vals2);
|
||||
auto actual = actualFunction(input0, input1, input2);
|
||||
CACHE_ALIGN VT actual_[vec_type::size()];
|
||||
actual.store(actual_);
|
||||
auto vec_expected = vec_type::loadu(expected);
|
||||
|
||||
AssertVectorized<vec_type> vecAssert(
|
||||
testNameInfo, seed, vec_expected, actual, input0, input1, input2);
|
||||
if (vecAssert.check(
|
||||
bitwise, dmn.CheckWithTolerance, dmn.ToleranceError))
|
||||
return;
|
||||
return;
|
||||
} // trial
|
||||
changeSeedBy += 1;
|
||||
}
|
||||
@ -1573,19 +1576,19 @@ double getDefaultTolerance() {
|
||||
|
||||
template<typename T, int N = 1>
|
||||
at::vec::VecMask<T, N> create_vec_mask(uint64_t bitmask) {
|
||||
constexpr auto size = at::vec::Vectorized<T>::size();
|
||||
std::array<int, N * size> mask;
|
||||
const auto size = at::vec::Vectorized<T>::size();
|
||||
int mask[N * size];
|
||||
for (int n = 0; n < N; n++) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
mask[n * size + i] = (bitmask >> i) & 1;
|
||||
}
|
||||
}
|
||||
return at::vec::VecMask<T, N>::from(mask.data());
|
||||
return at::vec::VecMask<T, N>::from(mask);
|
||||
}
|
||||
|
||||
template<typename T, int N = 1>
|
||||
at::vec::VecMask<T, N> generate_vec_mask(int seed) {
|
||||
constexpr auto size = at::vec::Vectorized<T>::size();
|
||||
const auto size = at::vec::Vectorized<T>::size();
|
||||
ValueGen<uint64_t> generator(0, (1ULL << size) - 1, seed);
|
||||
auto bitmask = generator.get();
|
||||
return create_vec_mask<T, N>(bitmask);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user