mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-30 19:54:53 +08:00
Compare commits
210 Commits
gh/davidbe
...
delete-qua
| Author | SHA1 | Date | |
|---|---|---|---|
| 239ea930a3 | |||
| edf7bb4f51 | |||
| bbb930aba2 | |||
| 60b41de0ca | |||
| e38a335d7f | |||
| 9d8cf24b3b | |||
| be56a8d7ac | |||
| 3effe0c293 | |||
| 2fde2090d0 | |||
| 5d8d126249 | |||
| 378c121d5e | |||
| 7e83d50845 | |||
| 6f05d58f2b | |||
| a7eb153bba | |||
| ed6df0e324 | |||
| 5c79a55e7e | |||
| 3d06ff82a8 | |||
| 2efa5eaa65 | |||
| c2510fcd86 | |||
| 510c398a4f | |||
| 63a96eaeb8 | |||
| 2b8d3b1b2b | |||
| bf1ebe0531 | |||
| 433a247102 | |||
| 8a47f9d03b | |||
| 9e5f4a844c | |||
| 7c1f627828 | |||
| b3b4d28f4c | |||
| ae1094b72b | |||
| eda0a9cc90 | |||
| d74ccf4ffe | |||
| 689fba032d | |||
| c1d69d5dd5 | |||
| e49acfc5c5 | |||
| 034e996d37 | |||
| 16c3b4143b | |||
| 54a4d34d10 | |||
| 78684e27ac | |||
| 40e39ae21f | |||
| e466dab164 | |||
| d27d36136c | |||
| 815545f2dd | |||
| d26ca5de05 | |||
| 2022588295 | |||
| 02715d0876 | |||
| 17687eb792 | |||
| 7cda4017dd | |||
| 3e56a9cdfb | |||
| ee9ac36c23 | |||
| 9be5860bc3 | |||
| 548c9d8281 | |||
| 71a650ad56 | |||
| 2471cc3355 | |||
| db00e1699a | |||
| 5ea832e5f6 | |||
| a952956d05 | |||
| 63e87d6d05 | |||
| f7127b9b94 | |||
| 44f5b93122 | |||
| e0fd48be7d | |||
| 43f7216327 | |||
| 8a8fac1131 | |||
| bcc98bb2a4 | |||
| 524e827095 | |||
| 9968edd002 | |||
| 7275f28045 | |||
| 7be862ab8f | |||
| 336f1e2d35 | |||
| a46ea8a364 | |||
| f41d017aa6 | |||
| 52e4e41cbc | |||
| 64f2ec77f8 | |||
| fdc5b42a8f | |||
| d58ed04d89 | |||
| 386bc9e2e9 | |||
| f2e712ca14 | |||
| 99c1a6bdd9 | |||
| 4ed1b03f72 | |||
| 8f9a191db6 | |||
| ef97bd4713 | |||
| f0b388665e | |||
| c9a5bf09ba | |||
| dfcda613b6 | |||
| 0e7f02fe2e | |||
| 308b88bde9 | |||
| c51da57b55 | |||
| f9544f1f0c | |||
| 11c71053e0 | |||
| 22abe6ded4 | |||
| 2b82c61f04 | |||
| f651e28f80 | |||
| e7167dbacf | |||
| 6c42afe196 | |||
| f7130c097e | |||
| ad86c05b78 | |||
| b359571c60 | |||
| a6fab82b16 | |||
| dd3e7170c2 | |||
| 7081b8233a | |||
| 7b392bac13 | |||
| 19ae5afdaa | |||
| 3fd84a8592 | |||
| d56f11a1f2 | |||
| 794b95d54b | |||
| e3fe001d9e | |||
| 660dbea909 | |||
| 5cfe4377d6 | |||
| 898179331e | |||
| 2e64e45b0b | |||
| e472daa809 | |||
| ec816d73b4 | |||
| f17f658125 | |||
| c9174a20f7 | |||
| b6276a425f | |||
| a0e0abd037 | |||
| b221be9140 | |||
| 8408522976 | |||
| c329a8f19c | |||
| 5dfd8a9c7a | |||
| 8c2e450082 | |||
| 2bb33e7a08 | |||
| 4ce6e6ec88 | |||
| 382598ef87 | |||
| dc524efb4d | |||
| 5d5a5b3501 | |||
| 404008e3ef | |||
| b642a5c118 | |||
| 493f42a541 | |||
| 662c1cfed2 | |||
| 5cc4e856fd | |||
| 7597988f1b | |||
| 9620994067 | |||
| e124a0d88c | |||
| 7cfd054075 | |||
| d40aaa42ee | |||
| 11c07c848c | |||
| db259bd6b8 | |||
| d5cdc36943 | |||
| 541584d22e | |||
| c0e155a8d2 | |||
| 48560eef80 | |||
| fd4f704905 | |||
| 60e66d11ab | |||
| 4b4c2a7b1d | |||
| af9c92b4cb | |||
| c09cf29d7d | |||
| 6f60cfe9b1 | |||
| e20784f228 | |||
| 172853547a | |||
| e0ab1b538a | |||
| 3f569f9af7 | |||
| 94716db222 | |||
| 06f39a71b6 | |||
| 36dd598bda | |||
| 32983ea698 | |||
| 5e636d664a | |||
| eaf32fffb7 | |||
| 0e9d8032a3 | |||
| 0105cd89ab | |||
| d5d14ee823 | |||
| 156bc243f0 | |||
| bd6b5fddbf | |||
| 54701a0c94 | |||
| 0edc1b91f7 | |||
| 9f5276dc07 | |||
| 9d175bc7e6 | |||
| b096341963 | |||
| 82eefaedd9 | |||
| c553c55be7 | |||
| d5a89178b0 | |||
| bdb7819166 | |||
| 34c8033fd3 | |||
| ab2294d828 | |||
| 3173616532 | |||
| 8c0df6fe17 | |||
| 0364db7cd1 | |||
| f8c0a4bd28 | |||
| 4c8eb65efb | |||
| 5a2db5152d | |||
| 0a63053fe9 | |||
| bb476310a4 | |||
| fa1c20ae92 | |||
| 77676753ec | |||
| 617e3f69f8 | |||
| ab6cb34480 | |||
| c6a27bae36 | |||
| 563fd95563 | |||
| 6ef70edd9a | |||
| 3df6360e8c | |||
| d0a9629435 | |||
| 22edb457c9 | |||
| e5f6ffd810 | |||
| 019e30e3b8 | |||
| 4500a4aa50 | |||
| 6bc263809d | |||
| ffac0de07e | |||
| 01b0f09931 | |||
| 6401d1d53d | |||
| 3a5677a380 | |||
| 02608e560a | |||
| e1aee86646 | |||
| 1c8844d9e7 | |||
| 720c2c46b1 | |||
| 3bc6bdc866 | |||
| 47f10d0ad0 | |||
| 0f9c1b374f | |||
| b146e1a264 | |||
| c78fce9e79 | |||
| 023887fc5a | |||
| 1586521461 |
@ -383,7 +383,7 @@ if [[ -n "${CI:-}" ]]; then
|
||||
fi
|
||||
|
||||
# Build image
|
||||
tar ch . | docker build \
|
||||
docker build \
|
||||
${no_cache_flag} \
|
||||
${progress_flag} \
|
||||
--build-arg "BUILD_ENVIRONMENT=${image}" \
|
||||
@ -422,7 +422,7 @@ tar ch . | docker build \
|
||||
-f $(dirname ${DOCKERFILE})/Dockerfile \
|
||||
-t "$tmp_tag" \
|
||||
"$@" \
|
||||
-
|
||||
.
|
||||
|
||||
# NVIDIA dockers for RC releases use tag names like `11.0-cudnn9-devel-ubuntu18.04-rc`,
|
||||
# for this case we will set UBUNTU_VERSION to `18.04-rc` so that the Dockerfile could
|
||||
|
||||
@ -1 +1 @@
|
||||
c8757738a7418249896224430ce84888e8ecdd79
|
||||
ae848267bebc65c6181e8cc5e64a6357d2679260
|
||||
|
||||
@ -10,6 +10,8 @@ else
|
||||
arch_path='sbsa'
|
||||
fi
|
||||
|
||||
NVSHMEM_VERSION=3.3.9
|
||||
|
||||
function install_cuda {
|
||||
version=$1
|
||||
runfile=$2
|
||||
@ -40,13 +42,52 @@ function install_cudnn {
|
||||
rm -rf tmp_cudnn
|
||||
}
|
||||
|
||||
function install_nvshmem {
|
||||
cuda_major_version=$1 # e.g. "12"
|
||||
nvshmem_version=$2 # e.g. "3.3.9"
|
||||
|
||||
case "${arch_path}" in
|
||||
sbsa)
|
||||
dl_arch="aarch64"
|
||||
;;
|
||||
x86_64)
|
||||
dl_arch="x64"
|
||||
;;
|
||||
*)
|
||||
dl_arch="${arch}"
|
||||
;;
|
||||
esac
|
||||
|
||||
tmpdir="tmp_nvshmem"
|
||||
mkdir -p "${tmpdir}" && cd "${tmpdir}"
|
||||
|
||||
# nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html
|
||||
filename="libnvshmem_cuda${cuda_major_version}-linux-${arch_path}-${nvshmem_version}"
|
||||
url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}.tar.gz"
|
||||
|
||||
# download, unpack, install
|
||||
wget -q "${url}"
|
||||
tar xf "${filename}.tar.gz"
|
||||
cp -a "libnvshmem/include/"* /usr/local/include/
|
||||
cp -a "libnvshmem/lib/"* /usr/local/lib/
|
||||
|
||||
# cleanup
|
||||
cd ..
|
||||
rm -rf "${tmpdir}"
|
||||
|
||||
echo "nvSHMEM ${nvshmem_version} for CUDA ${cuda_major_version} (${arch_path}) installed."
|
||||
}
|
||||
|
||||
|
||||
function install_126 {
|
||||
CUDNN_VERSION=9.10.2.21
|
||||
echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
|
||||
echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
|
||||
install_cuda 12.6.3 cuda_12.6.3_560.35.05_linux
|
||||
|
||||
install_cudnn 12 $CUDNN_VERSION
|
||||
|
||||
install_nvshmem 12 $NVSHMEM_VERSION
|
||||
|
||||
CUDA_VERSION=12.6 bash install_nccl.sh
|
||||
|
||||
CUDA_VERSION=12.6 bash install_cusparselt.sh
|
||||
@ -56,13 +97,15 @@ function install_126 {
|
||||
|
||||
function install_129 {
|
||||
CUDNN_VERSION=9.10.2.21
|
||||
echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
|
||||
echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
|
||||
# install CUDA 12.9.1 in the same container
|
||||
install_cuda 12.9.1 cuda_12.9.1_575.57.08_linux
|
||||
|
||||
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
|
||||
install_cudnn 12 $CUDNN_VERSION
|
||||
|
||||
install_nvshmem 12 $NVSHMEM_VERSION
|
||||
|
||||
CUDA_VERSION=12.9 bash install_nccl.sh
|
||||
|
||||
CUDA_VERSION=12.9 bash install_cusparselt.sh
|
||||
@ -106,13 +149,15 @@ function prune_126 {
|
||||
|
||||
function install_128 {
|
||||
CUDNN_VERSION=9.8.0.87
|
||||
echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
|
||||
echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
|
||||
# install CUDA 12.8.1 in the same container
|
||||
install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux
|
||||
|
||||
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
|
||||
install_cudnn 12 $CUDNN_VERSION
|
||||
|
||||
install_nvshmem 12 $NVSHMEM_VERSION
|
||||
|
||||
CUDA_VERSION=12.8 bash install_nccl.sh
|
||||
|
||||
CUDA_VERSION=12.8 bash install_cusparselt.sh
|
||||
|
||||
@ -383,6 +383,6 @@ cmake==4.0.0
|
||||
tlparse==0.3.30
|
||||
#Description: required for log parsing
|
||||
|
||||
cuda-bindings>=12.0,<13.0
|
||||
cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"
|
||||
#Description: required for testing CUDAGraph::raw_cuda_graph(). See https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html for how this version was chosen. Note "Any fix in the latest bindings would be backported to the prior major version" means that only the newest version of cuda-bindings will get fixes. Depending on the latest version of 12.x is okay because all 12.y versions will be supported via "CUDA minor version compatibility". Pytorch builds against 13.z versions of cuda toolkit work with 12.x versions of cuda-bindings as well because newer drivers work with old toolkits.
|
||||
#test that import: test_cuda.py
|
||||
|
||||
@ -1 +0,0 @@
|
||||
../../docs/requirements.txt
|
||||
61
.ci/docker/requirements-docs.txt
Normal file
61
.ci/docker/requirements-docs.txt
Normal file
@ -0,0 +1,61 @@
|
||||
sphinx==5.3.0
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 5.3.0
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
|
||||
|
||||
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
|
||||
# but it doesn't seem to work and hangs around idly. The initial thought is probably
|
||||
# something related to Docker setup. We can investigate this later
|
||||
|
||||
sphinxcontrib.katex==0.8.6
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 0.8.6
|
||||
|
||||
sphinxext-opengraph==0.9.1
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 0.9.1
|
||||
|
||||
sphinx_sitemap==2.6.0
|
||||
#Description: This is used to generate sitemap for PyTorch docs
|
||||
#Pinned versions: 2.6.0
|
||||
|
||||
matplotlib==3.5.3 ; python_version < "3.13"
|
||||
matplotlib==3.6.3 ; python_version >= "3.13"
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 3.6.3 if python > 3.12. Otherwise 3.5.3.
|
||||
|
||||
tensorboard==2.13.0 ; python_version < "3.13"
|
||||
tensorboard==2.18.0 ; python_version >= "3.13"
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 2.13.0
|
||||
|
||||
breathe==4.34.0
|
||||
#Description: This is used to generate PyTorch C++ docs
|
||||
#Pinned versions: 4.34.0
|
||||
|
||||
exhale==0.2.3
|
||||
#Description: This is used to generate PyTorch C++ docs
|
||||
#Pinned versions: 0.2.3
|
||||
|
||||
docutils==0.16
|
||||
#Description: This is used to generate PyTorch C++ docs
|
||||
#Pinned versions: 0.16
|
||||
|
||||
bs4==0.0.1
|
||||
#Description: This is used to generate PyTorch C++ docs
|
||||
#Pinned versions: 0.0.1
|
||||
|
||||
IPython==8.12.0
|
||||
#Description: This is used to generate PyTorch functorch docs
|
||||
#Pinned versions: 8.12.0
|
||||
|
||||
myst-nb==0.17.2
|
||||
#Description: This is used to generate PyTorch functorch docs
|
||||
#Pinned versions: 0.13.2
|
||||
|
||||
# The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
|
||||
python-etcd==0.4.5
|
||||
sphinx-copybutton==0.5.0
|
||||
sphinx-design==0.4.0
|
||||
sphinxcontrib-mermaid==1.0.0
|
||||
myst-parser==0.18.1
|
||||
@ -1 +1 @@
|
||||
3.3.1
|
||||
3.4.0
|
||||
|
||||
@ -51,20 +51,22 @@ else
|
||||
fi
|
||||
|
||||
cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
|
||||
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
|
||||
|
||||
TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
|
||||
case ${CUDA_VERSION} in
|
||||
12.8|12.9)
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX" #removing sm_50-sm_70 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
|
||||
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
|
||||
#removing sm_50-sm_70 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
|
||||
12.8)
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0"
|
||||
;;
|
||||
12.9)
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
|
||||
# WAR to resolve the ld error in libtorch build with CUDA 12.9
|
||||
if [[ "$DESIRED_CUDA" == "cu129" && "$PACKAGE_TYPE" == "libtorch" ]]; then
|
||||
if [[ "$PACKAGE_TYPE" == "libtorch" ]]; then
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
|
||||
fi
|
||||
;;
|
||||
12.6)
|
||||
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
|
||||
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
|
||||
TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
|
||||
;;
|
||||
*)
|
||||
echo "unknown cuda version $CUDA_VERSION"
|
||||
|
||||
@ -393,10 +393,8 @@ else
|
||||
# This is an attempt to mitigate flaky libtorch build OOM error. By default, the build parallelization
|
||||
# is set to be the number of CPU minus 2. So, let's try a more conservative value here. A 4xlarge has
|
||||
# 16 CPUs
|
||||
if [ -z "$MAX_JOBS_OVERRIDE" ]; then
|
||||
MAX_JOBS=$(nproc --ignore=4)
|
||||
export MAX_JOBS
|
||||
fi
|
||||
MAX_JOBS=$(nproc --ignore=4)
|
||||
export MAX_JOBS
|
||||
|
||||
# NB: Install outside of source directory (at the same level as the root
|
||||
# pytorch folder) so that it doesn't get cleaned away prior to docker push.
|
||||
|
||||
@ -13,6 +13,13 @@ if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then
|
||||
fi
|
||||
|
||||
if which sccache > /dev/null; then
|
||||
# Clear SCCACHE_BUCKET and SCCACHE_REGION if they are empty, otherwise
|
||||
# sccache will complain about invalid bucket configuration
|
||||
if [[ -z "${SCCACHE_BUCKET:-}" ]]; then
|
||||
unset SCCACHE_BUCKET
|
||||
unset SCCACHE_REGION
|
||||
fi
|
||||
|
||||
# Save sccache logs to file
|
||||
sccache --stop-server > /dev/null 2>&1 || true
|
||||
rm -f ~/sccache_error.log || true
|
||||
|
||||
@ -11,6 +11,8 @@ export TERM=vt100
|
||||
|
||||
# shellcheck source=./common.sh
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
|
||||
# shellcheck source=./common-build.sh
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
|
||||
|
||||
# Do not change workspace permissions for ROCm and s390x CI jobs
|
||||
# as it can leave workspace with bad permissions for cancelled jobs
|
||||
@ -163,8 +165,6 @@ elif [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
|
||||
export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu"
|
||||
# setting PYTHON_TEST_EXTRA_OPTION
|
||||
export PYTHON_TEST_EXTRA_OPTION="--xpu"
|
||||
# Disable sccache for xpu test due to flaky issue https://github.com/pytorch/pytorch/issues/143585
|
||||
sudo rm -rf /opt/cache
|
||||
fi
|
||||
|
||||
if [[ "$TEST_CONFIG" == *crossref* ]]; then
|
||||
@ -333,9 +333,9 @@ test_h100_distributed() {
|
||||
test_h100_symm_mem() {
|
||||
# symmetric memory test
|
||||
time python test/run_test.py --include distributed/test_symmetric_memory.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
time TORCH_SYMMMEM=NVSHMEM python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
time TORCH_SYMMMEM=NVSHMEM python test/run_test.py --include distributed/test_nvshmem_triton.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
time TORCH_SYMMMEM=NCCL python test/run_test.py --include distributed/test_nccl.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
time python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
time python test/run_test.py --include distributed/test_nvshmem_triton.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
time python test/run_test.py --include distributed/test_nccl.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
@ -368,6 +368,16 @@ test_dynamo_wrapped_shard() {
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_einops() {
|
||||
pip install einops==0.6.1
|
||||
time python test/run_test.py --einops --verbose --upload-artifacts-while-running
|
||||
pip install einops==0.7.0
|
||||
time python test/run_test.py --einops --verbose --upload-artifacts-while-running
|
||||
pip install einops==0.8.1
|
||||
time python test/run_test.py --einops --verbose --upload-artifacts-while-running
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
|
||||
test_inductor_distributed() {
|
||||
# Smuggle a few multi-gpu tests here so that we don't have to request another large node
|
||||
@ -426,14 +436,21 @@ test_inductor_aoti() {
|
||||
python3 tools/amd_build/build_amd.py
|
||||
fi
|
||||
if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
|
||||
BUILD_AOT_INDUCTOR_TEST=1 TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop
|
||||
BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop)
|
||||
# TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
|
||||
LD_LIBRARY_PATH=/opt/conda/envs/py_3.10/lib/:${TORCH_LIB_DIR}:$LD_LIBRARY_PATH
|
||||
CPP_TESTS_DIR="${BUILD_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
|
||||
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
|
||||
else
|
||||
BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
|
||||
CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
|
||||
BUILD_COMMAND=(python setup.py develop)
|
||||
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
|
||||
fi
|
||||
|
||||
# aoti cmake custom command requires `torch` to be installed
|
||||
# initialize the cmake build cache and install torch
|
||||
/usr/bin/env "${BUILD_COMMAND[@]}"
|
||||
# rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
|
||||
/usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
|
||||
|
||||
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
|
||||
}
|
||||
|
||||
test_inductor_cpp_wrapper_shard() {
|
||||
@ -446,47 +463,26 @@ test_inductor_cpp_wrapper_shard() {
|
||||
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
||||
mkdir -p "$TEST_REPORTS_DIR"
|
||||
|
||||
if [[ "$1" -eq "2" ]]; then
|
||||
# For now, manually put the opinfo tests in shard 2, and all other tests in
|
||||
# shard 1. Run all CPU tests, as well as specific GPU tests triggering past
|
||||
# bugs, for now.
|
||||
python test/run_test.py \
|
||||
--include inductor/test_torchinductor_opinfo \
|
||||
-k 'linalg or to_sparse or TestInductorOpInfoCPU' \
|
||||
--verbose
|
||||
exit
|
||||
fi
|
||||
|
||||
# Run certain inductor unit tests with cpp wrapper. In the end state, we
|
||||
# should be able to run all the inductor unit tests with cpp_wrapper.
|
||||
#
|
||||
# TODO: I'm pretty sure that "TestInductorOpInfoCPU" is not a valid filter,
|
||||
# but change that in another PR to more accurately monitor the increased CI
|
||||
# usage.
|
||||
python test/run_test.py \
|
||||
--include inductor/test_torchinductor_opinfo \
|
||||
-k 'linalg or to_sparse or TestInductorOpInfoCPU' \
|
||||
--shard "$1" "$NUM_TEST_SHARDS" \
|
||||
--verbose
|
||||
python test/run_test.py \
|
||||
--include inductor/test_torchinductor inductor/test_max_autotune inductor/test_cpu_repro \
|
||||
--shard "$1" "$NUM_TEST_SHARDS" \
|
||||
--verbose
|
||||
python test/run_test.py --inductor \
|
||||
--include test_torch \
|
||||
-k 'take' \
|
||||
--shard "$1" "$NUM_TEST_SHARDS" \
|
||||
--verbose
|
||||
python test/run_test.py --inductor --include test_torch -k 'take' --verbose
|
||||
|
||||
# Run inductor benchmark tests with cpp wrapper.
|
||||
# Skip benchmark tests if it's in rerun-disabled-mode.
|
||||
if [[ "${PYTORCH_TEST_RERUN_DISABLED_TESTS}" == "1" ]]; then
|
||||
echo "skip dynamo benchmark tests for rerun-disabled-test"
|
||||
else
|
||||
echo "run dynamo benchmark tests with cpp wrapper"
|
||||
python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
|
||||
--training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
|
||||
--output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv"
|
||||
python benchmarks/dynamo/check_accuracy.py \
|
||||
--actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
|
||||
--expected "benchmarks/dynamo/ci_expected_accuracy/${MAYBE_ROCM}inductor_timm_training.csv"
|
||||
|
||||
python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
|
||||
--bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
|
||||
python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
|
||||
--bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
|
||||
python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
|
||||
--bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
|
||||
python benchmarks/dynamo/check_accuracy.py \
|
||||
--actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
|
||||
--expected "benchmarks/dynamo/ci_expected_accuracy/${MAYBE_ROCM}inductor_torchbench_inference.csv"
|
||||
fi
|
||||
}
|
||||
|
||||
# "Global" flags for inductor benchmarking controlled by TEST_CONFIG
|
||||
@ -1698,11 +1694,11 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
|
||||
PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
|
||||
fi
|
||||
elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
|
||||
install_torchaudio cuda
|
||||
install_torchvision
|
||||
checkout_install_torchbench hf_T5 llama moco
|
||||
PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
|
||||
test_inductor_aoti
|
||||
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
|
||||
test_inductor_aoti
|
||||
fi
|
||||
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
|
||||
install_torchvision
|
||||
test_inductor_shard "${SHARD_NUMBER}"
|
||||
@ -1711,6 +1707,8 @@ elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
|
||||
test_inductor_distributed
|
||||
fi
|
||||
fi
|
||||
elif [[ "${TEST_CONFIG}" == *einops* ]]; then
|
||||
test_einops
|
||||
elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
|
||||
install_torchvision
|
||||
test_dynamo_wrapped_shard "${SHARD_NUMBER}"
|
||||
@ -1760,7 +1758,7 @@ elif [[ "${TEST_CONFIG}" == smoke ]]; then
|
||||
test_python_smoke
|
||||
elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
|
||||
test_h100_distributed
|
||||
elif [[ "${TEST_CONFIG}" == test_h100_symm_mem ]]; then
|
||||
elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
|
||||
test_h100_symm_mem
|
||||
else
|
||||
install_torchvision
|
||||
|
||||
@ -75,8 +75,8 @@ TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
|
||||
# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
|
||||
TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
|
||||
# CUDA 12.8 builds have triton for Linux and Linux aarch64 binaries.
|
||||
if [[ "$DESIRED_CUDA" == cu128 ]]; then
|
||||
# CUDA 12.9 builds have triton for Linux and Linux aarch64 binaries.
|
||||
if [[ "$DESIRED_CUDA" == "cu129" ]]; then
|
||||
TRITON_CONSTRAINT="platform_system == 'Linux'"
|
||||
fi
|
||||
|
||||
|
||||
@ -125,7 +125,7 @@ runs:
|
||||
TAG: ${{ steps.parse-ref.outputs.tag }}
|
||||
EVENT_NAME: ${{ github.event_name }}
|
||||
SCHEDULE: ${{ github.event.schedule }}
|
||||
HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }}
|
||||
HEAD_BRANCH: ${{ steps.parse-ref.outputs.branch }}
|
||||
id: filter
|
||||
run: |
|
||||
echo "Workflow: ${GITHUB_WORKFLOW}"
|
||||
|
||||
@ -304,8 +304,7 @@ def unzip_artifact_and_replace_files() -> None:
|
||||
|
||||
|
||||
def set_output() -> None:
|
||||
# Disable for now so we can monitor first
|
||||
# pass
|
||||
print("Setting output reuse=true")
|
||||
if os.getenv("GITHUB_OUTPUT"):
|
||||
with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
|
||||
print("reuse=true", file=env)
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
4e94321c54617dd738a05bfedfc28bc0fa635b5c
|
||||
70caf76066ef2c1054d6128b11769dc816a779e7
|
||||
|
||||
2
.github/ci_commit_pins/xla.txt
vendored
2
.github/ci_commit_pins/xla.txt
vendored
@ -1 +1 @@
|
||||
926700d7832caa552ba2e1fc8302f6a2f4d2f6d8
|
||||
1c00dea2c9adb2137903c86b4191e8c247f8fda9
|
||||
|
||||
30
.github/scripts/delete_old_branches.py
vendored
30
.github/scripts/delete_old_branches.py
vendored
@ -275,7 +275,7 @@ def delete_branches() -> None:
|
||||
delete_branch(git_repo, branch)
|
||||
|
||||
|
||||
def delete_old_ciflow_tags() -> None:
|
||||
def delete_old_tags() -> None:
|
||||
# Deletes ciflow tags if they are associated with a closed PR or a specific
|
||||
# commit. Lightweight tags don't have information about the date they were
|
||||
# created, so we can't check how old they are. The script just assumes that
|
||||
@ -288,23 +288,29 @@ def delete_old_ciflow_tags() -> None:
|
||||
delete_branch(git_repo, f"refs/tags/{tag}")
|
||||
|
||||
tags = git_repo._run_git("tag").splitlines()
|
||||
open_pr_numbers = [x["number"] for x in get_open_prs()]
|
||||
|
||||
CIFLOW_TAG_REGEX = re.compile(r"^ciflow\/.*\/(\d{5,6}|[0-9a-f]{40})$")
|
||||
AUTO_REVERT_TAG_REGEX = re.compile(r"^trunk\/[0-9a-f]{40}$")
|
||||
for tag in tags:
|
||||
try:
|
||||
if ESTIMATED_TOKENS[0] > 400:
|
||||
print("Estimated tokens exceeded, exiting")
|
||||
break
|
||||
if not tag.startswith("ciflow/"):
|
||||
|
||||
if not CIFLOW_TAG_REGEX.match(tag) and not AUTO_REVERT_TAG_REGEX.match(tag):
|
||||
continue
|
||||
re_match_pr = re.match(r"^ciflow\/.*\/(\d{5,6})$", tag)
|
||||
re_match_sha = re.match(r"^ciflow\/.*\/([0-9a-f]{40})$", tag)
|
||||
if re_match_pr:
|
||||
pr_number = int(re_match_pr.group(1))
|
||||
if pr_number in open_pr_numbers:
|
||||
continue
|
||||
delete_tag(tag)
|
||||
elif re_match_sha:
|
||||
|
||||
# This checks the date of the commit associated with the tag instead
|
||||
# of the tag itself since lightweight tags don't have this
|
||||
# information. I think it should be ok since this only runs once a
|
||||
# day
|
||||
tag_info = git_repo._run_git("show", "-s", "--format=%ct", tag)
|
||||
tag_timestamp = int(tag_info.strip())
|
||||
# Maybe some timezone issues, but a few hours shouldn't matter
|
||||
tag_age_days = (datetime.now().timestamp() - tag_timestamp) / SEC_IN_DAY
|
||||
|
||||
if tag_age_days > 7:
|
||||
print(f"[{tag}] Tag is older than 7 days, deleting")
|
||||
delete_tag(tag)
|
||||
except Exception as e:
|
||||
print(f"Failed to check tag {tag}: {e}")
|
||||
@ -312,4 +318,4 @@ def delete_old_ciflow_tags() -> None:
|
||||
|
||||
if __name__ == "__main__":
|
||||
delete_branches()
|
||||
delete_old_ciflow_tags()
|
||||
delete_old_tags()
|
||||
|
||||
16
.github/scripts/filter_test_configs.py
vendored
16
.github/scripts/filter_test_configs.py
vendored
@ -18,6 +18,7 @@ import yaml
|
||||
|
||||
|
||||
REENABLE_TEST_REGEX = "(?i)(Close(d|s)?|Resolve(d|s)?|Fix(ed|es)?) (#|https://github.com/pytorch/pytorch/issues/)([0-9]+)"
|
||||
MAIN_BRANCH = "main"
|
||||
|
||||
PREFIX = "test-config/"
|
||||
|
||||
@ -97,7 +98,7 @@ def parse_args() -> Any:
|
||||
parser.add_argument(
|
||||
"--branch",
|
||||
type=str,
|
||||
default="main",
|
||||
default=MAIN_BRANCH,
|
||||
help="the branch name",
|
||||
)
|
||||
return parser.parse_args()
|
||||
@ -456,6 +457,7 @@ def download_json(url: str, headers: dict[str, str], num_retries: int = 3) -> An
|
||||
|
||||
|
||||
def set_output(name: str, val: Any) -> None:
|
||||
print(f"Setting output {name}={val}")
|
||||
if os.getenv("GITHUB_OUTPUT"):
|
||||
with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
|
||||
print(f"{name}={val}", file=env)
|
||||
@ -495,13 +497,20 @@ def check_for_setting(labels: set[str], body: str, setting: str) -> bool:
|
||||
|
||||
|
||||
def perform_misc_tasks(
|
||||
labels: set[str], test_matrix: dict[str, list[Any]], job_name: str, pr_body: str
|
||||
labels: set[str],
|
||||
test_matrix: dict[str, list[Any]],
|
||||
job_name: str,
|
||||
pr_body: str,
|
||||
branch: Optional[str] = None,
|
||||
) -> None:
|
||||
"""
|
||||
In addition to apply the filter logic, the script also does the following
|
||||
misc tasks to set keep-going and is-unstable variables
|
||||
"""
|
||||
set_output("keep-going", check_for_setting(labels, pr_body, "keep-going"))
|
||||
set_output(
|
||||
"keep-going",
|
||||
branch == MAIN_BRANCH or check_for_setting(labels, pr_body, "keep-going"),
|
||||
)
|
||||
set_output(
|
||||
"ci-verbose-test-logs",
|
||||
check_for_setting(labels, pr_body, "ci-verbose-test-logs"),
|
||||
@ -624,6 +633,7 @@ def main() -> None:
|
||||
test_matrix=filtered_test_matrix,
|
||||
job_name=args.job_name,
|
||||
pr_body=pr_body if pr_body else "",
|
||||
branch=args.branch,
|
||||
)
|
||||
|
||||
# Set the filtered test matrix as the output
|
||||
|
||||
@ -17,7 +17,7 @@ from typing import Optional
|
||||
|
||||
# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
|
||||
CUDA_ARCHES = ["12.6", "12.8", "12.9"]
|
||||
CUDA_STABLE = "12.6"
|
||||
CUDA_STABLE = "12.8"
|
||||
CUDA_ARCHES_FULL_VERSION = {
|
||||
"12.6": "12.6.3",
|
||||
"12.8": "12.8.1",
|
||||
@ -54,7 +54,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
@ -71,7 +71,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
@ -88,6 +88,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
|
||||
2
.github/scripts/get_workflow_job_id.py
vendored
2
.github/scripts/get_workflow_job_id.py
vendored
@ -136,10 +136,10 @@ def find_job_id_name(args: Any) -> tuple[str, str]:
|
||||
|
||||
|
||||
def set_output(name: str, val: Any) -> None:
|
||||
print(f"Setting output {name}={val}")
|
||||
if os.getenv("GITHUB_OUTPUT"):
|
||||
with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
|
||||
print(f"{name}={val}", file=env)
|
||||
print(f"setting {name}={val}")
|
||||
else:
|
||||
print(f"::set-output name={name}::{val}")
|
||||
|
||||
|
||||
1
.github/scripts/parse_ref.py
vendored
1
.github/scripts/parse_ref.py
vendored
@ -5,6 +5,7 @@ import re
|
||||
|
||||
|
||||
def set_output(name: str, val: str) -> None:
|
||||
print(f"Setting output {name}={val}")
|
||||
if os.getenv("GITHUB_OUTPUT"):
|
||||
with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
|
||||
print(f"{name}={val}", file=env)
|
||||
|
||||
56
.github/scripts/test_delete_old_branches.py
vendored
Normal file
56
.github/scripts/test_delete_old_branches.py
vendored
Normal file
@ -0,0 +1,56 @@
|
||||
import os
|
||||
import unittest
|
||||
from datetime import datetime
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
|
||||
os.environ["GITHUB_TOKEN"] = "test_token"
|
||||
|
||||
from delete_old_branches import delete_old_tags
|
||||
|
||||
|
||||
@patch("delete_old_branches.delete_branch")
|
||||
@patch("gitutils.GitRepo._run_git")
|
||||
class TestDeleteTag(unittest.TestCase):
|
||||
def test_delete_tag(
|
||||
self, mock_run_git: "MagicMock", mock_delete_tag: "MagicMock"
|
||||
) -> None:
|
||||
for tag in [
|
||||
"ciflow/branch/12345",
|
||||
"ciflow/commitsha/1234567890abcdef1234567890abcdef12345678",
|
||||
"trunk/1234567890abcdef1234567890abcdef12345678",
|
||||
]:
|
||||
mock_run_git.side_effect = [
|
||||
tag,
|
||||
str(int(datetime.now().timestamp() - 8 * 24 * 60 * 60)), # 8 days ago
|
||||
]
|
||||
delete_old_tags()
|
||||
mock_delete_tag.assert_called_once()
|
||||
mock_delete_tag.reset_mock()
|
||||
|
||||
# Don't delete if the tag is not old enough
|
||||
mock_run_git.side_effect = [
|
||||
tag,
|
||||
str(int(datetime.now().timestamp() - 6 * 24 * 60 * 60)), # 6 days ago
|
||||
]
|
||||
delete_old_tags()
|
||||
mock_delete_tag.assert_not_called()
|
||||
|
||||
def test_do_not_delete_tag(
|
||||
self, mock_run_git: "MagicMock", mock_delete_tag: "MagicMock"
|
||||
) -> None:
|
||||
for tag in [
|
||||
"ciflow/doesntseemtomatch",
|
||||
"trunk/doesntseemtomatch",
|
||||
"doesntseemtomatch",
|
||||
]:
|
||||
mock_run_git.side_effect = [
|
||||
tag,
|
||||
str(int(datetime.now().timestamp() - 8 * 24 * 60 * 60)), # 8 days ago
|
||||
]
|
||||
delete_old_tags()
|
||||
mock_delete_tag.assert_not_called()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
15
.github/workflows/_linux-build.yml
vendored
15
.github/workflows/_linux-build.yml
vendored
@ -69,11 +69,6 @@ on:
|
||||
required: false
|
||||
type: string
|
||||
default: ""
|
||||
max-jobs:
|
||||
description: |
|
||||
Overwrite the number of jobs to use for the build
|
||||
required: false
|
||||
type: string
|
||||
disable-monitor:
|
||||
description: |
|
||||
Disable utilization monitoring for build job
|
||||
@ -266,7 +261,6 @@ jobs:
|
||||
OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
|
||||
MAX_JOBS_OVERRIDE: ${{ inputs.max-jobs }}
|
||||
run: |
|
||||
START_TIME=$(date +%s)
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
|
||||
@ -286,12 +280,6 @@ jobs:
|
||||
DOCKER_SHELL_CMD=
|
||||
fi
|
||||
|
||||
if [[ ${MAX_JOBS_OVERRIDE} == "" ]]; then
|
||||
MAX_JOBS="$(nproc --ignore=2)"
|
||||
else
|
||||
MAX_JOBS="${MAX_JOBS_OVERRIDE}"
|
||||
fi
|
||||
|
||||
# Leaving 1GB for the runner and other things
|
||||
TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
|
||||
# https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
|
||||
@ -303,8 +291,7 @@ jobs:
|
||||
# shellcheck disable=SC2086
|
||||
container_name=$(docker run \
|
||||
-e BUILD_ENVIRONMENT \
|
||||
-e MAX_JOBS=${MAX_JOBS} \
|
||||
-e MAX_JOBS_OVERRIDE \
|
||||
-e MAX_JOBS="$(nproc --ignore=2)" \
|
||||
-e AWS_DEFAULT_REGION \
|
||||
-e PR_NUMBER \
|
||||
-e SHA1 \
|
||||
|
||||
53
.github/workflows/_linux-test.yml
vendored
53
.github/workflows/_linux-test.yml
vendored
@ -90,10 +90,13 @@ jobs:
|
||||
environment: ${{ github.ref == 'refs/heads/main' && 'scribe-protected' || startsWith(github.ref, 'refs/heads/release/') && 'scribe-protected' || contains(github.event.pull_request.labels.*.name, 'ci-scribe') && 'scribe-pr' || '' }}
|
||||
runs-on: ${{ matrix.runner }}
|
||||
timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
steps:
|
||||
- name: Setup SSH (Click me for login details)
|
||||
uses: pytorch/test-infra/.github/actions/setup-ssh@main
|
||||
if: ${{ !contains(matrix.runner, 'gcp.a100') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
|
||||
if: ${{ matrix.runner != 'B200' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
|
||||
with:
|
||||
github-secret: ${{ secrets.GITHUB_TOKEN }}
|
||||
instructions: |
|
||||
@ -105,18 +108,31 @@ jobs:
|
||||
with:
|
||||
no-sudo: true
|
||||
|
||||
- name: Setup Python
|
||||
if: matrix.runner == 'B200'
|
||||
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
|
||||
with:
|
||||
python-version: '3.12'
|
||||
cache: pip
|
||||
|
||||
- name: Setup Linux
|
||||
uses: ./.github/actions/setup-linux
|
||||
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
||||
if: inputs.build-environment != 'linux-s390x-binary-manywheel' && matrix.runner != 'B200'
|
||||
|
||||
- name: configure aws credentials
|
||||
if : ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
|
||||
if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
|
||||
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
|
||||
with:
|
||||
role-to-assume: ${{ inputs.aws-role-to-assume }}
|
||||
role-session-name: gha-linux-test
|
||||
aws-region: us-east-1
|
||||
|
||||
- name: Login to Amazon ECR
|
||||
if: ${{ inputs.aws-role-to-assume != '' && matrix.runner == 'B200' }}
|
||||
id: login-ecr
|
||||
continue-on-error: true
|
||||
uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
|
||||
|
||||
- name: Calculate docker image
|
||||
id: calculate-docker-image
|
||||
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
||||
@ -148,17 +164,17 @@ jobs:
|
||||
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
|
||||
id: install-nvidia-driver
|
||||
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
|
||||
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
|
||||
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && matrix.runner != 'B200' }}
|
||||
|
||||
- name: Setup GPU_FLAG for docker run
|
||||
id: setup-gpu-flag
|
||||
run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
|
||||
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
|
||||
if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || matrix.runner == 'B200') }}
|
||||
|
||||
- name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
|
||||
id: setup-sscache-port-flag
|
||||
run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
|
||||
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
|
||||
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && matrix.runner != 'B200' }}
|
||||
|
||||
- name: Lock NVIDIA A100 40GB Frequency
|
||||
run: |
|
||||
@ -225,6 +241,12 @@ jobs:
|
||||
run: |
|
||||
echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Preserve github env variables for use in docker
|
||||
shell: bash
|
||||
run: |
|
||||
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
|
||||
- name: Test
|
||||
id: test
|
||||
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
|
||||
@ -253,8 +275,8 @@ jobs:
|
||||
NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
|
||||
TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
|
||||
# Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
|
||||
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
|
||||
SCCACHE_REGION: us-east-1
|
||||
SCCACHE_BUCKET: ${{ matrix.runner != 'B200' && 'ossci-compiler-cache-circleci-v2' || '' }}
|
||||
SCCACHE_REGION: ${{ matrix.runner != 'B200' && 'us-east-1' || '' }}
|
||||
SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
|
||||
DOCKER_IMAGE: ${{ inputs.docker-image }}
|
||||
XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
|
||||
@ -264,7 +286,6 @@ jobs:
|
||||
DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
|
||||
IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}
|
||||
ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
|
||||
run: |
|
||||
set -x
|
||||
@ -290,10 +311,6 @@ jobs:
|
||||
# if for some reason cleanup action doesn't stop container
|
||||
# when job is cancelled
|
||||
DOCKER_SHELL_CMD="sleep 12h"
|
||||
|
||||
# since some steps are skipped on s390x, if they are necessary, run them here
|
||||
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
|
||||
else
|
||||
SHM_OPTS="--shm-size=${SHM_SIZE}"
|
||||
JENKINS_USER="--user jenkins"
|
||||
@ -345,7 +362,6 @@ jobs:
|
||||
-e HUGGING_FACE_HUB_TOKEN \
|
||||
-e SCRIBE_GRAPHQL_ACCESS_TOKEN \
|
||||
-e DASHBOARD_TAG \
|
||||
-e IS_A100_RUNNER \
|
||||
-e ARTIFACTS_FILE_SUFFIX \
|
||||
--memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
|
||||
--memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
|
||||
@ -384,6 +400,15 @@ jobs:
|
||||
test_config: ${{ matrix.config }}
|
||||
job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
|
||||
|
||||
- name: Authenticate with AWS
|
||||
if: ${{ matrix.runner == 'B200' }}
|
||||
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
|
||||
# The max duration enforced by the server side
|
||||
role-duration-seconds: 18000
|
||||
aws-region: us-east-1
|
||||
|
||||
- name: Upload the benchmark results
|
||||
uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
|
||||
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
|
||||
|
||||
2
.github/workflows/_mac-build.yml
vendored
2
.github/workflows/_mac-build.yml
vendored
@ -123,7 +123,7 @@ jobs:
|
||||
else
|
||||
# The runner has access to the S3 bucket via IAM profile without the need
|
||||
# for any credential
|
||||
echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"0
|
||||
echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
|
||||
echo "SCCACHE_S3_KEY_PREFIX=${GITHUB_WORKFLOW}" >> "${GITHUB_ENV}"
|
||||
fi
|
||||
|
||||
|
||||
3
.github/workflows/_xpu-test.yml
vendored
3
.github/workflows/_xpu-test.yml
vendored
@ -191,9 +191,6 @@ jobs:
|
||||
SHARD_NUMBER: ${{ matrix.shard }}
|
||||
NUM_TEST_SHARDS: ${{ matrix.num_shards }}
|
||||
REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
|
||||
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
|
||||
SCCACHE_REGION: us-east-1
|
||||
SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
|
||||
DOCKER_IMAGE: ${{ inputs.docker-image }}
|
||||
XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
|
||||
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
|
||||
|
||||
24
.github/workflows/build-triton-wheel.yml
vendored
24
.github/workflows/build-triton-wheel.yml
vendored
@ -52,6 +52,7 @@ jobs:
|
||||
matrix:
|
||||
py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ]
|
||||
device: ["cuda", "rocm", "xpu", "aarch64"]
|
||||
docker-image: ["pytorch/manylinux2_28-builder:cpu"]
|
||||
include:
|
||||
- device: "rocm"
|
||||
rocm_version: "6.4"
|
||||
@ -67,6 +68,7 @@ jobs:
|
||||
runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge"
|
||||
timeout-minutes: 40
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ matrix.device == 'rocm' && format('pytorch/manylinux2_28-builder:rocm{0}', matrix.rocm_version) || matrix.device == 'aarch64' && 'pytorch/manylinux2_28_aarch64-builder:cpu-aarch64' || matrix.docker-image }}
|
||||
PY_VERS: ${{ matrix.py_vers }}
|
||||
BUILD_DEVICE: ${{ matrix.device }}
|
||||
PLATFORM: 'manylinux_2_28_x86_64'
|
||||
@ -84,34 +86,14 @@ jobs:
|
||||
- name: Setup Linux
|
||||
uses: ./.github/actions/setup-linux
|
||||
|
||||
- name: configure aws credentials
|
||||
id: aws_creds
|
||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
|
||||
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
aws-region: us-east-1
|
||||
role-duration-seconds: 18000
|
||||
|
||||
- name: Calculate docker image
|
||||
id: calculate-docker-image
|
||||
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
||||
with:
|
||||
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
|
||||
docker-image-name: ${{ matrix.device == 'aarch64' && 'manylinux2_28_aarch64-builder' || 'manylinux2_28-builder' }}
|
||||
# NOTE: CUDA builds are currently built using the cpu tag
|
||||
custom-tag-prefix: ${{ matrix.device == 'rocm' && format('rocm{0}', matrix.rocm_version) || matrix.device == 'aarch64' && 'cpu-aarch64' || 'cpu' }}
|
||||
docker-build-dir: .ci/docker
|
||||
|
||||
- name: Pull Docker image
|
||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
||||
with:
|
||||
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
docker-image: ${{ env.DOCKER_IMAGE }}
|
||||
|
||||
- name: Build Triton wheel
|
||||
env:
|
||||
IS_RELEASE_TAG: ${{ startsWith(github.event.ref, 'refs/tags/v') }}
|
||||
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
run: |
|
||||
set -x
|
||||
mkdir -p "${RUNNER_TEMP}/artifacts/"
|
||||
|
||||
57
.github/workflows/create_release.yml
vendored
57
.github/workflows/create_release.yml
vendored
@ -35,7 +35,6 @@ jobs:
|
||||
contents: write
|
||||
outputs:
|
||||
pt_release_name: ${{ steps.release_name.outputs.pt_release_name }}
|
||||
pt_pep517_release_name: ${{ steps.release_name.outputs.pt_pep517_release_name }}
|
||||
steps:
|
||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
with:
|
||||
@ -54,57 +53,36 @@ jobs:
|
||||
tag_or_branch="${tag_or_branch#refs/heads/}"
|
||||
# replace directory separators with _ in branch name
|
||||
tag_or_branch="${tag_or_branch//\//_}"
|
||||
torch_version="$(python -c 'from tools.generate_torch_version import get_torch_version; print(get_torch_version())')"
|
||||
{
|
||||
echo "PT_RELEASE_NAME=pytorch-$tag_or_branch";
|
||||
echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz";
|
||||
echo "PT_PEP517_RELEASE_FILE=torch-${torch_version}.tar.gz";
|
||||
} >> "$GITHUB_ENV"
|
||||
echo "PT_RELEASE_NAME=pytorch-$tag_or_branch" >> "$GITHUB_ENV"
|
||||
echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
|
||||
- name: Checkout optional submodules
|
||||
run: python3 tools/optional_submodules.py
|
||||
- name: Create source distribution
|
||||
run: |
|
||||
# Create new folder with specified name so extracting the archive yields that
|
||||
rm -rf "/tmp/$PT_RELEASE_NAME"
|
||||
cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
|
||||
mv "/tmp/$PT_RELEASE_NAME" .
|
||||
# Cleanup
|
||||
rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
|
||||
find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
|
||||
# Create archive
|
||||
tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
|
||||
echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
|
||||
- name: Create PEP 517 compatible source distribution
|
||||
run: |
|
||||
pip install build==1.2.2.post1 || exit 1
|
||||
python -m build --sdist || exit 1
|
||||
cd dist || exit 1
|
||||
# Create new folder with specified name so extracting the archive yields that
|
||||
rm -rf "/tmp/$PT_RELEASE_NAME"
|
||||
cp -r "$PWD" "/tmp/$PT_RELEASE_NAME"
|
||||
mv "/tmp/$PT_RELEASE_NAME" .
|
||||
# Cleanup
|
||||
rm -rf "$PT_RELEASE_NAME"/{.circleci,.ci}
|
||||
find "$PT_RELEASE_NAME" -name '.git*' -exec rm -rv {} \; || true
|
||||
# Create archive
|
||||
tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
|
||||
echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
|
||||
- name: Upload source distribution for release
|
||||
if: ${{ github.event_name == 'release' }}
|
||||
uses: softprops/action-gh-release@da05d552573ad5aba039eaac05058a918a7bf631 # v2.2.2
|
||||
with:
|
||||
files: |
|
||||
${{ env.PT_RELEASE_FILE }}
|
||||
${{ env.PT_PEP517_RELEASE_FILE }}
|
||||
files: ${{env.PT_RELEASE_FILE}}
|
||||
- name: Upload source distribution to GHA artifacts for release tags
|
||||
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
|
||||
with:
|
||||
name: ${{ env.PT_RELEASE_FILE }}
|
||||
path: ${{ env.PT_RELEASE_FILE }}
|
||||
- name: Upload PEP 517 source distribution to GHA artifacts for release tags
|
||||
if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
|
||||
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
|
||||
with:
|
||||
name: ${{ env.PT_PEP517_RELEASE_FILE }}
|
||||
path: dist/${{ env.PT_PEP517_RELEASE_FILE }}
|
||||
- name: Set output
|
||||
id: release_name
|
||||
run: |
|
||||
{
|
||||
echo "name=pt_release_name::${{ env.PT_RELEASE_FILE }}";
|
||||
echo "name=pt_pep517_release_name::${{ env.PT_PEP517_RELEASE_FILE }}";
|
||||
} >> "${GITHUB_OUTPUT}"
|
||||
run: echo "pt_release_name=${{ env.PT_RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
upload_source_code_to_s3:
|
||||
if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
|
||||
@ -120,9 +98,6 @@ jobs:
|
||||
- uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
|
||||
with:
|
||||
name: ${{ needs.release.outputs.pt_release_name }}
|
||||
- uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7
|
||||
with:
|
||||
name: ${{ needs.release.outputs.pt_pep517_release_name }}
|
||||
- name: Configure AWS credentials(PyTorch account)
|
||||
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
|
||||
with:
|
||||
@ -133,9 +108,7 @@ jobs:
|
||||
s3-bucket: pytorch
|
||||
s3-prefix: source_code/test
|
||||
if-no-files-found: warn
|
||||
path: |
|
||||
${{ needs.release.outputs.pt_release_name }}
|
||||
${{ needs.release.outputs.pt_pep517_release_name }}
|
||||
path: ${{ needs.release.outputs.pt_release_name }}
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
|
||||
|
||||
12
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
12
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -136,7 +136,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_9-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -252,7 +252,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -368,7 +368,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -484,7 +484,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -600,7 +600,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -716,7 +716,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
6
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
6
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
@ -61,7 +61,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_6-test: # Testing
|
||||
@ -108,7 +108,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_8-test: # Testing
|
||||
@ -155,7 +155,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_9-test: # Testing
|
||||
|
||||
172
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
172
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
@ -131,7 +131,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_6-test: # Testing
|
||||
@ -200,7 +200,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_8-test: # Testing
|
||||
@ -269,7 +269,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_9-test: # Testing
|
||||
@ -744,7 +744,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_6-test: # Testing
|
||||
@ -813,7 +813,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_8-test: # Testing
|
||||
@ -882,7 +882,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_9-test: # Testing
|
||||
@ -1357,7 +1357,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_6-test: # Testing
|
||||
@ -1407,74 +1407,6 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda12_6-full-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu126
|
||||
GPU_ARCH_VERSION: 12.6
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_6-full
|
||||
build_environment: linux-binary-manywheel
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_6-full-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_11-cuda12_6-full-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu126
|
||||
GPU_ARCH_VERSION: 12.6
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_6-full
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_6-full-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_11-cuda12_6-full-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu126
|
||||
GPU_ARCH_VERSION: 12.6
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.6
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_6-full
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda12_8-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -1494,7 +1426,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_8-test: # Testing
|
||||
@ -1544,6 +1476,74 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda12_8-full-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: 12.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_8-full
|
||||
build_environment: linux-binary-manywheel
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_8-full-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_11-cuda12_8-full-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: 12.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_8-full
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8 and 12.9 build need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_8-full-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_11-cuda12_8-full-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu128
|
||||
GPU_ARCH_VERSION: 12.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.8
|
||||
use_split_build: False
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_8-full
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -1563,7 +1563,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_9-test: # Testing
|
||||
@ -2038,7 +2038,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_6-test: # Testing
|
||||
@ -2107,7 +2107,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_8-test: # Testing
|
||||
@ -2176,7 +2176,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_9-test: # Testing
|
||||
@ -2651,7 +2651,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_6-test: # Testing
|
||||
@ -2720,7 +2720,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_8-test: # Testing
|
||||
@ -2789,7 +2789,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_9-test: # Testing
|
||||
@ -3264,7 +3264,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_6-test: # Testing
|
||||
@ -3333,7 +3333,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_8-test: # Testing
|
||||
@ -3402,7 +3402,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_9-test: # Testing
|
||||
|
||||
2
.github/workflows/h100-symm-mem.yml
vendored
2
.github/workflows/h100-symm-mem.yml
vendored
@ -38,7 +38,7 @@ jobs:
|
||||
cuda-arch-list: '9.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "h100_distributed", shard: 1, num_shards: 1, runner: "linux.aws.h100.4" },
|
||||
{ config: "h100-symm-mem", shard: 1, num_shards: 1, runner: "linux.aws.h100.4" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
@ -13,7 +13,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
linux-jammy-cpu-py3_9-gcc11-inductor-build:
|
||||
|
||||
@ -13,7 +13,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-default-label-prefix:
|
||||
|
||||
4
.github/workflows/inductor-nightly.yml
vendored
4
.github/workflows/inductor-nightly.yml
vendored
@ -16,7 +16,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-default-label-prefix:
|
||||
|
||||
4
.github/workflows/inductor-perf-compare.yml
vendored
4
.github/workflows/inductor-perf-compare.yml
vendored
@ -10,7 +10,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-default-label-prefix:
|
||||
|
||||
@ -48,7 +48,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
|
||||
@ -63,7 +63,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
|
||||
@ -5,7 +5,7 @@ on:
|
||||
tags:
|
||||
- ciflow/inductor-perf-test-nightly-rocm/*
|
||||
schedule:
|
||||
- cron: 0 7 * * 0
|
||||
- cron: 0 7 * * 0,3
|
||||
# NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
|
||||
# out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
|
||||
workflow_dispatch:
|
||||
@ -88,18 +88,23 @@ jobs:
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
{ config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
@ -53,7 +53,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
|
||||
@ -58,7 +58,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
|
||||
@ -63,7 +63,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
|
||||
4
.github/workflows/inductor-periodic.yml
vendored
4
.github/workflows/inductor-periodic.yml
vendored
@ -15,7 +15,9 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
|
||||
permissions: read-all
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-default-label-prefix:
|
||||
|
||||
4
.github/workflows/inductor-unittest.yml
vendored
4
.github/workflows/inductor-unittest.yml
vendored
@ -12,7 +12,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-unittest
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
|
||||
4
.github/workflows/inductor.yml
vendored
4
.github/workflows/inductor.yml
vendored
@ -22,7 +22,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
unit-test:
|
||||
|
||||
4
.github/workflows/operator_benchmark.yml
vendored
4
.github/workflows/operator_benchmark.yml
vendored
@ -19,7 +19,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
linux-jammy-cpu-py3_9-gcc11-opbenchmark-build:
|
||||
|
||||
4
.github/workflows/periodic.yml
vendored
4
.github/workflows/periodic.yml
vendored
@ -20,7 +20,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
llm-td:
|
||||
|
||||
6
.github/workflows/pull.yml
vendored
6
.github/workflows/pull.yml
vendored
@ -19,7 +19,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
llm-td:
|
||||
@ -201,6 +203,7 @@ jobs:
|
||||
{ config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "einops", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
@ -236,6 +239,7 @@ jobs:
|
||||
{ config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
|
||||
{ config: "einops", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
4
.github/workflows/s390x-periodic.yml
vendored
4
.github/workflows/s390x-periodic.yml
vendored
@ -15,7 +15,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
llm-td:
|
||||
|
||||
4
.github/workflows/slow.yml
vendored
4
.github/workflows/slow.yml
vendored
@ -18,7 +18,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
llm-td:
|
||||
|
||||
4
.github/workflows/trunk.yml
vendored
4
.github/workflows/trunk.yml
vendored
@ -16,7 +16,9 @@ concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
llm-td:
|
||||
|
||||
@ -122,6 +122,7 @@ is_formatter = true
|
||||
[[linter]]
|
||||
code = 'MYPY'
|
||||
include_patterns = [
|
||||
'setup.py',
|
||||
'torch/**/*.py',
|
||||
'torch/**/*.pyi',
|
||||
'caffe2/**/*.py',
|
||||
@ -1156,6 +1157,7 @@ exclude_patterns = [
|
||||
'torch/_vendor/**',
|
||||
'torch/_inductor/fx_passes/serialized_patterns/**',
|
||||
'torch/_inductor/autoheuristic/artifacts/**',
|
||||
'torch/utils/model_dump/preact.mjs',
|
||||
# These files are all grandfathered in, feel free to remove from this list
|
||||
# as necessary
|
||||
# NOTE: remove the patterns in the order they are listed
|
||||
@ -1167,17 +1169,10 @@ exclude_patterns = [
|
||||
'test/**',
|
||||
'test/test_*',
|
||||
'test/[a-hA-h]*/**',
|
||||
'test/inductor/**',
|
||||
'test/dynamo/**',
|
||||
'test/distributed/**',
|
||||
'torch/**',
|
||||
'torch/_*/**',
|
||||
'torch/fx/**',
|
||||
'torch/distributed/tensor/**',
|
||||
'torch/[j-o]*/**',
|
||||
'torch/utils/**',
|
||||
'torch/csrc/jit/**',
|
||||
'torch/csrc/jit/[a-o]*/**',
|
||||
]
|
||||
init_command = [
|
||||
'python3',
|
||||
|
||||
111
MANIFEST.in
111
MANIFEST.in
@ -1,91 +1,50 @@
|
||||
# Include individual top-level files
|
||||
include MANIFEST.in
|
||||
include BUCK.oss
|
||||
include BUILD.bazel
|
||||
include CITATION.cff
|
||||
include CODEOWNERS
|
||||
include Dockerfile
|
||||
include LICENSE
|
||||
include Makefile
|
||||
include NOTICE
|
||||
include WORKSPACE
|
||||
include .bazelignore .bazelrc .bazelversion
|
||||
include .clang-format .clang-tidy
|
||||
include .cmakelintrc
|
||||
include .coveragerc
|
||||
include .dockerignore
|
||||
include .flake8
|
||||
include .gdbinit
|
||||
include .lintrunner.toml
|
||||
include .lldbinit
|
||||
include docker.Makefile
|
||||
include ubsan.supp
|
||||
# Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html
|
||||
|
||||
# Include bazel related files
|
||||
include *.bzl
|
||||
# Include general configuration files
|
||||
include *.ini
|
||||
# Include important top-level information
|
||||
include *.md
|
||||
# Include technical text files
|
||||
include *.txt
|
||||
|
||||
# Include ctags configuration
|
||||
include .ctags.d/*.ctags
|
||||
|
||||
# Include subfolders completely
|
||||
graft .devcontainer
|
||||
graft .vscode
|
||||
# Include source files in SDist
|
||||
include CMakeLists.txt
|
||||
include *.bzl *.bazel .bazel* BUILD *.BUILD BUILD.* WORKSPACE
|
||||
include BUCK BUCK.*
|
||||
include requirements*.txt
|
||||
include version.txt
|
||||
include [Mm]akefile *.[Mm]akefile [Mm]akefile.*
|
||||
include [Dd]ockerfile *.[Dd]ockerfile [Dd]ockerfile.* .dockerignore
|
||||
graft android
|
||||
# The following folder (assets) is empty except for a .gitignore file, which
|
||||
# will not be included in the sdist, hence we include the directory explicitly.
|
||||
include android/test_app/app/src/main/assets
|
||||
graft aten
|
||||
graft binaries
|
||||
graft c10
|
||||
graft caffe2
|
||||
graft cmake
|
||||
graft torch
|
||||
graft tools
|
||||
graft test
|
||||
graft docs
|
||||
graft ios
|
||||
graft third_party
|
||||
graft test
|
||||
graft benchmarks
|
||||
graft scripts
|
||||
graft mypy_plugins
|
||||
graft modules
|
||||
graft functorch
|
||||
graft third_party
|
||||
graft tools
|
||||
graft torch
|
||||
graft torchgen
|
||||
# FIXME: torch-xla build during codegen will fail if include this file in wheel
|
||||
exclude torchgen/BUILD.bazel
|
||||
|
||||
# The following exclusions omit parts from third-party dependencies that
|
||||
# contain invalid symlinks[1] and that are not needed for pytorch, such as
|
||||
# bindings for unused languages
|
||||
prune third_party/ittapi/rust
|
||||
prune third_party/flatbuffers/java
|
||||
prune third_party/flatbuffers/kotlin
|
||||
prune third_party/nccl/pkg/debian
|
||||
prune third_party/opentelemetry-cpp/third_party/prometheus-cpp/cmake/project-import-*
|
||||
# Misc files and directories in SDist
|
||||
include *.md
|
||||
include CITATION.cff
|
||||
include LICENSE NOTICE
|
||||
include mypy*.ini
|
||||
graft benchmarks
|
||||
graft docs
|
||||
graft mypy_plugins
|
||||
graft scripts
|
||||
|
||||
# The following document is also an invalid symlink[1] and superfluous
|
||||
exclude third_party/flatbuffers/docs/source/CONTRIBUTING.md
|
||||
# Misc files needed for custom setuptools command
|
||||
include .gitignore
|
||||
include .gitmodules
|
||||
|
||||
# Omit autogenerated code
|
||||
# Include test suites in SDist
|
||||
graft test
|
||||
include pytest.ini
|
||||
include .coveragerc
|
||||
|
||||
# Prune generated/compiled files
|
||||
prune torchgen/packaged
|
||||
|
||||
# Omit caches, compiled, and scm related content
|
||||
prune */__pycache__
|
||||
prune **/.github
|
||||
prune **/.gitlab
|
||||
global-exclude *.o *.so *.dylib *.a
|
||||
global-exclude *.pyc *.swp
|
||||
global-exclude .git .git-blame-ignore-revs .gitattributes .gitignore .gitmodules
|
||||
global-exclude .gitlab-ci.yml
|
||||
global-exclude *.o *.obj *.so *.a *.dylib *.pxd *.dll *.lib *.py[cod]
|
||||
|
||||
# [1] Invalid symlinks for the purposes of Python source distributions are,
|
||||
# according to the source distribution format[2] links pointing outside the
|
||||
# destination directory or links with a `..` component, which is those of
|
||||
# concern here.
|
||||
|
||||
# [2] https://packaging.python.org/en/latest/specifications/source-distribution-format/#source-distribution-archive-features
|
||||
prune */.git
|
||||
global-exclude .git *~ *.swp
|
||||
|
||||
6
Makefile
6
Makefile
@ -57,7 +57,8 @@ setup-env-cuda:
|
||||
setup-env-rocm:
|
||||
$(MAKE) setup-env PYTHON="$(PYTHON)" NIGHTLY_TOOL_OPTS="$(NIGHTLY_TOOL_OPTS) --rocm"
|
||||
|
||||
.lintbin/.lintrunner.sha256: requirements.txt pyproject.toml .lintrunner.toml
|
||||
.PHONY: setup-lint
|
||||
setup-lint .lintbin/.lintrunner.sha256: requirements.txt pyproject.toml .lintrunner.toml
|
||||
@echo "Setting up lintrunner..."
|
||||
$(PIP) install lintrunner
|
||||
lintrunner init
|
||||
@ -65,9 +66,6 @@ setup-env-rocm:
|
||||
@mkdir -p .lintbin
|
||||
@sha256sum requirements.txt pyproject.toml .lintrunner.toml > .lintbin/.lintrunner.sha256
|
||||
|
||||
.PHONY: setup-lint
|
||||
setup-lint: .lintbin/.lintrunner.sha256
|
||||
|
||||
.PHONY: lazy-setup-lint
|
||||
lazy-setup-lint: .lintbin/.lintrunner.sha256
|
||||
@if [ ! -x "$(shell command -v lintrunner)" ]; then \
|
||||
|
||||
@ -200,7 +200,7 @@ If you want to compile with CUDA support, [select a supported version of CUDA fr
|
||||
- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v8.5 or above
|
||||
- [Compiler](https://gist.github.com/ax3l/9489132) compatible with CUDA
|
||||
|
||||
Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware
|
||||
Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver, and NVIDIA hardware.
|
||||
|
||||
If you want to disable CUDA support, export the environment variable `USE_CUDA=0`.
|
||||
Other potentially useful environment variables may be found in `setup.py`. If
|
||||
|
||||
@ -50,6 +50,7 @@ Following is the Release Compatibility Matrix for PyTorch releases:
|
||||
|
||||
| PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm |
|
||||
| --- | --- | --- | --- | --- | --- |
|
||||
| 2.8 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 12.9 (CUDNN 9.10.2.21) | ROCm 6.4 |
|
||||
| 2.7 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8 (CUDNN 9.1.0.70), CUDA 12.6 (CUDNN 9.5.1.17) | CUDA 12.8 (CUDNN 9.7.1.26) | ROCm 6.3 |
|
||||
| 2.6 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8, CUDA 12.4 (CUDNN 9.1.0.70) | CUDA 12.6 (CUDNN 9.5.1.17) | ROCm 6.2.4 |
|
||||
| 2.5 | >=3.9, <=3.12, (3.13 experimental) | C++17 | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70 | None | ROCm 6.2 |
|
||||
@ -73,9 +74,9 @@ Following is the release cadence. All future dates below are tentative. For late
|
||||
| 2.4 | Jun 2024 | Jul 2024 | Sept 2024 | Not planned |
|
||||
| 2.5 | Sep 2024 | Oct 2024 | Nov 2024 | Not planned |
|
||||
| 2.6 | Dec 2024 | Jan 2025 | Not planned | Not planned |
|
||||
| 2.7 | Mar 2025 | Apr 2025 | (May 2025) | (Jun 2025) |
|
||||
| 2.7 | Mar 2025 | Apr 2025 | Jun 2025 | Not planned |
|
||||
| 2.8 | Jun 2025 | Jul 2025 | (Aug 2025) | (Sep 2025) |
|
||||
| 2.9 | Aug 2025 | Oct 2025 | (Nov 2025) | (Dec 2025) |
|
||||
| 2.9 | Sept 2025 | Oct 2025 | (Nov 2025) | (Dec 2025) |
|
||||
| 2.10 | Dec 2025 | Jan 2026 | (Feb 2026) | (Mar 2026) |
|
||||
| 2.11 | Mar 2026 | Apr 2026 | (Jun 2026) | (Jul 2026) |
|
||||
|
||||
|
||||
@ -30,7 +30,7 @@ TORCH_API bool isAccelerator(c10::DeviceType device_type);
|
||||
template <
|
||||
typename... T,
|
||||
typename = std::enable_if_t<(std::is_same_v<T, c10::DeviceType> && ...)>>
|
||||
TORCH_API inline bool isAcceleratorExcluded(
|
||||
inline bool isAcceleratorExcluded(
|
||||
c10::DeviceType device_type,
|
||||
c10::DeviceType first_excluded,
|
||||
T... rest_excluded) {
|
||||
|
||||
@ -178,7 +178,7 @@ bool FunctionalTensorWrapper::is_up_to_date() const {
|
||||
// See Note [Functionalization Pass - Inplace View Ops]
|
||||
void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) {
|
||||
view_metas_.push_back(meta);
|
||||
// Manually track the fact that this tensor recieved a metadata mutation!
|
||||
// Manually track the fact that this tensor received a metadata mutation!
|
||||
has_metadata_mutation_ = true;
|
||||
// Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation.
|
||||
maybe_mark_symbolic(meta);
|
||||
@ -579,7 +579,7 @@ std::vector<Tensor> from_functional_tensor(ITensorListRef t_list) {
|
||||
for (const auto& tensor : t_list) {
|
||||
// from_functional_tensor(Tensor) has asserts to make sure you don't accidentally call
|
||||
// it on a non-functional input,
|
||||
// but from_functional_tensor(TensorList) can recieve a list containing both
|
||||
// but from_functional_tensor(TensorList) can receive a list containing both
|
||||
// functional and non-functional tensors.
|
||||
// Example of when that can happen: torch.cat(function_input_tensor, global_state_tensor).
|
||||
// When that happens, we're okay with only unwrapping the functional tensors.
|
||||
|
||||
@ -300,7 +300,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
|
||||
namespace functionalization {
|
||||
namespace impl {
|
||||
|
||||
TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(
|
||||
inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(
|
||||
const Tensor& tensor) {
|
||||
auto functional_impl =
|
||||
static_cast<FunctionalTensorWrapper*>(tensor.unsafeGetTensorImpl());
|
||||
|
||||
@ -167,14 +167,14 @@ TORCH_API TensorImpl* propagate_names(
|
||||
|
||||
TORCH_API void propagate_names(TensorImpl* result, /*const */ TensorImpl* src);
|
||||
|
||||
TORCH_API inline void propagate_names(
|
||||
inline void propagate_names(
|
||||
const TensorBase& result,
|
||||
DimnameList names,
|
||||
bool validate_names = false) {
|
||||
propagate_names(result.unsafeGetTensorImpl(), names, validate_names);
|
||||
}
|
||||
|
||||
TORCH_API inline void propagate_names_if_nonempty(
|
||||
inline void propagate_names_if_nonempty(
|
||||
const TensorBase& result,
|
||||
DimnameList names,
|
||||
bool validate_names = false) {
|
||||
@ -182,9 +182,7 @@ TORCH_API inline void propagate_names_if_nonempty(
|
||||
result.unsafeGetTensorImpl(), names, validate_names);
|
||||
}
|
||||
|
||||
TORCH_API inline void propagate_names(
|
||||
const TensorBase& result,
|
||||
const TensorBase& src) {
|
||||
inline void propagate_names(const TensorBase& result, const TensorBase& src) {
|
||||
propagate_names(result.unsafeGetTensorImpl(), src.unsafeGetTensorImpl());
|
||||
}
|
||||
|
||||
|
||||
@ -25,7 +25,7 @@ TORCH_API void set_autocast_cache_enabled(bool enabled);
|
||||
// deprecated CUDA-specific autocast APIs
|
||||
C10_DEPRECATED_MESSAGE(
|
||||
"at::autocast::is_enabled() is deprecated. Please use at::autocast::is_autocast_enabled(at::kCUDA) instead.")
|
||||
TORCH_API inline bool is_enabled() {
|
||||
inline bool is_enabled() {
|
||||
TORCH_WARN_DEPRECATION(
|
||||
"at::autocast::",
|
||||
__func__,
|
||||
@ -34,7 +34,7 @@ TORCH_API inline bool is_enabled() {
|
||||
}
|
||||
C10_DEPRECATED_MESSAGE(
|
||||
"at::autocast::set_enabled(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(at::kCUDA, enabled) instead.")
|
||||
TORCH_API inline void set_enabled(bool enabled) {
|
||||
inline void set_enabled(bool enabled) {
|
||||
TORCH_WARN_DEPRECATION(
|
||||
"at::autocast::",
|
||||
__func__,
|
||||
@ -43,7 +43,7 @@ TORCH_API inline void set_enabled(bool enabled) {
|
||||
}
|
||||
C10_DEPRECATED_MESSAGE(
|
||||
"at::autocast::get_autocast_gpu_dtype() is deprecated. Please use at::autocast::get_autocast_dtype(at::kCUDA) instead.")
|
||||
TORCH_API inline at::ScalarType get_autocast_gpu_dtype() {
|
||||
inline at::ScalarType get_autocast_gpu_dtype() {
|
||||
TORCH_WARN_DEPRECATION(
|
||||
"at::autocast::",
|
||||
__func__,
|
||||
@ -52,7 +52,7 @@ TORCH_API inline at::ScalarType get_autocast_gpu_dtype() {
|
||||
}
|
||||
C10_DEPRECATED_MESSAGE(
|
||||
"at::autocast::set_autocast_gpu_dtype(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(at::kCUDA, dtype) instead.")
|
||||
TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
|
||||
inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
|
||||
TORCH_WARN_DEPRECATION(
|
||||
"at::autocast::",
|
||||
__func__,
|
||||
@ -65,7 +65,7 @@ TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
|
||||
"at::autocast::is_" #name \
|
||||
"_enabled() is deprecated. Please use at::autocast::is_autocast_enabled(" #device_type \
|
||||
") instead.") \
|
||||
TORCH_API inline bool is_##name##_enabled() { \
|
||||
inline bool is_##name##_enabled() { \
|
||||
TORCH_WARN_DEPRECATION( \
|
||||
"at::autocast::", \
|
||||
__func__, \
|
||||
@ -78,7 +78,7 @@ TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
|
||||
"at::autocast::set_" #name \
|
||||
"_enabled(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(" #device_type \
|
||||
", enabled) instead.") \
|
||||
TORCH_API inline void set_##name##_enabled(bool enabled) { \
|
||||
inline void set_##name##_enabled(bool enabled) { \
|
||||
TORCH_WARN_DEPRECATION( \
|
||||
"at::autocast::", \
|
||||
__func__, \
|
||||
@ -91,7 +91,7 @@ TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
|
||||
"at::autocast::get_autocast_" #name \
|
||||
"_dtype() is deprecated. Please use at::autocast::get_autocast_dtype(" #device_type \
|
||||
") instead.") \
|
||||
TORCH_API inline at::ScalarType get_autocast_##name##_dtype() { \
|
||||
inline at::ScalarType get_autocast_##name##_dtype() { \
|
||||
TORCH_WARN_DEPRECATION( \
|
||||
"at::autocast::", \
|
||||
__func__, \
|
||||
@ -104,7 +104,7 @@ TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
|
||||
"at::autocast::set_autocast_" #name \
|
||||
"_dtype(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(" #device_type \
|
||||
", dtype) instead.") \
|
||||
TORCH_API inline void set_autocast_##name##_dtype(at::ScalarType dtype) { \
|
||||
inline void set_autocast_##name##_dtype(at::ScalarType dtype) { \
|
||||
TORCH_WARN_DEPRECATION( \
|
||||
"at::autocast::", \
|
||||
__func__, \
|
||||
|
||||
@ -265,7 +265,7 @@ class TORCH_API TensorBase {
|
||||
return impl_->is_contiguous(memory_format);
|
||||
}
|
||||
|
||||
// Like is_contiguous, but more dynamic shape-friendly. Maybe returns a symbolic representation of
|
||||
// Like is_contiguous, but more dynamic shape-friendly. May return a symbolic representation of
|
||||
// contiguity instead of SymTrue SymFalse, when results are data-dependent.
|
||||
c10::SymBool sym_is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Contiguous) const {
|
||||
if (impl_->has_symbolic_sizes_strides()) {
|
||||
|
||||
@ -105,7 +105,7 @@ using supported_primitive_arg_types = guts::typelist::typelist<
|
||||
// So a valid input type is one that our boxed functor wrapper can
|
||||
// unbox from an IValue into a C++ value.
|
||||
//
|
||||
// Whereas a valid output type is one that our wrapper can recieve
|
||||
// Whereas a valid output type is one that our wrapper can receive
|
||||
// as a C++ value from the unboxed functor, and box into an IValue.
|
||||
|
||||
//
|
||||
|
||||
@ -677,7 +677,7 @@ inline TypePtr Type::withContained(std::vector<TypePtr> contained_types) {
|
||||
}
|
||||
|
||||
|
||||
TORCH_API inline bool operator==(const Type& lhs, const Type& rhs) {
|
||||
inline bool operator==(const Type& lhs, const Type& rhs) {
|
||||
if (C10_UNLIKELY(!rhs.symmetric())) {
|
||||
return rhs.equals(lhs);
|
||||
}
|
||||
|
||||
@ -202,18 +202,14 @@ class Vectorized<float> {
|
||||
store(tmp);
|
||||
return tmp[idx];
|
||||
}
|
||||
// For boolean version where we want to if any 1/all zero
|
||||
// etc. can be done faster in a different way.
|
||||
int zero_mask() const {
|
||||
__at_align__ float tmp[size()];
|
||||
store(tmp);
|
||||
int mask = 0;
|
||||
for (int i = 0; i < size(); ++i) {
|
||||
if (tmp[i] == 0.f) {
|
||||
mask |= (1 << i);
|
||||
}
|
||||
}
|
||||
return mask;
|
||||
uint32x4_t is_zero_vec = vceqzq_f32(values);
|
||||
const int32x4_t shift = vcombine_s32(
|
||||
vcreate_s32(0x0 | (int64_t(0x1) << 32)),
|
||||
vcreate_s32(0x2 | (int64_t(0x3) << 32)));
|
||||
uint32x4_t bits_vec =
|
||||
vshlq_u32(vandq_u32(is_zero_vec, vdupq_n_u32(1)), shift);
|
||||
return vaddvq_u32(bits_vec);
|
||||
}
|
||||
Vectorized<float> isnan() const {
|
||||
return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(values, values)));
|
||||
|
||||
@ -220,8 +220,32 @@ class Vectorized<c10::Half> : public Vectorized16<
|
||||
std::memcpy(ptr, tmp_values, count * sizeof(float16_t));
|
||||
}
|
||||
}
|
||||
// For boolean version where we want to if any 1/all zero
|
||||
// etc. can be done faster in a different way.
|
||||
int zero_mask() const {
|
||||
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
uint16x8_t is_zero_vec = vceqzq_f16(values);
|
||||
const int16x8_t shift = vcombine_s16(
|
||||
vcreate_s16(
|
||||
0x0 | (int64_t(0x1) << 16) | (int64_t(0x2) << 32) |
|
||||
(int64_t(0x3) << 48)),
|
||||
vcreate_s16(
|
||||
0x4 | (int64_t(0x5) << 16) | (int64_t(0x6) << 32) |
|
||||
(int64_t(0x7) << 48)));
|
||||
uint16x8_t bits_vec =
|
||||
vshlq_u16(vandq_u16(is_zero_vec, vdupq_n_u16(1)), shift);
|
||||
return vaddvq_u16(bits_vec);
|
||||
#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
// use known working implmentation.
|
||||
__at_align__ value_type tmp[size()];
|
||||
store(tmp);
|
||||
int mask = 0;
|
||||
for (int i = 0; i < size(); ++i) {
|
||||
if (tmp[i] == 0) {
|
||||
mask |= (1 << i);
|
||||
}
|
||||
}
|
||||
return mask;
|
||||
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
}
|
||||
Vectorized<c10::Half> isnan() const {
|
||||
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
return vreinterpretq_f16_u16(vmvnq_u16(vceqq_f16(values, values)));
|
||||
|
||||
@ -348,26 +348,6 @@ class Vectorized<int16_t> {
|
||||
DEFINE_MEMBER_OP(operator^, int16_t, vec_xor)
|
||||
};
|
||||
|
||||
template <>
|
||||
Vectorized<int16_t> inline operator<<(
|
||||
const Vectorized<int16_t>& a,
|
||||
const Vectorized<int16_t>& b) {
|
||||
vuint16 shift_vec0 = reinterpret_cast<vuint16>(b.vec0());
|
||||
vuint16 shift_vec1 = reinterpret_cast<vuint16>(b.vec1());
|
||||
return Vectorized<int16_t>{
|
||||
vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)};
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<int16_t> inline operator>>(
|
||||
const Vectorized<int16_t>& a,
|
||||
const Vectorized<int16_t>& b) {
|
||||
vuint16 shift_vec0 = reinterpret_cast<vuint16>(b.vec0());
|
||||
vuint16 shift_vec1 = reinterpret_cast<vuint16>(b.vec1());
|
||||
return Vectorized<int16_t>{
|
||||
vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<int16_t> inline maximum(
|
||||
const Vectorized<int16_t>& a,
|
||||
@ -382,6 +362,8 @@ Vectorized<int16_t> inline minimum(
|
||||
return a.minimum(b);
|
||||
}
|
||||
|
||||
DEFINE_SHIFT_FUNCS(int16_t)
|
||||
|
||||
template <>
|
||||
Vectorized<int16_t> C10_ALWAYS_INLINE
|
||||
operator+(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
|
||||
|
||||
@ -278,26 +278,6 @@ class Vectorized<int32_t> {
|
||||
DEFINE_MEMBER_OP(operator^, int32_t, vec_xor)
|
||||
};
|
||||
|
||||
template <>
|
||||
Vectorized<int32_t> inline operator<<(
|
||||
const Vectorized<int32_t>& a,
|
||||
const Vectorized<int32_t>& b) {
|
||||
vuint32 shift_vec0 = reinterpret_cast<vuint32>(b.vec0());
|
||||
vuint32 shift_vec1 = reinterpret_cast<vuint32>(b.vec1());
|
||||
return Vectorized<int32_t>{
|
||||
vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)};
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<int32_t> inline operator>>(
|
||||
const Vectorized<int32_t>& a,
|
||||
const Vectorized<int32_t>& b) {
|
||||
vuint32 shift_vec0 = reinterpret_cast<vuint32>(b.vec0());
|
||||
vuint32 shift_vec1 = reinterpret_cast<vuint32>(b.vec1());
|
||||
return Vectorized<int32_t>{
|
||||
vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<int32_t> inline maximum(
|
||||
const Vectorized<int32_t>& a,
|
||||
@ -312,6 +292,8 @@ Vectorized<int32_t> inline minimum(
|
||||
return a.minimum(b);
|
||||
}
|
||||
|
||||
DEFINE_SHIFT_FUNCS(int32_t)
|
||||
|
||||
template <>
|
||||
Vectorized<int32_t> C10_ALWAYS_INLINE
|
||||
operator+(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
|
||||
|
||||
@ -231,26 +231,6 @@ class Vectorized<int64_t> {
|
||||
DEFINE_MEMBER_OP(operator^, int64_t, vec_xor)
|
||||
};
|
||||
|
||||
template <>
|
||||
Vectorized<int64_t> inline operator<<(
|
||||
const Vectorized<int64_t>& a,
|
||||
const Vectorized<int64_t>& b) {
|
||||
vuint64 shift_vec0 = reinterpret_cast<vuint64>(b.vec0());
|
||||
vuint64 shift_vec1 = reinterpret_cast<vuint64>(b.vec1());
|
||||
return Vectorized<int64_t>{
|
||||
vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)};
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<int64_t> inline operator>>(
|
||||
const Vectorized<int64_t>& a,
|
||||
const Vectorized<int64_t>& b) {
|
||||
vuint64 shift_vec0 = reinterpret_cast<vuint64>(b.vec0());
|
||||
vuint64 shift_vec1 = reinterpret_cast<vuint64>(b.vec1());
|
||||
return Vectorized<int64_t>{
|
||||
vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<int64_t> inline maximum(
|
||||
const Vectorized<int64_t>& a,
|
||||
@ -265,6 +245,8 @@ Vectorized<int64_t> inline minimum(
|
||||
return a.minimum(b);
|
||||
}
|
||||
|
||||
DEFINE_SHIFT_FUNCS(int64_t)
|
||||
|
||||
template <>
|
||||
Vectorized<int64_t> C10_ALWAYS_INLINE
|
||||
operator+(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <cstdint>
|
||||
|
||||
@ -39,6 +40,19 @@ using vfloat32 = __attribute__((altivec(vector__))) float;
|
||||
using vfloat64 = __attribute__((altivec(vector__))) double;
|
||||
#endif
|
||||
|
||||
inline auto make_vuint(vint8 v) {
|
||||
return reinterpret_cast<vuint8>(v);
|
||||
}
|
||||
inline auto make_vuint(vint16 v) {
|
||||
return reinterpret_cast<vuint16>(v);
|
||||
}
|
||||
inline auto make_vuint(vint32 v) {
|
||||
return reinterpret_cast<vuint32>(v);
|
||||
}
|
||||
inline auto make_vuint(vint64 v) {
|
||||
return reinterpret_cast<vuint64>(v);
|
||||
}
|
||||
|
||||
#if !defined(vec_float)
|
||||
C10_ALWAYS_INLINE vfloat32 vec_float(const vint32& vec_in) {
|
||||
vfloat32 vec_out;
|
||||
@ -521,6 +535,42 @@ const vfloat64 vd_imag_half = vfloat64{0.0, 0.5};
|
||||
const vfloat64 vd_sqrt2_2 = vfloat64{0.70710678118654757, 0.70710678118654757};
|
||||
const vfloat64 vd_pi_2 = vfloat64{M_PI / 2.0, 0.0};
|
||||
|
||||
template <typename T>
|
||||
Vectorized<T> VsxShiftRightArith(
|
||||
const Vectorized<T>& a,
|
||||
const Vectorized<T>& b) {
|
||||
const Vectorized<T> max_shift(sizeof(T) * CHAR_BIT - std::is_signed_v<T>);
|
||||
const auto mask = (b < Vectorized<T>(0)) | (b >= max_shift);
|
||||
const auto shift = Vectorized<T>::blendv(b, max_shift, mask);
|
||||
return Vectorized<T>{
|
||||
vec_sra(a.vec0(), make_vuint(shift.vec0())),
|
||||
vec_sra(a.vec1(), make_vuint(shift.vec1()))};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Vectorized<T> VsxShiftLeftArith(
|
||||
const Vectorized<T>& a,
|
||||
const Vectorized<T>& b) {
|
||||
const Vectorized<T> max_shift(sizeof(T) * CHAR_BIT);
|
||||
const auto mask = (b < Vectorized<T>(0)) | (b >= max_shift);
|
||||
Vectorized<T> ret(
|
||||
vec_sl(a.vec0(), make_vuint(b.vec0())),
|
||||
vec_sl(a.vec1(), make_vuint(b.vec1())));
|
||||
return Vectorized<T>::blendv(ret, Vectorized<T>(0), mask);
|
||||
}
|
||||
|
||||
#define DEFINE_SHIFT_FUNCS(operand_type) \
|
||||
template <> \
|
||||
Vectorized<operand_type> C10_ALWAYS_INLINE operator>>( \
|
||||
const Vectorized<operand_type>& a, const Vectorized<operand_type>& b) { \
|
||||
return VsxShiftRightArith(a, b); \
|
||||
} \
|
||||
template <> \
|
||||
Vectorized<operand_type> C10_ALWAYS_INLINE operator<<( \
|
||||
const Vectorized<operand_type>& a, const Vectorized<operand_type>& b) { \
|
||||
return VsxShiftLeftArith(a, b); \
|
||||
}
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace vec
|
||||
} // namespace at
|
||||
|
||||
153
aten/src/ATen/cpu/vec/vec_quant.h
Normal file
153
aten/src/ATen/cpu/vec/vec_quant.h
Normal file
@ -0,0 +1,153 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
namespace at::vec {
|
||||
// See Note [CPU_CAPABILITY namespace]
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
// Transpose a [4, 64] block to [64, 4] (with contiguous output, ld=4)
|
||||
template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 1>>
|
||||
static inline void transpose_pad_4x64_block(
|
||||
const scalar_t* src,
|
||||
scalar_t* dst,
|
||||
int64_t ld_src,
|
||||
int krem = 4,
|
||||
int nrem = 64) {
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
__m512i r[4];
|
||||
// Load with mask if partial
|
||||
if (nrem < 64) {
|
||||
__mmask64 mask = (1ULL << nrem) - 1;
|
||||
for (int i = 0; i < krem; ++i) {
|
||||
r[i] = _mm512_maskz_loadu_epi8(mask, src + i * ld_src);
|
||||
}
|
||||
for (int i = krem; i < 4; ++i) {
|
||||
r[i] = _mm512_setzero_si512();
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < krem; ++i) {
|
||||
r[i] = _mm512_loadu_si512(
|
||||
reinterpret_cast<const __m512i*>(src + i * ld_src));
|
||||
}
|
||||
for (int i = krem; i < 4; ++i) {
|
||||
r[i] = _mm512_setzero_si512();
|
||||
}
|
||||
}
|
||||
|
||||
// Transpose 4x64 bytes using unpack and shuffle
|
||||
__m512i t0 = _mm512_unpacklo_epi8(r[0], r[1]);
|
||||
__m512i t1 = _mm512_unpackhi_epi8(r[0], r[1]);
|
||||
__m512i t2 = _mm512_unpacklo_epi8(r[2], r[3]);
|
||||
__m512i t3 = _mm512_unpackhi_epi8(r[2], r[3]);
|
||||
|
||||
__m512i u0 = _mm512_unpacklo_epi16(t0, t2);
|
||||
__m512i u1 = _mm512_unpackhi_epi16(t0, t2);
|
||||
__m512i u2 = _mm512_unpacklo_epi16(t1, t3);
|
||||
__m512i u3 = _mm512_unpackhi_epi16(t1, t3);
|
||||
|
||||
__m512i v0 = _mm512_shuffle_i32x4(u0, u1, 0x88);
|
||||
__m512i v1 = _mm512_shuffle_i32x4(u0, u1, 0xdd);
|
||||
__m512i v2 = _mm512_shuffle_i32x4(u2, u3, 0x88);
|
||||
__m512i v3 = _mm512_shuffle_i32x4(u2, u3, 0xdd);
|
||||
|
||||
__m512i r0 = _mm512_shuffle_i32x4(v0, v2, 0x88);
|
||||
__m512i r1 = _mm512_shuffle_i32x4(v1, v3, 0x88);
|
||||
__m512i r2 = _mm512_shuffle_i32x4(v0, v2, 0xdd);
|
||||
__m512i r3 = _mm512_shuffle_i32x4(v1, v3, 0xdd);
|
||||
|
||||
// Store output
|
||||
if (nrem < 16) {
|
||||
__mmask64 mask = (1ULL << (nrem * 4)) - 1;
|
||||
_mm512_mask_storeu_epi8(dst, mask, r0);
|
||||
} else if (nrem == 16) {
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0);
|
||||
} else if (nrem < 32) {
|
||||
int n_bytes1 = 64;
|
||||
int n_bytes2 = (nrem * 4) - n_bytes1;
|
||||
__mmask64 mask = (1ULL << n_bytes2) - 1;
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0);
|
||||
_mm512_mask_storeu_epi8(reinterpret_cast<__m512i*>(dst + 64), mask, r1);
|
||||
} else if (nrem == 32) {
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0);
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64), r1);
|
||||
} else if (nrem < 48) {
|
||||
int n_bytes1 = 64 * 2;
|
||||
int n_bytes2 = (nrem * 4) - n_bytes1;
|
||||
__mmask64 mask = (1ULL << n_bytes2) - 1;
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0);
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64), r1);
|
||||
_mm512_mask_storeu_epi8(reinterpret_cast<__m512i*>(dst + 64 * 2), mask, r2);
|
||||
} else if (nrem == 48) {
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0);
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64), r1);
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64 * 2), r2);
|
||||
} else if (nrem < 64) {
|
||||
int n_bytes1 = 64 * 3;
|
||||
int n_bytes2 = (nrem * 4) - n_bytes1;
|
||||
__mmask64 mask = (1ULL << n_bytes2) - 1;
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0);
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64), r1);
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64 * 2), r2);
|
||||
_mm512_mask_storeu_epi8(reinterpret_cast<__m512i*>(dst + 64 * 3), mask, r3);
|
||||
} else {
|
||||
// normal case, nrem == 64
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0);
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64), r1);
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64 * 2), r2);
|
||||
_mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64 * 3), r3);
|
||||
}
|
||||
#else
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"transpose_pad_4x64_block is only supported when AVX-512 is supported")
|
||||
#endif
|
||||
}
|
||||
|
||||
// Reorder [K, N] → [K/4, N, 4] (VNNI4-style layout for bit8)
|
||||
template <typename scalar_t, typename = std::enable_if_t<sizeof(scalar_t) == 1>>
|
||||
static inline void pack_vnni4(
|
||||
const scalar_t* src,
|
||||
scalar_t* dst,
|
||||
int64_t ld_src,
|
||||
int64_t K,
|
||||
int64_t N) {
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
int64_t bk = 0;
|
||||
int64_t _K = K / 4 * 4;
|
||||
int64_t _N = N / 64 * 64;
|
||||
for (; bk < _K; bk += 4) {
|
||||
int64_t bn = 0;
|
||||
for (; bn < _N; bn += 64) {
|
||||
transpose_pad_4x64_block(
|
||||
src + bk * ld_src + bn, dst + bk * N + bn * 4, ld_src);
|
||||
}
|
||||
int64_t nrem = N - bn;
|
||||
if (nrem > 0) {
|
||||
transpose_pad_4x64_block(
|
||||
src + bk * ld_src + bn, dst + bk * N + bn * 4, ld_src, 4, nrem);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle leftover K rows (< 4)
|
||||
if (K % 4 != 0) {
|
||||
int krem = K - bk;
|
||||
int64_t bn = 0;
|
||||
for (; bn < _N; bn += 64) {
|
||||
transpose_pad_4x64_block(
|
||||
src + bk * ld_src + bn, dst + bk * N + bn * 4, ld_src, krem);
|
||||
}
|
||||
int64_t nrem = N - bn;
|
||||
if (nrem > 0) {
|
||||
transpose_pad_4x64_block(
|
||||
src + bk * ld_src + bn, dst + bk * N + bn * 4, ld_src, krem, nrem);
|
||||
}
|
||||
}
|
||||
#else
|
||||
TORCH_CHECK(false, "pack_vnni4 is only supported when AVX-512 is supported")
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
@ -154,7 +154,7 @@ programmatically since the settings become fixed. Use the C++ or Python APIs ins
|
||||
| PYTORCH_TUNABLEOP_MAX_WARMUP_ITERATIONS | Default is 0, meaning it is not used. |
|
||||
| PYTORCH_TUNABLEOP_ICACHE_FLUSH_ENABLED | Default is 1. Set to 0 to disable. |
|
||||
| PYTORCH_TUNABLEOP_ROTATING_BUFFER_SIZE | Default (or < 0) is to query L2 cache size. Set to 0 to disable. Otherwise, set to the number of MiB to use for the pool of operator parameters. For example, setting this to the size of your device's memory cache will guarantee that every tuning iteration will use a cold cache. |
|
||||
| PYTORCH_TUNABLEOP_BLAS_LOG | Default is 0. Set to 1 to enable. Write BLAS paramters to tuning CSV file. |
|
||||
| PYTORCH_TUNABLEOP_BLAS_LOG | Default is 0. Set to 1 to enable. Write BLAS parameters to tuning CSV file. |
|
||||
|
||||
### Python Interface
|
||||
All python APIs exist in the `torch.cuda.tunable` module.
|
||||
|
||||
@ -697,7 +697,7 @@ TORCH_META_FUNC(linalg_cholesky_ex)(const Tensor& A,
|
||||
auto ndim = A_shape.size();
|
||||
|
||||
// L
|
||||
auto L_strides = at::native::batched_matrix_contiguous_strides(A_shape, /*f-contig*=*/A.device().type() != at::kMPS);
|
||||
auto L_strides = at::native::batched_matrix_contiguous_strides(A_shape, /*f-contig*=*/true);
|
||||
set_output_strided(0, A_shape, L_strides, A.options(), {});
|
||||
|
||||
// info
|
||||
|
||||
@ -127,6 +127,9 @@ TORCH_IMPL_FUNC(smooth_l1_loss_out)
|
||||
|
||||
TORCH_IMPL_FUNC(mse_loss_out)
|
||||
(const Tensor& input, const Tensor& target, int64_t reduction, const Tensor& result) {
|
||||
TORCH_CHECK(input.device() == target.device(),
|
||||
"Expected all tensors to be on the same device, but found at least two devices, ",
|
||||
input.device(), " and ", target.device(), "!");
|
||||
if (reduction != Reduction::None) {
|
||||
Tensor loss;
|
||||
auto iter = TensorIterator::borrowing_binary_op(loss, input, target);
|
||||
|
||||
@ -2862,7 +2862,7 @@ inline C10_HOST_DEVICE T chebyshev_polynomial_t_forward(T x, int64_t n) {
|
||||
T q = x;
|
||||
T r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
|
||||
r = (x + x) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
@ -2910,7 +2910,7 @@ inline C10_HOST_DEVICE T chebyshev_polynomial_u_forward(T x, int64_t n) {
|
||||
T q = x + x;
|
||||
T r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
|
||||
r = (x + x) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
@ -2966,7 +2966,7 @@ inline C10_HOST_DEVICE T chebyshev_polynomial_v_forward(T x, int64_t n) {
|
||||
T q = x + x - T(1.0);
|
||||
T r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
|
||||
r = (x + x) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
@ -3026,7 +3026,7 @@ inline C10_HOST_DEVICE T chebyshev_polynomial_w_forward(T x, int64_t n) {
|
||||
T q = x + x + T(1.0);
|
||||
T r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
|
||||
r = (x + x) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
@ -3150,7 +3150,7 @@ inline C10_HOST_DEVICE T laguerre_polynomial_l_forward(T x, int64_t n) {
|
||||
T q = T(1.0) - x;
|
||||
T r;
|
||||
|
||||
for (int64_t k = 1; k < n; k++) {
|
||||
for (int64_t k = 1; (k < n) && !std::isnan(q); k++) {
|
||||
r = (((k + k) + (T(1.0) - x)) * q - k * p) / (k + 1);
|
||||
p = q;
|
||||
q = r;
|
||||
@ -3190,7 +3190,7 @@ inline C10_HOST_DEVICE T legendre_polynomial_p_forward(T x, int64_t n) {
|
||||
T q = x;
|
||||
T r;
|
||||
|
||||
for (int64_t k = 1; k < n; k++) {
|
||||
for (int64_t k = 1; (k < n) && !std::isnan(q); k++) {
|
||||
r = ((k + k + 1) * x * q - k * p) / (k + 1);
|
||||
p = q;
|
||||
q = r;
|
||||
@ -3733,7 +3733,7 @@ inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_t_forward(T x, int64_t n)
|
||||
T q = x + x - T(1.0);
|
||||
T r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
|
||||
r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
@ -3785,7 +3785,7 @@ inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_u_forward(T x, int64_t n)
|
||||
T q = x + x - T(1.0) + (x + x - T(1.0));
|
||||
T r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
|
||||
r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
@ -3841,7 +3841,7 @@ inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_v_forward(T x, int64_t n)
|
||||
T q = x + x - T(1.0) + (x + x - T(1.0)) - T(1.0);
|
||||
T r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
|
||||
r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
@ -3897,7 +3897,7 @@ inline C10_HOST_DEVICE T shifted_chebyshev_polynomial_w_forward(T x, int64_t n)
|
||||
T q = x + x - T(1.0) + (x + x - T(1.0)) + T(1.0);
|
||||
T r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !std::isnan(q); k++) {
|
||||
r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
|
||||
@ -165,6 +165,12 @@ inline void transpose<uint16_t>(int64_t M, int64_t N, const uint16_t* src, int64
|
||||
TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
|
||||
fbgemm::transpose_simd<uint16_t>(M, N, src, ld_src, dst, ld_dst);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void transpose<uint8_t>(int64_t M, int64_t N, const uint8_t* src, int64_t ld_src, uint8_t* dst, int64_t ld_dst) {
|
||||
TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM.");
|
||||
fbgemm::transpose_simd<uint8_t>(M, N, src, ld_src, dst, ld_dst);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename index_t, typename F>
|
||||
|
||||
@ -369,7 +369,7 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices,
|
||||
|
||||
int warp_size = at::cuda::warp_size();
|
||||
TORCH_INTERNAL_ASSERT(num_threads() % warp_size == 0 &&
|
||||
num_threads() <= cuda_utils::kCUDABlockReduceMaxThreads(),
|
||||
num_threads() <= static_cast<uint32_t>(cuda_utils::kCUDABlockReduceMaxThreads()),
|
||||
"BlockReduceSum requires all warps be active");
|
||||
const int64_t *num_unique_indices_ptr = num_unique_indices.const_data_ptr<int64_t>();
|
||||
dim3 grid = unique_indices.numel();
|
||||
|
||||
@ -1946,7 +1946,7 @@ const auto chebyshev_polynomial_t_string = jiterator_stringify(
|
||||
T q = x;
|
||||
T r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
|
||||
r = (x + x) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
@ -1996,7 +1996,7 @@ const auto chebyshev_polynomial_u_string = jiterator_stringify(
|
||||
T q = x + x;
|
||||
T r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
|
||||
r = (x + x) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
@ -2054,7 +2054,7 @@ const auto chebyshev_polynomial_v_string = jiterator_stringify(
|
||||
T q = x + x - T(1.0);
|
||||
T r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
|
||||
r = (x + x) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
@ -2116,7 +2116,7 @@ const auto chebyshev_polynomial_w_string = jiterator_stringify(
|
||||
T q = x + x + T(1.0);
|
||||
T r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
|
||||
r = (x + x) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
@ -2252,7 +2252,7 @@ const auto laguerre_polynomial_l_string = jiterator_stringify(
|
||||
T q = T(1.0) - x;
|
||||
T r;
|
||||
|
||||
for (int64_t k = 1; k < n; k++) {
|
||||
for (int64_t k = 1; (k < n) && !isnan(q); k++) {
|
||||
r = (((k + k) + (T(1.0) - x)) * q - k * p) / (k + 1);
|
||||
p = q;
|
||||
q = r;
|
||||
@ -2294,7 +2294,7 @@ const auto legendre_polynomial_p_string = jiterator_stringify(
|
||||
T q = x;
|
||||
T r;
|
||||
|
||||
for (int64_t k = 1; k < n; k++) {
|
||||
for (int64_t k = 1; (k < n) && !isnan(q); k++) {
|
||||
r = ((k + k + 1) * x * q - k * p) / (k + 1);
|
||||
p = q;
|
||||
q = r;
|
||||
@ -2851,7 +2851,7 @@ const auto shifted_chebyshev_polynomial_t_string = jiterator_stringify(
|
||||
T q = x + x - T(1.0);
|
||||
T r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
|
||||
r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
@ -2905,7 +2905,7 @@ const auto shifted_chebyshev_polynomial_u_string = jiterator_stringify(
|
||||
T q = x + x - T(1.0) + (x + x - T(1.0));
|
||||
T r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
|
||||
r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
@ -2963,7 +2963,7 @@ const auto shifted_chebyshev_polynomial_v_string = jiterator_stringify(
|
||||
T q = x + x - T(1.0) + (x + x - T(1.0)) - T(1.0);
|
||||
T r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
|
||||
r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
@ -3021,7 +3021,7 @@ const auto shifted_chebyshev_polynomial_w_string = jiterator_stringify(
|
||||
T q = x + x - T(1.0) + (x + x - T(1.0)) + T(1.0);
|
||||
T r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !isnan(q); k++) {
|
||||
r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
|
||||
@ -155,6 +155,12 @@ static void check_shape_forward(const Tensor& input,
|
||||
// but weight/bias and grad_weight/grad_bias are always CPU tensor.
|
||||
//
|
||||
|
||||
static bool mkldnn_conv_enabled_fpmath_mode_bf16(){
|
||||
return at::globalContext().float32Precision("mkldnn", "conv") == "bf16" &&
|
||||
mkldnn_bf16_device_check();
|
||||
}
|
||||
|
||||
|
||||
static inline at::MemoryFormat mkldnn_convolution_memory_format(int64_t dims, bool is_channels_last) {
|
||||
auto memory_format = at::MemoryFormat::Contiguous;
|
||||
if (is_channels_last) {
|
||||
@ -163,7 +169,7 @@ static inline at::MemoryFormat mkldnn_convolution_memory_format(int64_t dims, bo
|
||||
return memory_format;
|
||||
}
|
||||
|
||||
static void _mkldnn_convolution_out (
|
||||
static void _mkldnn_convolution_out(
|
||||
const Tensor& input_t,
|
||||
const Tensor& weight_t,
|
||||
const Tensor& bias,
|
||||
@ -261,6 +267,10 @@ static Tensor _mkldnn_convolution(
|
||||
output.resize_(output_sizes, memory_format);
|
||||
y = itensor_from_tensor(output);
|
||||
}
|
||||
if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
|
||||
input_t.scalar_type() == at::kFloat) {
|
||||
op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
|
||||
}
|
||||
_mkldnn_convolution_out(
|
||||
input_t,
|
||||
weight_t,
|
||||
@ -442,6 +452,10 @@ Tensor mkldnn_convolution_pointwise_binary(
|
||||
op_attr.set_post_ops(po);
|
||||
auto aprop_kind = ideep::prop_kind::forward_inference;
|
||||
|
||||
if (mkldnn_conv_enabled_fpmath_mode_bf16() && input_t.scalar_type() ==at::kFloat){
|
||||
op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
|
||||
}
|
||||
|
||||
if (bias.defined()) {
|
||||
const ideep::tensor b = itensor_from_tensor(bias);
|
||||
ideep::convolution_forward::compute_binary(
|
||||
@ -579,6 +593,10 @@ Tensor& mkldnn_convolution_pointwise_binary_(
|
||||
op_attr = ideep::attr_t::fuse_sum();
|
||||
}
|
||||
auto aprop_kind = ideep::prop_kind::forward_inference;
|
||||
if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
|
||||
input_t.scalar_type() == at::kFloat) {
|
||||
op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
|
||||
}
|
||||
_mkldnn_convolution_out(
|
||||
input_t,
|
||||
weight_t,
|
||||
@ -697,6 +715,10 @@ Tensor _mkldnn_convolution_transpose(
|
||||
y = itensor_from_tensor(output);
|
||||
}
|
||||
|
||||
if (mkldnn_conv_enabled_fpmath_mode_bf16() && input_t.scalar_type() ==at::kFloat){
|
||||
op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
|
||||
}
|
||||
|
||||
if (bias.defined()) {
|
||||
const ideep::tensor b = itensor_from_tensor(bias, /*from_const_data_ptr*/true);
|
||||
ideep::convolution_transpose_forward::compute_v3(
|
||||
@ -781,6 +803,11 @@ Tensor mkldnn_convolution_backward_input(
|
||||
grad_input.resize_(input_size, memory_format);
|
||||
grad_x = itensor_from_tensor(grad_input);
|
||||
}
|
||||
ideep::attr_t op_attr = ideep::attr_t();
|
||||
if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
|
||||
weight.scalar_type() == at::kFloat) {
|
||||
op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
|
||||
}
|
||||
ideep::convolution_backward_data::compute_v2(
|
||||
grad_y,
|
||||
w,
|
||||
@ -791,7 +818,17 @@ Tensor mkldnn_convolution_backward_input(
|
||||
padding.vec(),
|
||||
padding.vec(),
|
||||
groups,
|
||||
#if IDEEP_PREREQ(3, 4, 1, 3)
|
||||
is_channels_last,
|
||||
op_attr);
|
||||
#else
|
||||
is_channels_last);
|
||||
if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
|
||||
weight.scalar_type() == at::kFloat) {
|
||||
TORCH_WARN_ONCE(
|
||||
"Unexpected ideep version to support fpmath_mode_bf16, please update ideep version to align with pytorch main branch");
|
||||
}
|
||||
#endif
|
||||
|
||||
if (grad_output.is_mkldnn()) {
|
||||
return MKLDNNTensor(grad_x, grad_output.options());
|
||||
@ -816,6 +853,11 @@ std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
|
||||
const ideep::tensor x = itensor_from_tensor(input, /*from_const_data_ptr*/true);
|
||||
|
||||
ideep::tensor grad_w, grad_b;
|
||||
ideep::attr_t op_attr = ideep::attr_t();
|
||||
if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
|
||||
input.scalar_type() == at::kFloat) {
|
||||
op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
|
||||
}
|
||||
if (bias_defined) {
|
||||
ideep::convolution_backward_weights::compute_v2(
|
||||
x,
|
||||
@ -828,7 +870,8 @@ std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
|
||||
padding.vec(),
|
||||
padding.vec(),
|
||||
groups,
|
||||
is_channels_last);
|
||||
is_channels_last,
|
||||
op_attr);
|
||||
} else {
|
||||
ideep::convolution_backward_weights::compute_v2(
|
||||
x,
|
||||
@ -840,7 +883,8 @@ std::tuple<Tensor, Tensor> mkldnn_convolution_backward_weights(
|
||||
padding.vec(),
|
||||
padding.vec(),
|
||||
groups,
|
||||
is_channels_last);
|
||||
is_channels_last,
|
||||
op_attr);
|
||||
}
|
||||
|
||||
if (!is_channels_last) {
|
||||
@ -962,6 +1006,11 @@ Tensor mkldnn_convolution_transpose_backward_input(
|
||||
grad_input.resize_(input_size, memory_format);
|
||||
grad_x = itensor_from_tensor(grad_input);
|
||||
}
|
||||
ideep::attr_t op_attr = ideep::attr_t();
|
||||
if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
|
||||
weight.scalar_type() == at::kFloat) {
|
||||
op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
|
||||
}
|
||||
ideep::convolution_transpose_backward_data::compute_v3(
|
||||
grad_y,
|
||||
w,
|
||||
@ -972,7 +1021,8 @@ Tensor mkldnn_convolution_transpose_backward_input(
|
||||
padding_r(padding, output_padding),
|
||||
dilation.vec(),
|
||||
groups,
|
||||
is_channels_last);
|
||||
is_channels_last,
|
||||
op_attr);
|
||||
|
||||
if (grad_output.is_mkldnn()) {
|
||||
return MKLDNNTensor(grad_x, grad_output.options());
|
||||
@ -998,6 +1048,11 @@ std::tuple<Tensor,Tensor> mkldnn_convolution_transpose_backward_weights(
|
||||
auto x = itensor_from_tensor(input, /*from_const_data_ptr*/true);
|
||||
|
||||
ideep::tensor grad_w, grad_b;
|
||||
ideep::attr_t op_attr = ideep::attr_t();
|
||||
if (mkldnn_conv_enabled_fpmath_mode_bf16() &&
|
||||
input.scalar_type() == at::kFloat) {
|
||||
op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
|
||||
}
|
||||
if (bias_defined) {
|
||||
ideep::convolution_transpose_backward_weights::compute_v3(
|
||||
x,
|
||||
@ -1010,7 +1065,8 @@ std::tuple<Tensor,Tensor> mkldnn_convolution_transpose_backward_weights(
|
||||
padding_r(padding, output_padding),
|
||||
dilation.vec(),
|
||||
groups,
|
||||
is_channels_last);
|
||||
is_channels_last,
|
||||
op_attr);
|
||||
} else {
|
||||
ideep::convolution_transpose_backward_weights::compute_v3(
|
||||
x,
|
||||
@ -1022,7 +1078,8 @@ std::tuple<Tensor,Tensor> mkldnn_convolution_transpose_backward_weights(
|
||||
padding_r(padding, output_padding),
|
||||
dilation.vec(),
|
||||
groups,
|
||||
is_channels_last);
|
||||
is_channels_last,
|
||||
op_attr);
|
||||
}
|
||||
|
||||
if (!is_channels_last) {
|
||||
|
||||
@ -68,6 +68,11 @@ mkldnn_scaled_mm(const Tensor& mat1, const Tensor& mat2,
|
||||
|
||||
namespace at::native {
|
||||
|
||||
static bool use_mkldnn_bf32_linear() {
|
||||
return at::globalContext().float32Precision("mkldnn", "matmul") == "bf16" &&
|
||||
mkldnn_bf16_device_check();
|
||||
}
|
||||
|
||||
Tensor mkldnn_linear(
|
||||
const Tensor& self,
|
||||
const Tensor& weight_t, const std::optional<Tensor>& bias_opt) {
|
||||
@ -251,7 +256,9 @@ Tensor mkldnn_linear_pointwise(
|
||||
it != fusion_unary_attr_map().end(), "Fusion behavior undefined.");
|
||||
op_attr = it->second(scalars, algorithm);
|
||||
}
|
||||
|
||||
if (use_mkldnn_bf32_linear() && input_t.scalar_type() == at::kFloat){
|
||||
op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
|
||||
}
|
||||
if (mkldnn_bias.has_value()) {
|
||||
ideep::inner_product_forward::compute</*reorder_src=*/false, /*reorder_weight=*/false>(
|
||||
mkldnn_input,
|
||||
@ -341,6 +348,10 @@ Tensor mkldnn_linear_pointwise_binary(
|
||||
auto op_attr = ideep::attr_t::fuse_binary(it_binary->second, other_desc);
|
||||
auto aprop_kind = ideep::prop_kind::forward_inference;
|
||||
|
||||
if (use_mkldnn_bf32_linear() && input_t.scalar_type() == at::kFloat){
|
||||
op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16);
|
||||
}
|
||||
|
||||
if (mkldnn_bias.has_value()) {
|
||||
ideep::inner_product_forward::compute_binary</*reorder_src=*/false, /*reorder_weight=*/false>(
|
||||
mkldnn_input,
|
||||
|
||||
@ -134,6 +134,58 @@ struct chebyshev_polynomial_w_functor {
|
||||
}
|
||||
};
|
||||
|
||||
struct shifted_chebyshev_polynomial_t_functor {
|
||||
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
|
||||
inline T operator()(const T a, const T b) {
|
||||
return static_cast<T>(
|
||||
c10::metal::shifted_chebyshev_polynomial_t_forward(a, b));
|
||||
}
|
||||
template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
|
||||
inline float operator()(const T a, const T b) {
|
||||
return c10::metal::shifted_chebyshev_polynomial_t_forward(
|
||||
float(a), float(b));
|
||||
}
|
||||
};
|
||||
|
||||
struct shifted_chebyshev_polynomial_u_functor {
|
||||
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
|
||||
inline T operator()(const T a, const T b) {
|
||||
return static_cast<T>(
|
||||
c10::metal::shifted_chebyshev_polynomial_u_forward(a, b));
|
||||
}
|
||||
template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
|
||||
inline float operator()(const T a, const T b) {
|
||||
return c10::metal::shifted_chebyshev_polynomial_u_forward(
|
||||
float(a), float(b));
|
||||
}
|
||||
};
|
||||
|
||||
struct shifted_chebyshev_polynomial_v_functor {
|
||||
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
|
||||
inline T operator()(const T a, const T b) {
|
||||
return static_cast<T>(
|
||||
c10::metal::shifted_chebyshev_polynomial_v_forward(a, b));
|
||||
}
|
||||
template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
|
||||
inline float operator()(const T a, const T b) {
|
||||
return c10::metal::shifted_chebyshev_polynomial_v_forward(
|
||||
float(a), float(b));
|
||||
}
|
||||
};
|
||||
|
||||
struct shifted_chebyshev_polynomial_w_functor {
|
||||
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
|
||||
inline T operator()(const T a, const T b) {
|
||||
return static_cast<T>(
|
||||
c10::metal::shifted_chebyshev_polynomial_w_forward(a, b));
|
||||
}
|
||||
template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
|
||||
inline float operator()(const T a, const T b) {
|
||||
return c10::metal::shifted_chebyshev_polynomial_w_forward(
|
||||
float(a), float(b));
|
||||
}
|
||||
};
|
||||
|
||||
struct hermite_polynomial_h_functor {
|
||||
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
|
||||
inline T operator()(const T a, const T b) {
|
||||
@ -342,6 +394,14 @@ REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_v);
|
||||
REGISTER_INT2FLOAT_BINARY_OP(chebyshev_polynomial_w);
|
||||
REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_w);
|
||||
REGISTER_INT2FLOAT_BINARY_OP(chebyshev_polynomial_v);
|
||||
REGISTER_FLOAT_BINARY_OP(shifted_chebyshev_polynomial_t);
|
||||
REGISTER_INT2FLOAT_BINARY_OP(shifted_chebyshev_polynomial_t);
|
||||
REGISTER_FLOAT_BINARY_OP(shifted_chebyshev_polynomial_u);
|
||||
REGISTER_INT2FLOAT_BINARY_OP(shifted_chebyshev_polynomial_u);
|
||||
REGISTER_FLOAT_BINARY_OP(shifted_chebyshev_polynomial_v);
|
||||
REGISTER_INT2FLOAT_BINARY_OP(shifted_chebyshev_polynomial_v);
|
||||
REGISTER_FLOAT_BINARY_OP(shifted_chebyshev_polynomial_w);
|
||||
REGISTER_INT2FLOAT_BINARY_OP(shifted_chebyshev_polynomial_w);
|
||||
REGISTER_FLOAT_BINARY_OP(hermite_polynomial_h);
|
||||
REGISTER_INT2FLOAT_BINARY_OP(hermite_polynomial_h);
|
||||
REGISTER_FLOAT_BINARY_OP(hermite_polynomial_he);
|
||||
|
||||
@ -145,6 +145,28 @@ inline float blockReduceSum(
|
||||
return sharedScratch[0];
|
||||
}
|
||||
|
||||
template <bool col_major>
|
||||
inline device float& get_ref(device float* A, uint row, uint col, uint N);
|
||||
|
||||
template <>
|
||||
inline device float& get_ref<true>(
|
||||
device float* A,
|
||||
uint row,
|
||||
uint col,
|
||||
uint N) {
|
||||
return A[row * N + col];
|
||||
}
|
||||
|
||||
template <>
|
||||
inline device float& get_ref<false>(
|
||||
device float* A,
|
||||
uint row,
|
||||
uint col,
|
||||
uint N) {
|
||||
return A[row + col * N];
|
||||
}
|
||||
|
||||
template <bool upper>
|
||||
kernel void factorDiagonalBlock(
|
||||
device float* A [[buffer(0)]],
|
||||
device int* info [[buffer(1)]],
|
||||
@ -171,7 +193,7 @@ kernel void factorDiagonalBlock(
|
||||
for (uint i = linear_tid; i < tileSize; i += group_size) {
|
||||
uint r = i / actSize;
|
||||
uint c = i % actSize;
|
||||
tile[r][c] = A[batch_offset + (row0 + r) * N + (col0 + c)];
|
||||
tile[r][c] = get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N);
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
@ -244,10 +266,33 @@ kernel void factorDiagonalBlock(
|
||||
for (uint i = linear_tid; i < tileSize; i += group_size) {
|
||||
uint r = i / actSize;
|
||||
uint c = i % actSize;
|
||||
A[batch_offset + (row0 + r) * N + (col0 + c)] = tile[r][c];
|
||||
get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N) = tile[r][c];
|
||||
}
|
||||
}
|
||||
|
||||
template [[host_name("factorDiagonalBlockU")]]
|
||||
kernel void factorDiagonalBlock<true>(
|
||||
device float* A [[buffer(0)]],
|
||||
device int* info [[buffer(1)]],
|
||||
constant uint& N [[buffer(2)]],
|
||||
constant uint& NB [[buffer(3)]],
|
||||
constant uint& k [[buffer(4)]],
|
||||
uint3 tid [[thread_position_in_threadgroup]],
|
||||
uint3 bid [[threadgroup_position_in_grid]],
|
||||
uint3 tpg [[threads_per_threadgroup]]);
|
||||
|
||||
template [[host_name("factorDiagonalBlockL")]]
|
||||
kernel void factorDiagonalBlock<false>(
|
||||
device float* A [[buffer(0)]],
|
||||
device int* info [[buffer(1)]],
|
||||
constant uint& N [[buffer(2)]],
|
||||
constant uint& NB [[buffer(3)]],
|
||||
constant uint& k [[buffer(4)]],
|
||||
uint3 tid [[thread_position_in_threadgroup]],
|
||||
uint3 bid [[threadgroup_position_in_grid]],
|
||||
uint3 tpg [[threads_per_threadgroup]]);
|
||||
|
||||
template <bool upper>
|
||||
kernel void applyTRSM(
|
||||
device float* A [[buffer(0)]],
|
||||
constant uint& N [[buffer(2)]],
|
||||
@ -283,12 +328,12 @@ kernel void applyTRSM(
|
||||
for (uint i = linear_tid; i < actSize_k * actSize_k; i += group_size) {
|
||||
uint r = i / actSize_k;
|
||||
uint c = i % actSize_k;
|
||||
diag[i] = A[batch_offset + (k * NB + r) * N + (k * NB + c)];
|
||||
diag[i] = get_ref<upper>(A + batch_offset, k * NB + r, k * NB + c, N);
|
||||
}
|
||||
for (uint i = linear_tid; i < actSize_j * actSize_k; i += group_size) {
|
||||
uint r = i / actSize_k;
|
||||
uint c = i % actSize_k;
|
||||
target[i] = A[batch_offset + (row0 + r) * N + (col0 + c)];
|
||||
target[i] = get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N);
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
@ -332,10 +377,31 @@ kernel void applyTRSM(
|
||||
for (uint i = linear_tid; i < actSize_j * actSize_k; i += group_size) {
|
||||
uint r = i / actSize_k;
|
||||
uint c = i % actSize_k;
|
||||
A[batch_offset + (row0 + r) * N + (col0 + c)] = target[i];
|
||||
get_ref<upper>(A + batch_offset, row0 + r, col0 + c, N) = target[i];
|
||||
}
|
||||
}
|
||||
|
||||
template [[host_name("applyTRSMU")]]
|
||||
kernel void applyTRSM<true>(
|
||||
device float* A [[buffer(0)]],
|
||||
constant uint& N [[buffer(2)]],
|
||||
constant uint& NB [[buffer(3)]],
|
||||
constant uint& k [[buffer(4)]],
|
||||
uint3 tid [[thread_position_in_threadgroup]],
|
||||
uint3 tgid [[threadgroup_position_in_grid]],
|
||||
uint3 tpg [[threads_per_threadgroup]]);
|
||||
|
||||
template [[host_name("applyTRSML")]]
|
||||
kernel void applyTRSM<false>(
|
||||
device float* A [[buffer(0)]],
|
||||
constant uint& N [[buffer(2)]],
|
||||
constant uint& NB [[buffer(3)]],
|
||||
constant uint& k [[buffer(4)]],
|
||||
uint3 tid [[thread_position_in_threadgroup]],
|
||||
uint3 tgid [[threadgroup_position_in_grid]],
|
||||
uint3 tpg [[threads_per_threadgroup]]);
|
||||
|
||||
template <bool upper>
|
||||
kernel void applySYRK(
|
||||
device float* A [[buffer(0)]],
|
||||
constant uint& N [[buffer(2)]],
|
||||
@ -403,17 +469,25 @@ kernel void applySYRK(
|
||||
// Same logic to load/store Cfrag, Afrag, Bfrag...
|
||||
simdgroup_matrix<float, 8, 8> Cfrag;
|
||||
simdgroup_load(
|
||||
Cfrag, &A[batch_offset + (row0 + sb_y) * N + (col0 + sb_x)], N);
|
||||
Cfrag,
|
||||
&get_ref<upper>(A + batch_offset, row0 + sb_y, col0 + sb_x, N),
|
||||
N,
|
||||
0,
|
||||
!upper);
|
||||
|
||||
for (uint kk = 0; kk < actSize_k; kk += 8) {
|
||||
simdgroup_load(
|
||||
Afrag, &A[batch_offset + (row0 + sb_y) * N + (k * NB + kk)], N);
|
||||
Afrag,
|
||||
&get_ref<upper>(A + batch_offset, row0 + sb_y, k * NB + kk, N),
|
||||
N,
|
||||
0,
|
||||
!upper);
|
||||
simdgroup_load(
|
||||
Bfrag,
|
||||
&A[batch_offset + (col0 + sb_x) * N + (k * NB + kk)],
|
||||
&get_ref<upper>(A + batch_offset, col0 + sb_x, k * NB + kk, N),
|
||||
N,
|
||||
/* matrix_origin = */ 0,
|
||||
/* transpose = */ true);
|
||||
/* transpose = */ upper);
|
||||
|
||||
simdgroup_multiply(Prod, Afrag, Bfrag);
|
||||
simdgroup_multiply(Prod, Prod, negative_identity);
|
||||
@ -421,7 +495,11 @@ kernel void applySYRK(
|
||||
}
|
||||
|
||||
simdgroup_store(
|
||||
Cfrag, &A[batch_offset + (row0 + sb_y) * N + (col0 + sb_x)], N);
|
||||
Cfrag,
|
||||
&get_ref<upper>(A + batch_offset, row0 + sb_y, col0 + sb_x, N),
|
||||
N,
|
||||
0,
|
||||
!upper);
|
||||
}
|
||||
} else {
|
||||
// Fallback for non-multiple-of-8 dimensions
|
||||
@ -442,8 +520,10 @@ kernel void applySYRK(
|
||||
|
||||
float sum = 0.0f;
|
||||
for (uint i = 0; i < actSize_k; i++) {
|
||||
float a_val = A[batch_offset + (row0 + y) * N + k * NB + i];
|
||||
float b_val = A[batch_offset + (col0 + x) * N + k * NB + i];
|
||||
float a_val =
|
||||
get_ref<upper>(A + batch_offset, row0 + y, k * NB + i, N);
|
||||
float b_val =
|
||||
get_ref<upper>(A + batch_offset, col0 + x, k * NB + i, N);
|
||||
sum = fma(a_val, b_val, sum);
|
||||
}
|
||||
sum_accumulator[y * tpg.x + x] += sum;
|
||||
@ -452,13 +532,35 @@ kernel void applySYRK(
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
for (uint y = ty; y < actSize_j; y += tpg.y) {
|
||||
for (uint x = tx; x < actSize_h; x += tpg.x) {
|
||||
A[batch_offset + (row0 + y) * N + col0 + x] -=
|
||||
get_ref<upper>(A + batch_offset, row0 + y, col0 + x, N) -=
|
||||
sum_accumulator[y * tpg.x + x];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template [[host_name("applySYRKU")]]
|
||||
kernel void applySYRK<true>(
|
||||
device float* A [[buffer(0)]],
|
||||
constant uint& N [[buffer(2)]],
|
||||
constant uint& NB [[buffer(3)]],
|
||||
constant uint& k [[buffer(4)]],
|
||||
uint3 tid [[thread_position_in_threadgroup]],
|
||||
uint3 tgid [[threadgroup_position_in_grid]],
|
||||
uint3 tpg [[threads_per_threadgroup]],
|
||||
uint sgitg [[simdgroup_index_in_threadgroup]]);
|
||||
|
||||
template [[host_name("applySYRKL")]]
|
||||
kernel void applySYRK<false>(
|
||||
device float* A [[buffer(0)]],
|
||||
constant uint& N [[buffer(2)]],
|
||||
constant uint& NB [[buffer(3)]],
|
||||
constant uint& k [[buffer(4)]],
|
||||
uint3 tid [[thread_position_in_threadgroup]],
|
||||
uint3 tgid [[threadgroup_position_in_grid]],
|
||||
uint3 tpg [[threads_per_threadgroup]],
|
||||
uint sgitg [[simdgroup_index_in_threadgroup]]);
|
||||
|
||||
kernel void applyPivots(
|
||||
device float* P [[buffer(0)]],
|
||||
device const int* pivots [[buffer(1)]],
|
||||
|
||||
@ -27,3 +27,14 @@ struct PoolingParams {
|
||||
_ARRAY_NS::array<int64_t, N - 2> padding;
|
||||
_ARRAY_NS::array<int64_t, N - 2> dilation;
|
||||
};
|
||||
|
||||
template <unsigned N = 5>
|
||||
struct PoolingBackwardParams {
|
||||
int32_t dims;
|
||||
int32_t pooling_dims;
|
||||
_ARRAY_NS::array<int64_t, N> grad_input_sizes;
|
||||
_ARRAY_NS::array<int64_t, N> grad_input_strides;
|
||||
_ARRAY_NS::array<int64_t, N> grad_output_sizes;
|
||||
_ARRAY_NS::array<int64_t, N> grad_output_strides;
|
||||
_ARRAY_NS::array<int64_t, N> indices_strides;
|
||||
};
|
||||
|
||||
@ -1,7 +1,10 @@
|
||||
#include <ATen/native/mps/kernels/Pooling.h>
|
||||
#include <c10/metal/atomic.h>
|
||||
#include <metal_array>
|
||||
#include <metal_stdlib>
|
||||
|
||||
using namespace metal;
|
||||
using namespace c10::metal;
|
||||
|
||||
// Iterates through all the input elements that this kernel needs to
|
||||
// apply max to. Specialized for 3 pooling dimensions.
|
||||
@ -83,6 +86,50 @@ void max_pool_3d_input_iter(
|
||||
*indices = max_index;
|
||||
}
|
||||
|
||||
struct PoolOffsets {
|
||||
int64_t output;
|
||||
int64_t indices;
|
||||
int64_t input_leading;
|
||||
|
||||
PoolOffsets() : output(0), indices(0), input_leading(0) {}
|
||||
};
|
||||
|
||||
// Finds the offset of the output element that a forward pass thread will
|
||||
// calculate, `output[N, C, d, h, w]`. Also, find the offset of the input for
|
||||
// the leading dim indices, `input[N, C]`. Optionally, keep track of the output
|
||||
// pooling dimension indices, `[d, h , w]`.
|
||||
PoolOffsets find_pool_offsets(
|
||||
constant int64_t* output_sizes,
|
||||
constant int64_t* output_strides,
|
||||
constant int64_t* indices_strides,
|
||||
constant int64_t* input_strides,
|
||||
device int64_t* work_pooling_dim_indices,
|
||||
int32_t dims,
|
||||
int32_t leading_dims,
|
||||
uint tid) {
|
||||
int64_t output_idx = static_cast<int64_t>(tid);
|
||||
PoolOffsets offsets;
|
||||
|
||||
for (int64_t dim = dims - 1; dim >= 0; dim--) {
|
||||
int64_t dim_idx = output_idx % (output_sizes[dim]);
|
||||
offsets.output += output_strides[dim] * dim_idx;
|
||||
offsets.indices += indices_strides[dim] * dim_idx;
|
||||
|
||||
if (dim < leading_dims) {
|
||||
offsets.input_leading += input_strides[dim] * dim_idx;
|
||||
} else {
|
||||
// Keep track of pooling dimension indices of the output element, so we
|
||||
// can use them in the input iteration later on.
|
||||
if (work_pooling_dim_indices != nullptr) {
|
||||
work_pooling_dim_indices[dim - leading_dims] = dim_idx;
|
||||
}
|
||||
}
|
||||
output_idx = output_idx / output_sizes[dim];
|
||||
}
|
||||
|
||||
return offsets;
|
||||
}
|
||||
|
||||
// Kernel computes one element of the output per kernel call.
|
||||
template <typename T>
|
||||
kernel void max_pool(
|
||||
@ -113,32 +160,20 @@ kernel void max_pool(
|
||||
// element of the output. We need to fill it with the proper values below.
|
||||
device int64_t* work_pooling_dim_indices =
|
||||
work_pooling_dim_indices_ + tid * pooling_dims;
|
||||
int64_t output_idx = static_cast<int64_t>(tid);
|
||||
int64_t output_offset = 0;
|
||||
int64_t indices_offset = 0;
|
||||
int64_t input_leading_offset = 0;
|
||||
|
||||
// First, find the offset of the output element this thread will calculate,
|
||||
// `output[N, C, d, h, w]`. Also, find the offset of the input for the leading
|
||||
// dim indices, `input[N, C]` and keep track of the pooling dimension indices,
|
||||
// `[d, h , w]`.
|
||||
for (int64_t dim = dims - 1; dim >= 0; dim--) {
|
||||
int64_t dim_idx = output_idx % (output_sizes[dim]);
|
||||
output_offset += output_strides[dim] * dim_idx;
|
||||
indices_offset += indices_strides[dim] * dim_idx;
|
||||
PoolOffsets offsets = find_pool_offsets(
|
||||
output_sizes,
|
||||
output_strides,
|
||||
indices_strides,
|
||||
input_strides,
|
||||
work_pooling_dim_indices,
|
||||
dims,
|
||||
leading_dims,
|
||||
tid);
|
||||
|
||||
if (dim < leading_dims) {
|
||||
input_leading_offset += input_strides[dim] * dim_idx;
|
||||
} else {
|
||||
// Keep track of pooling dimension indices of the output element, so we
|
||||
// can use them in the input iteration later on.
|
||||
work_pooling_dim_indices[dim - leading_dims] = dim_idx;
|
||||
}
|
||||
output_idx = output_idx / output_sizes[dim];
|
||||
}
|
||||
output += output_offset;
|
||||
indices += indices_offset;
|
||||
input += input_leading_offset;
|
||||
output += offsets.output;
|
||||
indices += offsets.indices;
|
||||
input += offsets.input_leading;
|
||||
|
||||
max_pool_3d_input_iter<T>(
|
||||
input,
|
||||
@ -153,6 +188,69 @@ kernel void max_pool(
|
||||
dilation);
|
||||
}
|
||||
|
||||
// Finds the element in the grad input which corresponds to the index into the
|
||||
// pool, and then adds the grad output element to it.
|
||||
template <typename T>
|
||||
void max_pool_backward_impl(
|
||||
device AtomicType_t<T>* grad_input,
|
||||
T grad_output_element,
|
||||
int32_t input_index,
|
||||
constant int64_t* grad_input_sizes,
|
||||
constant int64_t* grad_input_strides,
|
||||
int32_t grad_input_leading_offset,
|
||||
int32_t pooling_dims) {
|
||||
int32_t size_prod = 1;
|
||||
int32_t pool_offset = 0;
|
||||
|
||||
for (int32_t dim = pooling_dims - 1; dim >= 0; dim--) {
|
||||
int32_t next_size_prod = grad_input_sizes[dim] * size_prod;
|
||||
pool_offset +=
|
||||
grad_input_strides[dim] * ((input_index % next_size_prod) / size_prod);
|
||||
size_prod *= grad_input_sizes[dim];
|
||||
}
|
||||
|
||||
AtomicType<T>::atomic_add(
|
||||
grad_input, grad_input_leading_offset + pool_offset, grad_output_element);
|
||||
}
|
||||
|
||||
// Kernel computes one element of the grad input per kernel call.
|
||||
template <typename T>
|
||||
kernel void max_pool_backward(
|
||||
device AtomicType_t<T>* grad_input [[buffer(0)]],
|
||||
constant T* grad_output [[buffer(1)]],
|
||||
constant int64_t* indices [[buffer(2)]],
|
||||
constant PoolingBackwardParams<5>& params [[buffer(3)]],
|
||||
uint tid [[thread_position_in_grid]]) {
|
||||
int32_t pooling_dims = params.pooling_dims;
|
||||
int32_t dims = params.dims;
|
||||
constant int64_t* grad_input_sizes = params.grad_input_sizes.data();
|
||||
constant int64_t* grad_input_strides = params.grad_input_strides.data();
|
||||
constant int64_t* grad_output_sizes = params.grad_output_sizes.data();
|
||||
constant int64_t* grad_output_strides = params.grad_output_strides.data();
|
||||
constant int64_t* indices_strides = params.indices_strides.data();
|
||||
|
||||
int32_t leading_dims = dims - pooling_dims;
|
||||
|
||||
PoolOffsets offsets = find_pool_offsets(
|
||||
grad_output_sizes,
|
||||
grad_output_strides,
|
||||
indices_strides,
|
||||
grad_input_strides,
|
||||
nullptr,
|
||||
dims,
|
||||
leading_dims,
|
||||
tid);
|
||||
|
||||
max_pool_backward_impl<T>(
|
||||
grad_input,
|
||||
grad_output[offsets.output],
|
||||
indices[offsets.indices],
|
||||
grad_input_sizes + leading_dims,
|
||||
grad_input_strides + leading_dims,
|
||||
offsets.input_leading,
|
||||
pooling_dims);
|
||||
}
|
||||
|
||||
#define REGISTER_MAX_POOL_OP(DTYPE) \
|
||||
template [[host_name("max_pool_" #DTYPE)]] kernel void max_pool<DTYPE>( \
|
||||
constant void* input_ [[buffer(0)]], \
|
||||
@ -162,6 +260,15 @@ kernel void max_pool(
|
||||
constant PoolingParams<5>& params [[buffer(4)]], \
|
||||
uint tid [[thread_position_in_grid]]);
|
||||
|
||||
#define REGISTER_MAX_POOL_BACKWARD_OP(DTYPE) \
|
||||
template [[host_name("max_pool_backward_" #DTYPE)]] \
|
||||
kernel void max_pool_backward<DTYPE>( \
|
||||
device AtomicType_t<DTYPE> * grad_input [[buffer(0)]], \
|
||||
constant DTYPE * grad_output_ [[buffer(1)]], \
|
||||
constant int64_t* grad_indices_ [[buffer(2)]], \
|
||||
constant PoolingBackwardParams<5>& params [[buffer(3)]], \
|
||||
uint tid [[thread_position_in_grid]]);
|
||||
|
||||
REGISTER_MAX_POOL_OP(float);
|
||||
REGISTER_MAX_POOL_OP(half);
|
||||
REGISTER_MAX_POOL_OP(int);
|
||||
@ -170,6 +277,11 @@ REGISTER_MAX_POOL_OP(short);
|
||||
REGISTER_MAX_POOL_OP(char);
|
||||
REGISTER_MAX_POOL_OP(uchar);
|
||||
REGISTER_MAX_POOL_OP(bool);
|
||||
|
||||
REGISTER_MAX_POOL_BACKWARD_OP(float);
|
||||
REGISTER_MAX_POOL_BACKWARD_OP(half);
|
||||
|
||||
#if __METAL_VERSION__ >= 310
|
||||
REGISTER_MAX_POOL_OP(bfloat);
|
||||
REGISTER_MAX_POOL_BACKWARD_OP(bfloat);
|
||||
#endif
|
||||
|
||||
@ -7,6 +7,24 @@ using namespace metal;
|
||||
|
||||
using c10::metal::accum_t;
|
||||
|
||||
struct LogAddExp {
|
||||
template <typename T>
|
||||
T operator()(T x, T y) {
|
||||
// Reference:
|
||||
// https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp
|
||||
T min_val = c10::metal::min(x, y);
|
||||
T max_val = c10::metal::max(x, y);
|
||||
|
||||
if (min_val != max_val || metal::isfinite(min_val)) {
|
||||
// nan will be propagated here
|
||||
return c10::metal::log1p(metal::exp(min_val - max_val)) + max_val;
|
||||
} else {
|
||||
// special case to correctly handle infinite cases
|
||||
return x;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
#if __METAL_VERSION__ < 310
|
||||
template <typename T, typename acc_t = accum_t<T>>
|
||||
struct CumMinOp {
|
||||
@ -32,6 +50,16 @@ struct CumMaxOp {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename acc_t = accum_t<T>>
|
||||
struct LogCumSumExpOp {
|
||||
static acc_t apply(acc_t x, acc_t y) {
|
||||
return LogAddExp{}(x, y);
|
||||
}
|
||||
static acc_t identity() {
|
||||
return -metal::numeric_limits<acc_t>::infinity();
|
||||
}
|
||||
};
|
||||
|
||||
// Inclusive scan along innermost dimension for contiguous tensors
|
||||
template <typename T, typename Op, typename acc_t = accum_t<T>>
|
||||
kernel void scan_contiguous_innermost_dim(
|
||||
@ -345,6 +373,10 @@ kernel void scan_with_indices_strided(
|
||||
constant uint& scan_dim [[buffer(8)]], \
|
||||
uint thread_index [[thread_position_in_grid]]);
|
||||
|
||||
// Simple scan operations
|
||||
REGISTER_SCAN_OP(logcumsumexp, LogCumSumExpOp, float);
|
||||
REGISTER_SCAN_OP(logcumsumexp, LogCumSumExpOp, half);
|
||||
|
||||
// Scan operations with indices
|
||||
REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, float);
|
||||
REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, half);
|
||||
@ -438,6 +470,30 @@ inline bool simd_shuffle(bool data, uint16_t lane) {
|
||||
return simd_shuffle_and_fill_up(val, init, 1); \
|
||||
}
|
||||
|
||||
template <typename T, typename acc_t = accum_t<T>>
|
||||
struct LogCumSumExpOp {
|
||||
static constexpr constant acc_t init = static_cast<acc_t>(
|
||||
metal::is_floating_point_v<T> ? -metal::numeric_limits<T>::infinity()
|
||||
: metal::numeric_limits<T>::lowest());
|
||||
|
||||
acc_t operator()(acc_t a, acc_t b) {
|
||||
return LogAddExp{}(a, b);
|
||||
}
|
||||
|
||||
acc_t simd_scan(acc_t x) {
|
||||
for (int i = 1; i <= 16; i *= 2) {
|
||||
acc_t other = simd_shuffle_and_fill_up(x, init, i);
|
||||
x = LogAddExp{}(x, other);
|
||||
}
|
||||
return x;
|
||||
}
|
||||
|
||||
acc_t simd_exclusive_scan(acc_t x) {
|
||||
x = simd_scan(x);
|
||||
return simd_shuffle_and_fill_up(x, init, 1);
|
||||
}
|
||||
};
|
||||
|
||||
// Pair structure to hold value and index for cummin/cummax operations
|
||||
template <typename T, typename acc_t = accum_t<T>>
|
||||
struct ValueIndexPair {
|
||||
@ -642,6 +698,203 @@ inline T ceildiv(T N, U M) {
|
||||
return (N + M - 1) / M;
|
||||
}
|
||||
|
||||
// Inclusive scan along innermost dimension for contiguous tensors
|
||||
template <typename T, typename Op, int N_READS, typename acc_t = accum_t<T>>
|
||||
kernel void scan_innermost_dim(
|
||||
const device T* in [[buffer(0)]],
|
||||
device T* out [[buffer(1)]],
|
||||
const constant size_t& axis_size [[buffer(2)]],
|
||||
uint3 gid [[threadgroup_position_in_grid]],
|
||||
uint3 gsize [[threadgroups_per_grid]],
|
||||
uint3 lid [[thread_position_in_threadgroup]],
|
||||
uint3 lsize [[threads_per_threadgroup]],
|
||||
uint simd_lane_id [[thread_index_in_simdgroup]],
|
||||
uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
|
||||
constexpr int simd_size = 32;
|
||||
Op op;
|
||||
|
||||
// Position the pointers
|
||||
size_t offset = (gid.y + gsize.y * size_t(gid.z)) * axis_size;
|
||||
in += offset;
|
||||
out += offset;
|
||||
|
||||
// Compute the number of simd_groups
|
||||
uint simd_groups = lsize.x / simd_size;
|
||||
|
||||
// Allocate memory
|
||||
acc_t prefix = Op::init;
|
||||
acc_t values[N_READS];
|
||||
threadgroup acc_t simdgroup_sums[32];
|
||||
|
||||
// Loop over the reduced axis in blocks of size ceildiv(axis_size,
|
||||
// N_READS*lsize)
|
||||
// Read block
|
||||
// Compute inclusive scan of the block
|
||||
// Compute inclusive scan per thread
|
||||
// Compute exclusive scan of thread sums in simdgroup
|
||||
// Write simdgroup sums in SM
|
||||
// Compute exclusive scan of simdgroup sums
|
||||
// Compute the output by scanning prefix, prev_simdgroup, prev_thread,
|
||||
// value
|
||||
// Write block
|
||||
|
||||
for (uint r = 0; r < ceildiv(axis_size, N_READS * lsize.x); r++) {
|
||||
// Compute the block offset
|
||||
uint offset = r * lsize.x * N_READS + lid.x * N_READS;
|
||||
|
||||
// Read the values
|
||||
if ((offset + N_READS) < axis_size) {
|
||||
load_unsafe<T, N_READS>(values, in + offset);
|
||||
} else {
|
||||
load_safe<T, N_READS>(values, in + offset, offset, axis_size, Op::init);
|
||||
}
|
||||
|
||||
// Compute an inclusive scan per thread
|
||||
for (int i = 1; i < N_READS; i++) {
|
||||
values[i] = op(values[i], values[i - 1]);
|
||||
}
|
||||
|
||||
// Compute exclusive scan of thread sums
|
||||
acc_t prev_thread = op.simd_exclusive_scan(values[N_READS - 1]);
|
||||
|
||||
// Write simdgroup_sums to SM
|
||||
if (simd_lane_id == simd_size - 1) {
|
||||
simdgroup_sums[simd_group_id] = op(prev_thread, values[N_READS - 1]);
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// Compute exclusive scan of simdgroup_sums
|
||||
if (simd_group_id == 0) {
|
||||
acc_t prev_simdgroup =
|
||||
op.simd_exclusive_scan(simdgroup_sums[simd_lane_id]);
|
||||
simdgroup_sums[simd_lane_id] = prev_simdgroup;
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// Compute the output
|
||||
for (int i = 0; i < N_READS; i++) {
|
||||
values[i] = op(values[i], prefix);
|
||||
values[i] = op(values[i], simdgroup_sums[simd_group_id]);
|
||||
values[i] = op(values[i], prev_thread);
|
||||
}
|
||||
|
||||
// Write the values
|
||||
if ((offset + N_READS) < axis_size) {
|
||||
write_unsafe<T, N_READS>(values, out + offset);
|
||||
} else {
|
||||
write_safe<T, N_READS>(values, out + offset, offset, axis_size);
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// Share the prefix
|
||||
if (simd_group_id == simd_groups - 1 && simd_lane_id == simd_size - 1) {
|
||||
simdgroup_sums[0] = values[N_READS - 1];
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
prefix = simdgroup_sums[0];
|
||||
}
|
||||
}
|
||||
|
||||
// Inclusive scan along outer dimension for contiguous tensors
|
||||
template <typename T, typename Op, int N_READS, typename acc_t = accum_t<T>>
|
||||
kernel void scan_outer_dim(
|
||||
const device T* in [[buffer(0)]],
|
||||
device T* out [[buffer(1)]],
|
||||
const constant size_t& axis_size [[buffer(2)]],
|
||||
const constant size_t& stride [[buffer(3)]],
|
||||
const constant size_t& stride_blocks [[buffer(4)]],
|
||||
uint3 gid [[threadgroup_position_in_grid]],
|
||||
uint3 gsize [[threadgroups_per_grid]],
|
||||
uint3 lid [[thread_position_in_threadgroup]],
|
||||
uint simd_lane_id [[thread_index_in_simdgroup]],
|
||||
uint simd_group_id [[simdgroup_index_in_threadgroup]]) {
|
||||
constexpr int simd_size = 32;
|
||||
constexpr int BM = 32;
|
||||
constexpr int BN = 32;
|
||||
constexpr int BN_pad = 32 + 16 / sizeof(T);
|
||||
constexpr int n_simds = BN / N_READS;
|
||||
constexpr int n_scans = BN / n_simds;
|
||||
Op op;
|
||||
|
||||
threadgroup acc_t read_buffer[BM * BN_pad];
|
||||
acc_t values[n_scans];
|
||||
acc_t prefix[n_scans];
|
||||
for (int i = 0; i < n_scans; i++) {
|
||||
prefix[i] = Op::init;
|
||||
}
|
||||
|
||||
// Compute offsets
|
||||
size_t full_gid = gid.y + gsize.y * size_t(gid.z);
|
||||
size_t offset = full_gid / stride_blocks * axis_size * stride;
|
||||
size_t global_index_x = full_gid % stride_blocks * BN;
|
||||
uint read_offset_y = (lid.x * N_READS) / BN;
|
||||
uint read_offset_x = (lid.x * N_READS) % BN;
|
||||
uint scan_offset_y = simd_lane_id;
|
||||
uint scan_offset_x = simd_group_id * n_scans;
|
||||
|
||||
uint stride_limit = stride - global_index_x;
|
||||
in += offset + global_index_x + read_offset_x;
|
||||
out += offset + global_index_x + read_offset_x;
|
||||
threadgroup acc_t* read_into =
|
||||
read_buffer + read_offset_y * BN_pad + read_offset_x;
|
||||
threadgroup acc_t* read_from =
|
||||
read_buffer + scan_offset_y * BN_pad + scan_offset_x;
|
||||
|
||||
for (uint j = 0; j < axis_size; j += BM) {
|
||||
// Calculate the indices for the current thread
|
||||
uint index_y = j + read_offset_y;
|
||||
uint check_index_y = index_y;
|
||||
|
||||
// Read into shared memory with type conversion
|
||||
if (check_index_y < axis_size && (read_offset_x + N_READS) < stride_limit) {
|
||||
for (int i = 0; i < N_READS; i++) {
|
||||
read_into[i] = static_cast<acc_t>(in[index_y * stride + i]);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < N_READS; i++) {
|
||||
if (check_index_y < axis_size && (read_offset_x + i) < stride_limit) {
|
||||
read_into[i] = static_cast<acc_t>(in[index_y * stride + i]);
|
||||
} else {
|
||||
read_into[i] = Op::init;
|
||||
}
|
||||
}
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// Read strided into registers
|
||||
for (int i = 0; i < n_scans; i++) {
|
||||
values[i] = read_from[i];
|
||||
}
|
||||
simdgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// Perform the scan
|
||||
for (int i = 0; i < n_scans; i++) {
|
||||
values[i] = op.simd_scan(values[i]);
|
||||
values[i] = op(values[i], prefix[i]);
|
||||
prefix[i] = simd_shuffle(values[i], simd_size - 1);
|
||||
}
|
||||
|
||||
// Write to shared memory
|
||||
for (int i = 0; i < n_scans; i++) {
|
||||
read_from[i] = values[i];
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// Write to device memory with type conversion
|
||||
if (check_index_y < axis_size && (read_offset_x + N_READS) < stride_limit) {
|
||||
for (int i = 0; i < N_READS; i++) {
|
||||
out[index_y * stride + i] = static_cast<T>(read_into[i]);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < N_READS; i++) {
|
||||
if (check_index_y < axis_size && (read_offset_x + i) < stride_limit) {
|
||||
out[index_y * stride + i] = static_cast<T>(read_into[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename Op, int N_READS, typename acc_t = accum_t<T>>
|
||||
kernel void scan_with_indices_innermost_dim(
|
||||
const device T* in [[buffer(0)]],
|
||||
@ -829,6 +1082,32 @@ kernel void scan_with_indices_outer_dim(
|
||||
}
|
||||
}
|
||||
|
||||
#define REGISTER_SCAN_OP(OP_NAME, OP_CLASS, DTYPE, NREADS) \
|
||||
template [[host_name(#OP_NAME "_innermost_" #DTYPE)]] [[kernel]] void \
|
||||
scan_innermost_dim<DTYPE, OP_CLASS<DTYPE>, NREADS>( \
|
||||
const device DTYPE* in [[buffer(0)]], \
|
||||
device DTYPE* out [[buffer(1)]], \
|
||||
const constant size_t& axis_size [[buffer(2)]], \
|
||||
uint3 gid [[threadgroup_position_in_grid]], \
|
||||
uint3 gsize [[threadgroups_per_grid]], \
|
||||
uint3 lid [[thread_position_in_threadgroup]], \
|
||||
uint3 lsize [[threads_per_threadgroup]], \
|
||||
uint simd_lane_id [[thread_index_in_simdgroup]], \
|
||||
uint simd_group_id [[simdgroup_index_in_threadgroup]]); \
|
||||
\
|
||||
template [[host_name(#OP_NAME "_outer_" #DTYPE)]] [[kernel]] void \
|
||||
scan_outer_dim<DTYPE, OP_CLASS<DTYPE>, NREADS>( \
|
||||
const device DTYPE* in [[buffer(0)]], \
|
||||
device DTYPE* out [[buffer(1)]], \
|
||||
const constant size_t& axis_size [[buffer(2)]], \
|
||||
const constant size_t& stride [[buffer(3)]], \
|
||||
const constant size_t& stride_blocks [[buffer(4)]], \
|
||||
uint3 gid [[threadgroup_position_in_grid]], \
|
||||
uint3 gsize [[threadgroups_per_grid]], \
|
||||
uint3 lid [[thread_position_in_threadgroup]], \
|
||||
uint simd_lane_id [[thread_index_in_simdgroup]], \
|
||||
uint simd_group_id [[simdgroup_index_in_threadgroup]])
|
||||
|
||||
#define REGISTER_SCAN_WITH_INDICES_OP(OP_NAME, OP_CLASS, DTYPE, NREADS) \
|
||||
template [[host_name(#OP_NAME "_innermost_" #DTYPE)]] [[kernel]] void \
|
||||
scan_with_indices_innermost_dim<DTYPE, OP_CLASS<DTYPE>, NREADS>( \
|
||||
@ -857,6 +1136,11 @@ kernel void scan_with_indices_outer_dim(
|
||||
uint simd_lane_id [[thread_index_in_simdgroup]], \
|
||||
uint simd_group_id [[simdgroup_index_in_threadgroup]])
|
||||
|
||||
// Simple scan operations
|
||||
REGISTER_SCAN_OP(logcumsumexp, LogCumSumExpOp, float, 4);
|
||||
REGISTER_SCAN_OP(logcumsumexp, LogCumSumExpOp, half, 4);
|
||||
REGISTER_SCAN_OP(logcumsumexp, LogCumSumExpOp, bfloat, 4);
|
||||
|
||||
// Scan with indices operations for cummin/cummax
|
||||
REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, float, 4);
|
||||
REGISTER_SCAN_WITH_INDICES_OP(cummin, CumMinOp, half, 4);
|
||||
|
||||
@ -119,6 +119,30 @@ static void chebyshev_polynomial_w_mps_kernel(TensorIteratorBase& iter) {
|
||||
lib.exec_binary_kernel(iter, "chebyshev_polynomial_w");
|
||||
}
|
||||
|
||||
static void shifted_chebyshev_polynomial_t_mps_kernel(TensorIteratorBase& iter) {
|
||||
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
|
||||
"shifted_chebyshev_polynomial_t_mps not implemented for non-floating types");
|
||||
lib.exec_binary_kernel(iter, "shifted_chebyshev_polynomial_t");
|
||||
}
|
||||
|
||||
static void shifted_chebyshev_polynomial_u_mps_kernel(TensorIteratorBase& iter) {
|
||||
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
|
||||
"shifted_chebyshev_polynomial_u_mps not implemented for non-floating types");
|
||||
lib.exec_binary_kernel(iter, "shifted_chebyshev_polynomial_u");
|
||||
}
|
||||
|
||||
static void shifted_chebyshev_polynomial_v_mps_kernel(TensorIteratorBase& iter) {
|
||||
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
|
||||
"shifted_chebyshev_polynomial_v_mps not implemented for non-floating types");
|
||||
lib.exec_binary_kernel(iter, "shifted_chebyshev_polynomial_v");
|
||||
}
|
||||
|
||||
static void shifted_chebyshev_polynomial_w_mps_kernel(TensorIteratorBase& iter) {
|
||||
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
|
||||
"shifted_chebyshev_polynomial_w_mps not implemented for non-floating types");
|
||||
lib.exec_binary_kernel(iter, "shifted_chebyshev_polynomial_w");
|
||||
}
|
||||
|
||||
static void hermite_polynomial_h_mps_kernel(TensorIteratorBase& iter) {
|
||||
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
|
||||
"hermite_polynomial_h_mps not implemented for non-floating types");
|
||||
@ -177,6 +201,10 @@ REGISTER_DISPATCH(chebyshev_polynomial_t_stub, &chebyshev_polynomial_t_mps_kerne
|
||||
REGISTER_DISPATCH(chebyshev_polynomial_u_stub, &chebyshev_polynomial_u_mps_kernel)
|
||||
REGISTER_DISPATCH(chebyshev_polynomial_v_stub, &chebyshev_polynomial_v_mps_kernel)
|
||||
REGISTER_DISPATCH(chebyshev_polynomial_w_stub, &chebyshev_polynomial_w_mps_kernel)
|
||||
REGISTER_DISPATCH(shifted_chebyshev_polynomial_t_stub, &shifted_chebyshev_polynomial_t_mps_kernel)
|
||||
REGISTER_DISPATCH(shifted_chebyshev_polynomial_u_stub, &shifted_chebyshev_polynomial_u_mps_kernel)
|
||||
REGISTER_DISPATCH(shifted_chebyshev_polynomial_v_stub, &shifted_chebyshev_polynomial_v_mps_kernel)
|
||||
REGISTER_DISPATCH(shifted_chebyshev_polynomial_w_stub, &shifted_chebyshev_polynomial_w_mps_kernel)
|
||||
REGISTER_DISPATCH(hermite_polynomial_h_stub, &hermite_polynomial_h_mps_kernel)
|
||||
REGISTER_DISPATCH(hermite_polynomial_he_stub, &hermite_polynomial_he_mps_kernel)
|
||||
REGISTER_DISPATCH(polar_stub, &polar_mps_kernel);
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/native/BatchLinearAlgebra.h>
|
||||
#include <ATen/native/LinearAlgebra.h>
|
||||
#include <ATen/native/LinearAlgebraUtils.h>
|
||||
#include <ATen/native/Resize.h>
|
||||
@ -22,7 +23,6 @@
|
||||
#include <ATen/ops/bmm_native.h>
|
||||
#include <ATen/ops/cholesky_native.h>
|
||||
#include <ATen/ops/linalg_cholesky_ex_native.h>
|
||||
#include <ATen/ops/linalg_cholesky_native.h>
|
||||
#include <ATen/ops/linalg_inv_ex_native.h>
|
||||
#include <ATen/ops/linalg_lu_factor_ex_native.h>
|
||||
#include <ATen/ops/linalg_lu_factor_native.h>
|
||||
@ -1097,25 +1097,8 @@ static void lu_unpack_mps_impl(const Tensor& LU_data,
|
||||
}
|
||||
}
|
||||
|
||||
static void linalg_cholesky_mps_impl(const Tensor& input,
|
||||
bool upper,
|
||||
bool check_errors,
|
||||
const Tensor& out,
|
||||
const Tensor& info) {
|
||||
using namespace mps;
|
||||
|
||||
TORCH_CHECK(out.is_mps());
|
||||
TORCH_CHECK(input.scalar_type() == at::ScalarType::Float, "linalg.cholesky: Input tensor must be float32");
|
||||
TORCH_CHECK(input.dim() >= 2, "linalg.cholesky: Input tensor must be at least 2D");
|
||||
TORCH_CHECK(input.size(-2) == input.size(-1), "linalg.cholesky: Input tensor must be square");
|
||||
auto input_sizes = input.sizes();
|
||||
resize_output(out, input_sizes);
|
||||
resize_output(info, {input_sizes.begin(), input_sizes.end() - 2});
|
||||
if (input.numel() == 0) {
|
||||
info.zero_();
|
||||
return;
|
||||
}
|
||||
out.copy_(input);
|
||||
static void cholesky_stub_impl(const Tensor& out, const Tensor& info, bool upper) {
|
||||
auto input_sizes = out.sizes();
|
||||
|
||||
int64_t ndim = out.dim();
|
||||
int64_t N = out.size(-1);
|
||||
@ -1124,9 +1107,9 @@ static void linalg_cholesky_mps_impl(const Tensor& input,
|
||||
auto stream = getCurrentMPSStream();
|
||||
auto device = MPSDevice::getInstance()->device();
|
||||
|
||||
auto factorDiagonalPSO = lib.getPipelineStateForFunc("factorDiagonalBlock");
|
||||
auto applyTRSMPSO = lib.getPipelineStateForFunc("applyTRSM");
|
||||
auto applySYRKPSO = lib.getPipelineStateForFunc("applySYRK");
|
||||
auto factorDiagonalPSO = lib.getPipelineStateForFunc(upper ? "factorDiagonalBlockU" : "factorDiagonalBlockL");
|
||||
auto applyTRSMPSO = lib.getPipelineStateForFunc(upper ? "applyTRSMU" : "applyTRSML");
|
||||
auto applySYRKPSO = lib.getPipelineStateForFunc(upper ? "applySYRKU" : "applySYRKL");
|
||||
|
||||
int64_t NB = std::min<int64_t>(32, N);
|
||||
int64_t numBlocks = (N + NB - 1) / NB;
|
||||
@ -1168,33 +1151,8 @@ static void linalg_cholesky_mps_impl(const Tensor& input,
|
||||
}
|
||||
});
|
||||
}
|
||||
int status;
|
||||
if (check_errors) {
|
||||
if (info_.dim() > 0) {
|
||||
// batch case
|
||||
for (const auto i : c10::irange(B)) {
|
||||
status = info_[i].item<int>();
|
||||
TORCH_CHECK(
|
||||
status == 0,
|
||||
"linalg.cholesky(): (Batch element ",
|
||||
i,
|
||||
"): The factorization could not be completed because the input is not positive-definite (the leading minor of order ",
|
||||
status,
|
||||
" is not positive-definite).");
|
||||
}
|
||||
} else {
|
||||
// single matrix case(no batch size)
|
||||
status = info.item<int>();
|
||||
TORCH_CHECK(
|
||||
status == 0,
|
||||
"linalg.cholesky(): The factorization could not be completed because the input is not positive-definite (the leading minor of order ",
|
||||
status,
|
||||
" is not positive-definite).");
|
||||
}
|
||||
}
|
||||
out.tril_();
|
||||
upper ? out.transpose_(ndim - 2, ndim - 1) : out;
|
||||
}
|
||||
|
||||
} // namespace mps
|
||||
|
||||
Tensor addr_mps(const Tensor& self, const Tensor& vec1, const Tensor& vec2, const Scalar& beta, const Scalar& alpha) {
|
||||
@ -1355,23 +1313,6 @@ Tensor& addbmm_out_mps(const Tensor& self,
|
||||
return result;
|
||||
}
|
||||
|
||||
Tensor cholesky_mps(const Tensor& self, bool upper) {
|
||||
auto out = at::empty_like(self, MemoryFormat::Contiguous);
|
||||
cholesky_mps_out(self, upper, out);
|
||||
return out;
|
||||
}
|
||||
|
||||
Tensor& cholesky_mps_out(const Tensor& self, bool upper, Tensor& out) {
|
||||
auto info = at::empty({}, self.options().dtype(kInt));
|
||||
mps::linalg_cholesky_mps_impl(self, upper, true, out, info);
|
||||
return out;
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(linalg_cholesky_ex_out_mps)
|
||||
(const Tensor& self, bool upper, bool check_errors, const Tensor& L, const Tensor& info) {
|
||||
mps::linalg_cholesky_mps_impl(self, upper, check_errors, L, info);
|
||||
}
|
||||
|
||||
Tensor addbmm_mps(const Tensor& self,
|
||||
const Tensor& batch1,
|
||||
const Tensor& batch2,
|
||||
@ -1460,4 +1401,6 @@ TORCH_IMPL_FUNC(linalg_lu_factor_ex_out_mps)
|
||||
TORCH_IMPL_FUNC(linalg_inv_ex_out_mps)(const Tensor& A, bool check_errors, const Tensor& result, const Tensor& info) {
|
||||
mps::linalg_inv_ex_out_mps_impl(A, check_errors, result, info);
|
||||
}
|
||||
|
||||
REGISTER_DISPATCH(cholesky_stub, mps::cholesky_stub_impl)
|
||||
} // namespace at::native
|
||||
|
||||
@ -18,6 +18,7 @@
|
||||
#include <ATen/ops/max_pool2d_native.h>
|
||||
#include <ATen/ops/max_pool2d_with_indices_backward_native.h>
|
||||
#include <ATen/ops/max_pool2d_with_indices_native.h>
|
||||
#include <ATen/ops/max_pool3d_with_indices_backward_native.h>
|
||||
#include <ATen/ops/max_pool3d_with_indices_native.h>
|
||||
#endif
|
||||
|
||||
@ -270,16 +271,16 @@ static IntArrayRef tensor_to_intarrayref(const Tensor& tensor) {
|
||||
return IntArrayRef(data_ptr, length);
|
||||
}
|
||||
|
||||
static void max_pool_with_indices_out_mps_template(const Tensor& output,
|
||||
const Tensor& indices,
|
||||
const Tensor& input,
|
||||
IntArrayRef kernel_size,
|
||||
IntArrayRef stride,
|
||||
IntArrayRef padding,
|
||||
IntArrayRef dilation,
|
||||
bool ceil_mode,
|
||||
const int32_t pooling_dims,
|
||||
const std::string& op_name) {
|
||||
using PoolSizes = std::tuple<int32_t, Tensor, Tensor, Tensor, Tensor, Tensor>;
|
||||
|
||||
static PoolSizes process_pool_sizes(const Tensor& input,
|
||||
IntArrayRef kernel_size,
|
||||
IntArrayRef stride,
|
||||
IntArrayRef padding,
|
||||
IntArrayRef dilation,
|
||||
bool ceil_mode,
|
||||
const int32_t pooling_dims,
|
||||
const std::string& op_name) {
|
||||
TORCH_INTERNAL_ASSERT(pooling_dims == 1 || pooling_dims == 2 || pooling_dims == 3);
|
||||
|
||||
const int32_t dims = input.dim();
|
||||
@ -387,9 +388,27 @@ static void max_pool_with_indices_out_mps_template(const Tensor& output,
|
||||
|
||||
t_output_size.slice(0, leading_dims) = t_output_pooling_size;
|
||||
|
||||
return std::tuple<int32_t, Tensor, Tensor, Tensor, Tensor, Tensor>(
|
||||
dims, t_output_size, t_kernel_size, t_stride, t_padding, t_dilation);
|
||||
}
|
||||
|
||||
static void max_pool_with_indices_out_mps_template(const Tensor& output,
|
||||
const Tensor& indices,
|
||||
const Tensor& input,
|
||||
IntArrayRef kernel_size,
|
||||
IntArrayRef stride,
|
||||
IntArrayRef padding,
|
||||
IntArrayRef dilation,
|
||||
bool ceil_mode,
|
||||
const int32_t pooling_dims,
|
||||
const std::string& op_name) {
|
||||
auto [dims, t_output_size, t_kernel_size, t_stride, t_padding, t_dilation] =
|
||||
process_pool_sizes(input, kernel_size, stride, padding, dilation, ceil_mode, pooling_dims, op_name);
|
||||
|
||||
IntArrayRef output_size = tensor_to_intarrayref(t_output_size);
|
||||
output.resize_(output_size);
|
||||
indices.resize_(output_size);
|
||||
const auto memory_format = input.suggest_memory_format();
|
||||
output.resize_(output_size, memory_format);
|
||||
indices.resize_(output_size, memory_format);
|
||||
|
||||
auto iter = TensorIteratorConfig().add_output(output).resize_outputs(false).check_all_same_dtype(false).build();
|
||||
|
||||
@ -436,6 +455,52 @@ static void max_pool_with_indices_out_mps_template(const Tensor& output,
|
||||
});
|
||||
}
|
||||
|
||||
static void max_pool_with_indices_backward_out_mps_template(Tensor& grad_input,
|
||||
const Tensor& indices,
|
||||
const Tensor& input,
|
||||
const Tensor& grad_output,
|
||||
IntArrayRef kernel_size,
|
||||
IntArrayRef stride,
|
||||
IntArrayRef padding,
|
||||
IntArrayRef dilation,
|
||||
bool ceil_mode,
|
||||
const int32_t pooling_dims,
|
||||
const std::string& op_name) {
|
||||
auto [dims, t_output_size, t_kernel_size, t_stride, t_padding, t_dilation] =
|
||||
process_pool_sizes(input, kernel_size, stride, padding, dilation, ceil_mode, pooling_dims, op_name);
|
||||
|
||||
const auto memory_format = input.suggest_memory_format();
|
||||
grad_input.resize_(input.sizes(), memory_format);
|
||||
grad_input.fill_(0);
|
||||
|
||||
id<MTLDevice> device = MPSDevice::getInstance()->device();
|
||||
MPSStream* mpsStream = getCurrentMPSStream();
|
||||
const auto numThreads = grad_output.numel();
|
||||
PoolingBackwardParams<5> params;
|
||||
|
||||
params.dims = dims;
|
||||
params.pooling_dims = pooling_dims;
|
||||
memcpy(params.grad_input_sizes.data(), grad_input.sizes().data(), dims * sizeof(int64_t));
|
||||
memcpy(params.grad_input_strides.data(), grad_input.strides().data(), dims * sizeof(int64_t));
|
||||
memcpy(params.grad_output_strides.data(), grad_output.strides().data(), dims * sizeof(int64_t));
|
||||
memcpy(params.grad_output_sizes.data(), grad_output.sizes().data(), dims * sizeof(int64_t));
|
||||
memcpy(params.indices_strides.data(), indices.strides().data(), dims * sizeof(int64_t));
|
||||
|
||||
dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
|
||||
auto maxPoolPSO = lib.getPipelineStateForFunc("max_pool_backward_" + scalarToMetalTypeString(input));
|
||||
|
||||
getMPSProfiler().beginProfileKernel(maxPoolPSO, op_name, {input});
|
||||
[computeEncoder setComputePipelineState:maxPoolPSO];
|
||||
mtl_setArgs(computeEncoder, grad_input, grad_output, indices, params);
|
||||
|
||||
mtl_dispatch1DJob(computeEncoder, maxPoolPSO, numThreads);
|
||||
getMPSProfiler().endProfileKernel(maxPoolPSO);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
static void avg_pool2d_template(const Tensor& input,
|
||||
const Tensor& output,
|
||||
const std::optional<Tensor>& grad_output_opt,
|
||||
@ -738,6 +803,52 @@ std::tuple<Tensor, Tensor> max_pool3d_with_indices_mps(const Tensor& input,
|
||||
return std::tuple<Tensor, Tensor>(output, indices);
|
||||
}
|
||||
|
||||
Tensor& max_pool3d_with_indices_backward_out_mps(const Tensor& grad_output,
|
||||
const Tensor& input,
|
||||
IntArrayRef kernel_size,
|
||||
IntArrayRef stride,
|
||||
IntArrayRef padding,
|
||||
IntArrayRef dilation,
|
||||
bool ceil_mode,
|
||||
const Tensor& indices,
|
||||
Tensor& grad_input) {
|
||||
mps::max_pool_with_indices_backward_out_mps_template(grad_input,
|
||||
indices,
|
||||
input,
|
||||
grad_output,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
dilation,
|
||||
ceil_mode,
|
||||
/*pooling_dims=*/3,
|
||||
"max_pool3d_backward");
|
||||
return grad_input;
|
||||
}
|
||||
|
||||
Tensor max_pool3d_with_indices_backward_mps(const Tensor& grad_output,
|
||||
const Tensor& input,
|
||||
IntArrayRef kernel_size,
|
||||
IntArrayRef stride,
|
||||
IntArrayRef padding,
|
||||
IntArrayRef dilation,
|
||||
bool ceil_mode,
|
||||
const Tensor& indices) {
|
||||
auto grad_input = at::empty({0}, input.options());
|
||||
mps::max_pool_with_indices_backward_out_mps_template(grad_input,
|
||||
indices,
|
||||
input,
|
||||
grad_output,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
dilation,
|
||||
ceil_mode,
|
||||
/*pooling_dims=*/3,
|
||||
"max_pool3d_backward");
|
||||
return grad_input;
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(avg_pool2d_out_mps)
|
||||
(const Tensor& input,
|
||||
int64_t kH,
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
#else
|
||||
#include <ATen/ops/_cummax_helper_native.h>
|
||||
#include <ATen/ops/_cummin_helper_native.h>
|
||||
#include <ATen/ops/_logcumsumexp_native.h>
|
||||
#endif
|
||||
#include <fmt/format.h>
|
||||
|
||||
@ -163,6 +164,111 @@ static std::pair<uint32_t, uint32_t> get_2d_grid_dims(const IntArrayRef& shape,
|
||||
return {static_cast<uint32_t>(grid_x), static_cast<uint32_t>(grid_y)};
|
||||
}
|
||||
|
||||
static void scan_simple_mps_impl(const Tensor& self, const Tensor& output, int64_t dim, const std::string& op_name) {
|
||||
if (output.numel() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t ndim = self.dim();
|
||||
const int64_t wrapped_dim = maybe_wrap_dim(dim, ndim);
|
||||
const int64_t axis_size = self.size(wrapped_dim);
|
||||
|
||||
// Preprocess input tensor - ensure it's contiguous for Metal shaders
|
||||
Tensor input_tensor = self.contiguous();
|
||||
|
||||
// Preprocess output tensor - ensure it's contiguous for Metal shaders
|
||||
Tensor output_tensor = output;
|
||||
bool output_needs_copy = !output.is_contiguous();
|
||||
Tensor temp_output;
|
||||
|
||||
if (output_needs_copy) {
|
||||
// Create a temporary contiguous tensor with the same shape and type
|
||||
temp_output = at::empty_like(output, output.options()).contiguous();
|
||||
output_tensor = temp_output;
|
||||
}
|
||||
|
||||
// Determine which kernel to use based on scan dimension position
|
||||
bool is_innermost_scan = (wrapped_dim == ndim - 1);
|
||||
|
||||
MPSStream* mpsStream = getCurrentMPSStream();
|
||||
dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
|
||||
|
||||
// Build kernel name based on scan dimension position
|
||||
const auto type_str = scalarToMetalTypeString(input_tensor);
|
||||
const auto kernel_name = fmt::format("{}_{}_{}", op_name, is_innermost_scan ? "innermost" : "outer", type_str);
|
||||
|
||||
id<MTLComputePipelineState> scanPSO = lib.getPipelineStateForFunc(kernel_name);
|
||||
|
||||
// this function call is a no-op if MPS Profiler is not enabled
|
||||
getMPSProfiler().beginProfileKernel(scanPSO, op_name, [&]() {
|
||||
std::vector<Tensor> all_tensors = {input_tensor, output_tensor};
|
||||
return all_tensors;
|
||||
}());
|
||||
|
||||
[computeEncoder setComputePipelineState:scanPSO];
|
||||
|
||||
// Set input and output buffers (both guaranteed contiguous)
|
||||
mtl_setBuffer(computeEncoder, input_tensor, 0);
|
||||
mtl_setBuffer(computeEncoder, output_tensor, 1);
|
||||
|
||||
if (is_innermost_scan) {
|
||||
// Contiguous scan dispatch (scanning innermost dimension)
|
||||
mtl_setBytes(computeEncoder, axis_size, 2);
|
||||
|
||||
int n_reads = (input_tensor.element_size() <= 4) ? 4 : 2;
|
||||
constexpr int simd_size = 32;
|
||||
int elements_per_simd = n_reads * simd_size;
|
||||
int thread_group_size = static_cast<int>(scanPSO.maxTotalThreadsPerThreadgroup);
|
||||
|
||||
if (axis_size <= n_reads * 1024) {
|
||||
thread_group_size = ((axis_size + elements_per_simd - 1) / elements_per_simd) * simd_size;
|
||||
} else if (axis_size <= n_reads * 2048) {
|
||||
thread_group_size = ((axis_size / 2 + elements_per_simd - 1) / elements_per_simd) * simd_size;
|
||||
}
|
||||
thread_group_size = std::min(thread_group_size, static_cast<int>(scanPSO.maxTotalThreadsPerThreadgroup));
|
||||
|
||||
auto tmp_grid_dims = get_2d_grid_dims(input_tensor.sizes(), wrapped_dim);
|
||||
|
||||
[computeEncoder dispatchThreads:MTLSizeMake(thread_group_size, tmp_grid_dims.first, tmp_grid_dims.second)
|
||||
threadsPerThreadgroup:MTLSizeMake(thread_group_size, 1, 1)];
|
||||
} else {
|
||||
// Strided scan dispatch (scanning non-innermost dimension)
|
||||
size_t stride = input_tensor.strides()[wrapped_dim];
|
||||
constexpr int bn = 32;
|
||||
size_t stride_blocks = (stride + bn - 1) / bn;
|
||||
|
||||
mtl_setBytes(computeEncoder, axis_size, 2);
|
||||
mtl_setBytes(computeEncoder, stride, 3);
|
||||
mtl_setBytes(computeEncoder, stride_blocks, 4);
|
||||
|
||||
int n_reads = (input_tensor.element_size() <= 4) ? 4 : 2;
|
||||
int n_simdgroups = bn / n_reads;
|
||||
constexpr int simd_size = 32;
|
||||
int thread_group_size = n_simdgroups * simd_size;
|
||||
|
||||
auto tmp_grid_dims = get_2d_grid_dims(input_tensor.sizes(), wrapped_dim);
|
||||
if (tmp_grid_dims.first * stride_blocks <= UINT_MAX) {
|
||||
tmp_grid_dims.first *= stride_blocks;
|
||||
} else {
|
||||
tmp_grid_dims.second *= stride_blocks;
|
||||
}
|
||||
|
||||
[computeEncoder dispatchThreads:MTLSizeMake(thread_group_size, tmp_grid_dims.first, tmp_grid_dims.second)
|
||||
threadsPerThreadgroup:MTLSizeMake(thread_group_size, 1, 1)];
|
||||
}
|
||||
|
||||
getMPSProfiler().endProfileKernel(scanPSO);
|
||||
}
|
||||
});
|
||||
|
||||
// Post-process: copy result back to original output tensor if needed
|
||||
if (output_needs_copy) {
|
||||
output.copy_(output_tensor);
|
||||
}
|
||||
}
|
||||
|
||||
// Specialized implementation for cummin/cummax that returns both values and indices
|
||||
static void scan_with_indices_mps_impl(const Tensor& self,
|
||||
const Tensor& values_output,
|
||||
@ -284,4 +390,29 @@ void cummin_helper_mps(const Tensor& self, Tensor& values, Tensor& indices, int6
|
||||
}
|
||||
}
|
||||
|
||||
Tensor& _logcumsumexp_out_mps(const Tensor& self, int64_t dim, Tensor& result) {
|
||||
const auto wrap_dim = maybe_wrap_dim(dim, self.dim());
|
||||
result.resize_(self.sizes());
|
||||
if (self.dim() == 0) {
|
||||
result.fill_(self);
|
||||
return result;
|
||||
}
|
||||
if (self.numel() == 0) {
|
||||
result.zero_();
|
||||
return result;
|
||||
}
|
||||
|
||||
if (is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) {
|
||||
mps::scan_simple_mps_impl(self, result, wrap_dim, "logcumsumexp");
|
||||
} else {
|
||||
mps::scan_mps_impl(self, {result}, wrap_dim, "logcumsumexp");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
Tensor _logcumsumexp_mps(const Tensor& self, int64_t dim) {
|
||||
Tensor result = at::empty_like(self, MemoryFormat::Contiguous);
|
||||
return _logcumsumexp_out_mps(self, dim, result);
|
||||
}
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
@ -3741,11 +3741,13 @@
|
||||
dispatch:
|
||||
CPU: _logcumsumexp_cpu
|
||||
CUDA: _logcumsumexp_cuda
|
||||
MPS: _logcumsumexp_mps
|
||||
|
||||
- func: _logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
|
||||
dispatch:
|
||||
CPU: _logcumsumexp_out_cpu
|
||||
CUDA: _logcumsumexp_out_cuda
|
||||
MPS: _logcumsumexp_out_mps
|
||||
|
||||
- func: logcumsumexp(Tensor self, int dim) -> Tensor
|
||||
variants: function, method
|
||||
@ -9478,14 +9480,12 @@
|
||||
|
||||
- func: cholesky.out(Tensor self, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
dispatch:
|
||||
CPU, CUDA: cholesky_out
|
||||
MPS: cholesky_mps_out
|
||||
CPU, CUDA, MPS: cholesky_out
|
||||
|
||||
- func: cholesky(Tensor self, bool upper=False) -> Tensor
|
||||
variants: method, function
|
||||
dispatch:
|
||||
CPU, CUDA: cholesky
|
||||
MPS: cholesky_mps
|
||||
CPU, CUDA, MPS: cholesky
|
||||
|
||||
- func: cholesky_solve.out(Tensor self, Tensor input2, bool upper=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
dispatch:
|
||||
@ -12442,12 +12442,14 @@
|
||||
dispatch:
|
||||
CPU: max_pool3d_with_indices_backward_out_cpu
|
||||
CUDA: max_pool3d_with_indices_backward_out_cuda
|
||||
MPS: max_pool3d_with_indices_backward_out_mps
|
||||
|
||||
- func: max_pool3d_with_indices_backward(Tensor grad_output, Tensor self, int[3] kernel_size, int[3] stride, int[3] padding, int[3] dilation, bool ceil_mode, Tensor indices) -> Tensor
|
||||
python_module: nn
|
||||
dispatch:
|
||||
CPU: max_pool3d_with_indices_backward_cpu
|
||||
CUDA: max_pool3d_with_indices_backward_cuda
|
||||
MPS: max_pool3d_with_indices_backward_mps
|
||||
|
||||
- func: max_unpool2d.out(Tensor self, Tensor indices, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!)
|
||||
python_module: nn
|
||||
@ -13939,8 +13941,7 @@
|
||||
python_module: linalg
|
||||
structured: True
|
||||
dispatch:
|
||||
CPU, CUDA: linalg_cholesky_ex_out
|
||||
MPS: linalg_cholesky_ex_out_mps
|
||||
CPU, CUDA, MPS: linalg_cholesky_ex_out
|
||||
|
||||
- func: linalg_cholesky(Tensor self, *, bool upper=False) -> Tensor
|
||||
python_module: linalg
|
||||
@ -15592,7 +15593,7 @@
|
||||
- func: special_shifted_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck
|
||||
dispatch:
|
||||
CPU, CUDA: special_shifted_chebyshev_polynomial_t_out
|
||||
CPU, CUDA, MPS: special_shifted_chebyshev_polynomial_t_out
|
||||
python_module: special
|
||||
structured_inherits: TensorIteratorBase
|
||||
structured: True
|
||||
@ -15641,7 +15642,7 @@
|
||||
- func: special_shifted_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck
|
||||
dispatch:
|
||||
CPU, CUDA: special_shifted_chebyshev_polynomial_u_out
|
||||
CPU, CUDA, MPS: special_shifted_chebyshev_polynomial_u_out
|
||||
python_module: special
|
||||
structured_inherits: TensorIteratorBase
|
||||
structured: True
|
||||
@ -15690,7 +15691,7 @@
|
||||
- func: special_shifted_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck
|
||||
dispatch:
|
||||
CPU, CUDA: special_shifted_chebyshev_polynomial_v_out
|
||||
CPU, CUDA, MPS: special_shifted_chebyshev_polynomial_v_out
|
||||
python_module: special
|
||||
structured_inherits: TensorIteratorBase
|
||||
structured: True
|
||||
@ -15739,7 +15740,7 @@
|
||||
- func: special_shifted_chebyshev_polynomial_w.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck
|
||||
dispatch:
|
||||
CPU, CUDA: special_shifted_chebyshev_polynomial_w_out
|
||||
CPU, CUDA, MPS: special_shifted_chebyshev_polynomial_w_out
|
||||
python_module: special
|
||||
structured_inherits: TensorIteratorBase
|
||||
structured: True
|
||||
|
||||
@ -5,7 +5,7 @@ namespace at {
|
||||
|
||||
namespace detail {
|
||||
|
||||
TORCH_API inline void noopDelete(void*) {}
|
||||
inline void noopDelete(void*) {}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
|
||||
@ -83,16 +83,16 @@ ${Functions_declarations}
|
||||
// Special C++ only overloads for std()-like functions (See gh-40287)
|
||||
// These are needed because int -> bool conversion takes precedence over int -> IntArrayRef
|
||||
// So, for example std(0) would select the std(unbiased=False) overload
|
||||
TORCH_API inline Tensor var(const Tensor& self, int dim) {
|
||||
inline Tensor var(const Tensor& self, int dim) {
|
||||
return at::var(self, IntArrayRef{dim});
|
||||
}
|
||||
TORCH_API inline std::tuple<Tensor, Tensor> var_mean(const Tensor& self, int dim) {
|
||||
inline std::tuple<Tensor, Tensor> var_mean(const Tensor& self, int dim) {
|
||||
return at::var_mean(self, IntArrayRef{dim});
|
||||
}
|
||||
TORCH_API inline Tensor std(const Tensor& self, int dim) {
|
||||
inline Tensor std(const Tensor& self, int dim) {
|
||||
return at::std(self, IntArrayRef{dim});
|
||||
}
|
||||
TORCH_API inline std::tuple<Tensor, Tensor> std_mean(const Tensor& self, int dim) {
|
||||
inline std::tuple<Tensor, Tensor> std_mean(const Tensor& self, int dim) {
|
||||
return at::std_mean(self, IntArrayRef{dim});
|
||||
}
|
||||
|
||||
|
||||
@ -5,6 +5,9 @@
|
||||
|
||||
#include <c10/mobile/CPUCachingAllocator.h>
|
||||
|
||||
// At the moment caching allocator is only exposed to mobile cpu allocator.
|
||||
#ifdef C10_MOBILE
|
||||
|
||||
TEST(CPUCachingAllocatorTest, check_alloc_free) {
|
||||
c10::CPUCachingAllocator caching_allocator;
|
||||
c10::WithCPUCachingAllocatorGuard cachine_allocator_guard(
|
||||
@ -41,10 +44,9 @@ TEST(CPUCachingAllocatorTest, check_alloc_inside_free_outside) {
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
// At the moment caching allocator is only exposed to mobile cpu allocator.
|
||||
#ifdef C10_MOBILE
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
at::manual_seed(42);
|
||||
return RUN_ALL_TESTS();
|
||||
#endif /* C10_Mobile */
|
||||
}
|
||||
|
||||
#endif /* C10_Mobile */
|
||||
|
||||
@ -5,14 +5,14 @@
|
||||
__global__ void test_thrust_kernel() {
|
||||
// thrust conversion
|
||||
{
|
||||
constexpr float num1 = float(1.23);
|
||||
constexpr float num2 = float(4.56);
|
||||
[[maybe_unused]] constexpr float num1 = float(1.23);
|
||||
[[maybe_unused]] constexpr float num2 = float(4.56);
|
||||
assert(c10::complex<float>(thrust::complex<float>(num1, num2)).real() == num1);
|
||||
assert(c10::complex<float>(thrust::complex<float>(num1, num2)).imag() == num2);
|
||||
}
|
||||
{
|
||||
constexpr double num1 = double(1.23);
|
||||
constexpr double num2 = double(4.56);
|
||||
[[maybe_unused]] constexpr double num1 = double(1.23);
|
||||
[[maybe_unused]] constexpr double num2 = double(4.56);
|
||||
assert(c10::complex<double>(thrust::complex<double>(num1, num2)).real() == num1);
|
||||
assert(c10::complex<double>(thrust::complex<double>(num1, num2)).imag() == num2);
|
||||
}
|
||||
@ -46,11 +46,11 @@ __global__ void test_reinterpret_cast() {
|
||||
assert(zzzz.real() == double(1));
|
||||
assert(zzzz.imag() == double(2));
|
||||
|
||||
cuComplex cuComplex_zz = *reinterpret_cast<cuComplex*>(&zz);
|
||||
[[maybe_unused]] cuComplex cuComplex_zz = *reinterpret_cast<cuComplex*>(&zz);
|
||||
assert(cuComplex_zz.x == float(1));
|
||||
assert(cuComplex_zz.y == float(2));
|
||||
|
||||
cuDoubleComplex cuDoubleComplex_zzzz = *reinterpret_cast<cuDoubleComplex*>(&zzzz);
|
||||
[[maybe_unused]] cuDoubleComplex cuDoubleComplex_zzzz = *reinterpret_cast<cuDoubleComplex*>(&zzzz);
|
||||
assert(cuDoubleComplex_zzzz.x == double(1));
|
||||
assert(cuDoubleComplex_zzzz.y == double(2));
|
||||
}
|
||||
|
||||
@ -33,7 +33,7 @@ __device__ void test(){
|
||||
// use the std namespace, but just "::" so that the function
|
||||
// gets resolved from nvcc math_functions.hpp
|
||||
|
||||
float threshold = 0.00001;
|
||||
[[maybe_unused]] float threshold = 0.00001;
|
||||
assert(::abs(::lgamma(Half(10.0)) - ::lgamma(10.0f)) <= threshold);
|
||||
assert(::abs(::exp(Half(1.0)) - ::exp(1.0f)) <= threshold);
|
||||
assert(::abs(::log(Half(1.0)) - ::log(1.0f)) <= threshold);
|
||||
|
||||
@ -61,6 +61,8 @@ namespace {
|
||||
template <typename T>
|
||||
class QuantizationTests : public ::testing::Test {};
|
||||
template <typename T>
|
||||
class Quantization8BitTests : public ::testing::Test {};
|
||||
template <typename T>
|
||||
class Quantization8BitWithTailTests : public ::testing::Test {};
|
||||
template <typename T>
|
||||
class FunctionalTests : public ::testing::Test {};
|
||||
@ -79,6 +81,7 @@ namespace {
|
||||
using FloatTestedTypes = ::testing::Types<vfloat, vdouble, vcomplex, vcomplexDbl>;
|
||||
using ALLTestedTypes = ::testing::Types<vfloat, vdouble, vcomplex, vlong, vint, vshort, vqint8, vquint8, vqint>;
|
||||
using QuantTestedTypes = ::testing::Types<vqint8, vquint8, vqint>;
|
||||
using Quantization8BitTestedTypes = ::testing::Types<vqint8, vquint8>;
|
||||
#if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
|
||||
using Quantization8BitWithTailTestedTypes =
|
||||
::testing::Types<vqint8, vquint8>;
|
||||
@ -116,6 +119,7 @@ namespace {
|
||||
TYPED_TEST_SUITE(BitwiseFloatsAdditional, RealFloatReducedFloatTestedTypes);
|
||||
TYPED_TEST_SUITE(BitwiseFloatsAdditional2, FloatTestedTypes);
|
||||
TYPED_TEST_SUITE(QuantizationTests, QuantTestedTypes);
|
||||
TYPED_TEST_SUITE(Quantization8BitTests, Quantization8BitTestedTypes);
|
||||
TYPED_TEST_SUITE(InfiniteTests, RealFloatTestedTypes);
|
||||
#if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER)
|
||||
TYPED_TEST_SUITE(
|
||||
@ -1496,6 +1500,68 @@ namespace {
|
||||
},
|
||||
test_case);
|
||||
}
|
||||
#ifndef _WIN32
|
||||
TYPED_TEST(Quantization8BitTests, Transpose) {
|
||||
using VT = ValueType<TypeParam>;
|
||||
constexpr auto M = 4;
|
||||
constexpr auto N = 64;
|
||||
constexpr auto L = M * N;
|
||||
constexpr auto ld_src = N;
|
||||
constexpr auto ld_dst = M;
|
||||
CACHE_ALIGN VT x[L];
|
||||
CACHE_ALIGN VT y[L];
|
||||
CACHE_ALIGN VT ref[L];
|
||||
auto seed = TestSeed();
|
||||
ValueGen<VT> generator(VT(-100), VT(100), seed);
|
||||
for (const auto i : c10::irange(L)) {
|
||||
x[i] = generator.get();
|
||||
}
|
||||
at::native::utils::transpose<uint8_t>(
|
||||
M, N,
|
||||
reinterpret_cast<uint8_t*>(x), ld_src,
|
||||
reinterpret_cast<uint8_t*>(y), ld_dst);
|
||||
for (int64_t j = 0; j < N; j++) {
|
||||
for (int64_t i = 0; i < M; i++) {
|
||||
ref[j * ld_dst + i] = c10::load(&(x[i * ld_src + j]));
|
||||
}
|
||||
}
|
||||
for (const auto i : c10::irange(L)) {
|
||||
ASSERT_EQ(y[i], ref[i])
|
||||
<< "Failure Details:\nTest Seed to reproduce: " << seed;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(CPU_CAPABILITY_AVX512)
|
||||
TYPED_TEST(Quantization8BitTests, PackVNNI4) {
|
||||
using VT = ValueType<TypeParam>;
|
||||
constexpr auto K = 8;
|
||||
constexpr auto N = 128;
|
||||
constexpr auto L = K * N;
|
||||
constexpr auto ld_src = N;
|
||||
CACHE_ALIGN VT x[L];
|
||||
CACHE_ALIGN VT y[L];
|
||||
CACHE_ALIGN VT ref[L];
|
||||
auto seed = TestSeed();
|
||||
ValueGen<VT> generator(VT(-100), VT(100), seed);
|
||||
for (const auto i : c10::irange(L)) {
|
||||
x[i] = generator.get();
|
||||
}
|
||||
at::vec::pack_vnni4(x, y, ld_src, K, N);
|
||||
int64_t _K = K / 4;
|
||||
for (int64_t k = 0; k < _K; k++) {
|
||||
for(int64_t n = 0; n < N; n++) {
|
||||
for(int64_t l = 0; l < 4; l++) {
|
||||
ref[k * N * 4 + n * 4 + l] =
|
||||
c10::load(&(x[k * ld_src * 4 + l * ld_src + n]));
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const auto i : c10::irange(L)) {
|
||||
ASSERT_EQ(y[i], ref[i])
|
||||
<< "Failure Details:\nTest Seed to reproduce: " << seed;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
TYPED_TEST(FunctionalTests, Map) {
|
||||
using vec = TypeParam;
|
||||
using VT = ValueType<TypeParam>;
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
#include <ATen/cpu/vec/vec.h>
|
||||
#include <ATen/cpu/vec/functional.h>
|
||||
#include <ATen/cpu/vec/vec.h>
|
||||
#include <ATen/cpu/vec/vec_quant.h>
|
||||
#include <c10/util/bit_cast.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <gtest/gtest.h>
|
||||
@ -21,7 +22,9 @@
|
||||
#else
|
||||
#define CACHE_LINE 32
|
||||
#endif
|
||||
|
||||
#ifndef _WIN32
|
||||
#include <ATen/native/cpu/utils.h>
|
||||
#endif
|
||||
#if defined(__GNUC__)
|
||||
#define CACHE_ALIGN __attribute__((aligned(CACHE_LINE)))
|
||||
#define not_inline __attribute__((noinline))
|
||||
|
||||
@ -601,6 +601,7 @@ libtorch_nativert_sources = [
|
||||
"torch/nativert/executor/Placement.cpp",
|
||||
"torch/nativert/executor/ExecutionPlanner.cpp",
|
||||
"torch/nativert/executor/ExecutionFrame.cpp",
|
||||
"torch/nativert/executor/Executor.cpp",
|
||||
"torch/nativert/executor/GraphExecutorBase.cpp",
|
||||
"torch/nativert/executor/ConstantFolder.cpp",
|
||||
"torch/nativert/executor/OpKernel.cpp",
|
||||
|
||||
@ -634,7 +634,7 @@ class DispatchKeySet final {
|
||||
C10_API std::string toString(DispatchKeySet);
|
||||
C10_API std::ostream& operator<<(std::ostream&, DispatchKeySet);
|
||||
|
||||
C10_API inline int getDispatchTableIndexForDispatchKey(DispatchKey k) {
|
||||
inline int getDispatchTableIndexForDispatchKey(DispatchKey k) {
|
||||
return DispatchKeySet(k).getDispatchTableIndexForDispatchKeySet();
|
||||
}
|
||||
|
||||
|
||||
@ -1559,7 +1559,7 @@ float chebyshev_polynomial_t_forward(T x, int64_t n) {
|
||||
float q = x;
|
||||
float r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
|
||||
r = (x + x) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
@ -1603,7 +1603,7 @@ float chebyshev_polynomial_u_forward(T x, int64_t n) {
|
||||
auto p = 1.0;
|
||||
float r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
|
||||
r = 2 * x * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
@ -1656,7 +1656,7 @@ float chebyshev_polynomial_v_forward(T x, int64_t n) {
|
||||
auto p = 1.0;
|
||||
float r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
|
||||
r = 2 * x * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
@ -1713,7 +1713,7 @@ float chebyshev_polynomial_w_forward(T x, int64_t n) {
|
||||
auto p = 1.0;
|
||||
float r;
|
||||
|
||||
for (int64_t k = 2; k <= n; k++) {
|
||||
for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
|
||||
r = 2.0 * x * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
@ -1722,6 +1722,207 @@ float chebyshev_polynomial_w_forward(T x, int64_t n) {
|
||||
return r;
|
||||
} // chebyshev_polynomial_w_forward(T x, int64_t n)
|
||||
|
||||
template <typename T>
|
||||
float shifted_chebyshev_polynomial_t_forward(T x, int64_t n) {
|
||||
if (n < 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
if (x == T(1.0)) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
if (x == 0.0) {
|
||||
if (n % 2 == 0) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
return -1.0;
|
||||
}
|
||||
|
||||
const float xpxm1 = x + x - 1.0;
|
||||
if ((n > 6) && (::metal::abs(xpxm1) < 1.0)) {
|
||||
return ::metal::precise::cos(n * ::metal::precise::acos(xpxm1));
|
||||
}
|
||||
|
||||
if (n == 0) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
if (n == 1) {
|
||||
return xpxm1;
|
||||
}
|
||||
|
||||
float p = 1.0;
|
||||
float q = xpxm1;
|
||||
float r;
|
||||
|
||||
for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
|
||||
r = (xpxm1 + xpxm1) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
}
|
||||
|
||||
return r;
|
||||
} // shifted_chebyshev_polynomial_t_forward(T x, int64_t n)
|
||||
|
||||
template <typename T>
|
||||
float shifted_chebyshev_polynomial_u_forward(T x, int64_t n) {
|
||||
if (n < 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
if (x == 1.0) {
|
||||
return n + 1;
|
||||
}
|
||||
|
||||
if (x == 0.0) {
|
||||
if (n % 2 == 0) {
|
||||
return n + 1;
|
||||
}
|
||||
|
||||
return -(n + 1);
|
||||
}
|
||||
const float xpxm1 = x + x - 1.0;
|
||||
if ((n > 6) && (::metal::abs(xpxm1) < 1.0)) {
|
||||
const float acos_2xm1 = ::metal::precise::acos(xpxm1);
|
||||
const float divisor = ::metal::precise::sin(acos_2xm1);
|
||||
if (divisor != 0.0) {
|
||||
return ::metal::precise::sin((n + 1) * acos_2xm1) / divisor;
|
||||
}
|
||||
|
||||
return (n + 1) * ::metal::precise::cos((n + 1) * acos_2xm1) / xpxm1;
|
||||
}
|
||||
|
||||
if (n == 0) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
if (n == 1) {
|
||||
return xpxm1 + xpxm1;
|
||||
}
|
||||
|
||||
float p = 1.0;
|
||||
float q = xpxm1 + xpxm1;
|
||||
float r;
|
||||
|
||||
for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
|
||||
r = (xpxm1 + xpxm1) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
}
|
||||
|
||||
return r;
|
||||
} // shifted_chebyshev_polynomial_u_forward(T x, int64_t n)
|
||||
|
||||
template <typename T>
|
||||
float shifted_chebyshev_polynomial_v_forward(T x, int64_t n) {
|
||||
if (n < 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
if (x == 1.0) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
if (x == 0.0) {
|
||||
if (n % 2 == 0) {
|
||||
return (n + n + 1);
|
||||
}
|
||||
|
||||
return -(n + n + 1);
|
||||
}
|
||||
|
||||
const float xpxm1 = x + x - 1.0;
|
||||
if ((n > 6) && (::metal::abs(xpxm1) < 1.0)) {
|
||||
const float acos_2xm1 = ::metal::precise::acos(xpxm1);
|
||||
if (::metal::precise::sin(acos_2xm1 / 2.0) != 1.0) {
|
||||
return ::metal::precise::cos((n + 0.5) * acos_2xm1) /
|
||||
::metal::precise::cos(acos_2xm1 / 2.0);
|
||||
}
|
||||
|
||||
if (n % 2 == 0) {
|
||||
return n + n + 1;
|
||||
}
|
||||
|
||||
return -(n + n + 1);
|
||||
}
|
||||
|
||||
if (n == 0) {
|
||||
return T(1.0);
|
||||
}
|
||||
|
||||
if (n == 1) {
|
||||
return xpxm1 + xpxm1 - 1.0;
|
||||
}
|
||||
|
||||
float p = 1.0;
|
||||
float q = xpxm1 + xpxm1 - 1.0;
|
||||
float r;
|
||||
|
||||
for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
|
||||
r = (xpxm1 + xpxm1) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
}
|
||||
|
||||
return r;
|
||||
} // shifted_chebyshev_polynomial_v_forward(T x, int64_t n)
|
||||
|
||||
template <typename T>
|
||||
float shifted_chebyshev_polynomial_w_forward(T x, int64_t n) {
|
||||
if (n < 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
if (x == 1.0) {
|
||||
return n + n + 1;
|
||||
}
|
||||
|
||||
if (x == 0.0) {
|
||||
if (n % 2 == 0) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
return -1.0;
|
||||
}
|
||||
|
||||
const float xpxm1 = x + x - 1.0;
|
||||
if ((n > 4) && (::metal::abs(xpxm1) < 1.0)) {
|
||||
const float acos_2xm1 = ::metal::precise::acos(xpxm1);
|
||||
if (::metal::precise::cos(acos_2xm1 / 2.0) != 1.0) {
|
||||
return ::metal::precise::sin((n + 0.5) * acos_2xm1) /
|
||||
::metal::precise::sin(acos_2xm1 / 2.0);
|
||||
}
|
||||
|
||||
if (n % 2 == 0) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
return -1.0;
|
||||
}
|
||||
|
||||
if (n == 0) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
if (n == 1) {
|
||||
return xpxm1 + xpxm1 + 1.0;
|
||||
}
|
||||
|
||||
float p = 1.0;
|
||||
float q = xpxm1 + xpxm1 + 1.0;
|
||||
float r;
|
||||
|
||||
for (int64_t k = 2; (k <= n) && !::metal::isnan(q); k++) {
|
||||
r = (xpxm1 + xpxm1) * q - p;
|
||||
p = q;
|
||||
q = r;
|
||||
}
|
||||
|
||||
return r;
|
||||
} // shifted_chebyshev_polynomial_w_forward(T x, int64_t n)
|
||||
|
||||
template <typename T>
|
||||
// TODO: Add 512 if/when double will be supported in Metal
|
||||
inline constexpr int getHermitianLimit() {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user