mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-05 16:44:58 +08:00
Compare commits
340 Commits
revert-cpp
...
gh/mlazos/
| Author | SHA1 | Date | |
|---|---|---|---|
| 12cec625d3 | |||
| 0674e0a0f1 | |||
| b7d348a907 | |||
| 9f9dbe0a9a | |||
| a19e92d433 | |||
| c3dc0c7089 | |||
| 04d6a6f339 | |||
| 0573747b6a | |||
| a663eb9c80 | |||
| 764c54ecae | |||
| 0d81bb7f9c | |||
| 82fafb3304 | |||
| 401c2f9657 | |||
| 13549e0e10 | |||
| 82d86bacf3 | |||
| 3b5d38a3bc | |||
| 84776e1374 | |||
| b3861ac8e7 | |||
| 4cc64d6234 | |||
| 1aef88c72d | |||
| f0745ddb11 | |||
| 4316df857c | |||
| 9d6597b1e9 | |||
| e8fadba28c | |||
| 60333de85d | |||
| 3dc92d69ed | |||
| f91899ca6c | |||
| e2dc32f4ba | |||
| 83cc38d9c1 | |||
| 8d599045cf | |||
| fd5da81fdd | |||
| 9261a1fb12 | |||
| d80ae738c9 | |||
| 51667435f5 | |||
| 65c3ffbfe0 | |||
| 2699f5410b | |||
| 9970fb97ff | |||
| dfebdcab86 | |||
| b09fb481e0 | |||
| 4e7232c5da | |||
| 93a70c717a | |||
| d97144d31e | |||
| e4043884c7 | |||
| 4a7bc1d522 | |||
| 8209a0506b | |||
| 70aeb49198 | |||
| cf9a834f39 | |||
| 856a7a5298 | |||
| ef8d97efcf | |||
| d2be06f673 | |||
| 08f4535378 | |||
| 30157d30f0 | |||
| b470e59c38 | |||
| 85b85f6c2c | |||
| b71966f67b | |||
| ccbf8a50af | |||
| 0947765eb9 | |||
| 239e7b541a | |||
| ffaa6578b7 | |||
| 365ed62f61 | |||
| fcc1063566 | |||
| 121235956b | |||
| aa9c96af04 | |||
| c3b71d5499 | |||
| 1e3600b528 | |||
| fee7624bd6 | |||
| 24e94e021a | |||
| 69be99ee51 | |||
| 034e951b0c | |||
| 160ab53dd5 | |||
| 5bcfdae71d | |||
| 4e8ba37ce3 | |||
| 26534e9809 | |||
| 657f8c3e21 | |||
| b0831930ed | |||
| c01636e1bc | |||
| fd68d409ad | |||
| 0d3a4f7155 | |||
| 108bb224f7 | |||
| fc8ac1216c | |||
| 030de07aff | |||
| 7d67a41db4 | |||
| a92dd063da | |||
| 85b035ca9c | |||
| 267d0197bf | |||
| 1dec8a67a8 | |||
| 797cd80b26 | |||
| 7d39401fa0 | |||
| e3ae0594d1 | |||
| f1e4c42b6e | |||
| d3e511f07c | |||
| d3be06cbdc | |||
| 1129605415 | |||
| a6b1ef1717 | |||
| 12577064dd | |||
| 24b6eb7727 | |||
| 32066772b3 | |||
| 47f0024310 | |||
| 98d640bb11 | |||
| 5d288bc3f7 | |||
| bfb47ec50e | |||
| 7a0cd8ed09 | |||
| 984e64b2cd | |||
| b9bcb37f40 | |||
| 7e3b9d105e | |||
| 45c3f02d69 | |||
| f5543e3741 | |||
| 5fc2c7a2a1 | |||
| 7692fa09cd | |||
| df71b70727 | |||
| 80ba6e458f | |||
| 0d50e5d8d4 | |||
| 99b05d1b78 | |||
| 39c805c391 | |||
| f911d64750 | |||
| 52db60170d | |||
| 56838bad5f | |||
| ad3a56ab98 | |||
| a7fd0b4001 | |||
| 181ee3bd42 | |||
| 0ec0549823 | |||
| 8221ee6db9 | |||
| b939de26d1 | |||
| 694db5f549 | |||
| 639a0b1239 | |||
| 398775a43e | |||
| fcd5f8c352 | |||
| 4acc66f119 | |||
| 8f40a0c634 | |||
| a5c3c08d10 | |||
| a553ea9ea4 | |||
| ba71e9ca9a | |||
| 694d205143 | |||
| 629293f568 | |||
| c37802a8c4 | |||
| 0a3ac47c0a | |||
| e83be7042e | |||
| fb545fb068 | |||
| 2df2c316e2 | |||
| 08b0a8f11a | |||
| 3f1824742c | |||
| bbb7d2270b | |||
| 6a5a436624 | |||
| ad559072db | |||
| ad02bd13df | |||
| 7563f61cc8 | |||
| fa8e073a4e | |||
| 95b5534773 | |||
| 9ee1afbf66 | |||
| f60751024e | |||
| 2de4cf2102 | |||
| 369f2d6951 | |||
| 32920926f0 | |||
| 39e5cdddf7 | |||
| 2829d48bd1 | |||
| f1af679270 | |||
| d46d8d6f54 | |||
| a5335263d3 | |||
| 79aee77381 | |||
| f5cb9a4c68 | |||
| f20bf77874 | |||
| 75f798e05b | |||
| 476b149a00 | |||
| 845da9c817 | |||
| 0918bf321c | |||
| 90519402c2 | |||
| 791ca80d3a | |||
| 5cbdade914 | |||
| 0187db88d4 | |||
| 311ea0dec0 | |||
| cf7756da38 | |||
| e380028a51 | |||
| b4403bfc62 | |||
| 12c12466b0 | |||
| f4d05feb7a | |||
| 7481622237 | |||
| b2a0f90501 | |||
| 14d4a77495 | |||
| 3d4ca228be | |||
| c3d205d598 | |||
| c54e2c5b41 | |||
| c3047938a0 | |||
| d2eff5d454 | |||
| 972030fe2e | |||
| d401e4e70a | |||
| f1a3440715 | |||
| 82ff07c788 | |||
| e0604d3170 | |||
| 8101fd46d4 | |||
| 3d4a2d8a93 | |||
| 59ddfb69a7 | |||
| bebabd7fce | |||
| 56a809aa07 | |||
| b33762bd2f | |||
| f02708c2be | |||
| a186aa8d6c | |||
| 48c3b71ecc | |||
| 2c9f877fa7 | |||
| fc540cefd4 | |||
| d1a6e006e0 | |||
| 8b0a36fe14 | |||
| fa560e1158 | |||
| a3fe1825aa | |||
| deb776319b | |||
| d7040e6d75 | |||
| 35f3572fa4 | |||
| bc5111cd8d | |||
| 398fdd32bb | |||
| 5fd1d41e62 | |||
| c594950e86 | |||
| 14102fb1f3 | |||
| 5cdbcb5233 | |||
| eae701cad0 | |||
| 8f51556daa | |||
| c0bbda37e8 | |||
| fefb546b91 | |||
| d6d6fa26f5 | |||
| 467c21ad9a | |||
| 4a94591321 | |||
| 5e7272b60a | |||
| 1dd6b76914 | |||
| 284716a691 | |||
| 8b188647cf | |||
| 96b61844a7 | |||
| 1b655a87ef | |||
| cb6966704c | |||
| c776384ad6 | |||
| 17d5aa4767 | |||
| cde81e92b9 | |||
| bfc2050db9 | |||
| c5701d0ab5 | |||
| 23669d02a6 | |||
| e8d887ae3f | |||
| 774abb018e | |||
| 0e19561e23 | |||
| 1fa520ea65 | |||
| c2e3cc7aed | |||
| 5849eea129 | |||
| 924482a6f6 | |||
| 20be077085 | |||
| 94eaeb9cb8 | |||
| 753d9bd806 | |||
| dd1fe7c22f | |||
| 695cb0d342 | |||
| 1764f3a9c8 | |||
| c9eabadc5e | |||
| c201a1cab1 | |||
| e105a47575 | |||
| aab27b051a | |||
| f8b4c00294 | |||
| 877f126e35 | |||
| 4fada51ada | |||
| 76b2c37045 | |||
| adedf26e21 | |||
| bea89d6060 | |||
| 48e672d149 | |||
| afaaaa314c | |||
| 84fe848503 | |||
| 56afad4eb3 | |||
| 2a058bfecf | |||
| 351e354917 | |||
| 31e42eb732 | |||
| a7d721962b | |||
| a9b29caeae | |||
| 0d4992c170 | |||
| b060e5c131 | |||
| 6d5e651a50 | |||
| 3cc5949dc2 | |||
| f167fd09fa | |||
| 68b3984b77 | |||
| a1eb6b5538 | |||
| f36f372acc | |||
| d9483d4c8d | |||
| 664946f8c3 | |||
| fea819ed08 | |||
| 84a2715d34 | |||
| 572cc12b42 | |||
| 1fdef664a5 | |||
| 08ae55021e | |||
| 260bcb8690 | |||
| 52c82897aa | |||
| 551921d484 | |||
| b5189e269e | |||
| 3895ce093f | |||
| 8aa087a29d | |||
| 7379972cc0 | |||
| b903018c26 | |||
| 21b48f8dfa | |||
| 009ea77234 | |||
| 0e46a10aa7 | |||
| a25818cf7e | |||
| e3e93c7107 | |||
| 1abfa5f70b | |||
| 687c15c0b3 | |||
| 895795f07c | |||
| 2dc56456cb | |||
| 3e64f175de | |||
| b53665232a | |||
| 741aff01ce | |||
| be6ac424bf | |||
| f917c7da45 | |||
| cc536ec942 | |||
| 2fa13c8e1a | |||
| 439fcc4445 | |||
| 4293bf12ab | |||
| 74cdc84878 | |||
| d33ea3d273 | |||
| 3c21272f3a | |||
| 9d2e77c3ef | |||
| 9249261a08 | |||
| d0cf20793e | |||
| 26a387d815 | |||
| 81c84e3ceb | |||
| 4757f2e4f4 | |||
| a3e59ee92a | |||
| 1fa84d141a | |||
| aedbb1418d | |||
| 1b0ed4de38 | |||
| f12798434a | |||
| ae23da73f1 | |||
| 397155450f | |||
| 53aa957add | |||
| 7835bc709a | |||
| 842680934a | |||
| b441d7c6d1 | |||
| 4c9d834916 | |||
| d7bf92baa9 | |||
| 77ddab454e | |||
| 5e00f4a152 | |||
| 33375f19ae | |||
| b7eb8dace7 | |||
| e388d09cf0 | |||
| e330c24154 | |||
| 9cf38e3efd | |||
| 2ce5614456 | |||
| b95c6cb7f3 | |||
| eda656b169 | |||
| da38f66f8e | |||
| 797f61ab4e | |||
| 2a1a2804ca |
@ -195,13 +195,16 @@ case "$tag" in
|
|||||||
NINJA_VERSION=1.9.0
|
NINJA_VERSION=1.9.0
|
||||||
TRITON=yes
|
TRITON=yes
|
||||||
;;
|
;;
|
||||||
pytorch-linux-jammy-xpu-n-py3)
|
pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks)
|
||||||
ANACONDA_PYTHON_VERSION=3.10
|
ANACONDA_PYTHON_VERSION=3.10
|
||||||
GCC_VERSION=11
|
GCC_VERSION=11
|
||||||
VISION=yes
|
VISION=yes
|
||||||
XPU_VERSION=2025.2
|
XPU_VERSION=2025.2
|
||||||
NINJA_VERSION=1.9.0
|
NINJA_VERSION=1.9.0
|
||||||
TRITON=yes
|
TRITON=yes
|
||||||
|
if [[ $tag =~ "benchmarks" ]]; then
|
||||||
|
INDUCTOR_BENCHMARKS=yes
|
||||||
|
fi
|
||||||
;;
|
;;
|
||||||
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
|
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
|
||||||
ANACONDA_PYTHON_VERSION=3.10
|
ANACONDA_PYTHON_VERSION=3.10
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
set -eux
|
set -eux
|
||||||
|
|
||||||
ACL_VERSION=${ACL_VERSION:-"v25.02"}
|
ACL_VERSION=${ACL_VERSION:-"v52.6.0"}
|
||||||
ACL_INSTALL_DIR="/acl"
|
ACL_INSTALL_DIR="/acl"
|
||||||
|
|
||||||
# Clone ACL
|
# Clone ACL
|
||||||
|
|||||||
@ -49,12 +49,20 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
|
|||||||
export SYSROOT_DEP="sysroot_linux-64=2.17"
|
export SYSROOT_DEP="sysroot_linux-64=2.17"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Install correct Python version
|
||||||
|
# Also ensure sysroot is using a modern GLIBC to match system compilers
|
||||||
|
if [ "$ANACONDA_PYTHON_VERSION" = "3.14" ]; then
|
||||||
|
as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
|
||||||
|
python="3.14.0" \
|
||||||
|
${SYSROOT_DEP} \
|
||||||
|
-c conda-forge
|
||||||
|
else
|
||||||
# Install correct Python version
|
# Install correct Python version
|
||||||
# Also ensure sysroot is using a modern GLIBC to match system compilers
|
# Also ensure sysroot is using a modern GLIBC to match system compilers
|
||||||
as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
|
as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
|
||||||
python="$ANACONDA_PYTHON_VERSION" \
|
python="$ANACONDA_PYTHON_VERSION" \
|
||||||
${SYSROOT_DEP}
|
${SYSROOT_DEP}
|
||||||
|
fi
|
||||||
# libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30
|
# libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30
|
||||||
# which is provided in libstdcxx 12 and up.
|
# which is provided in libstdcxx 12 and up.
|
||||||
conda_install libstdcxx-ng=12.3.0 --update-deps -c conda-forge
|
conda_install libstdcxx-ng=12.3.0 --update-deps -c conda-forge
|
||||||
|
|||||||
@ -10,7 +10,7 @@ else
|
|||||||
arch_path='sbsa'
|
arch_path='sbsa'
|
||||||
fi
|
fi
|
||||||
|
|
||||||
NVSHMEM_VERSION=3.3.24
|
NVSHMEM_VERSION=3.4.5
|
||||||
|
|
||||||
function install_cuda {
|
function install_cuda {
|
||||||
version=$1
|
version=$1
|
||||||
|
|||||||
@ -40,11 +40,7 @@ EOF
|
|||||||
|
|
||||||
# Default url values
|
# Default url values
|
||||||
rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
|
rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
|
||||||
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
|
|
||||||
|
|
||||||
# Add amdgpu repository
|
|
||||||
UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
|
UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
|
||||||
echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
|
|
||||||
|
|
||||||
# Add rocm repository
|
# Add rocm repository
|
||||||
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
|
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
|
||||||
|
|||||||
@ -12,8 +12,8 @@ function do_install() {
|
|||||||
|
|
||||||
rocm_version_nodot=${rocm_version//./}
|
rocm_version_nodot=${rocm_version//./}
|
||||||
|
|
||||||
# https://github.com/icl-utk-edu/magma/pull/65
|
# post merge of https://github.com/icl-utk-edu/magma/pull/65
|
||||||
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
|
MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
|
||||||
magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
|
magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
|
||||||
|
|
||||||
rocm_dir="/opt/rocm"
|
rocm_dir="/opt/rocm"
|
||||||
|
|||||||
@ -97,7 +97,7 @@ case ${image} in
|
|||||||
manylinux2_28-builder:xpu)
|
manylinux2_28-builder:xpu)
|
||||||
TARGET=xpu_final
|
TARGET=xpu_final
|
||||||
GPU_IMAGE=amd64/almalinux:8
|
GPU_IMAGE=amd64/almalinux:8
|
||||||
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
|
DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13"
|
||||||
MANY_LINUX_VERSION="2_28"
|
MANY_LINUX_VERSION="2_28"
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
|
|||||||
@ -138,10 +138,12 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
|
|||||||
#test_binary_ufuncs.py
|
#test_binary_ufuncs.py
|
||||||
numpy==1.22.4; python_version == "3.10"
|
numpy==1.22.4; python_version == "3.10"
|
||||||
numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
|
numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
|
||||||
numpy==2.1.2; python_version >= "3.13"
|
numpy==2.1.2; python_version >= "3.13" and python_version < "3.14"
|
||||||
|
numpy==2.3.4; python_version >= "3.14"
|
||||||
|
|
||||||
pandas==2.0.3; python_version < "3.13"
|
pandas==2.0.3; python_version < "3.13"
|
||||||
pandas==2.2.3; python_version >= "3.13"
|
pandas==2.2.3; python_version >= "3.13" and python_version < "3.14"
|
||||||
|
pandas==2.3.3; python_version >= "3.14"
|
||||||
|
|
||||||
#onnxruntime
|
#onnxruntime
|
||||||
#Description: scoring engine for Open Neural Network Exchange (ONNX) models
|
#Description: scoring engine for Open Neural Network Exchange (ONNX) models
|
||||||
@ -153,7 +155,8 @@ opt-einsum==3.3
|
|||||||
#Pinned versions: 3.3
|
#Pinned versions: 3.3
|
||||||
#test that import: test_linalg.py
|
#test that import: test_linalg.py
|
||||||
|
|
||||||
optree==0.13.0
|
optree==0.13.0 ; python_version < "3.14"
|
||||||
|
optree==0.17.0 ; python_version >= "3.14"
|
||||||
#Description: A library for tree manipulation
|
#Description: A library for tree manipulation
|
||||||
#Pinned versions: 0.13.0
|
#Pinned versions: 0.13.0
|
||||||
#test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
|
#test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
|
||||||
@ -252,7 +255,8 @@ scikit-image==0.22.0
|
|||||||
#test that import:
|
#test that import:
|
||||||
|
|
||||||
scipy==1.10.1 ; python_version <= "3.11"
|
scipy==1.10.1 ; python_version <= "3.11"
|
||||||
scipy==1.14.1 ; python_version >= "3.12"
|
scipy==1.14.1 ; python_version > "3.11" and python_version < "3.14"
|
||||||
|
scipy==1.16.2 ; python_version >= "3.14"
|
||||||
# Pin SciPy because of failing distribution tests (see #60347)
|
# Pin SciPy because of failing distribution tests (see #60347)
|
||||||
#Description: scientific python
|
#Description: scientific python
|
||||||
#Pinned versions: 1.10.1
|
#Pinned versions: 1.10.1
|
||||||
@ -324,7 +328,8 @@ pywavelets==1.7.0 ; python_version >= "3.12"
|
|||||||
#Pinned versions: 1.4.1
|
#Pinned versions: 1.4.1
|
||||||
#test that import:
|
#test that import:
|
||||||
|
|
||||||
lxml==5.3.0
|
lxml==5.3.0 ; python_version < "3.14"
|
||||||
|
lxml==6.0.2 ; python_version >= "3.14"
|
||||||
#Description: This is a requirement of unittest-xml-reporting
|
#Description: This is a requirement of unittest-xml-reporting
|
||||||
|
|
||||||
PyGithub==2.3.0
|
PyGithub==2.3.0
|
||||||
@ -334,7 +339,9 @@ sympy==1.13.3
|
|||||||
#Pinned versions:
|
#Pinned versions:
|
||||||
#test that import:
|
#test that import:
|
||||||
|
|
||||||
onnx==1.19.1
|
onnx==1.19.1 ; python_version < "3.14"
|
||||||
|
# Unpin once Python 3.14 is supported. See onnxruntime issue 26309.
|
||||||
|
onnx==1.18.0 ; python_version == "3.14"
|
||||||
#Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
|
#Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
|
||||||
#Pinned versions:
|
#Pinned versions:
|
||||||
#test that import:
|
#test that import:
|
||||||
@ -359,7 +366,7 @@ pwlf==2.2.1
|
|||||||
#test that import: test_sac_estimator.py
|
#test that import: test_sac_estimator.py
|
||||||
|
|
||||||
# To build PyTorch itself
|
# To build PyTorch itself
|
||||||
pyyaml==6.0.2
|
pyyaml==6.0.3
|
||||||
pyzstd
|
pyzstd
|
||||||
setuptools==78.1.1
|
setuptools==78.1.1
|
||||||
packaging==23.1
|
packaging==23.1
|
||||||
|
|||||||
@ -54,12 +54,15 @@ ENV OPENSSL_DIR /opt/openssl
|
|||||||
RUN rm install_openssl.sh
|
RUN rm install_openssl.sh
|
||||||
|
|
||||||
ARG INDUCTOR_BENCHMARKS
|
ARG INDUCTOR_BENCHMARKS
|
||||||
|
ARG ANACONDA_PYTHON_VERSION
|
||||||
|
ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
|
||||||
COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
|
COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
|
||||||
COPY ./common/common_utils.sh common_utils.sh
|
COPY ./common/common_utils.sh common_utils.sh
|
||||||
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
|
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
|
||||||
COPY ci_commit_pins/timm.txt timm.txt
|
COPY ci_commit_pins/timm.txt timm.txt
|
||||||
|
COPY ci_commit_pins/torchbench.txt torchbench.txt
|
||||||
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
||||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt
|
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
|
||||||
|
|
||||||
# Install XPU Dependencies
|
# Install XPU Dependencies
|
||||||
ARG XPU_VERSION
|
ARG XPU_VERSION
|
||||||
|
|||||||
@ -100,6 +100,8 @@ COPY ./common/common_utils.sh common_utils.sh
|
|||||||
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
|
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
|
||||||
COPY ci_commit_pins/timm.txt timm.txt
|
COPY ci_commit_pins/timm.txt timm.txt
|
||||||
COPY ci_commit_pins/torchbench.txt torchbench.txt
|
COPY ci_commit_pins/torchbench.txt torchbench.txt
|
||||||
|
# Only build aoti cpp tests when INDUCTOR_BENCHMARKS is set to True
|
||||||
|
ENV BUILD_AOT_INDUCTOR_TEST ${INDUCTOR_BENCHMARKS}
|
||||||
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
||||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
|
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
|
||||||
|
|
||||||
|
|||||||
@ -6,7 +6,7 @@ dependencies = [
|
|||||||
"GitPython==3.1.45",
|
"GitPython==3.1.45",
|
||||||
"docker==7.1.0",
|
"docker==7.1.0",
|
||||||
"pytest==7.3.2",
|
"pytest==7.3.2",
|
||||||
"uv==0.9.5"
|
"uv==0.9.6"
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.setuptools]
|
[tool.setuptools]
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
SHELL=/usr/bin/env bash
|
SHELL=/usr/bin/env bash
|
||||||
|
|
||||||
DOCKER_CMD ?= docker
|
DOCKER_CMD ?= docker
|
||||||
DESIRED_ROCM ?= 7.0
|
DESIRED_ROCM ?= 7.1
|
||||||
DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
|
DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
|
||||||
PACKAGE_NAME = magma-rocm
|
PACKAGE_NAME = magma-rocm
|
||||||
# inherit this from underlying docker image, do not pass this env var to docker
|
# inherit this from underlying docker image, do not pass this env var to docker
|
||||||
@ -16,6 +16,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
|
|||||||
magma-rocm/build_magma.sh
|
magma-rocm/build_magma.sh
|
||||||
|
|
||||||
.PHONY: all
|
.PHONY: all
|
||||||
|
all: magma-rocm71
|
||||||
all: magma-rocm70
|
all: magma-rocm70
|
||||||
all: magma-rocm64
|
all: magma-rocm64
|
||||||
|
|
||||||
@ -24,6 +25,11 @@ clean:
|
|||||||
$(RM) -r magma-*
|
$(RM) -r magma-*
|
||||||
$(RM) -r output
|
$(RM) -r output
|
||||||
|
|
||||||
|
.PHONY: magma-rocm71
|
||||||
|
magma-rocm71: DESIRED_ROCM := 7.1
|
||||||
|
magma-rocm71:
|
||||||
|
$(DOCKER_RUN)
|
||||||
|
|
||||||
.PHONY: magma-rocm70
|
.PHONY: magma-rocm70
|
||||||
magma-rocm70: DESIRED_ROCM := 7.0
|
magma-rocm70: DESIRED_ROCM := 7.0
|
||||||
magma-rocm70:
|
magma-rocm70:
|
||||||
|
|||||||
@ -6,8 +6,8 @@ set -eou pipefail
|
|||||||
# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
|
# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
|
||||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||||
|
|
||||||
# https://github.com/icl-utk-edu/magma/pull/65
|
# post merge of https://github.com/icl-utk-edu/magma/pull/65
|
||||||
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
|
MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
|
||||||
|
|
||||||
# Folders for the build
|
# Folders for the build
|
||||||
PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
|
PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
|
||||||
@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE
|
|||||||
|
|
||||||
# Fetch magma sources and verify checksum
|
# Fetch magma sources and verify checksum
|
||||||
pushd ${PACKAGE_DIR}
|
pushd ${PACKAGE_DIR}
|
||||||
git clone https://github.com/jeffdaily/magma
|
git clone https://github.com/icl-utk-edu/magma
|
||||||
pushd magma
|
pushd magma
|
||||||
git checkout ${MAGMA_VERSION}
|
git checkout ${MAGMA_VERSION}
|
||||||
popd
|
popd
|
||||||
|
|||||||
@ -426,7 +426,7 @@ fi
|
|||||||
if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; then
|
if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; then
|
||||||
# export test times so that potential sharded tests that'll branch off this build will use consistent data
|
# export test times so that potential sharded tests that'll branch off this build will use consistent data
|
||||||
# don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
|
# don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
|
||||||
python tools/stats/export_test_times.py
|
PYTHONPATH=. python tools/stats/export_test_times.py
|
||||||
fi
|
fi
|
||||||
# don't do this for bazel or s390x or riscv64 as they don't use sccache
|
# don't do this for bazel or s390x or riscv64 as they don't use sccache
|
||||||
if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
|
if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
|
||||||
|
|||||||
@ -460,28 +460,18 @@ test_inductor_shard() {
|
|||||||
--verbose
|
--verbose
|
||||||
}
|
}
|
||||||
|
|
||||||
test_inductor_aoti() {
|
test_inductor_aoti_cpp() {
|
||||||
# docker build uses bdist_wheel which does not work with test_aot_inductor
|
|
||||||
# TODO: need a faster way to build
|
|
||||||
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
|
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
|
||||||
# We need to hipify before building again
|
# We need to hipify before building again
|
||||||
python3 tools/amd_build/build_amd.py
|
python3 tools/amd_build/build_amd.py
|
||||||
fi
|
fi
|
||||||
if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
|
if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
|
||||||
BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .)
|
|
||||||
# TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
|
# TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
|
||||||
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
|
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
|
||||||
else
|
else
|
||||||
BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .)
|
|
||||||
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
|
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# aoti cmake custom command requires `torch` to be installed
|
|
||||||
# initialize the cmake build cache and install torch
|
|
||||||
/usr/bin/env "${BUILD_COMMAND[@]}"
|
|
||||||
# rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
|
|
||||||
/usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
|
|
||||||
|
|
||||||
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
|
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -582,6 +572,8 @@ fi
|
|||||||
|
|
||||||
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
|
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
|
||||||
DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
|
DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
|
||||||
|
elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
|
||||||
|
DYNAMO_BENCHMARK_FLAGS+=(--device xpu)
|
||||||
else
|
else
|
||||||
DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
|
DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
|
||||||
fi
|
fi
|
||||||
@ -675,6 +667,8 @@ test_perf_for_dashboard() {
|
|||||||
device=cuda_b200
|
device=cuda_b200
|
||||||
elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
|
elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
|
||||||
device=rocm
|
device=rocm
|
||||||
|
elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
|
||||||
|
device=xpu
|
||||||
fi
|
fi
|
||||||
|
|
||||||
for mode in "${modes[@]}"; do
|
for mode in "${modes[@]}"; do
|
||||||
@ -1767,7 +1761,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
|
|||||||
else
|
else
|
||||||
# Do this after checkout_install_torchbench to ensure we clobber any
|
# Do this after checkout_install_torchbench to ensure we clobber any
|
||||||
# nightlies that torchbench may pull in
|
# nightlies that torchbench may pull in
|
||||||
if [[ "${TEST_CONFIG}" != *cpu* ]]; then
|
if [[ "${TEST_CONFIG}" != *cpu* && "${TEST_CONFIG}" != *xpu* ]]; then
|
||||||
install_torchrec_and_fbgemm
|
install_torchrec_and_fbgemm
|
||||||
fi
|
fi
|
||||||
PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
|
PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
|
||||||
@ -1776,7 +1770,7 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
|
|||||||
install_torchvision
|
install_torchvision
|
||||||
PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
|
PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
|
||||||
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
|
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
|
||||||
test_inductor_aoti
|
test_inductor_aoti_cpp
|
||||||
fi
|
fi
|
||||||
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
|
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
|
||||||
install_torchvision
|
install_torchvision
|
||||||
|
|||||||
@ -7,12 +7,9 @@ if "%DESIRED_PYTHON%" == "3.13t" (
|
|||||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
|
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
|
||||||
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
|
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
|
||||||
set PYTHON_EXEC="python3.13t"
|
set PYTHON_EXEC="python3.13t"
|
||||||
) else if "%DESIRED_PYTHON%"=="3.14" (
|
|
||||||
echo Python version is set to 3.14 or 3.14t
|
|
||||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
|
|
||||||
) else if "%DESIRED_PYTHON%"=="3.14t" (
|
) else if "%DESIRED_PYTHON%"=="3.14t" (
|
||||||
echo Python version is set to 3.14 or 3.14t
|
echo Python version is set to 3.14 or 3.14t
|
||||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
|
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0-amd64.exe"
|
||||||
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
|
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
|
||||||
set PYTHON_EXEC="python3.14t"
|
set PYTHON_EXEC="python3.14t"
|
||||||
) else (
|
) else (
|
||||||
|
|||||||
4
.github/actions/diskspace-cleanup/action.yml
vendored
4
.github/actions/diskspace-cleanup/action.yml
vendored
@ -27,7 +27,9 @@ runs:
|
|||||||
docker system prune -af
|
docker system prune -af
|
||||||
diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
|
diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
|
||||||
if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
|
if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
|
||||||
echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
|
diskspace_cutoff_int=$((diskspace_cutoff + 0))
|
||||||
|
difference=$((100 - diskspace_cutoff_int))
|
||||||
|
echo "Error: Available diskspace is less than $difference percent. Not enough diskspace."
|
||||||
echo "$msg"
|
echo "$msg"
|
||||||
exit 1
|
exit 1
|
||||||
else
|
else
|
||||||
|
|||||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
|||||||
69bbe7363897764f9e758d851cd0340147d27f94
|
3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2
|
||||||
|
|||||||
2
.github/ci_commit_pins/vision.txt
vendored
2
.github/ci_commit_pins/vision.txt
vendored
@ -1 +1 @@
|
|||||||
1752fe6809b74921644866275ab80244b96e80bc
|
cfbc5c2f1c798991715a6b06bb3ce46478c4487c
|
||||||
|
|||||||
20
.github/merge_rules.yaml
vendored
20
.github/merge_rules.yaml
vendored
@ -540,6 +540,26 @@
|
|||||||
- Lint
|
- Lint
|
||||||
- pull
|
- pull
|
||||||
|
|
||||||
|
- name: PrivateUse1
|
||||||
|
patterns:
|
||||||
|
- torch/accelerator/**
|
||||||
|
- torch/utils/backend_registration.py
|
||||||
|
- torch/csrc/acc/**
|
||||||
|
- torch/csrc/DeviceAccelerator.*
|
||||||
|
- torch/csrc/profiler/standalone/privateuse1_observer.*
|
||||||
|
- aten/src/ATen/DeviceAccelerator.*
|
||||||
|
- aten/src/ATen/core/GeneratorForPrivateuseone.*
|
||||||
|
- aten/src/ATen/detail/PrivateUse1HooksInterface.*
|
||||||
|
- docs/source/accelerator/**
|
||||||
|
- test/cpp_extensions/open_registration_extension/torch_openreg/**
|
||||||
|
approved_by:
|
||||||
|
- albanD
|
||||||
|
- fffrog
|
||||||
|
mandatory_checks_name:
|
||||||
|
- EasyCLA
|
||||||
|
- Lint
|
||||||
|
- pull
|
||||||
|
|
||||||
- name: superuser
|
- name: superuser
|
||||||
patterns:
|
patterns:
|
||||||
- '*'
|
- '*'
|
||||||
|
|||||||
2
.github/pytorch-probot.yml
vendored
2
.github/pytorch-probot.yml
vendored
@ -19,6 +19,7 @@ ciflow_push_tags:
|
|||||||
- ciflow/inductor-perf-test-nightly-rocm-mi300
|
- ciflow/inductor-perf-test-nightly-rocm-mi300
|
||||||
- ciflow/inductor-perf-test-nightly-rocm-mi355
|
- ciflow/inductor-perf-test-nightly-rocm-mi355
|
||||||
- ciflow/inductor-perf-test-nightly-x86-zen
|
- ciflow/inductor-perf-test-nightly-x86-zen
|
||||||
|
- ciflow/inductor-perf-test-nightly-xpu
|
||||||
- ciflow/inductor-periodic
|
- ciflow/inductor-periodic
|
||||||
- ciflow/inductor-rocm
|
- ciflow/inductor-rocm
|
||||||
- ciflow/linux-aarch64
|
- ciflow/linux-aarch64
|
||||||
@ -26,6 +27,7 @@ ciflow_push_tags:
|
|||||||
- ciflow/nightly
|
- ciflow/nightly
|
||||||
- ciflow/op-benchmark
|
- ciflow/op-benchmark
|
||||||
- ciflow/periodic
|
- ciflow/periodic
|
||||||
|
- ciflow/periodic-rocm-mi200
|
||||||
- ciflow/periodic-rocm-mi300
|
- ciflow/periodic-rocm-mi300
|
||||||
- ciflow/pull
|
- ciflow/pull
|
||||||
- ciflow/quantization-periodic
|
- ciflow/quantization-periodic
|
||||||
|
|||||||
97
.github/scripts/generate_binary_build_matrix.py
vendored
97
.github/scripts/generate_binary_build_matrix.py
vendored
@ -11,11 +11,17 @@ architectures:
|
|||||||
* Latest XPU
|
* Latest XPU
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
|
SCRIPT_DIR = Path(__file__).absolute().parent
|
||||||
|
REPO_ROOT = SCRIPT_DIR.parent.parent
|
||||||
|
|
||||||
|
|
||||||
CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
|
CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
|
||||||
CUDA_STABLE = "12.8"
|
CUDA_STABLE = "12.8"
|
||||||
CUDA_ARCHES_FULL_VERSION = {
|
CUDA_ARCHES_FULL_VERSION = {
|
||||||
@ -31,8 +37,7 @@ CUDA_ARCHES_CUDNN_VERSION = {
|
|||||||
"13.0": "9",
|
"13.0": "9",
|
||||||
}
|
}
|
||||||
|
|
||||||
# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
|
ROCM_ARCHES = ["7.0", "7.1"]
|
||||||
ROCM_ARCHES = ["6.4", "7.0"]
|
|
||||||
|
|
||||||
XPU_ARCHES = ["xpu"]
|
XPU_ARCHES = ["xpu"]
|
||||||
|
|
||||||
@ -56,7 +61,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
|||||||
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
|
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
|
||||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||||
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
|
"nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
|
||||||
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
|
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
|
||||||
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
|
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
|
||||||
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
|
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
|
||||||
@ -73,7 +78,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
|||||||
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
|
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
|
||||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||||
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
|
"nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
|
||||||
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
|
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
|
||||||
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
|
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
|
||||||
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
|
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
|
||||||
@ -90,7 +95,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
|||||||
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
|
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
|
||||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||||
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
|
"nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
|
||||||
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
|
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
|
||||||
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
|
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
|
||||||
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
|
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
|
||||||
@ -107,7 +112,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
|||||||
"nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | "
|
"nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | "
|
||||||
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
|
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
|
||||||
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
|
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
|
||||||
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
|
"nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | "
|
||||||
"nvidia-nvtx==13.0.85; platform_system == 'Linux' | "
|
"nvidia-nvtx==13.0.85; platform_system == 'Linux' | "
|
||||||
"nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | "
|
"nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | "
|
||||||
"nvidia-cufile==1.15.1.6; platform_system == 'Linux'"
|
"nvidia-cufile==1.15.1.6; platform_system == 'Linux'"
|
||||||
@ -137,9 +142,48 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_nccl_wheel_version(arch_version: str) -> str:
|
# Used by tools/nightly.py
|
||||||
import re
|
PYTORCH_NIGHTLY_PIP_INDEX_URL = "https://download.pytorch.org/whl/nightly"
|
||||||
|
NIGHTLY_SOURCE_MATRIX = {
|
||||||
|
"cpu": dict(
|
||||||
|
name="cpu",
|
||||||
|
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cpu",
|
||||||
|
supported_platforms=["Linux", "macOS", "Windows"],
|
||||||
|
accelerator="cpu",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
CUDA_NIGHTLY_SOURCE_MATRIX = {
|
||||||
|
f"cuda-{major}.{minor}": dict(
|
||||||
|
name=f"cuda-{major}.{minor}",
|
||||||
|
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cu{major}{minor}",
|
||||||
|
supported_platforms=["Linux", "Windows"],
|
||||||
|
accelerator="cuda",
|
||||||
|
)
|
||||||
|
for major, minor in (map(int, version.split(".")) for version in CUDA_ARCHES)
|
||||||
|
}
|
||||||
|
ROCM_NIGHTLY_SOURCE_MATRIX = {
|
||||||
|
f"rocm-{major}.{minor}": dict(
|
||||||
|
name=f"rocm-{major}.{minor}",
|
||||||
|
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm{major}.{minor}",
|
||||||
|
supported_platforms=["Linux"],
|
||||||
|
accelerator="rocm",
|
||||||
|
)
|
||||||
|
for major, minor in (map(int, version.split(".")) for version in ROCM_ARCHES)
|
||||||
|
}
|
||||||
|
XPU_NIGHTLY_SOURCE_MATRIX = {
|
||||||
|
"xpu": dict(
|
||||||
|
name="xpu",
|
||||||
|
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/xpu",
|
||||||
|
supported_platforms=["Linux"],
|
||||||
|
accelerator="xpu",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
NIGHTLY_SOURCE_MATRIX.update(CUDA_NIGHTLY_SOURCE_MATRIX)
|
||||||
|
NIGHTLY_SOURCE_MATRIX.update(ROCM_NIGHTLY_SOURCE_MATRIX)
|
||||||
|
NIGHTLY_SOURCE_MATRIX.update(XPU_NIGHTLY_SOURCE_MATRIX)
|
||||||
|
|
||||||
|
|
||||||
|
def get_nccl_wheel_version(arch_version: str) -> str:
|
||||||
requirements = map(
|
requirements = map(
|
||||||
str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
|
str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
|
||||||
)
|
)
|
||||||
@ -147,17 +191,14 @@ def get_nccl_wheel_version(arch_version: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def read_nccl_pin(arch_version: str) -> str:
|
def read_nccl_pin(arch_version: str) -> str:
|
||||||
from pathlib import Path
|
nccl_pin_path = (
|
||||||
|
REPO_ROOT
|
||||||
nccl_pin_path = os.path.join(
|
/ ".ci"
|
||||||
Path(__file__).absolute().parents[2],
|
/ "docker"
|
||||||
".ci",
|
/ "ci_commit_pins"
|
||||||
"docker",
|
/ f"nccl-cu{arch_version[:2]}.txt"
|
||||||
"ci_commit_pins",
|
|
||||||
f"nccl-cu{arch_version[:2]}.txt",
|
|
||||||
)
|
)
|
||||||
with open(nccl_pin_path) as f:
|
return nccl_pin_path.read_text().strip()
|
||||||
return f.read().strip()
|
|
||||||
|
|
||||||
|
|
||||||
def validate_nccl_dep_consistency(arch_version: str) -> None:
|
def validate_nccl_dep_consistency(arch_version: str) -> None:
|
||||||
@ -165,7 +206,8 @@ def validate_nccl_dep_consistency(arch_version: str) -> None:
|
|||||||
wheel_ver = get_nccl_wheel_version(arch_version)
|
wheel_ver = get_nccl_wheel_version(arch_version)
|
||||||
if not nccl_release_tag.startswith(f"v{wheel_ver}"):
|
if not nccl_release_tag.startswith(f"v{wheel_ver}"):
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"{arch_version} NCCL release tag version {nccl_release_tag} does not correspond to wheel version {wheel_ver}"
|
f"{arch_version} NCCL release tag version {nccl_release_tag} "
|
||||||
|
f"does not correspond to wheel version {wheel_ver}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -412,7 +454,14 @@ def generate_wheels_matrix(
|
|||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
||||||
validate_nccl_dep_consistency("13.0")
|
arch_version = ""
|
||||||
validate_nccl_dep_consistency("12.9")
|
for arch_version in CUDA_ARCHES:
|
||||||
validate_nccl_dep_consistency("12.8")
|
validate_nccl_dep_consistency(arch_version)
|
||||||
validate_nccl_dep_consistency("12.6")
|
del arch_version
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Used by tools/nightly.py
|
||||||
|
(SCRIPT_DIR / "nightly_source_matrix.json").write_text(
|
||||||
|
json.dumps(NIGHTLY_SOURCE_MATRIX, indent=4) + "\n"
|
||||||
|
)
|
||||||
|
|||||||
13
.github/workflows/_xpu-test.yml
vendored
13
.github/workflows/_xpu-test.yml
vendored
@ -38,6 +38,10 @@ on:
|
|||||||
default: ""
|
default: ""
|
||||||
description: |
|
description: |
|
||||||
List of tests to include (empty string implies default list)
|
List of tests to include (empty string implies default list)
|
||||||
|
dashboard-tag:
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
default: ""
|
||||||
disable-monitor:
|
disable-monitor:
|
||||||
description: |
|
description: |
|
||||||
[Experimental] Disable utilization monitoring for tests.
|
[Experimental] Disable utilization monitoring for tests.
|
||||||
@ -58,6 +62,11 @@ on:
|
|||||||
required: false
|
required: false
|
||||||
type: number
|
type: number
|
||||||
default: 1
|
default: 1
|
||||||
|
secrets:
|
||||||
|
HUGGING_FACE_HUB_TOKEN:
|
||||||
|
required: false
|
||||||
|
description: |
|
||||||
|
HF Auth token to avoid rate limits when downloading models or datasets from hub
|
||||||
permissions:
|
permissions:
|
||||||
id-token: write
|
id-token: write
|
||||||
contents: read
|
contents: read
|
||||||
@ -196,6 +205,8 @@ jobs:
|
|||||||
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
|
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
|
||||||
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
|
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
|
||||||
TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
|
TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
|
||||||
|
DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
|
||||||
|
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||||
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
|
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
|
||||||
run: |
|
run: |
|
||||||
# Fetch aws credential from IMDs
|
# Fetch aws credential from IMDs
|
||||||
@ -246,6 +257,8 @@ jobs:
|
|||||||
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
|
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
|
||||||
-e TESTS_TO_INCLUDE \
|
-e TESTS_TO_INCLUDE \
|
||||||
-e ZE_AFFINITY_MASK \
|
-e ZE_AFFINITY_MASK \
|
||||||
|
-e HUGGING_FACE_HUB_TOKEN \
|
||||||
|
-e DASHBOARD_TAG \
|
||||||
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
||||||
--ulimit stack=10485760:83886080 \
|
--ulimit stack=10485760:83886080 \
|
||||||
--ulimit core=0 \
|
--ulimit core=0 \
|
||||||
|
|||||||
2
.github/workflows/build-almalinux-images.yml
vendored
2
.github/workflows/build-almalinux-images.yml
vendored
@ -36,7 +36,7 @@ jobs:
|
|||||||
runs-on: linux.9xlarge.ephemeral
|
runs-on: linux.9xlarge.ephemeral
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"]
|
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm7.0", "rocm7.1", "cpu"]
|
||||||
steps:
|
steps:
|
||||||
- name: Build docker image
|
- name: Build docker image
|
||||||
uses: pytorch/pytorch/.github/actions/binary-docker-build@main
|
uses: pytorch/pytorch/.github/actions/binary-docker-build@main
|
||||||
|
|||||||
2
.github/workflows/build-libtorch-images.yml
vendored
2
.github/workflows/build-libtorch-images.yml
vendored
@ -52,8 +52,8 @@ jobs:
|
|||||||
{ tag: "cuda12.9" },
|
{ tag: "cuda12.9" },
|
||||||
{ tag: "cuda12.8" },
|
{ tag: "cuda12.8" },
|
||||||
{ tag: "cuda12.6" },
|
{ tag: "cuda12.6" },
|
||||||
{ tag: "rocm6.4" },
|
|
||||||
{ tag: "rocm7.0" },
|
{ tag: "rocm7.0" },
|
||||||
|
{ tag: "rocm7.1" },
|
||||||
{ tag: "cpu" },
|
{ tag: "cpu" },
|
||||||
]
|
]
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
2
.github/workflows/build-magma-rocm-linux.yml
vendored
2
.github/workflows/build-magma-rocm-linux.yml
vendored
@ -34,7 +34,7 @@ jobs:
|
|||||||
id-token: write
|
id-token: write
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
rocm_version: ["70", "64"]
|
rocm_version: ["71", "70"]
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout PyTorch
|
- name: Checkout PyTorch
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|||||||
2
.github/workflows/build-manywheel-images.yml
vendored
2
.github/workflows/build-manywheel-images.yml
vendored
@ -54,8 +54,8 @@ jobs:
|
|||||||
{ name: "manylinuxaarch64-builder", tag: "cuda12.9", runner: "linux.arm64.2xlarge.ephemeral" },
|
{ name: "manylinuxaarch64-builder", tag: "cuda12.9", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||||
{ name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" },
|
{ name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||||
{ name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" },
|
{ name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||||
{ name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" },
|
|
||||||
{ name: "manylinux2_28-builder", tag: "rocm7.0", runner: "linux.9xlarge.ephemeral" },
|
{ name: "manylinux2_28-builder", tag: "rocm7.0", runner: "linux.9xlarge.ephemeral" },
|
||||||
|
{ name: "manylinux2_28-builder", tag: "rocm7.1", runner: "linux.9xlarge.ephemeral" },
|
||||||
{ name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" },
|
{ name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" },
|
||||||
{ name: "manylinux2_28_aarch64-builder", tag: "cpu-aarch64", runner: "linux.arm64.2xlarge.ephemeral" },
|
{ name: "manylinux2_28_aarch64-builder", tag: "cpu-aarch64", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||||
{ name: "manylinux2_28-builder", tag: "xpu", runner: "linux.9xlarge.ephemeral" },
|
{ name: "manylinux2_28-builder", tag: "xpu", runner: "linux.9xlarge.ephemeral" },
|
||||||
|
|||||||
7
.github/workflows/build-triton-wheel.yml
vendored
7
.github/workflows/build-triton-wheel.yml
vendored
@ -55,7 +55,7 @@ jobs:
|
|||||||
docker-image: ["pytorch/manylinux2_28-builder:cpu"]
|
docker-image: ["pytorch/manylinux2_28-builder:cpu"]
|
||||||
include:
|
include:
|
||||||
- device: "rocm"
|
- device: "rocm"
|
||||||
rocm_version: "7.0"
|
rocm_version: "7.1"
|
||||||
runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
|
runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
|
||||||
- device: "cuda"
|
- device: "cuda"
|
||||||
rocm_version: ""
|
rocm_version: ""
|
||||||
@ -159,12 +159,7 @@ jobs:
|
|||||||
WITH_CLANG_LDD="--with-clang-ldd"
|
WITH_CLANG_LDD="--with-clang-ldd"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ "${BUILD_DEVICE}" == xpu ]]; then
|
|
||||||
docker exec -t "${container_name}" bash -c "dnf install -y gcc-toolset-13-gcc-c++"
|
|
||||||
docker exec -t "${container_name}" bash -c "source /opt/rh/gcc-toolset-13/enable && ${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE"
|
|
||||||
else
|
|
||||||
docker exec -t "${container_name}" bash -c "${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE $WITH_CLANG_LDD"
|
docker exec -t "${container_name}" bash -c "${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE $WITH_CLANG_LDD"
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ ("${{ matrix.device }}" == "cuda" || "${{ matrix.device }}" == "xpu") ]]; then
|
if [[ ("${{ matrix.device }}" == "cuda" || "${{ matrix.device }}" == "xpu") ]]; then
|
||||||
docker exec -t "${container_name}" bash -c "auditwheel repair --plat ${PLATFORM} //artifacts/*.whl"
|
docker exec -t "${container_name}" bash -c "auditwheel repair --plat ${PLATFORM} //artifacts/*.whl"
|
||||||
|
|||||||
2
.github/workflows/docker-builds.yml
vendored
2
.github/workflows/docker-builds.yml
vendored
@ -57,6 +57,7 @@ jobs:
|
|||||||
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
|
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
|
||||||
pytorch-linux-jammy-py3.10-clang12,
|
pytorch-linux-jammy-py3.10-clang12,
|
||||||
pytorch-linux-jammy-py3.13-clang12,
|
pytorch-linux-jammy-py3.13-clang12,
|
||||||
|
pytorch-linux-jammy-py3.14-clang12,
|
||||||
pytorch-linux-jammy-rocm-n-py3,
|
pytorch-linux-jammy-rocm-n-py3,
|
||||||
pytorch-linux-noble-rocm-n-py3,
|
pytorch-linux-noble-rocm-n-py3,
|
||||||
pytorch-linux-jammy-rocm-n-py3-benchmarks,
|
pytorch-linux-jammy-rocm-n-py3-benchmarks,
|
||||||
@ -66,6 +67,7 @@ jobs:
|
|||||||
pytorch-linux-jammy-py3.12-halide,
|
pytorch-linux-jammy-py3.12-halide,
|
||||||
pytorch-linux-jammy-xpu-n-1-py3,
|
pytorch-linux-jammy-xpu-n-1-py3,
|
||||||
pytorch-linux-jammy-xpu-n-py3,
|
pytorch-linux-jammy-xpu-n-py3,
|
||||||
|
pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks,
|
||||||
pytorch-linux-jammy-py3-clang18-asan,
|
pytorch-linux-jammy-py3-clang18-asan,
|
||||||
pytorch-linux-jammy-py3-clang12-onnx,
|
pytorch-linux-jammy-py3-clang12-onnx,
|
||||||
pytorch-linux-jammy-linter,
|
pytorch-linux-jammy-linter,
|
||||||
|
|||||||
56
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
56
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -132,7 +132,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_10-cuda-aarch64-12_6
|
build_name: manywheel-py3_10-cuda-aarch64-12_6
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -178,7 +178,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_10-cuda-aarch64-12_8
|
build_name: manywheel-py3_10-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -224,7 +224,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_10-cuda-aarch64-12_9
|
build_name: manywheel-py3_10-cuda-aarch64-12_9
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -270,7 +270,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_10-cuda-aarch64-13_0
|
build_name: manywheel-py3_10-cuda-aarch64-13_0
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -381,7 +381,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_11-cuda-aarch64-12_6
|
build_name: manywheel-py3_11-cuda-aarch64-12_6
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -427,7 +427,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_11-cuda-aarch64-12_8
|
build_name: manywheel-py3_11-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -473,7 +473,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_11-cuda-aarch64-12_9
|
build_name: manywheel-py3_11-cuda-aarch64-12_9
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -519,7 +519,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_11-cuda-aarch64-13_0
|
build_name: manywheel-py3_11-cuda-aarch64-13_0
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -630,7 +630,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_12-cuda-aarch64-12_6
|
build_name: manywheel-py3_12-cuda-aarch64-12_6
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -676,7 +676,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_12-cuda-aarch64-12_8
|
build_name: manywheel-py3_12-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -722,7 +722,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_12-cuda-aarch64-12_9
|
build_name: manywheel-py3_12-cuda-aarch64-12_9
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -768,7 +768,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_12-cuda-aarch64-13_0
|
build_name: manywheel-py3_12-cuda-aarch64-13_0
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -879,7 +879,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13-cuda-aarch64-12_6
|
build_name: manywheel-py3_13-cuda-aarch64-12_6
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -925,7 +925,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13-cuda-aarch64-12_8
|
build_name: manywheel-py3_13-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -971,7 +971,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13-cuda-aarch64-12_9
|
build_name: manywheel-py3_13-cuda-aarch64-12_9
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1017,7 +1017,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13-cuda-aarch64-13_0
|
build_name: manywheel-py3_13-cuda-aarch64-13_0
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1128,7 +1128,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13t-cuda-aarch64-12_6
|
build_name: manywheel-py3_13t-cuda-aarch64-12_6
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1174,7 +1174,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13t-cuda-aarch64-12_8
|
build_name: manywheel-py3_13t-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1220,7 +1220,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13t-cuda-aarch64-12_9
|
build_name: manywheel-py3_13t-cuda-aarch64-12_9
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1266,7 +1266,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13t-cuda-aarch64-13_0
|
build_name: manywheel-py3_13t-cuda-aarch64-13_0
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1377,7 +1377,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_14-cuda-aarch64-12_6
|
build_name: manywheel-py3_14-cuda-aarch64-12_6
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1423,7 +1423,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_14-cuda-aarch64-12_8
|
build_name: manywheel-py3_14-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1469,7 +1469,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_14-cuda-aarch64-12_9
|
build_name: manywheel-py3_14-cuda-aarch64-12_9
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1515,7 +1515,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_14-cuda-aarch64-13_0
|
build_name: manywheel-py3_14-cuda-aarch64-13_0
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1626,7 +1626,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_14t-cuda-aarch64-12_6
|
build_name: manywheel-py3_14t-cuda-aarch64-12_6
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1672,7 +1672,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_14t-cuda-aarch64-12_8
|
build_name: manywheel-py3_14t-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1718,7 +1718,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_14t-cuda-aarch64-12_9
|
build_name: manywheel-py3_14t-cuda-aarch64-12_9
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1764,7 +1764,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_14t-cuda-aarch64-13_0
|
build_name: manywheel-py3_14t-cuda-aarch64-13_0
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|||||||
236
.github/workflows/generated-linux-binary-libtorch-nightly.yml
generated
vendored
236
.github/workflows/generated-linux-binary-libtorch-nightly.yml
generated
vendored
@ -384,124 +384,6 @@ jobs:
|
|||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
uses: ./.github/workflows/_binary-upload.yml
|
uses: ./.github/workflows/_binary-upload.yml
|
||||||
|
|
||||||
libtorch-rocm6_4-shared-with-deps-release-build:
|
|
||||||
if: ${{ github.repository_owner == 'pytorch' }}
|
|
||||||
uses: ./.github/workflows/_binary-build-linux.yml
|
|
||||||
needs: get-label-type
|
|
||||||
with:
|
|
||||||
PYTORCH_ROOT: /pytorch
|
|
||||||
PACKAGE_TYPE: libtorch
|
|
||||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
|
||||||
# favor of GPU_ARCH_VERSION
|
|
||||||
DESIRED_CUDA: rocm6.4
|
|
||||||
GPU_ARCH_VERSION: "6.4"
|
|
||||||
GPU_ARCH_TYPE: rocm
|
|
||||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
|
||||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
|
|
||||||
LIBTORCH_CONFIG: release
|
|
||||||
LIBTORCH_VARIANT: shared-with-deps
|
|
||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
|
||||||
timeout-minutes: 300
|
|
||||||
build_name: libtorch-rocm6_4-shared-with-deps-release
|
|
||||||
build_environment: linux-binary-libtorch
|
|
||||||
secrets:
|
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
libtorch-rocm6_4-shared-with-deps-release-test: # Testing
|
|
||||||
if: ${{ github.repository_owner == 'pytorch' }}
|
|
||||||
needs:
|
|
||||||
- libtorch-rocm6_4-shared-with-deps-release-build
|
|
||||||
- get-label-type
|
|
||||||
runs-on: linux.rocm.gpu.mi250
|
|
||||||
timeout-minutes: 240
|
|
||||||
env:
|
|
||||||
PYTORCH_ROOT: /pytorch
|
|
||||||
PACKAGE_TYPE: libtorch
|
|
||||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
|
||||||
# favor of GPU_ARCH_VERSION
|
|
||||||
DESIRED_CUDA: rocm6.4
|
|
||||||
GPU_ARCH_VERSION: "6.4"
|
|
||||||
GPU_ARCH_TYPE: rocm
|
|
||||||
SKIP_ALL_TESTS: 1
|
|
||||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
|
||||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
|
|
||||||
LIBTORCH_CONFIG: release
|
|
||||||
LIBTORCH_VARIANT: shared-with-deps
|
|
||||||
permissions:
|
|
||||||
id-token: write
|
|
||||||
contents: read
|
|
||||||
steps:
|
|
||||||
- name: Setup ROCm
|
|
||||||
uses: ./.github/actions/setup-rocm
|
|
||||||
- uses: actions/download-artifact@v4.1.7
|
|
||||||
name: Download Build Artifacts
|
|
||||||
with:
|
|
||||||
name: libtorch-rocm6_4-shared-with-deps-release
|
|
||||||
path: "${{ runner.temp }}/artifacts/"
|
|
||||||
- name: Checkout PyTorch
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
|
||||||
submodules: recursive
|
|
||||||
path: pytorch
|
|
||||||
show-progress: false
|
|
||||||
- name: Clean PyTorch checkout
|
|
||||||
run: |
|
|
||||||
# Remove any artifacts from the previous checkouts
|
|
||||||
git clean -fxd
|
|
||||||
working-directory: pytorch
|
|
||||||
- name: ROCm set GPU_FLAG
|
|
||||||
run: |
|
|
||||||
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
|
|
||||||
- name: configure aws credentials
|
|
||||||
id: aws_creds
|
|
||||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
|
|
||||||
uses: aws-actions/configure-aws-credentials@v4
|
|
||||||
with:
|
|
||||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
|
||||||
aws-region: us-east-1
|
|
||||||
role-duration-seconds: 18000
|
|
||||||
- name: Calculate docker image
|
|
||||||
id: calculate-docker-image
|
|
||||||
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
|
||||||
with:
|
|
||||||
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
|
|
||||||
docker-image-name: libtorch-cxx11-builder
|
|
||||||
custom-tag-prefix: rocm6.4
|
|
||||||
docker-build-dir: .ci/docker
|
|
||||||
working-directory: pytorch
|
|
||||||
- name: Pull Docker image
|
|
||||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
|
||||||
with:
|
|
||||||
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
||||||
- name: Test Pytorch binary
|
|
||||||
uses: ./pytorch/.github/actions/test-pytorch-binary
|
|
||||||
env:
|
|
||||||
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
|
||||||
- name: Teardown ROCm
|
|
||||||
uses: ./.github/actions/teardown-rocm
|
|
||||||
libtorch-rocm6_4-shared-with-deps-release-upload: # Uploading
|
|
||||||
if: ${{ github.repository_owner == 'pytorch' }}
|
|
||||||
permissions:
|
|
||||||
id-token: write
|
|
||||||
contents: read
|
|
||||||
needs: libtorch-rocm6_4-shared-with-deps-release-test
|
|
||||||
with:
|
|
||||||
PYTORCH_ROOT: /pytorch
|
|
||||||
PACKAGE_TYPE: libtorch
|
|
||||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
|
||||||
# favor of GPU_ARCH_VERSION
|
|
||||||
DESIRED_CUDA: rocm6.4
|
|
||||||
GPU_ARCH_VERSION: "6.4"
|
|
||||||
GPU_ARCH_TYPE: rocm
|
|
||||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
|
||||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
|
|
||||||
LIBTORCH_CONFIG: release
|
|
||||||
LIBTORCH_VARIANT: shared-with-deps
|
|
||||||
build_name: libtorch-rocm6_4-shared-with-deps-release
|
|
||||||
secrets:
|
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
uses: ./.github/workflows/_binary-upload.yml
|
|
||||||
|
|
||||||
libtorch-rocm7_0-shared-with-deps-release-build:
|
libtorch-rocm7_0-shared-with-deps-release-build:
|
||||||
if: ${{ github.repository_owner == 'pytorch' }}
|
if: ${{ github.repository_owner == 'pytorch' }}
|
||||||
uses: ./.github/workflows/_binary-build-linux.yml
|
uses: ./.github/workflows/_binary-build-linux.yml
|
||||||
@ -619,3 +501,121 @@ jobs:
|
|||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
uses: ./.github/workflows/_binary-upload.yml
|
uses: ./.github/workflows/_binary-upload.yml
|
||||||
|
|
||||||
|
libtorch-rocm7_1-shared-with-deps-release-build:
|
||||||
|
if: ${{ github.repository_owner == 'pytorch' }}
|
||||||
|
uses: ./.github/workflows/_binary-build-linux.yml
|
||||||
|
needs: get-label-type
|
||||||
|
with:
|
||||||
|
PYTORCH_ROOT: /pytorch
|
||||||
|
PACKAGE_TYPE: libtorch
|
||||||
|
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||||
|
# favor of GPU_ARCH_VERSION
|
||||||
|
DESIRED_CUDA: rocm7.1
|
||||||
|
GPU_ARCH_VERSION: "7.1"
|
||||||
|
GPU_ARCH_TYPE: rocm
|
||||||
|
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||||
|
DOCKER_IMAGE_TAG_PREFIX: rocm7.1
|
||||||
|
LIBTORCH_CONFIG: release
|
||||||
|
LIBTORCH_VARIANT: shared-with-deps
|
||||||
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
|
timeout-minutes: 300
|
||||||
|
build_name: libtorch-rocm7_1-shared-with-deps-release
|
||||||
|
build_environment: linux-binary-libtorch
|
||||||
|
secrets:
|
||||||
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
libtorch-rocm7_1-shared-with-deps-release-test: # Testing
|
||||||
|
if: ${{ github.repository_owner == 'pytorch' }}
|
||||||
|
needs:
|
||||||
|
- libtorch-rocm7_1-shared-with-deps-release-build
|
||||||
|
- get-label-type
|
||||||
|
runs-on: linux.rocm.gpu.mi250
|
||||||
|
timeout-minutes: 240
|
||||||
|
env:
|
||||||
|
PYTORCH_ROOT: /pytorch
|
||||||
|
PACKAGE_TYPE: libtorch
|
||||||
|
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||||
|
# favor of GPU_ARCH_VERSION
|
||||||
|
DESIRED_CUDA: rocm7.1
|
||||||
|
GPU_ARCH_VERSION: "7.1"
|
||||||
|
GPU_ARCH_TYPE: rocm
|
||||||
|
SKIP_ALL_TESTS: 1
|
||||||
|
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||||
|
DOCKER_IMAGE_TAG_PREFIX: rocm7.1
|
||||||
|
LIBTORCH_CONFIG: release
|
||||||
|
LIBTORCH_VARIANT: shared-with-deps
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
steps:
|
||||||
|
- name: Setup ROCm
|
||||||
|
uses: ./.github/actions/setup-rocm
|
||||||
|
- uses: actions/download-artifact@v4.1.7
|
||||||
|
name: Download Build Artifacts
|
||||||
|
with:
|
||||||
|
name: libtorch-rocm7_1-shared-with-deps-release
|
||||||
|
path: "${{ runner.temp }}/artifacts/"
|
||||||
|
- name: Checkout PyTorch
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||||
|
submodules: recursive
|
||||||
|
path: pytorch
|
||||||
|
show-progress: false
|
||||||
|
- name: Clean PyTorch checkout
|
||||||
|
run: |
|
||||||
|
# Remove any artifacts from the previous checkouts
|
||||||
|
git clean -fxd
|
||||||
|
working-directory: pytorch
|
||||||
|
- name: ROCm set GPU_FLAG
|
||||||
|
run: |
|
||||||
|
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
|
||||||
|
- name: configure aws credentials
|
||||||
|
id: aws_creds
|
||||||
|
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
|
||||||
|
uses: aws-actions/configure-aws-credentials@v4
|
||||||
|
with:
|
||||||
|
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||||
|
aws-region: us-east-1
|
||||||
|
role-duration-seconds: 18000
|
||||||
|
- name: Calculate docker image
|
||||||
|
id: calculate-docker-image
|
||||||
|
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
||||||
|
with:
|
||||||
|
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
|
||||||
|
docker-image-name: libtorch-cxx11-builder
|
||||||
|
custom-tag-prefix: rocm7.1
|
||||||
|
docker-build-dir: .ci/docker
|
||||||
|
working-directory: pytorch
|
||||||
|
- name: Pull Docker image
|
||||||
|
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
||||||
|
with:
|
||||||
|
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||||
|
- name: Test Pytorch binary
|
||||||
|
uses: ./pytorch/.github/actions/test-pytorch-binary
|
||||||
|
env:
|
||||||
|
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||||
|
- name: Teardown ROCm
|
||||||
|
uses: ./.github/actions/teardown-rocm
|
||||||
|
libtorch-rocm7_1-shared-with-deps-release-upload: # Uploading
|
||||||
|
if: ${{ github.repository_owner == 'pytorch' }}
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
needs: libtorch-rocm7_1-shared-with-deps-release-test
|
||||||
|
with:
|
||||||
|
PYTORCH_ROOT: /pytorch
|
||||||
|
PACKAGE_TYPE: libtorch
|
||||||
|
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||||
|
# favor of GPU_ARCH_VERSION
|
||||||
|
DESIRED_CUDA: rocm7.1
|
||||||
|
GPU_ARCH_VERSION: "7.1"
|
||||||
|
GPU_ARCH_TYPE: rocm
|
||||||
|
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||||
|
DOCKER_IMAGE_TAG_PREFIX: rocm7.1
|
||||||
|
LIBTORCH_CONFIG: release
|
||||||
|
LIBTORCH_VARIANT: shared-with-deps
|
||||||
|
build_name: libtorch-rocm7_1-shared-with-deps-release
|
||||||
|
secrets:
|
||||||
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
uses: ./.github/workflows/_binary-upload.yml
|
||||||
|
|||||||
1666
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
1666
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
File diff suppressed because it is too large
Load Diff
148
.github/workflows/inductor-perf-test-nightly-xpu.yml
vendored
Normal file
148
.github/workflows/inductor-perf-test-nightly-xpu.yml
vendored
Normal file
@ -0,0 +1,148 @@
|
|||||||
|
name: inductor-perf-nightly-xpu
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- ciflow/inductor-perf-test-nightly-xpu/*
|
||||||
|
schedule:
|
||||||
|
- cron: 30 17 * * *
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
training:
|
||||||
|
description: Run training (on by default)?
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: true
|
||||||
|
inference:
|
||||||
|
description: Run inference (on by default)?
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: true
|
||||||
|
default:
|
||||||
|
description: Run inductor_default?
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
dynamic:
|
||||||
|
description: Run inductor_dynamic_shapes?
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
cppwrapper:
|
||||||
|
description: Run inductor_cpp_wrapper?
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
cudagraphs:
|
||||||
|
description: Run inductor_cudagraphs?
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
freezing_cudagraphs:
|
||||||
|
description: Run inductor_cudagraphs with freezing for inference?
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
aotinductor:
|
||||||
|
description: Run aot_inductor for inference?
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
maxautotune:
|
||||||
|
description: Run inductor_max_autotune?
|
||||||
|
required: false
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
benchmark_configs:
|
||||||
|
description: The list of configs used the benchmark
|
||||||
|
required: false
|
||||||
|
type: string
|
||||||
|
default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,cachebench
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
permissions: read-all
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
get-label-type:
|
||||||
|
name: get-label-type
|
||||||
|
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||||
|
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||||
|
with:
|
||||||
|
triggering_actor: ${{ github.triggering_actor }}
|
||||||
|
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||||
|
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||||
|
curr_ref_type: ${{ github.ref_type }}
|
||||||
|
opt_out_experiments: lf
|
||||||
|
|
||||||
|
xpu-n-py3_10-inductor-benchmark-build:
|
||||||
|
name: xpu-n-py3.10-inductor-benchmark
|
||||||
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
|
needs: get-label-type
|
||||||
|
with:
|
||||||
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
|
build-environment: linux-jammy-xpu-n-py3.10
|
||||||
|
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks
|
||||||
|
runner: linux.c7i.12xlarge
|
||||||
|
test-matrix: |
|
||||||
|
{ include: [
|
||||||
|
{ config: "inductor_huggingface_perf_xpu", shard: 1, num_shards: 5, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "inductor_huggingface_perf_xpu", shard: 2, num_shards: 5, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "inductor_huggingface_perf_xpu", shard: 3, num_shards: 5, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "inductor_huggingface_perf_xpu", shard: 4, num_shards: 5, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "inductor_huggingface_perf_xpu", shard: 5, num_shards: 5, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "inductor_timm_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "inductor_timm_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "inductor_timm_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "inductor_timm_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "inductor_timm_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "inductor_timm_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "inductor_torchbench_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "inductor_torchbench_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "inductor_torchbench_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "inductor_torchbench_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "inductor_torchbench_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "inductor_torchbench_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
|
||||||
|
]}
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
|
xpu-n-py3_10-inductor-benchmark-test-nightly:
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
if: github.event_name != 'workflow_dispatch'
|
||||||
|
name: xpu-n-py3.10-inductor-benchmark
|
||||||
|
uses: ./.github/workflows/_xpu-test.yml
|
||||||
|
needs: xpu-n-py3_10-inductor-benchmark-build
|
||||||
|
with:
|
||||||
|
build-environment: linux-jammy-xpu-n-py3.10
|
||||||
|
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
|
||||||
|
docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
|
||||||
|
test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
|
||||||
|
timeout-minutes: 720
|
||||||
|
# Disable monitor in perf tests for more investigation
|
||||||
|
disable-monitor: true
|
||||||
|
monitor-log-interval: 10
|
||||||
|
monitor-data-collect-interval: 2
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
|
xpu-n-py3_10-inductor-benchmark-test:
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
if: github.event_name == 'workflow_dispatch'
|
||||||
|
name: xpu-n-py3.10-inductor-test
|
||||||
|
uses: ./.github/workflows/_xpu-test.yml
|
||||||
|
needs: xpu-n-py3_10-inductor-benchmark-build
|
||||||
|
with:
|
||||||
|
build-environment: linux-jammy-xpu-n-py3.10
|
||||||
|
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
|
||||||
|
docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
|
||||||
|
test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
|
||||||
|
timeout-minutes: 720
|
||||||
|
disable-monitor: false
|
||||||
|
monitor-log-interval: 15
|
||||||
|
monitor-data-collect-interval: 4
|
||||||
|
secrets: inherit
|
||||||
84
.github/workflows/periodic-rocm-mi200.yml
vendored
Normal file
84
.github/workflows/periodic-rocm-mi200.yml
vendored
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
name: periodic-rocm-mi200
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
# We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
|
||||||
|
# Also run less frequently on weekends.
|
||||||
|
- cron: 45 0,8,16 * * 1-5
|
||||||
|
- cron: 45 4 * * 0,6
|
||||||
|
- cron: 45 4,12,20 * * 1-5
|
||||||
|
- cron: 45 12 * * 0,6
|
||||||
|
- cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- ciflow/periodic/*
|
||||||
|
- ciflow/periodic-rocm-mi200/*
|
||||||
|
branches:
|
||||||
|
- release/*
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
llm-td:
|
||||||
|
if: github.repository_owner == 'pytorch'
|
||||||
|
name: before-test
|
||||||
|
uses: ./.github/workflows/llm_td_retrieval.yml
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
target-determination:
|
||||||
|
name: before-test
|
||||||
|
uses: ./.github/workflows/target_determination.yml
|
||||||
|
needs: llm-td
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
get-label-type:
|
||||||
|
name: get-label-type
|
||||||
|
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||||
|
if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
|
||||||
|
with:
|
||||||
|
triggering_actor: ${{ github.triggering_actor }}
|
||||||
|
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||||
|
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||||
|
curr_ref_type: ${{ github.ref_type }}
|
||||||
|
|
||||||
|
linux-jammy-rocm-py3_10-build:
|
||||||
|
name: linux-jammy-rocm-py3.10
|
||||||
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
|
needs: get-label-type
|
||||||
|
with:
|
||||||
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
|
build-environment: linux-jammy-rocm-py3.10
|
||||||
|
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||||
|
test-matrix: |
|
||||||
|
{ include: [
|
||||||
|
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||||
|
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||||
|
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||||
|
]}
|
||||||
|
secrets: inherit
|
||||||
|
|
||||||
|
linux-jammy-rocm-py3_10-test:
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
name: linux-jammy-rocm-py3.10
|
||||||
|
uses: ./.github/workflows/_rocm-test.yml
|
||||||
|
needs:
|
||||||
|
- linux-jammy-rocm-py3_10-build
|
||||||
|
- target-determination
|
||||||
|
with:
|
||||||
|
build-environment: linux-jammy-rocm-py3.10
|
||||||
|
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||||
|
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||||
|
secrets: inherit
|
||||||
31
.github/workflows/periodic.yml
vendored
31
.github/workflows/periodic.yml
vendored
@ -204,37 +204,6 @@ jobs:
|
|||||||
test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
linux-jammy-rocm-py3_10-build:
|
|
||||||
name: linux-jammy-rocm-py3.10
|
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
|
||||||
needs: get-label-type
|
|
||||||
with:
|
|
||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
|
||||||
build-environment: linux-jammy-rocm-py3.10
|
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
|
||||||
test-matrix: |
|
|
||||||
{ include: [
|
|
||||||
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
|
|
||||||
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
|
|
||||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
|
|
||||||
]}
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
linux-jammy-rocm-py3_10-test:
|
|
||||||
permissions:
|
|
||||||
id-token: write
|
|
||||||
contents: read
|
|
||||||
name: linux-jammy-rocm-py3.10
|
|
||||||
uses: ./.github/workflows/_rocm-test.yml
|
|
||||||
needs:
|
|
||||||
- linux-jammy-rocm-py3_10-build
|
|
||||||
- target-determination
|
|
||||||
with:
|
|
||||||
build-environment: linux-jammy-rocm-py3.10
|
|
||||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
|
||||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build:
|
linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build:
|
||||||
name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck
|
name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
|
|||||||
1
.github/workflows/upload-test-stats.yml
vendored
1
.github/workflows/upload-test-stats.yml
vendored
@ -6,6 +6,7 @@ on:
|
|||||||
- pull
|
- pull
|
||||||
- trunk
|
- trunk
|
||||||
- periodic
|
- periodic
|
||||||
|
- periodic-rocm-mi200
|
||||||
- periodic-rocm-mi300
|
- periodic-rocm-mi300
|
||||||
- inductor
|
- inductor
|
||||||
- unstable
|
- unstable
|
||||||
|
|||||||
20
.github/workflows/xpu.yml
vendored
20
.github/workflows/xpu.yml
vendored
@ -59,14 +59,18 @@ jobs:
|
|||||||
runner: linux.c7i.12xlarge
|
runner: linux.c7i.12xlarge
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
{ config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
|
{ config: "default", shard: 1, num_shards: 12, runner: "linux.idc.xpu" },
|
||||||
{ config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
|
{ config: "default", shard: 2, num_shards: 12, runner: "linux.idc.xpu" },
|
||||||
{ config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
|
{ config: "default", shard: 3, num_shards: 12, runner: "linux.idc.xpu" },
|
||||||
{ config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
|
{ config: "default", shard: 4, num_shards: 12, runner: "linux.idc.xpu" },
|
||||||
{ config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
|
{ config: "default", shard: 5, num_shards: 12, runner: "linux.idc.xpu" },
|
||||||
{ config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
|
{ config: "default", shard: 6, num_shards: 12, runner: "linux.idc.xpu" },
|
||||||
{ config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
|
{ config: "default", shard: 7, num_shards: 12, runner: "linux.idc.xpu" },
|
||||||
{ config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
|
{ config: "default", shard: 8, num_shards: 12, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "default", shard: 9, num_shards: 12, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "default", shard: 10, num_shards: 12, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "default", shard: 11, num_shards: 12, runner: "linux.idc.xpu" },
|
||||||
|
{ config: "default", shard: 12, num_shards: 12, runner: "linux.idc.xpu" },
|
||||||
]}
|
]}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
|
|||||||
1
.gitignore
vendored
1
.gitignore
vendored
@ -143,6 +143,7 @@ scripts/release_notes/*.json
|
|||||||
sccache-stats*.json
|
sccache-stats*.json
|
||||||
lint.json
|
lint.json
|
||||||
merge_record.json
|
merge_record.json
|
||||||
|
.github/scripts/nightly_source_matrix.json
|
||||||
|
|
||||||
# These files get copied over on invoking setup.py
|
# These files get copied over on invoking setup.py
|
||||||
torchgen/packaged/*
|
torchgen/packaged/*
|
||||||
|
|||||||
@ -374,7 +374,7 @@ cmake_dependent_option(
|
|||||||
"Build the lazy Torchscript backend, not compatible with mobile builds" ON
|
"Build the lazy Torchscript backend, not compatible with mobile builds" ON
|
||||||
"NOT INTERN_BUILD_MOBILE" OFF)
|
"NOT INTERN_BUILD_MOBILE" OFF)
|
||||||
cmake_dependent_option(BUILD_FUNCTORCH "Build Functorch" ON "BUILD_PYTHON" OFF)
|
cmake_dependent_option(BUILD_FUNCTORCH "Build Functorch" ON "BUILD_PYTHON" OFF)
|
||||||
cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
|
cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin folder"
|
||||||
OFF "USE_CUDA" OFF)
|
OFF "USE_CUDA" OFF)
|
||||||
cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
|
cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
|
||||||
"CPU_AARCH64" OFF)
|
"CPU_AARCH64" OFF)
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||

|

|
||||||
|
|
||||||
--------------------------------------------------------------------------------
|
--------------------------------------------------------------------------------
|
||||||
|
|
||||||
@ -72,7 +72,7 @@ Elaborating Further:
|
|||||||
|
|
||||||
If you use NumPy, then you have used Tensors (a.k.a. ndarray).
|
If you use NumPy, then you have used Tensors (a.k.a. ndarray).
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
PyTorch provides Tensors that can live either on the CPU or the GPU and accelerates the
|
PyTorch provides Tensors that can live either on the CPU or the GPU and accelerates the
|
||||||
computation by a huge amount.
|
computation by a huge amount.
|
||||||
@ -99,7 +99,7 @@ from several research papers on this topic, as well as current and past work suc
|
|||||||
While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date.
|
While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date.
|
||||||
You get the best of speed and flexibility for your crazy research.
|
You get the best of speed and flexibility for your crazy research.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
### Python First
|
### Python First
|
||||||
|
|
||||||
|
|||||||
@ -260,7 +260,7 @@ IF(USE_FBGEMM_GENAI)
|
|||||||
if(USE_CUDA)
|
if(USE_CUDA)
|
||||||
# To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
|
# To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
|
||||||
# If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
|
# If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
|
||||||
set(FBGEMM_CUTLASS_KERNELS_REGEX ".*mx8mx8bf16_grouped.*")
|
set(FBGEMM_CUTLASS_KERNELS_REGEX ".*(mx8mx8bf16_grouped|f4f4bf16_grouped).*")
|
||||||
file(GLOB_RECURSE fbgemm_genai_native_cuda_cu
|
file(GLOB_RECURSE fbgemm_genai_native_cuda_cu
|
||||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu"
|
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu"
|
||||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
|
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
|
||||||
@ -291,6 +291,7 @@ IF(USE_FBGEMM_GENAI)
|
|||||||
|
|
||||||
set(fbgemm_genai_cuh
|
set(fbgemm_genai_cuh
|
||||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
|
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
|
||||||
|
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/f4f4bf16_grouped/"
|
||||||
"${FBGEMM_GENAI_SRCS}/"
|
"${FBGEMM_GENAI_SRCS}/"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -825,6 +825,14 @@ void Context::setDisplayVmapFallbackWarnings(bool enabled) {
|
|||||||
display_vmap_fallback_warnings_ = enabled;
|
display_vmap_fallback_warnings_ = enabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool Context::warnOnAccumulateGradStreamMismatch() const {
|
||||||
|
return warn_on_accumulate_grad_stream_mismatch_;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Context::setWarnOnAccumulateGradStreamMismatch(bool enabled) {
|
||||||
|
warn_on_accumulate_grad_stream_mismatch_ = enabled;
|
||||||
|
}
|
||||||
|
|
||||||
bool Context::isDefaultMobileCPUAllocatorSet() {
|
bool Context::isDefaultMobileCPUAllocatorSet() {
|
||||||
return prev_allocator_ptr_ != nullptr;
|
return prev_allocator_ptr_ != nullptr;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -404,6 +404,9 @@ class TORCH_API Context {
|
|||||||
void setDisplayVmapFallbackWarnings(bool enabled);
|
void setDisplayVmapFallbackWarnings(bool enabled);
|
||||||
bool areVmapFallbackWarningsEnabled() const;
|
bool areVmapFallbackWarningsEnabled() const;
|
||||||
|
|
||||||
|
void setWarnOnAccumulateGradStreamMismatch(bool enabled);
|
||||||
|
bool warnOnAccumulateGradStreamMismatch() const;
|
||||||
|
|
||||||
bool isDefaultMobileCPUAllocatorSet();
|
bool isDefaultMobileCPUAllocatorSet();
|
||||||
void setDefaultMobileCPUAllocator();
|
void setDefaultMobileCPUAllocator();
|
||||||
void unsetDefaultMobileCPUAllocator();
|
void unsetDefaultMobileCPUAllocator();
|
||||||
@ -494,6 +497,7 @@ class TORCH_API Context {
|
|||||||
bool release_original_weights = false;
|
bool release_original_weights = false;
|
||||||
#endif
|
#endif
|
||||||
bool display_vmap_fallback_warnings_ = false;
|
bool display_vmap_fallback_warnings_ = false;
|
||||||
|
bool warn_on_accumulate_grad_stream_mismatch_ = true;
|
||||||
std::atomic<at::QEngine> quantized_engine = at::QEngine::NoQEngine;
|
std::atomic<at::QEngine> quantized_engine = at::QEngine::NoQEngine;
|
||||||
bool enable_sparse_tensor_invariant_checks = false;
|
bool enable_sparse_tensor_invariant_checks = false;
|
||||||
bool allow_fp16_reduction_cpu = false;
|
bool allow_fp16_reduction_cpu = false;
|
||||||
|
|||||||
@ -677,8 +677,8 @@ struct CachingHostAllocatorImpl {
|
|||||||
// size. This allows us to quickly find a free block of the right size.
|
// size. This allows us to quickly find a free block of the right size.
|
||||||
// We use deque to store per size free list and guard the list with its own
|
// We use deque to store per size free list and guard the list with its own
|
||||||
// mutex.
|
// mutex.
|
||||||
alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>> free_list_ =
|
alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>>
|
||||||
std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
|
free_list_{MAX_SIZE_INDEX};
|
||||||
|
|
||||||
alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
|
alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
|
||||||
std::deque<std::pair<E, B*>> events_; // event queue paired with block
|
std::deque<std::pair<E, B*>> events_; // event queue paired with block
|
||||||
|
|||||||
@ -19,6 +19,13 @@ inline namespace CPU_CAPABILITY {
|
|||||||
#error "Big endian is not supported."
|
#error "Big endian is not supported."
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// GCC does not properly optimize bf16 operators
|
||||||
|
#if defined(__ARM_FEATURE_BF16) && (__clang_major__ >= 19)
|
||||||
|
#define BF16_ARITHMETIC_SUPPORTED() 1
|
||||||
|
#else
|
||||||
|
#define BF16_ARITHMETIC_SUPPORTED() 0
|
||||||
|
#endif
|
||||||
|
|
||||||
// Unlike the float16_t family of types, bfloat16_t is not available
|
// Unlike the float16_t family of types, bfloat16_t is not available
|
||||||
// when we're not targeting bfloat16 hardware support on some
|
// when we're not targeting bfloat16 hardware support on some
|
||||||
// platforms (but not Mac, so we have to be careful not to shadow the
|
// platforms (but not Mac, so we have to be careful not to shadow the
|
||||||
@ -352,18 +359,35 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
|||||||
other, &Vectorized<float>::name); \
|
other, &Vectorized<float>::name); \
|
||||||
}
|
}
|
||||||
|
|
||||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
|
|
||||||
Vectorized frac() const;
|
Vectorized frac() const;
|
||||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(trunc)
|
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(trunc)
|
||||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(sqrt)
|
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(sqrt)
|
||||||
|
|
||||||
#ifdef __ARM_FEATURE_BF16
|
#ifdef __ARM_FEATURE_BF16
|
||||||
|
// Flip sign bit
|
||||||
Vectorized<c10::BFloat16> neg() const {
|
Vectorized<c10::BFloat16> neg() const {
|
||||||
return -values;
|
return vreinterpretq_bf16_s16(vreinterpretq_s16_bf16(values) ^ (-32768));
|
||||||
}
|
}
|
||||||
|
// Fast reciprocal is fine because we are truncating results
|
||||||
Vectorized<c10::BFloat16> reciprocal() const {
|
Vectorized<c10::BFloat16> reciprocal() const {
|
||||||
return 1.0f / values;
|
auto x = vcvtq_low_f32_bf16(values);
|
||||||
|
auto y = vcvtq_high_f32_bf16(values);
|
||||||
|
x = vrecpeq_f32(x);
|
||||||
|
y = vrecpeq_f32(y);
|
||||||
|
return vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(x), y);
|
||||||
}
|
}
|
||||||
|
// Clearing the sign bit
|
||||||
|
Vectorized<c10::BFloat16> abs() const {
|
||||||
|
return vreinterpretq_bf16_u16(vreinterpretq_u16_bf16(values) & 0x7FFF);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
|
||||||
|
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
|
||||||
|
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// These functions are optimized on clang-21+
|
||||||
|
#if BF16_ARITHMETIC_SUPPORTED() && (__clang_major__ >= 21)
|
||||||
Vectorized<c10::BFloat16> operator==(
|
Vectorized<c10::BFloat16> operator==(
|
||||||
const Vectorized<c10::BFloat16>& other) const {
|
const Vectorized<c10::BFloat16>& other) const {
|
||||||
return values == other.values;
|
return values == other.values;
|
||||||
@ -394,8 +418,6 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
|||||||
return values >= other.values;
|
return values >= other.values;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
|
|
||||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal)
|
|
||||||
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator==)
|
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator==)
|
||||||
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator!=)
|
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator!=)
|
||||||
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<)
|
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<)
|
||||||
@ -451,7 +473,7 @@ template <>
|
|||||||
Vectorized<c10::BFloat16> inline operator+(
|
Vectorized<c10::BFloat16> inline operator+(
|
||||||
const Vectorized<c10::BFloat16>& a,
|
const Vectorized<c10::BFloat16>& a,
|
||||||
const Vectorized<c10::BFloat16>& b) {
|
const Vectorized<c10::BFloat16>& b) {
|
||||||
#ifdef __ARM_FEATURE_BF16
|
#if BF16_ARITHMETIC_SUPPORTED()
|
||||||
bfloat16x8_t x = a;
|
bfloat16x8_t x = a;
|
||||||
bfloat16x8_t y = b;
|
bfloat16x8_t y = b;
|
||||||
return x + y;
|
return x + y;
|
||||||
@ -464,7 +486,7 @@ template <>
|
|||||||
Vectorized<c10::BFloat16> inline operator-(
|
Vectorized<c10::BFloat16> inline operator-(
|
||||||
const Vectorized<c10::BFloat16>& a,
|
const Vectorized<c10::BFloat16>& a,
|
||||||
const Vectorized<c10::BFloat16>& b) {
|
const Vectorized<c10::BFloat16>& b) {
|
||||||
#ifdef __ARM_FEATURE_BF16
|
#if BF16_ARITHMETIC_SUPPORTED()
|
||||||
bfloat16x8_t x = a;
|
bfloat16x8_t x = a;
|
||||||
bfloat16x8_t y = b;
|
bfloat16x8_t y = b;
|
||||||
return x - y;
|
return x - y;
|
||||||
@ -477,7 +499,7 @@ template <>
|
|||||||
Vectorized<c10::BFloat16> inline operator*(
|
Vectorized<c10::BFloat16> inline operator*(
|
||||||
const Vectorized<c10::BFloat16>& a,
|
const Vectorized<c10::BFloat16>& a,
|
||||||
const Vectorized<c10::BFloat16>& b) {
|
const Vectorized<c10::BFloat16>& b) {
|
||||||
#ifdef __ARM_FEATURE_BF16
|
#if BF16_ARITHMETIC_SUPPORTED()
|
||||||
bfloat16x8_t x = a;
|
bfloat16x8_t x = a;
|
||||||
bfloat16x8_t y = b;
|
bfloat16x8_t y = b;
|
||||||
return x * y;
|
return x * y;
|
||||||
@ -490,7 +512,7 @@ template <>
|
|||||||
Vectorized<c10::BFloat16> inline operator/(
|
Vectorized<c10::BFloat16> inline operator/(
|
||||||
const Vectorized<c10::BFloat16>& a,
|
const Vectorized<c10::BFloat16>& a,
|
||||||
const Vectorized<c10::BFloat16>& b) {
|
const Vectorized<c10::BFloat16>& b) {
|
||||||
#ifdef __ARM_FEATURE_BF16
|
#if BF16_ARITHMETIC_SUPPORTED()
|
||||||
bfloat16x8_t x = a;
|
bfloat16x8_t x = a;
|
||||||
bfloat16x8_t y = b;
|
bfloat16x8_t y = b;
|
||||||
return x / y;
|
return x / y;
|
||||||
@ -607,7 +629,7 @@ Vectorized<c10::BFloat16> inline fmadd(
|
|||||||
const Vectorized<c10::BFloat16>& a,
|
const Vectorized<c10::BFloat16>& a,
|
||||||
const Vectorized<c10::BFloat16>& b,
|
const Vectorized<c10::BFloat16>& b,
|
||||||
const Vectorized<c10::BFloat16>& c) {
|
const Vectorized<c10::BFloat16>& c) {
|
||||||
#ifdef __ARM_FEATURE_BF16
|
#if BF16_ARITHMETIC_SUPPORTED()
|
||||||
bfloat16x8_t x = a;
|
bfloat16x8_t x = a;
|
||||||
bfloat16x8_t y = b;
|
bfloat16x8_t y = b;
|
||||||
bfloat16x8_t z = c;
|
bfloat16x8_t z = c;
|
||||||
@ -627,7 +649,7 @@ Vectorized<c10::BFloat16> inline fnmadd(
|
|||||||
const Vectorized<c10::BFloat16>& a,
|
const Vectorized<c10::BFloat16>& a,
|
||||||
const Vectorized<c10::BFloat16>& b,
|
const Vectorized<c10::BFloat16>& b,
|
||||||
const Vectorized<c10::BFloat16>& c) {
|
const Vectorized<c10::BFloat16>& c) {
|
||||||
#ifdef __ARM_FEATURE_BF16
|
#if BF16_ARITHMETIC_SUPPORTED()
|
||||||
bfloat16x8_t x = a;
|
bfloat16x8_t x = a;
|
||||||
bfloat16x8_t y = b;
|
bfloat16x8_t y = b;
|
||||||
bfloat16x8_t z = c;
|
bfloat16x8_t z = c;
|
||||||
@ -643,7 +665,7 @@ Vectorized<c10::BFloat16> inline fmsub(
|
|||||||
const Vectorized<c10::BFloat16>& a,
|
const Vectorized<c10::BFloat16>& a,
|
||||||
const Vectorized<c10::BFloat16>& b,
|
const Vectorized<c10::BFloat16>& b,
|
||||||
const Vectorized<c10::BFloat16>& c) {
|
const Vectorized<c10::BFloat16>& c) {
|
||||||
#ifdef __ARM_FEATURE_BF16
|
#if BF16_ARITHMETIC_SUPPORTED()
|
||||||
bfloat16x8_t x = a;
|
bfloat16x8_t x = a;
|
||||||
bfloat16x8_t y = b;
|
bfloat16x8_t y = b;
|
||||||
bfloat16x8_t z = c;
|
bfloat16x8_t z = c;
|
||||||
@ -659,7 +681,7 @@ Vectorized<c10::BFloat16> inline fnmsub(
|
|||||||
const Vectorized<c10::BFloat16>& a,
|
const Vectorized<c10::BFloat16>& a,
|
||||||
const Vectorized<c10::BFloat16>& b,
|
const Vectorized<c10::BFloat16>& b,
|
||||||
const Vectorized<c10::BFloat16>& c) {
|
const Vectorized<c10::BFloat16>& c) {
|
||||||
#ifdef __ARM_FEATURE_BF16
|
#if BF16_ARITHMETIC_SUPPORTED()
|
||||||
bfloat16x8_t x = a;
|
bfloat16x8_t x = a;
|
||||||
bfloat16x8_t y = b;
|
bfloat16x8_t y = b;
|
||||||
bfloat16x8_t z = c;
|
bfloat16x8_t z = c;
|
||||||
|
|||||||
@ -6,9 +6,9 @@ namespace at::vec {
|
|||||||
inline namespace CPU_CAPABILITY {
|
inline namespace CPU_CAPABILITY {
|
||||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
||||||
|
|
||||||
// Enable auto-vectorization for GCC-13+ and clang-17+
|
// Enable auto-vectorization for clang-17+
|
||||||
// GCC-12 has a bug: gcc.gnu.org/bugzilla/show_bug.cgi?id=117001
|
// GCC-12 has a bug: gcc.gnu.org/bugzilla/show_bug.cgi?id=117001
|
||||||
#if __GNUC__ > 12 || (defined(__clang__) && (__clang_major__ >= 17))
|
#if defined(__clang__) && (__clang_major__ >= 17)
|
||||||
|
|
||||||
template <typename from_type, typename to_type>
|
template <typename from_type, typename to_type>
|
||||||
inline void convertImpl(
|
inline void convertImpl(
|
||||||
@ -21,12 +21,46 @@ inline void convertImpl(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename to_type>
|
||||||
|
inline void convertFromBool(
|
||||||
|
const bool* __restrict src,
|
||||||
|
to_type* __restrict dst,
|
||||||
|
int64_t n) {
|
||||||
|
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
|
||||||
|
uint64_t len = static_cast<uint64_t>(n);
|
||||||
|
for (uint64_t i = 0; i < len; i++) {
|
||||||
|
dst[i] = srcPtr[i] != 0 ? static_cast<to_type>(1) : static_cast<to_type>(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename from_type>
|
||||||
|
inline void convertToBool(
|
||||||
|
const from_type* __restrict src,
|
||||||
|
bool* __restrict dst,
|
||||||
|
int64_t n) {
|
||||||
|
uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
|
||||||
|
uint64_t len = static_cast<uint64_t>(n);
|
||||||
|
for (uint64_t i = 0; i < len; i++) {
|
||||||
|
dstPtr[i] = src[i] != static_cast<from_type>(0) ? 1 : 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#define CONVERT_TEMPLATE(from_type, to_type) \
|
#define CONVERT_TEMPLATE(from_type, to_type) \
|
||||||
template <> \
|
template <> \
|
||||||
inline void convert(const from_type* src, to_type* dst, int64_t n) { \
|
inline void convert(const from_type* src, to_type* dst, int64_t n) { \
|
||||||
return convertImpl<from_type, to_type>(src, dst, n); \
|
return convertImpl<from_type, to_type>(src, dst, n); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define CONVERT_FROM_BOOL_TEMPLATE(to_type) \
|
||||||
|
inline void convert(const bool* src, to_type* dst, int64_t n) { \
|
||||||
|
return convertFromBool<to_type>(src, dst, n); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define CONVERT_TO_BOOL_TEMPLATE(from_type) \
|
||||||
|
inline void convert(const from_type* src, bool* dst, int64_t n) { \
|
||||||
|
return convertToBool<from_type>(src, dst, n); \
|
||||||
|
}
|
||||||
|
|
||||||
CONVERT_TEMPLATE(uint8_t, uint8_t)
|
CONVERT_TEMPLATE(uint8_t, uint8_t)
|
||||||
CONVERT_TEMPLATE(uint8_t, int8_t)
|
CONVERT_TEMPLATE(uint8_t, int8_t)
|
||||||
CONVERT_TEMPLATE(uint8_t, int16_t)
|
CONVERT_TEMPLATE(uint8_t, int16_t)
|
||||||
@ -34,6 +68,7 @@ CONVERT_TEMPLATE(uint8_t, int32_t)
|
|||||||
CONVERT_TEMPLATE(uint8_t, int64_t)
|
CONVERT_TEMPLATE(uint8_t, int64_t)
|
||||||
CONVERT_TEMPLATE(uint8_t, float)
|
CONVERT_TEMPLATE(uint8_t, float)
|
||||||
CONVERT_TEMPLATE(uint8_t, double)
|
CONVERT_TEMPLATE(uint8_t, double)
|
||||||
|
CONVERT_TO_BOOL_TEMPLATE(uint8_t)
|
||||||
CONVERT_TEMPLATE(int8_t, uint8_t)
|
CONVERT_TEMPLATE(int8_t, uint8_t)
|
||||||
CONVERT_TEMPLATE(int8_t, int8_t)
|
CONVERT_TEMPLATE(int8_t, int8_t)
|
||||||
CONVERT_TEMPLATE(int8_t, int16_t)
|
CONVERT_TEMPLATE(int8_t, int16_t)
|
||||||
@ -41,6 +76,7 @@ CONVERT_TEMPLATE(int8_t, int32_t)
|
|||||||
CONVERT_TEMPLATE(int8_t, int64_t)
|
CONVERT_TEMPLATE(int8_t, int64_t)
|
||||||
CONVERT_TEMPLATE(int8_t, float)
|
CONVERT_TEMPLATE(int8_t, float)
|
||||||
CONVERT_TEMPLATE(int8_t, double)
|
CONVERT_TEMPLATE(int8_t, double)
|
||||||
|
CONVERT_TO_BOOL_TEMPLATE(int8_t)
|
||||||
CONVERT_TEMPLATE(int16_t, uint8_t)
|
CONVERT_TEMPLATE(int16_t, uint8_t)
|
||||||
CONVERT_TEMPLATE(int16_t, int8_t)
|
CONVERT_TEMPLATE(int16_t, int8_t)
|
||||||
CONVERT_TEMPLATE(int16_t, int16_t)
|
CONVERT_TEMPLATE(int16_t, int16_t)
|
||||||
@ -48,6 +84,7 @@ CONVERT_TEMPLATE(int16_t, int32_t)
|
|||||||
CONVERT_TEMPLATE(int16_t, int64_t)
|
CONVERT_TEMPLATE(int16_t, int64_t)
|
||||||
CONVERT_TEMPLATE(int16_t, float)
|
CONVERT_TEMPLATE(int16_t, float)
|
||||||
CONVERT_TEMPLATE(int16_t, double)
|
CONVERT_TEMPLATE(int16_t, double)
|
||||||
|
CONVERT_TO_BOOL_TEMPLATE(int16_t)
|
||||||
CONVERT_TEMPLATE(int32_t, uint8_t)
|
CONVERT_TEMPLATE(int32_t, uint8_t)
|
||||||
CONVERT_TEMPLATE(int32_t, int8_t)
|
CONVERT_TEMPLATE(int32_t, int8_t)
|
||||||
CONVERT_TEMPLATE(int32_t, int16_t)
|
CONVERT_TEMPLATE(int32_t, int16_t)
|
||||||
@ -55,6 +92,7 @@ CONVERT_TEMPLATE(int32_t, int32_t)
|
|||||||
CONVERT_TEMPLATE(int32_t, int64_t)
|
CONVERT_TEMPLATE(int32_t, int64_t)
|
||||||
CONVERT_TEMPLATE(int32_t, float)
|
CONVERT_TEMPLATE(int32_t, float)
|
||||||
CONVERT_TEMPLATE(int32_t, double)
|
CONVERT_TEMPLATE(int32_t, double)
|
||||||
|
CONVERT_TO_BOOL_TEMPLATE(int32_t)
|
||||||
CONVERT_TEMPLATE(int64_t, uint8_t)
|
CONVERT_TEMPLATE(int64_t, uint8_t)
|
||||||
CONVERT_TEMPLATE(int64_t, int8_t)
|
CONVERT_TEMPLATE(int64_t, int8_t)
|
||||||
CONVERT_TEMPLATE(int64_t, int16_t)
|
CONVERT_TEMPLATE(int64_t, int16_t)
|
||||||
@ -62,6 +100,7 @@ CONVERT_TEMPLATE(int64_t, int32_t)
|
|||||||
CONVERT_TEMPLATE(int64_t, int64_t)
|
CONVERT_TEMPLATE(int64_t, int64_t)
|
||||||
CONVERT_TEMPLATE(int64_t, float)
|
CONVERT_TEMPLATE(int64_t, float)
|
||||||
CONVERT_TEMPLATE(int64_t, double)
|
CONVERT_TEMPLATE(int64_t, double)
|
||||||
|
CONVERT_TO_BOOL_TEMPLATE(int64_t)
|
||||||
CONVERT_TEMPLATE(float, uint8_t)
|
CONVERT_TEMPLATE(float, uint8_t)
|
||||||
CONVERT_TEMPLATE(float, int8_t)
|
CONVERT_TEMPLATE(float, int8_t)
|
||||||
CONVERT_TEMPLATE(float, int16_t)
|
CONVERT_TEMPLATE(float, int16_t)
|
||||||
@ -69,6 +108,7 @@ CONVERT_TEMPLATE(float, int32_t)
|
|||||||
CONVERT_TEMPLATE(float, int64_t)
|
CONVERT_TEMPLATE(float, int64_t)
|
||||||
CONVERT_TEMPLATE(float, float)
|
CONVERT_TEMPLATE(float, float)
|
||||||
CONVERT_TEMPLATE(float, double)
|
CONVERT_TEMPLATE(float, double)
|
||||||
|
CONVERT_TO_BOOL_TEMPLATE(float)
|
||||||
CONVERT_TEMPLATE(double, uint8_t)
|
CONVERT_TEMPLATE(double, uint8_t)
|
||||||
CONVERT_TEMPLATE(double, int8_t)
|
CONVERT_TEMPLATE(double, int8_t)
|
||||||
CONVERT_TEMPLATE(double, int16_t)
|
CONVERT_TEMPLATE(double, int16_t)
|
||||||
@ -76,6 +116,14 @@ CONVERT_TEMPLATE(double, int32_t)
|
|||||||
CONVERT_TEMPLATE(double, int64_t)
|
CONVERT_TEMPLATE(double, int64_t)
|
||||||
CONVERT_TEMPLATE(double, float)
|
CONVERT_TEMPLATE(double, float)
|
||||||
CONVERT_TEMPLATE(double, double)
|
CONVERT_TEMPLATE(double, double)
|
||||||
|
CONVERT_TO_BOOL_TEMPLATE(double)
|
||||||
|
CONVERT_FROM_BOOL_TEMPLATE(uint8_t)
|
||||||
|
CONVERT_FROM_BOOL_TEMPLATE(int8_t)
|
||||||
|
CONVERT_FROM_BOOL_TEMPLATE(int16_t)
|
||||||
|
CONVERT_FROM_BOOL_TEMPLATE(int32_t)
|
||||||
|
CONVERT_FROM_BOOL_TEMPLATE(int64_t)
|
||||||
|
CONVERT_FROM_BOOL_TEMPLATE(float)
|
||||||
|
CONVERT_FROM_BOOL_TEMPLATE(double)
|
||||||
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||||
|
|
||||||
#define CONVERT_FROM_FP16_TEMPLATE(to_type) \
|
#define CONVERT_FROM_FP16_TEMPLATE(to_type) \
|
||||||
@ -107,6 +155,41 @@ CONVERT_TO_FP16_TEMPLATE(int32_t)
|
|||||||
CONVERT_TO_FP16_TEMPLATE(int64_t)
|
CONVERT_TO_FP16_TEMPLATE(int64_t)
|
||||||
CONVERT_TO_FP16_TEMPLATE(float)
|
CONVERT_TO_FP16_TEMPLATE(float)
|
||||||
CONVERT_TO_FP16_TEMPLATE(double)
|
CONVERT_TO_FP16_TEMPLATE(double)
|
||||||
|
|
||||||
|
inline void convertBoolToFp16Impl(
|
||||||
|
const bool* __restrict src,
|
||||||
|
at::Half* __restrict dst,
|
||||||
|
int64_t n) {
|
||||||
|
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
|
||||||
|
float16_t* dstPtr = reinterpret_cast<float16_t*>(dst);
|
||||||
|
uint64_t len = static_cast<uint64_t>(n);
|
||||||
|
for (uint64_t i = 0; i < len; i++) {
|
||||||
|
dstPtr[i] = srcPtr[i] != 0 ? 1.0 : 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline void convert(const bool* src, at::Half* dst, int64_t n) {
|
||||||
|
return convertBoolToFp16Impl(src, dst, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void convertFp16ToBoolImpl(
|
||||||
|
const at::Half* __restrict src,
|
||||||
|
bool* __restrict dst,
|
||||||
|
int64_t n) {
|
||||||
|
const float16_t* srcPtr = reinterpret_cast<const float16_t*>(src);
|
||||||
|
uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
|
||||||
|
uint64_t len = static_cast<uint64_t>(n);
|
||||||
|
for (uint64_t i = 0; i < len; i++) {
|
||||||
|
dstPtr[i] = srcPtr[i] != 0.0 ? 1 : 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline void convert(const at::Half* src, bool* dst, int64_t n) {
|
||||||
|
return convertFp16ToBoolImpl(src, dst, n);
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#ifdef __ARM_FEATURE_BF16
|
#ifdef __ARM_FEATURE_BF16
|
||||||
CONVERT_TEMPLATE(bfloat16_t, uint8_t)
|
CONVERT_TEMPLATE(bfloat16_t, uint8_t)
|
||||||
@ -124,6 +207,44 @@ CONVERT_TEMPLATE(int32_t, bfloat16_t)
|
|||||||
CONVERT_TEMPLATE(int64_t, bfloat16_t)
|
CONVERT_TEMPLATE(int64_t, bfloat16_t)
|
||||||
CONVERT_TEMPLATE(float, bfloat16_t)
|
CONVERT_TEMPLATE(float, bfloat16_t)
|
||||||
CONVERT_TEMPLATE(double, bfloat16_t)
|
CONVERT_TEMPLATE(double, bfloat16_t)
|
||||||
|
|
||||||
|
inline void convertBoolToBfloat16Impl(
|
||||||
|
const bool* __restrict src,
|
||||||
|
c10::BFloat16* __restrict dst,
|
||||||
|
int64_t n) {
|
||||||
|
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
|
||||||
|
uint16_t* dstPtr = reinterpret_cast<uint16_t*>(dst);
|
||||||
|
uint64_t len = static_cast<uint64_t>(n);
|
||||||
|
constexpr uint16_t kBf16One = 0x3f80; // 1.0 in bfloat16
|
||||||
|
for (uint64_t i = 0; i < len; i++) {
|
||||||
|
dstPtr[i] = srcPtr[i] != 0 ? kBf16One : 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline void convert(const bool* src, c10::BFloat16* dst, int64_t n) {
|
||||||
|
return convertBoolToBfloat16Impl(src, dst, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void convertBfloat16ToBoolImpl(
|
||||||
|
const c10::BFloat16* __restrict src,
|
||||||
|
bool* __restrict dst,
|
||||||
|
int64_t n) {
|
||||||
|
uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
|
||||||
|
const uint16_t* srcPtr = reinterpret_cast<const uint16_t*>(src);
|
||||||
|
uint64_t len = static_cast<uint64_t>(n);
|
||||||
|
for (uint64_t i = 0; i < len; i++) {
|
||||||
|
// Check if all non-sign bits are 0
|
||||||
|
bool isBf16Zero = (srcPtr[i] & 0x7fff) == 0;
|
||||||
|
dstPtr[i] = isBf16Zero ? 0 : 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
inline void convert(const c10::BFloat16* src, bool* dst, int64_t n) {
|
||||||
|
return convertBfloat16ToBoolImpl(src, dst, n);
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -309,7 +309,7 @@ class Vectorized<float> {
|
|||||||
DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1)
|
DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1)
|
||||||
// Implementation copied from Arm Optimized Routine
|
// Implementation copied from Arm Optimized Routine
|
||||||
// https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
|
// https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
|
||||||
Vectorized<float> exp_u20() const {
|
inline Vectorized<float> vexpq_f32_u20() const {
|
||||||
// bail out to sleef if it's a special case:
|
// bail out to sleef if it's a special case:
|
||||||
// i.e. there's an input s.t. |input| > 87.3....
|
// i.e. there's an input s.t. |input| > 87.3....
|
||||||
const float32x4_t special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);
|
const float32x4_t special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);
|
||||||
@ -348,6 +348,9 @@ class Vectorized<float> {
|
|||||||
|
|
||||||
return vfmaq_f32(scale, poly, scale);
|
return vfmaq_f32(scale, poly, scale);
|
||||||
}
|
}
|
||||||
|
Vectorized<float> exp_u20() const {
|
||||||
|
return vexpq_f32_u20();
|
||||||
|
}
|
||||||
Vectorized<float> fexp_u20() const {
|
Vectorized<float> fexp_u20() const {
|
||||||
return exp_u20();
|
return exp_u20();
|
||||||
}
|
}
|
||||||
@ -634,7 +637,7 @@ inline Vectorized<float> Vectorized<float>::erf() const {
|
|||||||
// - exp(- x * x)
|
// - exp(- x * x)
|
||||||
auto pow_2 = (*this) * (*this);
|
auto pow_2 = (*this) * (*this);
|
||||||
auto neg_pow_2 = pow_2 ^ neg_zero_vec;
|
auto neg_pow_2 = pow_2 ^ neg_zero_vec;
|
||||||
auto tmp4 = neg_pow_2.exp();
|
auto tmp4 = neg_pow_2.vexpq_f32_u20();
|
||||||
auto tmp5 = tmp4 ^ neg_zero_vec;
|
auto tmp5 = tmp4 ^ neg_zero_vec;
|
||||||
// erf(x) = sign(x) * (1 - r * t * exp(- x * x))
|
// erf(x) = sign(x) * (1 - r * t * exp(- x * x))
|
||||||
auto tmp6 = t * tmp5;
|
auto tmp6 = t * tmp5;
|
||||||
|
|||||||
@ -1,8 +1,20 @@
|
|||||||
#include <ATen/cuda/CUDAGreenContext.h>
|
#include <ATen/cuda/CUDAGreenContext.h>
|
||||||
|
|
||||||
|
#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
||||||
|
#include <c10/cuda/driver_api.h>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <vector>
|
||||||
|
#define HAS_CUDA_GREEN_CONTEXT() 1
|
||||||
|
#else
|
||||||
|
#define HAS_CUDA_GREEN_CONTEXT() 0
|
||||||
|
// Suppress unsued private field warnings as this class is not supposed to be called
|
||||||
|
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-private-field")
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace at::cuda {
|
namespace at::cuda {
|
||||||
GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
|
|
||||||
#if CUDA_HAS_GREEN_CONTEXT
|
GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
|
||||||
|
#if HAS_CUDA_GREEN_CONTEXT()
|
||||||
int driver_version;
|
int driver_version;
|
||||||
C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
|
C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
@ -68,11 +80,11 @@ namespace at::cuda {
|
|||||||
std::unique_ptr<GreenContext> GreenContext::create(
|
std::unique_ptr<GreenContext> GreenContext::create(
|
||||||
uint32_t num_sms,
|
uint32_t num_sms,
|
||||||
std::optional<uint32_t> device_id) {
|
std::optional<uint32_t> device_id) {
|
||||||
#if CUDA_HAS_GREEN_CONTEXT
|
#if HAS_CUDA_GREEN_CONTEXT()
|
||||||
if (!device_id.has_value()) {
|
if (!device_id.has_value()) {
|
||||||
device_id = at::cuda::current_device();
|
device_id = at::cuda::current_device();
|
||||||
}
|
}
|
||||||
return std::make_unique<GreenContext>(device_id.value(), num_sms);
|
return std::unique_ptr<GreenContext>(new GreenContext(device_id.value(), num_sms));
|
||||||
#else
|
#else
|
||||||
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
||||||
#endif
|
#endif
|
||||||
@ -80,7 +92,7 @@ namespace at::cuda {
|
|||||||
|
|
||||||
// Implement move operations
|
// Implement move operations
|
||||||
GreenContext::GreenContext(GreenContext&& other) noexcept{
|
GreenContext::GreenContext(GreenContext&& other) noexcept{
|
||||||
#if CUDA_HAS_GREEN_CONTEXT
|
#if HAS_CUDA_GREEN_CONTEXT()
|
||||||
device_id_ = std::exchange(other.device_id_, -1);
|
device_id_ = std::exchange(other.device_id_, -1);
|
||||||
green_ctx_ = std::exchange(other.green_ctx_, nullptr);
|
green_ctx_ = std::exchange(other.green_ctx_, nullptr);
|
||||||
context_ = std::exchange(other.context_, nullptr);
|
context_ = std::exchange(other.context_, nullptr);
|
||||||
@ -91,7 +103,7 @@ namespace at::cuda {
|
|||||||
}
|
}
|
||||||
|
|
||||||
GreenContext& GreenContext::operator=(GreenContext&& other) noexcept{
|
GreenContext& GreenContext::operator=(GreenContext&& other) noexcept{
|
||||||
#if CUDA_HAS_GREEN_CONTEXT
|
#if HAS_CUDA_GREEN_CONTEXT()
|
||||||
if (this != &other) {
|
if (this != &other) {
|
||||||
// Clean up current resources
|
// Clean up current resources
|
||||||
if (green_ctx_) {
|
if (green_ctx_) {
|
||||||
@ -120,7 +132,7 @@ namespace at::cuda {
|
|||||||
}
|
}
|
||||||
|
|
||||||
GreenContext::~GreenContext() noexcept{
|
GreenContext::~GreenContext() noexcept{
|
||||||
#if CUDA_HAS_GREEN_CONTEXT
|
#if HAS_CUDA_GREEN_CONTEXT()
|
||||||
C10_CUDA_DRIVER_CHECK(
|
C10_CUDA_DRIVER_CHECK(
|
||||||
c10::cuda::DriverAPI::get()->cuGreenCtxDestroy_(green_ctx_));
|
c10::cuda::DriverAPI::get()->cuGreenCtxDestroy_(green_ctx_));
|
||||||
#else
|
#else
|
||||||
@ -128,25 +140,9 @@ namespace at::cuda {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the underlying CUDA context
|
|
||||||
CUcontext GreenContext::getContext() const {
|
|
||||||
#if CUDA_HAS_GREEN_CONTEXT
|
|
||||||
return context_;
|
|
||||||
#else
|
|
||||||
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get the underlying green context
|
|
||||||
#if CUDA_HAS_GREEN_CONTEXT
|
|
||||||
CUgreenCtx GreenContext::getGreenContext() const {
|
|
||||||
return green_ctx_;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Make this context current
|
// Make this context current
|
||||||
void GreenContext::setContext() {
|
void GreenContext::setContext() {
|
||||||
#if CUDA_HAS_GREEN_CONTEXT
|
#if HAS_CUDA_GREEN_CONTEXT()
|
||||||
auto current_stream = c10::cuda::getCurrentCUDAStream();
|
auto current_stream = c10::cuda::getCurrentCUDAStream();
|
||||||
parent_stream_ = current_stream.stream();
|
parent_stream_ = current_stream.stream();
|
||||||
|
|
||||||
@ -175,7 +171,7 @@ namespace at::cuda {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void GreenContext::popContext() {
|
void GreenContext::popContext() {
|
||||||
#if CUDA_HAS_GREEN_CONTEXT
|
#if HAS_CUDA_GREEN_CONTEXT()
|
||||||
// see above note about stream being hardcoded to the default stream
|
// see above note about stream being hardcoded to the default stream
|
||||||
at::cuda::CUDAEvent ev;
|
at::cuda::CUDAEvent ev;
|
||||||
ev.record(c10::cuda::getCurrentCUDAStream());
|
ev.record(c10::cuda::getCurrentCUDAStream());
|
||||||
|
|||||||
@ -1,53 +1,38 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <ATen/cuda/CUDAEvent.h>
|
#include <ATen/cuda/CUDAEvent.h>
|
||||||
|
|
||||||
#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
|
||||||
#include <c10/cuda/driver_api.h>
|
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#include <memory>
|
|
||||||
#include <stdexcept>
|
// Forward declare green context as opaque ptr
|
||||||
#include <vector>
|
typedef struct CUgreenCtx_st* CUgreenCtx;
|
||||||
#define CUDA_HAS_GREEN_CONTEXT 1
|
|
||||||
#else
|
|
||||||
#define CUDA_HAS_GREEN_CONTEXT 0
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace at::cuda {
|
namespace at::cuda {
|
||||||
|
|
||||||
class TORCH_CUDA_CPP_API GreenContext {
|
class TORCH_CUDA_CPP_API GreenContext {
|
||||||
public:
|
public:
|
||||||
GreenContext(uint32_t device_id, uint32_t num_sms);
|
// Green context creation
|
||||||
|
static std::unique_ptr<GreenContext> create(
|
||||||
static std::unique_ptr<GreenContext> create(uint32_t num_sms, std::optional<uint32_t> device_id);
|
uint32_t num_sms,
|
||||||
|
std::optional<uint32_t> device_id);
|
||||||
|
~GreenContext() noexcept;
|
||||||
|
|
||||||
// Delete copy constructor and assignment
|
// Delete copy constructor and assignment
|
||||||
GreenContext(const GreenContext&) = delete;
|
GreenContext(const GreenContext&) = delete;
|
||||||
GreenContext& operator=(const GreenContext&) = delete;
|
GreenContext& operator=(const GreenContext&) = delete;
|
||||||
|
|
||||||
// Implement move operations
|
|
||||||
GreenContext(GreenContext&& other) noexcept;
|
|
||||||
GreenContext& operator=(GreenContext&& other) noexcept;
|
|
||||||
~GreenContext() noexcept;
|
|
||||||
|
|
||||||
// Get the underlying CUDA context
|
|
||||||
CUcontext getContext() const;
|
|
||||||
|
|
||||||
// Get the underlying green context
|
|
||||||
#if CUDA_HAS_GREEN_CONTEXT
|
|
||||||
CUgreenCtx getGreenContext() const;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Make this context current
|
// Make this context current
|
||||||
void setContext();
|
void setContext();
|
||||||
|
|
||||||
void popContext();
|
void popContext();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
#if CUDA_HAS_GREEN_CONTEXT
|
GreenContext(uint32_t device_id, uint32_t num_sms);
|
||||||
|
// Implement move operations
|
||||||
|
GreenContext(GreenContext&& other) noexcept;
|
||||||
|
GreenContext& operator=(GreenContext&& other) noexcept;
|
||||||
|
|
||||||
int32_t device_id_ = -1;
|
int32_t device_id_ = -1;
|
||||||
CUgreenCtx green_ctx_ = nullptr;
|
CUgreenCtx green_ctx_ = nullptr;
|
||||||
CUcontext context_ = nullptr;
|
CUcontext context_ = nullptr;
|
||||||
cudaStream_t parent_stream_ = nullptr;
|
cudaStream_t parent_stream_ = nullptr;
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
} // namespace at::cuda
|
} // namespace at::cuda
|
||||||
|
|||||||
@ -7,17 +7,6 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#if defined(USE_ROCM)
|
|
||||||
// hipSparse const API added in v2.4.0
|
|
||||||
#if HIPSPARSE_VERSION >= 200400
|
|
||||||
#define AT_USE_HIPSPARSE_GENERIC_API() 1
|
|
||||||
#else
|
|
||||||
#define AT_USE_HIPSPARSE_GENERIC_API() 1
|
|
||||||
#endif
|
|
||||||
#else // USE_ROCM
|
|
||||||
#define AT_USE_HIPSPARSE_GENERIC_API() 0
|
|
||||||
#endif // USE_ROCM
|
|
||||||
|
|
||||||
// cuSparse Generic API spsv function was added in CUDA 11.3.0
|
// cuSparse Generic API spsv function was added in CUDA 11.3.0
|
||||||
#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11500)
|
#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11500)
|
||||||
#define AT_USE_CUSPARSE_GENERIC_SPSV() 1
|
#define AT_USE_CUSPARSE_GENERIC_SPSV() 1
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
#include <ATen/cuda/CUDAContextLight.h>
|
#include <ATen/cuda/CUDAContextLight.h>
|
||||||
#include <ATen/cuda/Sleep.h>
|
#include <ATen/cuda/Sleep.h>
|
||||||
|
|
||||||
|
#include <c10/cuda/CUDACachingAllocator.h>
|
||||||
#include <c10/cuda/CUDAException.h>
|
#include <c10/cuda/CUDAException.h>
|
||||||
#include <c10/cuda/CUDAStream.h>
|
#include <c10/cuda/CUDAStream.h>
|
||||||
|
|
||||||
@ -24,8 +25,22 @@ __global__ void spin_kernel(int64_t cycles) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
thread_local int *flag = nullptr;
|
||||||
|
|
||||||
|
__global__ void busy_wait_for_flag_kernel(int *flag) {
|
||||||
|
atomicExch(flag, 1);
|
||||||
|
while (atomicAdd(flag, 0) == 1) {
|
||||||
|
// do nothing
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__ void clear_flag_kernel(int *flag) {
|
||||||
|
atomicExch(flag, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // anonymous namespace
|
||||||
|
|
||||||
void sleep(int64_t cycles) {
|
void sleep(int64_t cycles) {
|
||||||
dim3 grid(1);
|
dim3 grid(1);
|
||||||
dim3 block(1);
|
dim3 block(1);
|
||||||
@ -33,6 +48,26 @@ void sleep(int64_t cycles) {
|
|||||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void busy_wait_for_flag() {
|
||||||
|
if (!flag) {
|
||||||
|
flag = (int*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(int));
|
||||||
|
}
|
||||||
|
dim3 grid(1);
|
||||||
|
dim3 block(1);
|
||||||
|
busy_wait_for_flag_kernel<<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(flag);
|
||||||
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||||
|
}
|
||||||
|
|
||||||
|
void clear_flag() {
|
||||||
|
if (!flag) {
|
||||||
|
flag = (int*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(int));
|
||||||
|
}
|
||||||
|
dim3 grid(1);
|
||||||
|
dim3 block(1);
|
||||||
|
clear_flag_kernel<<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(flag);
|
||||||
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
__global__ void flush_icache_kernel()
|
__global__ void flush_icache_kernel()
|
||||||
{
|
{
|
||||||
|
|||||||
@ -7,6 +7,11 @@ namespace at::cuda {
|
|||||||
// enqueues a kernel that spins for the specified number of cycles
|
// enqueues a kernel that spins for the specified number of cycles
|
||||||
TORCH_CUDA_CU_API void sleep(int64_t cycles);
|
TORCH_CUDA_CU_API void sleep(int64_t cycles);
|
||||||
|
|
||||||
|
// enqueues a kernel that spins until a flag is cleared by a
|
||||||
|
// corresponding call to clear_flag()
|
||||||
|
TORCH_CUDA_CU_API void busy_wait_for_flag();
|
||||||
|
TORCH_CUDA_CU_API void clear_flag();
|
||||||
|
|
||||||
// flushes instruction cache for ROCm; no-op for CUDA
|
// flushes instruction cache for ROCm; no-op for CUDA
|
||||||
TORCH_CUDA_CU_API void flush_icache();
|
TORCH_CUDA_CU_API void flush_icache();
|
||||||
|
|
||||||
|
|||||||
@ -2,8 +2,6 @@
|
|||||||
#include <ATen/Tensor.h>
|
#include <ATen/Tensor.h>
|
||||||
#include <ATen/cuda/Exceptions.h>
|
#include <ATen/cuda/Exceptions.h>
|
||||||
|
|
||||||
#include <mutex>
|
|
||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
namespace cuda {
|
namespace cuda {
|
||||||
namespace detail {
|
namespace detail {
|
||||||
@ -12,39 +10,36 @@ __device__ __constant__ float cublas_one_device;
|
|||||||
__device__ __constant__ float cublas_zero_device;
|
__device__ __constant__ float cublas_zero_device;
|
||||||
|
|
||||||
float *get_cublas_device_one() {
|
float *get_cublas_device_one() {
|
||||||
static c10::once_flag init_flag;
|
static float *ptr = nullptr;
|
||||||
|
static auto init_flag = [&]() {
|
||||||
c10::call_once(init_flag, []() {
|
|
||||||
const float one = 1.f;
|
const float one = 1.f;
|
||||||
AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_one_device, &one, sizeof(float)));
|
AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_one_device, &one, sizeof(float)));
|
||||||
});
|
|
||||||
|
|
||||||
float *ptr;
|
|
||||||
AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_one_device));
|
AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_one_device));
|
||||||
|
return true;
|
||||||
|
}();
|
||||||
|
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
float *get_cublas_device_zero() {
|
float *get_cublas_device_zero() {
|
||||||
static c10::once_flag init_flag;
|
static float *ptr = nullptr;
|
||||||
|
static auto init_flag = [&]() {
|
||||||
c10::call_once(init_flag, []() {
|
|
||||||
const float zero = 0.f;
|
const float zero = 0.f;
|
||||||
AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_zero_device, &zero, sizeof(float)));
|
AT_CUDA_CHECK(cudaMemcpyToSymbol(cublas_zero_device, &zero, sizeof(float)));
|
||||||
});
|
|
||||||
|
|
||||||
float *ptr;
|
|
||||||
AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_zero_device));
|
AT_CUDA_CHECK(cudaGetSymbolAddress(reinterpret_cast<void**>(&ptr), cublas_zero_device));
|
||||||
|
return true;
|
||||||
|
}();
|
||||||
|
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
float *get_user_alpha_ptr() {
|
float *get_user_alpha_ptr() {
|
||||||
static float *alpha_ptr;
|
static float *alpha_ptr;
|
||||||
|
|
||||||
static c10::once_flag init_flag;
|
static bool init_flag [[maybe_unused]] = []() {
|
||||||
|
|
||||||
c10::call_once(init_flag, []() {
|
|
||||||
AT_CUDA_CHECK(cudaMalloc(&alpha_ptr, sizeof(float)));
|
AT_CUDA_CHECK(cudaMalloc(&alpha_ptr, sizeof(float)));
|
||||||
});
|
return true;
|
||||||
|
}();
|
||||||
|
|
||||||
return alpha_ptr;
|
return alpha_ptr;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -580,7 +580,7 @@ std::ofstream& TuningContext::GetUntunedFile(){
|
|||||||
filename.append(device);
|
filename.append(device);
|
||||||
}
|
}
|
||||||
|
|
||||||
untuned_file_ = std::ofstream(filename, std::ios::out | std::ios::trunc);
|
untuned_file_ = std::ofstream(filename, std::ios::out | std::ios::app);
|
||||||
}
|
}
|
||||||
return untuned_file_;
|
return untuned_file_;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <c10/core/CachingDeviceAllocator.h>
|
||||||
#include <c10/core/Device.h>
|
#include <c10/core/Device.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
|
||||||
@ -151,6 +152,36 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
|
|||||||
}
|
}
|
||||||
|
|
||||||
virtual bool isAvailable() const override;
|
virtual bool isAvailable() const override;
|
||||||
|
|
||||||
|
/* MTIAGraph related APIs */
|
||||||
|
virtual int64_t mtiagraphCreate(bool keep_graph = false) const {
|
||||||
|
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual void mtiagraphCaptureBegin(int64_t handle, MempoolId_t pool) const {
|
||||||
|
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual void mtiagraphCaptureEnd(int64_t handle) const {
|
||||||
|
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual void mtiagraphInstantiate(int64_t handle) const {
|
||||||
|
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual void mtiagraphReplay(int64_t handle) const {
|
||||||
|
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual void mtiagraphReset(int64_t handle) const {
|
||||||
|
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual MempoolId_t mtiagraphPool(int64_t handle) const {
|
||||||
|
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct TORCH_API MTIAHooksArgs {};
|
struct TORCH_API MTIAHooksArgs {};
|
||||||
|
|||||||
@ -534,20 +534,20 @@ Tensor trace_decomp(const Tensor& tensor) {
|
|||||||
std::tuple<Tensor, std::optional<int64_t>> tril_batch_rule(
|
std::tuple<Tensor, std::optional<int64_t>> tril_batch_rule(
|
||||||
const Tensor& self,
|
const Tensor& self,
|
||||||
std::optional<int64_t> self_bdim,
|
std::optional<int64_t> self_bdim,
|
||||||
int64_t diagonal = 0) {
|
c10::SymInt diagonal = 0) {
|
||||||
TORCH_CHECK(self.dim() >= 2, "tril: The input tensor must have at least 2 dimensions.");
|
TORCH_CHECK(self.dim() >= 2, "tril: The input tensor must have at least 2 dimensions.");
|
||||||
auto self_ = moveBatchDimToFront(self, self_bdim);
|
auto self_ = moveBatchDimToFront(self, self_bdim);
|
||||||
auto result = at::tril(self_, diagonal);
|
auto result = at::tril_symint(self_, std::move(diagonal));
|
||||||
return std::make_tuple(std::move(result), 0);
|
return std::make_tuple(std::move(result), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::tuple<Tensor, std::optional<int64_t>> triu_batch_rule(
|
std::tuple<Tensor, std::optional<int64_t>> triu_batch_rule(
|
||||||
const Tensor& self,
|
const Tensor& self,
|
||||||
std::optional<int64_t> self_bdim,
|
std::optional<int64_t> self_bdim,
|
||||||
int64_t diagonal = 0) {
|
c10::SymInt diagonal = 0) {
|
||||||
TORCH_CHECK(self.dim() >= 2, "triu: The input tensor must have at least 2 dimensions.");
|
TORCH_CHECK(self.dim() >= 2, "triu: The input tensor must have at least 2 dimensions.");
|
||||||
auto self_ = moveBatchDimToFront(self, self_bdim);
|
auto self_ = moveBatchDimToFront(self, self_bdim);
|
||||||
auto result = at::triu(self_, diagonal);
|
auto result = at::triu_symint(self_, std::move(diagonal));
|
||||||
return std::make_tuple(std::move(result), 0);
|
return std::make_tuple(std::move(result), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,5 @@
|
|||||||
// Copyright © 2022 Apple Inc.
|
// Copyright © 2022 Apple Inc.
|
||||||
|
|
||||||
#include <c10/util/CallOnce.h>
|
|
||||||
|
|
||||||
#include <ATen/mps/IndexKernels.h>
|
#include <ATen/mps/IndexKernels.h>
|
||||||
#include <ATen/mps/MPSAllocatorInterface.h>
|
#include <ATen/mps/MPSAllocatorInterface.h>
|
||||||
#include <ATen/mps/MPSDevice.h>
|
#include <ATen/mps/MPSDevice.h>
|
||||||
@ -10,9 +8,6 @@
|
|||||||
|
|
||||||
namespace at::mps {
|
namespace at::mps {
|
||||||
|
|
||||||
static std::unique_ptr<MPSDevice> mps_device;
|
|
||||||
static c10::once_flag mpsdev_init;
|
|
||||||
|
|
||||||
static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& device) {
|
static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& device) {
|
||||||
// MPS Advanced Indexing needs at least Metal 2.0 (support for Argument Buffers and function constants)
|
// MPS Advanced Indexing needs at least Metal 2.0 (support for Argument Buffers and function constants)
|
||||||
// host_name attribute needs at least Metal 2.2 and ulong needs Metal 2.3 (supported on MacOS 11+
|
// host_name attribute needs at least Metal 2.2 and ulong needs Metal 2.3 (supported on MacOS 11+
|
||||||
@ -21,8 +16,8 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
|
|||||||
}
|
}
|
||||||
|
|
||||||
MPSDevice* MPSDevice::getInstance() {
|
MPSDevice* MPSDevice::getInstance() {
|
||||||
c10::call_once(mpsdev_init, [] { mps_device = std::unique_ptr<MPSDevice>(new MPSDevice()); });
|
static MPSDevice mps_device;
|
||||||
return mps_device.get();
|
return &mps_device;
|
||||||
}
|
}
|
||||||
|
|
||||||
MPSDevice::~MPSDevice() {
|
MPSDevice::~MPSDevice() {
|
||||||
|
|||||||
@ -25,18 +25,19 @@ TORCH_PRECOMPUTE_META_FUNC(avg_pool2d)
|
|||||||
// #20866, #22032: Guarantee this for the official C++ API?
|
// #20866, #22032: Guarantee this for the official C++ API?
|
||||||
TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
|
TORCH_CHECK(kernel_size.size() == 1 || kernel_size.size() == 2,
|
||||||
"avg_pool2d: kernel_size must either be a single int, or a tuple of two ints");
|
"avg_pool2d: kernel_size must either be a single int, or a tuple of two ints");
|
||||||
const int64_t kH = kernel_size[0];
|
const int kH = safe_downcast<int, int64_t>(kernel_size[0]);
|
||||||
const int64_t kW = kernel_size.size() == 1 ? kH : kernel_size[1];
|
const int kW = kernel_size.size() == 1 ? kH : safe_downcast<int, int64_t>(kernel_size[1]);
|
||||||
|
|
||||||
TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 2,
|
TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 2,
|
||||||
"avg_pool2d: stride must either be omitted, a single int, or a tuple of two ints");
|
"avg_pool2d: stride must either be omitted, a single int, or a tuple of two ints");
|
||||||
const int64_t dH = stride.empty() ? kH : stride[0];
|
const int dH = stride.empty() ? kH : safe_downcast<int, int64_t>(stride[0]);
|
||||||
const int64_t dW = stride.empty() ? kW : stride.size() == 1 ? dH : stride[1];
|
const int dW = stride.empty() ? kW :
|
||||||
|
stride.size() == 1 ? dH : safe_downcast<int, int64_t>(stride[1]);
|
||||||
|
|
||||||
TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
|
TORCH_CHECK(padding.size() == 1 || padding.size() == 2,
|
||||||
"avg_pool2d: padding must either be a single int, or a tuple of two ints");
|
"avg_pool2d: padding must either be a single int, or a tuple of two ints");
|
||||||
const int64_t padH = padding[0];
|
const int padH = safe_downcast<int, int64_t>(padding[0]);
|
||||||
const int64_t padW = padding.size() == 1 ? padH : padding[1];
|
const int padW = padding.size() == 1 ? padH : safe_downcast<int, int64_t>(padding[1]);
|
||||||
|
|
||||||
TORCH_CHECK(!divisor_override.has_value() || divisor_override.value() != 0,
|
TORCH_CHECK(!divisor_override.has_value() || divisor_override.value() != 0,
|
||||||
"divisor must be not zero");
|
"divisor must be not zero");
|
||||||
|
|||||||
@ -410,8 +410,8 @@ struct ConvParams {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
|
static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
|
||||||
// broken on cuDNN 9.8
|
// broken on cuDNN 9.8 - 9.14
|
||||||
if (cudnn_version >= 90800) {
|
if (cudnn_version >= 90800 && cudnn_version < 91500) {
|
||||||
if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
|
if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
|
||||||
(input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) &&
|
(input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) &&
|
||||||
weight.dim() == 5) {
|
weight.dim() == 5) {
|
||||||
@ -689,6 +689,10 @@ static void check_shape_forward(const at::Tensor& input,
|
|||||||
", but got bias of size ", at::symint::sizes<T>(bias), " instead");
|
", but got bias of size ", at::symint::sizes<T>(bias), " instead");
|
||||||
|
|
||||||
for (const auto i : c10::irange(2, k)) {
|
for (const auto i : c10::irange(2, k)) {
|
||||||
|
// T could be int64_t or SymInt, Specialized numeric_limts<SymInt> in c10/core/SymInt.h
|
||||||
|
TORCH_CHECK(padding[i-2] <= (std::numeric_limits<T>::max() - padding[i-2]),
|
||||||
|
"Given padding=", padding[i-2], " at dimension ", i-2, " , expected padding to be at most ",
|
||||||
|
(std::numeric_limits<T>::max() / 2));
|
||||||
input_shape.push_back(at::symint::size<T>(input, i) + 2 * padding[i-2]);
|
input_shape.push_back(at::symint::size<T>(input, i) + 2 * padding[i-2]);
|
||||||
// log new kernel size considering dilation
|
// log new kernel size considering dilation
|
||||||
kernel_shape.push_back(dilation[i-2] * (weight_sizes[i]-1) + 1);
|
kernel_shape.push_back(dilation[i-2] * (weight_sizes[i]-1) + 1);
|
||||||
@ -715,6 +719,11 @@ static void check_shape_forward(const at::Tensor& input,
|
|||||||
"Kernel size: (", kernel_ss.str(), "). Kernel size can't be greater than actual input size");
|
"Kernel size: (", kernel_ss.str(), "). Kernel size can't be greater than actual input size");
|
||||||
}
|
}
|
||||||
} else { // transposed
|
} else { // transposed
|
||||||
|
for (const auto i : c10::irange(2, k)) {
|
||||||
|
TORCH_CHECK(padding[i-2] <= (std::numeric_limits<T>::max() - padding[i-2]),
|
||||||
|
"Given padding=", padding[i-2], " at dimension ", i-2, " , expected padding to be at most ",
|
||||||
|
(std::numeric_limits<T>::max() / 2));
|
||||||
|
}
|
||||||
TORCH_CHECK(at::symint::size<T>(input, 1) == weight_sizes[0],
|
TORCH_CHECK(at::symint::size<T>(input, 1) == weight_sizes[0],
|
||||||
"Given transposed=", transposed, ", weight of size ", weight_sizes,
|
"Given transposed=", transposed, ", weight of size ", weight_sizes,
|
||||||
", expected input", at::symint::sizes<T>(input), " to have ", weight_sizes[0],
|
", expected input", at::symint::sizes<T>(input), " to have ", weight_sizes[0],
|
||||||
|
|||||||
@ -52,8 +52,7 @@ Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, in
|
|||||||
for (const auto k : c10::irange(kw)) {
|
for (const auto k : c10::irange(kw)) {
|
||||||
int iShift = std::max(0, static_cast<int>(k - real_pad));
|
int iShift = std::max(0, static_cast<int>(k - real_pad));
|
||||||
int oShift = std::max(0, static_cast<int>(real_pad - k));
|
int oShift = std::max(0, static_cast<int>(real_pad - k));
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
long t = std::min(ilen + real_pad - k, olen) - oShift;
|
||||||
int t = std::min(ilen + real_pad - k, olen) - oShift;
|
|
||||||
// Note: gemm assumes column-major matrices
|
// Note: gemm assumes column-major matrices
|
||||||
// input is l*m (row-major)
|
// input is l*m (row-major)
|
||||||
// weight is m*r (row-major)
|
// weight is m*r (row-major)
|
||||||
|
|||||||
@ -16,8 +16,7 @@ bool canUse32BitIndexMath(const TensorBase& t, int64_t max_elem) {
|
|||||||
auto linearId = elements - 1;
|
auto linearId = elements - 1;
|
||||||
|
|
||||||
// NOTE: Assumes all strides are positive, which is true for now
|
// NOTE: Assumes all strides are positive, which is true for now
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
for (auto i = t.dim() - 1; i >= 0; --i) {
|
||||||
for (int i = t.dim() - 1; i >= 0; --i) {
|
|
||||||
auto curDimIndex = linearId % t.sym_size(i);
|
auto curDimIndex = linearId % t.sym_size(i);
|
||||||
auto curDimOffset = curDimIndex * t.sym_stride(i);
|
auto curDimOffset = curDimIndex * t.sym_stride(i);
|
||||||
offset += curDimOffset;
|
offset += curDimOffset;
|
||||||
|
|||||||
@ -68,7 +68,6 @@ Tensor fbgemm_linear_int8_weight_fp32_activation(
|
|||||||
const float* input_ptr = input_contig.const_data_ptr<float>();
|
const float* input_ptr = input_contig.const_data_ptr<float>();
|
||||||
|
|
||||||
TORCH_CHECK(input.dim() >= 2);
|
TORCH_CHECK(input.dim() >= 2);
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
|
||||||
const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
|
const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
|
||||||
const int64_t K = input.size(input.dim() - 1);
|
const int64_t K = input.size(input.dim() - 1);
|
||||||
TORCH_CHECK(weight.dim() == 2);
|
TORCH_CHECK(weight.dim() == 2);
|
||||||
|
|||||||
@ -160,10 +160,9 @@ struct Dist {
|
|||||||
// value of k.
|
// value of k.
|
||||||
parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [p, self_start, self_end, n, m, res_start](int64_t k, int64_t end) {
|
parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [p, self_start, self_end, n, m, res_start](int64_t k, int64_t end) {
|
||||||
const Vec pvec(p);
|
const Vec pvec(p);
|
||||||
double n2 = n - .5;
|
double n2 = static_cast<double>(n) - .5;
|
||||||
// The -1 accounts for floating point truncation issues
|
// The -1 accounts for floating point truncation issues
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2.0 * static_cast<double>(k) - 1.0)));
|
||||||
int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
|
|
||||||
int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
|
int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
|
||||||
|
|
||||||
const scalar_t * self_i = self_start + i * m;
|
const scalar_t * self_i = self_start + i * m;
|
||||||
|
|||||||
@ -139,7 +139,7 @@ void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, dou
|
|||||||
}
|
}
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
AT_DISPATCH_ALL_TYPES(dtype, "smooth_l1_backward_cpu_out", [&] {
|
AT_DISPATCH_ALL_TYPES_AND(kHalf, dtype, "smooth_l1_backward_cpu_out", [&] {
|
||||||
auto norm_val = norm.to<scalar_t>();
|
auto norm_val = norm.to<scalar_t>();
|
||||||
scalar_t beta_val(beta);
|
scalar_t beta_val(beta);
|
||||||
auto norm_val_vec = Vectorized<scalar_t>(norm_val);
|
auto norm_val_vec = Vectorized<scalar_t>(norm_val);
|
||||||
|
|||||||
@ -170,10 +170,14 @@ static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const
|
|||||||
#if defined(CUDA_VERSION) || defined(USE_ROCM)
|
#if defined(CUDA_VERSION) || defined(USE_ROCM)
|
||||||
const auto scalar_type = mat1.scalar_type();
|
const auto scalar_type = mat1.scalar_type();
|
||||||
return (beta.toComplexDouble() == 1.0
|
return (beta.toComplexDouble() == 1.0
|
||||||
// self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
|
|
||||||
// is to use lt interface only when self is bias.
|
|
||||||
&& self.dim() == 1 && self.sizes()[0] == mat2_sizes[1] && self.is_contiguous()
|
|
||||||
&& result.dim() == 2 && result.is_contiguous()
|
&& result.dim() == 2 && result.is_contiguous()
|
||||||
|
// Conditions for bias to be fusable
|
||||||
|
&& (
|
||||||
|
self.is_contiguous() &&
|
||||||
|
// NOTE: fine to have 1-len dims to the left from the right-most one
|
||||||
|
(self.dim() == 1 || self.squeeze().dim() == 1) &&
|
||||||
|
self.sizes().back() == mat2_sizes[1]
|
||||||
|
)
|
||||||
&& ( // some dtype restrictions
|
&& ( // some dtype restrictions
|
||||||
#ifndef USE_ROCM
|
#ifndef USE_ROCM
|
||||||
scalar_type == at::ScalarType::Double ||
|
scalar_type == at::ScalarType::Double ||
|
||||||
|
|||||||
@ -208,6 +208,62 @@ _f8_f8_bf16_rowwise_grouped_mm(
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Tensor&
|
||||||
|
_f4_f4_bf16_grouped_mm_fbgemm(
|
||||||
|
const Tensor& mat_a,
|
||||||
|
const Tensor& mat_b,
|
||||||
|
const Tensor& scale_a,
|
||||||
|
const std::optional<Tensor>& global_scale_a,
|
||||||
|
const Tensor& scale_b,
|
||||||
|
const std::optional<Tensor>& global_scale_b,
|
||||||
|
const std::optional<Tensor>& offs,
|
||||||
|
const std::optional<Tensor>& bias,
|
||||||
|
Tensor& out) {
|
||||||
|
#if !defined(USE_ROCM) && defined(USE_FBGEMM_GENAI)
|
||||||
|
// Typing checks
|
||||||
|
TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2,
|
||||||
|
"mat_a must be Float4_e2n1fn_2, got: ", mat_a.scalar_type());
|
||||||
|
TORCH_CHECK_VALUE(mat_b.scalar_type() == at::kFloat4_e2m1fn_x2,
|
||||||
|
"mat_b must be Float4_e2n1fn_2, got: ", mat_b.scalar_type());
|
||||||
|
|
||||||
|
std::optional<Tensor> combined_global_scale = std::nullopt;
|
||||||
|
if (global_scale_a.has_value() || global_scale_b.has_value()) {
|
||||||
|
// NVFP4
|
||||||
|
TORCH_CHECK_VALUE(global_scale_a.has_value() && global_scale_b.has_value(),
|
||||||
|
"For NVFP4 grouped gemm both of global_scale_{a,b} must have values")
|
||||||
|
TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e4m3fn,
|
||||||
|
"scale_a must be Float8_e4m3fn, got: ", scale_a.scalar_type());
|
||||||
|
TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e4m3fn,
|
||||||
|
"scale_b must be Float8_e4m3fn, got: ", scale_b.scalar_type());
|
||||||
|
TORCH_CHECK_VALUE(global_scale_a.value().scalar_type() == at::kFloat,
|
||||||
|
"global_scale_a must be Float, got: ", global_scale_a.value().scalar_type());
|
||||||
|
TORCH_CHECK_VALUE(global_scale_b.value().scalar_type() == at::kFloat,
|
||||||
|
"global_scale_b must be Float, got: ", global_scale_b.value().scalar_type());
|
||||||
|
combined_global_scale = global_scale_a.value().mul(global_scale_b.value());
|
||||||
|
} else {
|
||||||
|
// MXFP4
|
||||||
|
TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e8m0fnu,
|
||||||
|
"scale_a must be Float8_e8m0fnu, got: ", scale_a.scalar_type());
|
||||||
|
TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e8m0fnu,
|
||||||
|
"scale_b must be Float8_e8m0fnu, got: ", scale_b.scalar_type());
|
||||||
|
}
|
||||||
|
|
||||||
|
auto o = fbgemm_gpu::f4f4bf16_grouped_mm(
|
||||||
|
mat_a,
|
||||||
|
mat_b,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
offs.value(),
|
||||||
|
out,
|
||||||
|
combined_global_scale
|
||||||
|
);
|
||||||
|
#else
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(false, "nvfp4 grouped gemm is not supported without USE_FBGEMM_GENAI, and only for CUDA")
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
|
void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
|
||||||
// Checks scales for 2d or 3d target tensors (`mat`).
|
// Checks scales for 2d or 3d target tensors (`mat`).
|
||||||
if (mat.dim() == 2) {
|
if (mat.dim() == 2) {
|
||||||
@ -245,7 +301,15 @@ void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) {
|
void _check_scales_blocked(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) {
|
||||||
|
// if {mx,nv}fp4, will need to modify K later
|
||||||
|
bool is_fp4 = (mat.scalar_type() == kFloat4_e2m1fn_x2);
|
||||||
|
int blocksize = 32;
|
||||||
|
// check for nvfp4 vs. mxfp4 to fix blocksize
|
||||||
|
if (is_fp4 && scale.scalar_type() == kFloat8_e4m3fn) {
|
||||||
|
blocksize = 16;
|
||||||
|
}
|
||||||
|
|
||||||
// Checks scales for 2d or 3d target tensors (`mat`).
|
// Checks scales for 2d or 3d target tensors (`mat`).
|
||||||
if (mat.dim() == 2) {
|
if (mat.dim() == 2) {
|
||||||
// For MXFP8, 2d tensors have variable size groups represented as subtensors,
|
// For MXFP8, 2d tensors have variable size groups represented as subtensors,
|
||||||
@ -253,17 +317,19 @@ void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim,
|
|||||||
// so we can't check the scale sizes without doing a d2h sync to get the group sizes here.
|
// so we can't check the scale sizes without doing a d2h sync to get the group sizes here.
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
scale.dim() == mat.dim(),
|
scale.dim() == mat.dim(),
|
||||||
"for mxfp8, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(), " and scale.dim() = ", scale.dim(), " for arg ", arg_idx);
|
"for block-scaled, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(),
|
||||||
|
" and scale.dim() = ", scale.dim(), " for arg ", arg_idx
|
||||||
|
);
|
||||||
|
|
||||||
// LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/32, 4))
|
// LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/blocksize, 4))
|
||||||
// RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/32, 4))
|
// RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/blocksize, 4))
|
||||||
// * weight is transposed prior to the call, scale stays non-transposed.
|
// * weight is transposed prior to the call, scale stays non-transposed.
|
||||||
bool LHS = arg_idx == 0;
|
bool LHS = arg_idx == 0;
|
||||||
int scale_dim_to_check = 0;
|
int scale_dim_to_check = 0;
|
||||||
int mat_dim_to_check = LHS ? 0 : 1;
|
int mat_dim_to_check = LHS ? 0 : 1;
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
scale.size(scale_dim_to_check) >= mat.size(mat_dim_to_check),
|
scale.size(scale_dim_to_check) >= mat.size(mat_dim_to_check),
|
||||||
"for mxfp8, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ",
|
"for block-scaled, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ",
|
||||||
"must have scale.shape[", scale_dim_to_check, "] >= ", mat.size(mat_dim_to_check), " but got scale.shape=(", scale.size(0), ", ", scale.size(1), ")");
|
"must have scale.shape[", scale_dim_to_check, "] >= ", mat.size(mat_dim_to_check), " but got scale.shape=(", scale.size(0), ", ", scale.size(1), ")");
|
||||||
} else {
|
} else {
|
||||||
// For MXFP8, 3d tensors have static group sizes (stack of 2d tensors),
|
// For MXFP8, 3d tensors have static group sizes (stack of 2d tensors),
|
||||||
@ -273,32 +339,40 @@ void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim,
|
|||||||
};
|
};
|
||||||
|
|
||||||
// TODO: this is for 3d tensor in 2d-3d case specifically.
|
// TODO: this is for 3d tensor in 2d-3d case specifically.
|
||||||
// We'll need to support 3d-3d and 3d-2d cases once mxfp8 grouped gemm supports them.
|
// We'll need to support 3d-3d and 3d-2d cases once mxfp8/nvfp4 grouped gemm supports them.
|
||||||
int64_t G = mat.size(0);
|
int64_t G = mat.size(0);
|
||||||
int64_t K = mat.size(1);
|
int64_t K = mat.size(1);
|
||||||
|
if (is_fp4) {
|
||||||
|
// FP4 packs 2 values into a single 8b word - the "real" K is 2x the
|
||||||
|
// reported K. Reverse that adjustment.
|
||||||
|
const int fp4_elems_per_byte = 2;
|
||||||
|
K *= fp4_elems_per_byte;
|
||||||
|
}
|
||||||
int64_t N = mat.size(2);
|
int64_t N = mat.size(2);
|
||||||
int64_t blocked_scale_K = round_up(K/32, 4);
|
int64_t blocked_scale_K = round_up(K/blocksize, 4);
|
||||||
int64_t blocked_scale_N = round_up(N, 128);
|
int64_t blocked_scale_N = round_up(N, 128);
|
||||||
|
|
||||||
// fbgemm expects stack of flattened blocked scales for 3d tensor, shape (G, blocked_scale_K * blocked_scale_N).
|
// fbgemm expects stack of flattened blocked scales for 3d tensor, shape (G, blocked_scale_K * blocked_scale_N).
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
scale.dim() == mat.dim() - 1,
|
scale.dim() == mat.dim() - 1,
|
||||||
"for mxfp8 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N), but scale is ", scale.dim(), "D for arg ", arg_idx
|
"for block-scaled 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N),",
|
||||||
|
"but scale is ", scale.dim(), "D for arg ", arg_idx
|
||||||
);
|
);
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
scale.size(0) == G && scale.size(1) == blocked_scale_K * blocked_scale_N,
|
scale.size(0) == G && scale.size(1) == blocked_scale_K * blocked_scale_N,
|
||||||
"for mxfp8, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ") for arg ", arg_idx
|
"for block-scaled grouped GEMM, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ")",
|
||||||
|
" for arg ", arg_idx, ", got: ", scale.size(0), ", ", scale.size(1)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
|
void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
|
||||||
bool using_fp8_rowwise = scale.scalar_type() == kFloat;
|
bool using_fp8_rowwise = scale.scalar_type() == kFloat;
|
||||||
bool using_mxfp8 = scale.scalar_type() == at::kFloat8_e8m0fnu;
|
bool using_mx = scale.scalar_type() == at::kFloat8_e8m0fnu;
|
||||||
if (using_fp8_rowwise) {
|
if (using_fp8_rowwise) {
|
||||||
_check_scales_fp8_rowwise(mat, scale, dim, arg_idx, scale_multiplier);
|
_check_scales_fp8_rowwise(mat, scale, dim, arg_idx, scale_multiplier);
|
||||||
} else if (using_mxfp8) {
|
} else if (using_mx) {
|
||||||
_check_scales_mxfp8(mat, scale, dim, arg_idx);
|
_check_scales_blocked(mat, scale, dim, arg_idx);
|
||||||
} else {
|
} else {
|
||||||
TORCH_CHECK(false, "scale must be float32 or float8_e8m0fnu, but got ", scale.dtype());
|
TORCH_CHECK(false, "scale must be float32 or float8_e8m0fnu, but got ", scale.dtype());
|
||||||
}
|
}
|
||||||
@ -411,9 +485,11 @@ namespace {
|
|||||||
|
|
||||||
using acceptance_fn = std::function<bool(c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&, c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&)>;
|
using acceptance_fn = std::function<bool(c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&, c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&)>;
|
||||||
|
|
||||||
std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 2> scale_grouped_kernel_dispatch = {{
|
std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 4> scale_grouped_kernel_dispatch = {{
|
||||||
{ "rowwise_rowwise", scaled_blas::check_rowwise_recipe, ScaledGemmImplementation::ROWWISE_ROWWISE},
|
{ "rowwise_rowwise", scaled_blas::check_rowwise_recipe, ScaledGemmImplementation::ROWWISE_ROWWISE},
|
||||||
{ "mxfp8_mxfp8", scaled_blas::check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8}}};
|
{ "mxfp8_mxfp8", scaled_blas::check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8},
|
||||||
|
{ "mxfp4_mxfp4", scaled_blas::check_mxfp4_recipe, ScaledGemmImplementation::MXFP4_MXFP4},
|
||||||
|
{ "nvfp4_nvfp4", scaled_blas::check_nvfp4_recipe, ScaledGemmImplementation::NVFP4_NVFP4}}};
|
||||||
|
|
||||||
} // anonymous namespace
|
} // anonymous namespace
|
||||||
|
|
||||||
@ -525,8 +601,9 @@ _scaled_grouped_mm_cuda_v2(
|
|||||||
out);
|
out);
|
||||||
}
|
}
|
||||||
case ScaledGemmImplementation::MXFP8_MXFP8: {
|
case ScaledGemmImplementation::MXFP8_MXFP8: {
|
||||||
_check_scales_mxfp8(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
|
// scale shape checks
|
||||||
_check_scales_mxfp8(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
|
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
|
||||||
|
_check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
|
||||||
return _mx8_mx8_bf16_grouped_mm_fbgemm(
|
return _mx8_mx8_bf16_grouped_mm_fbgemm(
|
||||||
mat_a,
|
mat_a,
|
||||||
mat_b,
|
mat_b,
|
||||||
@ -537,6 +614,36 @@ _scaled_grouped_mm_cuda_v2(
|
|||||||
offs.value(),
|
offs.value(),
|
||||||
out);
|
out);
|
||||||
}
|
}
|
||||||
|
case ScaledGemmImplementation::MXFP4_MXFP4: {
|
||||||
|
// scale shape checks
|
||||||
|
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
|
||||||
|
_check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
|
||||||
|
return _f4_f4_bf16_grouped_mm_fbgemm(
|
||||||
|
mat_a,
|
||||||
|
mat_b,
|
||||||
|
scale_a[0], /* block-scale A */
|
||||||
|
std::nullopt, /* global-scale A */
|
||||||
|
scale_b[0], /* block-scale B */
|
||||||
|
std::nullopt, /* global-scale B */
|
||||||
|
offs.value(),
|
||||||
|
std::nullopt, /* bias */
|
||||||
|
out);
|
||||||
|
}
|
||||||
|
case ScaledGemmImplementation::NVFP4_NVFP4: {
|
||||||
|
// scale shape checks
|
||||||
|
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
|
||||||
|
_check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
|
||||||
|
return _f4_f4_bf16_grouped_mm_fbgemm(
|
||||||
|
mat_a,
|
||||||
|
mat_b,
|
||||||
|
scale_a[0], /* block-scale A */
|
||||||
|
scale_a[1], /* global-scale A */
|
||||||
|
scale_b[0], /* block-scale B */
|
||||||
|
scale_b[1], /* global-scale B */
|
||||||
|
offs.value(),
|
||||||
|
std::nullopt, /* bias */
|
||||||
|
out);
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
TORCH_CHECK_NOT_IMPLEMENTED(false,
|
TORCH_CHECK_NOT_IMPLEMENTED(false,
|
||||||
"_scaled_grouped_mm_cuda_v2 is in an inconsistent state - should never reach here");
|
"_scaled_grouped_mm_cuda_v2 is in an inconsistent state - should never reach here");
|
||||||
|
|||||||
@ -13,7 +13,7 @@ __global__ void vectorized_gather_kernel(char * out, char * inp, index_t * idx,
|
|||||||
if (allow_neg_indices) {
|
if (allow_neg_indices) {
|
||||||
ind = (ind < 0) ? ind + ind_dim_size : ind;
|
ind = (ind < 0) ? ind + ind_dim_size : ind;
|
||||||
}
|
}
|
||||||
CUDA_KERNEL_ASSERT(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds");
|
CUDA_KERNEL_ASSERT_VERBOSE(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds", "Expected 0 <= index < ind_dim_size(%ld), but got index = %ld", ind_dim_size, ind);
|
||||||
int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; // off is guaranteed to be within int32 limits
|
int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; // off is guaranteed to be within int32 limits
|
||||||
if (off >= slice_size) return;
|
if (off >= slice_size) return;
|
||||||
auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);
|
auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);
|
||||||
|
|||||||
@ -794,6 +794,24 @@ void _check_deepseek_scale_stride(const Tensor& scale, const Tensor& t, const Sc
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
_check_deepseek_support() {
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||||
|
if (dprops->major != 9) {
|
||||||
|
// Only on Hopper GPUs
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||||
|
dprops->major == 9,
|
||||||
|
"DeepSeek style (1x128, 128x128) scaling only supported in CUDA for SM90")
|
||||||
|
}
|
||||||
|
// Only in cublasLt >= 12.9
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||||
|
CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900,
|
||||||
|
"DeepSeek style (1x128, 128x128) scaling requires cublasLt >= 12.9"
|
||||||
|
);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
Tensor&
|
Tensor&
|
||||||
_scaled_block1x128_block1x128(
|
_scaled_block1x128_block1x128(
|
||||||
const Tensor& mat_a, const Tensor& mat_b,
|
const Tensor& mat_a, const Tensor& mat_b,
|
||||||
@ -802,8 +820,12 @@ _scaled_block1x128_block1x128(
|
|||||||
const c10::ScalarType out_dtype,
|
const c10::ScalarType out_dtype,
|
||||||
const bool use_fast_accum,
|
const bool use_fast_accum,
|
||||||
Tensor& out) {
|
Tensor& out) {
|
||||||
|
#ifndef USE_ROCM
|
||||||
// Restrictions:
|
// Restrictions:
|
||||||
// A, B are FP8, scales are fp32, shape K//128
|
// A, B are FP8, scales are fp32, shape K//128
|
||||||
|
// CUDA: Only Hopper GPUs
|
||||||
|
_check_deepseek_support();
|
||||||
|
|
||||||
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
|
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
|
||||||
mat_a.scalar_type(), mat_b.scalar_type());
|
mat_a.scalar_type(), mat_b.scalar_type());
|
||||||
TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
|
TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
|
||||||
@ -821,6 +843,12 @@ _scaled_block1x128_block1x128(
|
|||||||
_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
|
_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
|
||||||
|
|
||||||
return out;
|
return out;
|
||||||
|
#else
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||||
|
false,
|
||||||
|
"1x128 and 128x128 scaling not available with ROCm"
|
||||||
|
);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor&
|
Tensor&
|
||||||
@ -831,10 +859,12 @@ _scaled_block128x128_block1x128(
|
|||||||
const c10::ScalarType out_dtype,
|
const c10::ScalarType out_dtype,
|
||||||
const bool use_fast_accum,
|
const bool use_fast_accum,
|
||||||
Tensor& out) {
|
Tensor& out) {
|
||||||
|
#ifndef USE_ROCM
|
||||||
// Restrictions:
|
// Restrictions:
|
||||||
// A, B are FP8, scales are fp32, shape K//128
|
// A, B are FP8, scales are fp32, shape K//128
|
||||||
std::cout << "mat_b: " << mat_b.dim() << ", " << mat_b.sizes() << ", " << mat_b.strides() << std::endl;
|
// CUDA: Only Hopper GPUs
|
||||||
std::cout << "scale_b: " << scale_b.dim() << ", " << scale_b.sizes() << ", " << scale_b.strides() << std::endl;
|
_check_deepseek_support();
|
||||||
|
|
||||||
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
|
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
|
||||||
mat_a.scalar_type(), mat_b.scalar_type());
|
mat_a.scalar_type(), mat_b.scalar_type());
|
||||||
TORCH_CHECK_VALUE(scale_a.sizes()[0] == ceil_div<int64_t>(mat_a.sizes()[0], 128) && scale_a.sizes()[1] == ceil_div<int64_t>(mat_a.sizes()[1], 128) && scale_a.scalar_type() == kFloat,
|
TORCH_CHECK_VALUE(scale_a.sizes()[0] == ceil_div<int64_t>(mat_a.sizes()[0], 128) && scale_a.sizes()[1] == ceil_div<int64_t>(mat_a.sizes()[1], 128) && scale_a.scalar_type() == kFloat,
|
||||||
@ -852,6 +882,12 @@ _scaled_block128x128_block1x128(
|
|||||||
_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
|
_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
|
||||||
|
|
||||||
return out;
|
return out;
|
||||||
|
#else
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||||
|
false,
|
||||||
|
"1x128 and 128x128 scaling not available with ROCm"
|
||||||
|
);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor&
|
Tensor&
|
||||||
@ -862,8 +898,12 @@ _scaled_block1x128_block128x128(
|
|||||||
const c10::ScalarType out_dtype,
|
const c10::ScalarType out_dtype,
|
||||||
const bool use_fast_accum,
|
const bool use_fast_accum,
|
||||||
Tensor& out) {
|
Tensor& out) {
|
||||||
|
#ifndef USE_ROCM
|
||||||
// Restrictions:
|
// Restrictions:
|
||||||
// A, B are FP8, scales are fp32, A: shape K//128, B: K//128, N//128
|
// A, B are FP8, scales are fp32, A: shape K//128, B: K//128, N//128
|
||||||
|
// CUDA: Only Hopper GPUs
|
||||||
|
_check_deepseek_support();
|
||||||
|
|
||||||
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
|
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
|
||||||
mat_a.scalar_type(), mat_b.scalar_type());
|
mat_a.scalar_type(), mat_b.scalar_type());
|
||||||
TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
|
TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
|
||||||
@ -881,6 +921,12 @@ _scaled_block1x128_block128x128(
|
|||||||
_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
|
_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
|
||||||
|
|
||||||
return out;
|
return out;
|
||||||
|
#else
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||||
|
false,
|
||||||
|
"1x128 and 128x128 scaling not available with ROCm"
|
||||||
|
);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor&
|
Tensor&
|
||||||
|
|||||||
@ -160,8 +160,8 @@ struct _cuda_scatter_gather_internal_kernel {
|
|||||||
auto offsets = offset_calc.get(i);
|
auto offsets = offset_calc.get(i);
|
||||||
|
|
||||||
int64_t idx_dim = *(index_t*)(index_ptr + offsets[2]);
|
int64_t idx_dim = *(index_t*)(index_ptr + offsets[2]);
|
||||||
CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
|
CUDA_KERNEL_ASSERT_VERBOSE(idx_dim >= 0 && idx_dim < index_size
|
||||||
&& "scatter gather kernel index out of bounds");
|
&& "scatter gather kernel index out of bounds", "Expected 0 <= idx_dim < index_size (%ld), but got idx_dim = %ld", index_size, idx_dim);
|
||||||
|
|
||||||
f(
|
f(
|
||||||
(scalar_t*)(self_ptr + offsets[0]),
|
(scalar_t*)(self_ptr + offsets[0]),
|
||||||
@ -406,9 +406,8 @@ struct _cuda_scatter_fill_internal_kernel {
|
|||||||
auto offsets = offset_calc.get(i);
|
auto offsets = offset_calc.get(i);
|
||||||
|
|
||||||
int64_t idx_dim = *(index_t*)(index_ptr + offsets[1]);
|
int64_t idx_dim = *(index_t*)(index_ptr + offsets[1]);
|
||||||
CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
|
CUDA_KERNEL_ASSERT_VERBOSE(idx_dim >= 0 && idx_dim < index_size
|
||||||
&& "index out of bounds"
|
&& "index out of bounds", "Expected 0 <= idx_dim < index_size (%ld), but got idx_dim = %ld", index_size, idx_dim);
|
||||||
);
|
|
||||||
|
|
||||||
f(
|
f(
|
||||||
(scalar_t*)(self_ptr + offsets[0]),
|
(scalar_t*)(self_ptr + offsets[0]),
|
||||||
|
|||||||
@ -12,14 +12,15 @@
|
|||||||
|
|
||||||
namespace at::native {
|
namespace at::native {
|
||||||
|
|
||||||
#if AT_USE_JITERATOR()
|
#if 0 && AT_USE_JITERATOR()
|
||||||
constexpr char tan_name[] = "tan_impl";
|
constexpr char tan_name[] = "tan_impl";
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void tan_kernel_cuda(TensorIteratorBase& iter) {
|
void tan_kernel_cuda(TensorIteratorBase& iter) {
|
||||||
auto common_dtype = iter.common_dtype();
|
auto common_dtype = iter.common_dtype();
|
||||||
if (at::isComplexType(common_dtype)) {
|
if (at::isComplexType(common_dtype)) {
|
||||||
#if AT_USE_JITERATOR()
|
// Disabled due to accuracy issues
|
||||||
|
#if 0 && AT_USE_JITERATOR()
|
||||||
static const auto tan_string = jiterator_stringify(
|
static const auto tan_string = jiterator_stringify(
|
||||||
template <typename T> T tan_impl(T a) { return std::tan(a); });
|
template <typename T> T tan_impl(T a) { return std::tan(a); });
|
||||||
AT_DISPATCH_COMPLEX_TYPES_AND(
|
AT_DISPATCH_COMPLEX_TYPES_AND(
|
||||||
|
|||||||
@ -12,14 +12,15 @@
|
|||||||
|
|
||||||
namespace at::native {
|
namespace at::native {
|
||||||
|
|
||||||
#if AT_USE_JITERATOR()
|
#if 0 && AT_USE_JITERATOR()
|
||||||
constexpr char tanh_name[] = "tanh_impl";
|
constexpr char tanh_name[] = "tanh_impl";
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void tanh_kernel_cuda(TensorIteratorBase& iter) {
|
void tanh_kernel_cuda(TensorIteratorBase& iter) {
|
||||||
auto common_dtype = iter.common_dtype();
|
auto common_dtype = iter.common_dtype();
|
||||||
if (at::isComplexType(common_dtype)) {
|
if (at::isComplexType(common_dtype)) {
|
||||||
#if AT_USE_JITERATOR()
|
// Disabled due to accuracy issues
|
||||||
|
#if 0 && AT_USE_JITERATOR()
|
||||||
static const auto tanh_string = jiterator_stringify(
|
static const auto tanh_string = jiterator_stringify(
|
||||||
template <typename T> T tanh_impl(T a) { return std::tanh(a); });
|
template <typename T> T tanh_impl(T a) { return std::tanh(a); });
|
||||||
AT_DISPATCH_COMPLEX_TYPES_AND(
|
AT_DISPATCH_COMPLEX_TYPES_AND(
|
||||||
|
|||||||
@ -141,7 +141,8 @@ WelfordDataLN cuWelfordOnlineSum(
|
|||||||
if constexpr (!rms_norm){
|
if constexpr (!rms_norm){
|
||||||
U delta = val - curr_sum.mean;
|
U delta = val - curr_sum.mean;
|
||||||
U new_count = curr_sum.count + 1.f;
|
U new_count = curr_sum.count + 1.f;
|
||||||
#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
|
//Due to low CU count, we run into accuracy issues on gfx90a with `__builtin_amdgcn_rcpf`
|
||||||
|
#if defined(USE_ROCM) && !defined(__gfx90a__) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
|
||||||
U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
|
U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
|
||||||
#else
|
#else
|
||||||
U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
|
U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
|
||||||
@ -163,7 +164,8 @@ WelfordDataLN cuWelfordCombine(
|
|||||||
U count = dataA.count + dataB.count;
|
U count = dataA.count + dataB.count;
|
||||||
U mean, sigma2;
|
U mean, sigma2;
|
||||||
if (count > decltype(dataB.count){0}) {
|
if (count > decltype(dataB.count){0}) {
|
||||||
#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
|
//Due to low CU count, we run into accuracy issues on gfx90a with `__builtin_amdgcn_rcpf`
|
||||||
|
#if defined(USE_ROCM) && !defined(__gfx90a__) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
|
||||||
auto coef = __builtin_amdgcn_rcpf(count);
|
auto coef = __builtin_amdgcn_rcpf(count);
|
||||||
#else
|
#else
|
||||||
auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
|
auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
|
||||||
|
|||||||
@ -86,6 +86,28 @@ struct zeta_functor {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct logaddexp_functor {
|
||||||
|
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
|
||||||
|
inline T operator()(const T a, const T b) {
|
||||||
|
return c10::metal::logaddexp(a, b);
|
||||||
|
}
|
||||||
|
template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
|
||||||
|
inline float operator()(const T a, const T b) {
|
||||||
|
return c10::metal::logaddexp(float(a), float(b));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct logaddexp2_functor {
|
||||||
|
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
|
||||||
|
inline T operator()(const T a, const T b) {
|
||||||
|
return c10::metal::logaddexp2(a, b);
|
||||||
|
}
|
||||||
|
template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
|
||||||
|
inline float operator()(const T a, const T b) {
|
||||||
|
return c10::metal::logaddexp2(float(a), float(b));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
struct xlog1py_functor {
|
struct xlog1py_functor {
|
||||||
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
|
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
|
||||||
inline T operator()(const T a, const T b) {
|
inline T operator()(const T a, const T b) {
|
||||||
@ -377,6 +399,10 @@ REGISTER_FLOAT_BINARY_OP(fmin);
|
|||||||
REGISTER_FLOAT_BINARY_OP(nextafter);
|
REGISTER_FLOAT_BINARY_OP(nextafter);
|
||||||
REGISTER_FLOAT_BINARY_OP(zeta);
|
REGISTER_FLOAT_BINARY_OP(zeta);
|
||||||
REGISTER_INT2FLOAT_BINARY_OP(zeta);
|
REGISTER_INT2FLOAT_BINARY_OP(zeta);
|
||||||
|
REGISTER_FLOAT_BINARY_OP(logaddexp);
|
||||||
|
REGISTER_INT2FLOAT_BINARY_OP(logaddexp);
|
||||||
|
REGISTER_FLOAT_BINARY_OP(logaddexp2);
|
||||||
|
REGISTER_INT2FLOAT_BINARY_OP(logaddexp2);
|
||||||
REGISTER_FLOAT_BINARY_OP(xlog1py);
|
REGISTER_FLOAT_BINARY_OP(xlog1py);
|
||||||
REGISTER_INT2FLOAT_BINARY_OP(xlog1py);
|
REGISTER_INT2FLOAT_BINARY_OP(xlog1py);
|
||||||
REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_t);
|
REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_t);
|
||||||
@ -463,6 +489,8 @@ REGISTER_BINARY_OP(add, float2, float2);
|
|||||||
REGISTER_BINARY_OP(add, half2, half2);
|
REGISTER_BINARY_OP(add, half2, half2);
|
||||||
REGISTER_BINARY_OP(sub, float2, float2);
|
REGISTER_BINARY_OP(sub, float2, float2);
|
||||||
REGISTER_BINARY_OP(sub, half2, half2);
|
REGISTER_BINARY_OP(sub, half2, half2);
|
||||||
|
REGISTER_BINARY_OP(logaddexp, float2, float2);
|
||||||
|
REGISTER_BINARY_OP(logaddexp, half2, half2);
|
||||||
REGISTER_BINARY_ALPHA_OP(add_alpha, float2, float2, float2);
|
REGISTER_BINARY_ALPHA_OP(add_alpha, float2, float2, float2);
|
||||||
REGISTER_BINARY_ALPHA_OP(add_alpha, half2, half2, half2);
|
REGISTER_BINARY_ALPHA_OP(add_alpha, half2, half2, half2);
|
||||||
REGISTER_BINARY_ALPHA_OP(sub_alpha, float2, float2, float2);
|
REGISTER_BINARY_ALPHA_OP(sub_alpha, float2, float2, float2);
|
||||||
|
|||||||
@ -89,6 +89,14 @@ static void zeta_mps_kernel(TensorIteratorBase& iter) {
|
|||||||
lib.exec_binary_kernel(iter, "zeta");
|
lib.exec_binary_kernel(iter, "zeta");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void logaddexp_mps_kernel(TensorIteratorBase& iter) {
|
||||||
|
lib.exec_binary_kernel(iter, "logaddexp");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void logaddexp2_mps_kernel(TensorIteratorBase& iter) {
|
||||||
|
lib.exec_binary_kernel(iter, "logaddexp2");
|
||||||
|
}
|
||||||
|
|
||||||
static void xlog1py_mps_kernel(TensorIteratorBase& iter) {
|
static void xlog1py_mps_kernel(TensorIteratorBase& iter) {
|
||||||
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "xlog1py_mps not implemented for non-floating types");
|
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "xlog1py_mps not implemented for non-floating types");
|
||||||
lib.exec_binary_kernel(iter, "xlog1py");
|
lib.exec_binary_kernel(iter, "xlog1py");
|
||||||
@ -211,6 +219,8 @@ REGISTER_DISPATCH(fmin_stub, &fmin_mps_kernel)
|
|||||||
REGISTER_DISPATCH(copysign_stub, ©sign_mps_kernel)
|
REGISTER_DISPATCH(copysign_stub, ©sign_mps_kernel)
|
||||||
REGISTER_DISPATCH(nextafter_stub, &nextafter_mps_kernel)
|
REGISTER_DISPATCH(nextafter_stub, &nextafter_mps_kernel)
|
||||||
REGISTER_DISPATCH(zeta_stub, &zeta_mps_kernel)
|
REGISTER_DISPATCH(zeta_stub, &zeta_mps_kernel)
|
||||||
|
REGISTER_DISPATCH(logaddexp_stub, &logaddexp_mps_kernel);
|
||||||
|
REGISTER_DISPATCH(logaddexp2_stub, &logaddexp2_mps_kernel);
|
||||||
REGISTER_DISPATCH(xlog1py_stub, &xlog1py_mps_kernel)
|
REGISTER_DISPATCH(xlog1py_stub, &xlog1py_mps_kernel)
|
||||||
REGISTER_DISPATCH(chebyshev_polynomial_t_stub, &chebyshev_polynomial_t_mps_kernel)
|
REGISTER_DISPATCH(chebyshev_polynomial_t_stub, &chebyshev_polynomial_t_mps_kernel)
|
||||||
REGISTER_DISPATCH(chebyshev_polynomial_u_stub, &chebyshev_polynomial_u_mps_kernel)
|
REGISTER_DISPATCH(chebyshev_polynomial_u_stub, &chebyshev_polynomial_u_mps_kernel)
|
||||||
|
|||||||
@ -17,8 +17,6 @@
|
|||||||
#include <ATen/ops/ge_native.h>
|
#include <ATen/ops/ge_native.h>
|
||||||
#include <ATen/ops/gt_native.h>
|
#include <ATen/ops/gt_native.h>
|
||||||
#include <ATen/ops/le_native.h>
|
#include <ATen/ops/le_native.h>
|
||||||
#include <ATen/ops/logaddexp2_native.h>
|
|
||||||
#include <ATen/ops/logaddexp_native.h>
|
|
||||||
#include <ATen/ops/logical_and_native.h>
|
#include <ATen/ops/logical_and_native.h>
|
||||||
#include <ATen/ops/logical_or_native.h>
|
#include <ATen/ops/logical_or_native.h>
|
||||||
#include <ATen/ops/logical_xor_native.h>
|
#include <ATen/ops/logical_xor_native.h>
|
||||||
@ -277,30 +275,6 @@ TORCH_IMPL_FUNC(pow_Scalar_out_mps)(const Scalar& base, const Tensor& exp, const
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TORCH_IMPL_FUNC(logaddexp_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
|
|
||||||
mps::BinaryOpBlock logaddexp_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
|
|
||||||
MPSGraph* mpsGraph = cachedGraph->graph();
|
|
||||||
MPSGraphTensor* sumTensor =
|
|
||||||
[mpsGraph additionWithPrimaryTensor:[mpsGraph exponentWithTensor:primaryCastTensor name:nil]
|
|
||||||
secondaryTensor:[mpsGraph exponentWithTensor:secondaryCastTensor name:nil]
|
|
||||||
name:nil];
|
|
||||||
return [mpsGraph logarithmWithTensor:sumTensor name:nil];
|
|
||||||
};
|
|
||||||
mps::binaryOpTensor(self, other, output, "logaddexp_out_mps", logaddexp_op_block);
|
|
||||||
}
|
|
||||||
|
|
||||||
TORCH_IMPL_FUNC(logaddexp2_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
|
|
||||||
mps::BinaryOpBlock logaddexp2_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
|
|
||||||
MPSGraph* mpsGraph = cachedGraph->graph();
|
|
||||||
MPSGraphTensor* sumTensor =
|
|
||||||
[mpsGraph additionWithPrimaryTensor:[mpsGraph exponentBase2WithTensor:primaryCastTensor name:nil]
|
|
||||||
secondaryTensor:[mpsGraph exponentBase2WithTensor:secondaryCastTensor name:nil]
|
|
||||||
name:nil];
|
|
||||||
return [mpsGraph logarithmBase2WithTensor:sumTensor name:nil];
|
|
||||||
};
|
|
||||||
mps::binaryOpTensor(self, other, output, "logaddexp2_out_mps", logaddexp2_op_block);
|
|
||||||
}
|
|
||||||
|
|
||||||
TORCH_IMPL_FUNC(xlogy_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
|
TORCH_IMPL_FUNC(xlogy_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
|
||||||
mps::BinaryOpBlock xlogy_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
|
mps::BinaryOpBlock xlogy_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
|
||||||
MPSGraph* mpsGraph = cachedGraph->graph();
|
MPSGraph* mpsGraph = cachedGraph->graph();
|
||||||
|
|||||||
@ -57,6 +57,7 @@ Tensor& random_mps_impl(Tensor& self,
|
|||||||
if (self.numel() == 0) {
|
if (self.numel() == 0) {
|
||||||
return self;
|
return self;
|
||||||
}
|
}
|
||||||
|
at::assert_no_internal_overlap(self);
|
||||||
// MPS random is broken for 5D+ tensors, see https://github.com/pytorch/pytorch/issues/147624
|
// MPS random is broken for 5D+ tensors, see https://github.com/pytorch/pytorch/issues/147624
|
||||||
const auto need_reshape = self.ndimension() > 4;
|
const auto need_reshape = self.ndimension() > 4;
|
||||||
auto mps_gen = get_generator_or_default<MPSGeneratorImpl>(gen, at::mps::detail::getDefaultMPSGenerator());
|
auto mps_gen = get_generator_or_default<MPSGeneratorImpl>(gen, at::mps::detail::getDefaultMPSGenerator());
|
||||||
@ -153,8 +154,16 @@ Tensor& random_mps_impl(Tensor& self,
|
|||||||
feeds[meanPlaceholder.getMPSGraphTensor()] = meanPlaceholder.getMPSGraphTensorData();
|
feeds[meanPlaceholder.getMPSGraphTensor()] = meanPlaceholder.getMPSGraphTensorData();
|
||||||
}
|
}
|
||||||
|
|
||||||
Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self);
|
// Handle non-contiguous output tensors by creating a contiguous temporary
|
||||||
|
const auto needs_gather = needsGather(self);
|
||||||
|
Tensor self_ = needs_gather ? at::empty_like(self, MemoryFormat::Contiguous) : self;
|
||||||
|
Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self_);
|
||||||
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
|
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
|
||||||
|
|
||||||
|
// Copy results back to original non-contiguous output
|
||||||
|
if (needs_gather) {
|
||||||
|
self.copy_(self_);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return self;
|
return self;
|
||||||
|
|||||||
@ -617,6 +617,9 @@ Tensor& index_select_out_mps(const Tensor& self, int64_t dim, const Tensor& inde
|
|||||||
TORCH_CHECK(self.scalar_type() == output.scalar_type(),
|
TORCH_CHECK(self.scalar_type() == output.scalar_type(),
|
||||||
"index_select(): self and output must have the same scalar type");
|
"index_select(): self and output must have the same scalar type");
|
||||||
TORCH_CHECK(dim == 0 || dim < self.dim(), "index_select(): Indexing dim ", dim, " is out of bounds of tensor");
|
TORCH_CHECK(dim == 0 || dim < self.dim(), "index_select(): Indexing dim ", dim, " is out of bounds of tensor");
|
||||||
|
at::assert_no_internal_overlap(output);
|
||||||
|
at::assert_no_overlap(output, self);
|
||||||
|
at::assert_no_overlap(output, index);
|
||||||
auto output_size = self.sizes().vec();
|
auto output_size = self.sizes().vec();
|
||||||
if (self.dim() > 0) {
|
if (self.dim() > 0) {
|
||||||
output_size[dim] = num_indices;
|
output_size[dim] = num_indices;
|
||||||
|
|||||||
@ -370,7 +370,7 @@ static void nllnd_loss_backward_impl(Tensor& grad_input_arg,
|
|||||||
onValue:-1.0f
|
onValue:-1.0f
|
||||||
offValue:0.0f
|
offValue:0.0f
|
||||||
name:nil];
|
name:nil];
|
||||||
oneHotTensor = castMPSTensor(mpsGraph, oneHotTensor, inputTensor.dataType);
|
oneHotTensor = castMPSTensor(mpsGraph, oneHotTensor, [inputTensor dataType]);
|
||||||
if (isWeightsArrayValid) {
|
if (isWeightsArrayValid) {
|
||||||
oneHotTensor = [mpsGraph multiplicationWithPrimaryTensor:oneHotTensor
|
oneHotTensor = [mpsGraph multiplicationWithPrimaryTensor:oneHotTensor
|
||||||
secondaryTensor:weightTensor
|
secondaryTensor:weightTensor
|
||||||
@ -705,6 +705,7 @@ static void smooth_l1_loss_template(const Tensor& input,
|
|||||||
TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.");
|
TORCH_CHECK(beta >= 0, "smooth_l1_loss does not support negative values for beta.");
|
||||||
TORCH_CHECK(input.is_mps());
|
TORCH_CHECK(input.is_mps());
|
||||||
TORCH_CHECK(target.is_mps());
|
TORCH_CHECK(target.is_mps());
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "MPS doesn't know how to do square_i64");
|
||||||
if ((input.numel() == 0) || (target.numel() == 0)) {
|
if ((input.numel() == 0) || (target.numel() == 0)) {
|
||||||
reduction == Reduction::Mean ? output.fill_(std::numeric_limits<float>::quiet_NaN()) : output.zero_();
|
reduction == Reduction::Mean ? output.fill_(std::numeric_limits<float>::quiet_NaN()) : output.zero_();
|
||||||
return;
|
return;
|
||||||
@ -771,7 +772,7 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output,
|
|||||||
MPSGraphTensor* targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
|
MPSGraphTensor* targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
|
||||||
MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
|
MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
|
||||||
|
|
||||||
MPSGraphTensor* betaTensor = [mpsGraph constantWithScalar:beta dataType:MPSDataTypeFloat32];
|
MPSGraphTensor* betaTensor = [mpsGraph constantWithScalar:beta dataType:[inputTensor dataType]];
|
||||||
// xn - yn
|
// xn - yn
|
||||||
MPSGraphTensor* diffTensor = [mpsGraph subtractionWithPrimaryTensor:inputTensor
|
MPSGraphTensor* diffTensor = [mpsGraph subtractionWithPrimaryTensor:inputTensor
|
||||||
secondaryTensor:targetTensor
|
secondaryTensor:targetTensor
|
||||||
@ -797,7 +798,8 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output,
|
|||||||
name:@"lossTensor"];
|
name:@"lossTensor"];
|
||||||
MPSGraphTensor* outputTensor = lossTensor;
|
MPSGraphTensor* outputTensor = lossTensor;
|
||||||
if (reduction == Reduction::Mean) {
|
if (reduction == Reduction::Mean) {
|
||||||
MPSGraphTensor* numelTensor = [mpsGraph constantWithScalar:(double)input.numel() dataType:MPSDataTypeFloat32];
|
MPSGraphTensor* numelTensor = [mpsGraph constantWithScalar:(double)input.numel()
|
||||||
|
dataType:[lossTensor dataType]];
|
||||||
outputTensor = [mpsGraph divisionWithPrimaryTensor:lossTensor secondaryTensor:numelTensor name:nil];
|
outputTensor = [mpsGraph divisionWithPrimaryTensor:lossTensor secondaryTensor:numelTensor name:nil];
|
||||||
}
|
}
|
||||||
MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:outputTensor
|
MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:outputTensor
|
||||||
|
|||||||
@ -84,6 +84,9 @@ std::tuple<Tensor&, Tensor&, Tensor&> batch_norm_mps_out(const Tensor& self,
|
|||||||
Tensor& output,
|
Tensor& output,
|
||||||
Tensor& save_mean,
|
Tensor& save_mean,
|
||||||
Tensor& save_var) {
|
Tensor& save_var) {
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(self.scalar_type() != kLong, "Long batch norm is not supported with MPS");
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(self.scalar_type()),
|
||||||
|
"Batch norm for complex is not supported for MPS");
|
||||||
using namespace at::native::mps;
|
using namespace at::native::mps;
|
||||||
struct CachedGraph : public MPSCachedGraph {
|
struct CachedGraph : public MPSCachedGraph {
|
||||||
CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
|
CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
|
||||||
@ -918,6 +921,7 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_mps(const Tensor& input,
|
|||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||||
const int axis = input_ndim - normalized_ndim;
|
const int axis = input_ndim - normalized_ndim;
|
||||||
MPSStream* stream = getCurrentMPSStream();
|
MPSStream* stream = getCurrentMPSStream();
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "Not implemented for long on MPS");
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
|
mps::dispatch_sync_with_rethrow(stream->queue(), ^() {
|
||||||
// which kernel variant to use based on the normalized axis N size
|
// which kernel variant to use based on the normalized axis N size
|
||||||
|
|||||||
@ -1028,15 +1028,18 @@ TORCH_IMPL_FUNC(prod_out_mps)
|
|||||||
}
|
}
|
||||||
|
|
||||||
TORCH_IMPL_FUNC(amax_out_mps)(const Tensor& input_t, IntArrayRef dim, bool keepdim, const Tensor& output_t) {
|
TORCH_IMPL_FUNC(amax_out_mps)(const Tensor& input_t, IntArrayRef dim, bool keepdim, const Tensor& output_t) {
|
||||||
|
TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "amax is not defined for complex types");
|
||||||
reduction_out_mps(input_t, dim, keepdim, std::nullopt, output_t, MPSReductionType::AMAX, "amax_out_mps");
|
reduction_out_mps(input_t, dim, keepdim, std::nullopt, output_t, MPSReductionType::AMAX, "amax_out_mps");
|
||||||
}
|
}
|
||||||
|
|
||||||
TORCH_IMPL_FUNC(amin_out_mps)(const Tensor& input_t, IntArrayRef dim, bool keepdim, const Tensor& output_t) {
|
TORCH_IMPL_FUNC(amin_out_mps)(const Tensor& input_t, IntArrayRef dim, bool keepdim, const Tensor& output_t) {
|
||||||
|
TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "amin is not defined for complex types");
|
||||||
reduction_out_mps(input_t, dim, keepdim, std::nullopt, output_t, MPSReductionType::AMIN, "amin_out_mps");
|
reduction_out_mps(input_t, dim, keepdim, std::nullopt, output_t, MPSReductionType::AMIN, "amin_out_mps");
|
||||||
}
|
}
|
||||||
|
|
||||||
TORCH_IMPL_FUNC(aminmax_out_mps)
|
TORCH_IMPL_FUNC(aminmax_out_mps)
|
||||||
(const Tensor& input_t, std::optional<int64_t> dim_opt, bool keepdim, const Tensor& min_t, const Tensor& max_t) {
|
(const Tensor& input_t, std::optional<int64_t> dim_opt, bool keepdim, const Tensor& min_t, const Tensor& max_t) {
|
||||||
|
TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "aminmax is not defined for complex types");
|
||||||
reduction_out_mps(input_t,
|
reduction_out_mps(input_t,
|
||||||
dim_opt.has_value() ? OptionalIntArrayRef({*dim_opt}) : std::nullopt,
|
dim_opt.has_value() ? OptionalIntArrayRef({*dim_opt}) : std::nullopt,
|
||||||
keepdim,
|
keepdim,
|
||||||
|
|||||||
@ -31,6 +31,7 @@ void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& v
|
|||||||
indices.copy_(values.toType(at::ScalarType::Long));
|
indices.copy_(values.toType(at::ScalarType::Long));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(self.scalar_type()), "kthvalue is not implemented for complex types");
|
||||||
// issue #154890, raising error to prevent crash within MPSGraph until
|
// issue #154890, raising error to prevent crash within MPSGraph until
|
||||||
// workaround is implemented.
|
// workaround is implemented.
|
||||||
TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890");
|
TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890");
|
||||||
|
|||||||
@ -3622,8 +3622,7 @@
|
|||||||
structured: True
|
structured: True
|
||||||
structured_inherits: TensorIteratorBase
|
structured_inherits: TensorIteratorBase
|
||||||
dispatch:
|
dispatch:
|
||||||
CPU, CUDA: logaddexp_out
|
CPU, CUDA, MPS: logaddexp_out
|
||||||
MPS: logaddexp_out_mps
|
|
||||||
tags: pointwise
|
tags: pointwise
|
||||||
|
|
||||||
- func: logaddexp(Tensor self, Tensor other) -> Tensor
|
- func: logaddexp(Tensor self, Tensor other) -> Tensor
|
||||||
@ -3635,8 +3634,7 @@
|
|||||||
structured: True
|
structured: True
|
||||||
structured_inherits: TensorIteratorBase
|
structured_inherits: TensorIteratorBase
|
||||||
dispatch:
|
dispatch:
|
||||||
CPU, CUDA: logaddexp2_out
|
CPU, CUDA, MPS: logaddexp2_out
|
||||||
MPS: logaddexp2_out_mps
|
|
||||||
tags: pointwise
|
tags: pointwise
|
||||||
|
|
||||||
- func: logaddexp2(Tensor self, Tensor other) -> Tensor
|
- func: logaddexp2(Tensor self, Tensor other) -> Tensor
|
||||||
@ -8867,11 +8865,11 @@
|
|||||||
autogen: bitwise_right_shift.Scalar_Tensor_out
|
autogen: bitwise_right_shift.Scalar_Tensor_out
|
||||||
tags: pointwise
|
tags: pointwise
|
||||||
|
|
||||||
- func: tril_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
|
- func: tril_(Tensor(a!) self, SymInt diagonal=0) -> Tensor(a!)
|
||||||
structured_delegate: tril.out
|
structured_delegate: tril.out
|
||||||
variants: method
|
variants: method
|
||||||
|
|
||||||
- func: triu_(Tensor(a!) self, int diagonal=0) -> Tensor(a!)
|
- func: triu_(Tensor(a!) self, SymInt diagonal=0) -> Tensor(a!)
|
||||||
structured_delegate: triu.out
|
structured_delegate: triu.out
|
||||||
variants: method
|
variants: method
|
||||||
|
|
||||||
@ -8995,25 +8993,25 @@
|
|||||||
- func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor
|
- func: cross(Tensor self, Tensor other, int? dim=None) -> Tensor
|
||||||
variants: method, function
|
variants: method, function
|
||||||
|
|
||||||
- func: triu.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
|
- func: triu.out(Tensor self, SymInt diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
|
||||||
structured: True
|
structured: True
|
||||||
dispatch:
|
dispatch:
|
||||||
CPU: triu_cpu
|
CPU: triu_cpu
|
||||||
CUDA: triu_cuda
|
CUDA: triu_cuda
|
||||||
MPS: triu_mps_out
|
MPS: triu_mps_out
|
||||||
|
|
||||||
- func: triu(Tensor self, int diagonal=0) -> Tensor
|
- func: triu(Tensor self, SymInt diagonal=0) -> Tensor
|
||||||
structured_delegate: triu.out
|
structured_delegate: triu.out
|
||||||
variants: method, function
|
variants: method, function
|
||||||
|
|
||||||
- func: tril.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
|
- func: tril.out(Tensor self, SymInt diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
|
||||||
structured: True
|
structured: True
|
||||||
dispatch:
|
dispatch:
|
||||||
CPU: tril_cpu
|
CPU: tril_cpu
|
||||||
CUDA: tril_cuda
|
CUDA: tril_cuda
|
||||||
MPS: tril_mps_out
|
MPS: tril_mps_out
|
||||||
|
|
||||||
- func: tril(Tensor self, int diagonal=0) -> Tensor
|
- func: tril(Tensor self, SymInt diagonal=0) -> Tensor
|
||||||
structured_delegate: tril.out
|
structured_delegate: tril.out
|
||||||
variants: method, function
|
variants: method, function
|
||||||
|
|
||||||
|
|||||||
@ -73,8 +73,7 @@ void upsample_bilinear2d_out_frame(
|
|||||||
const auto rwidth = area_pixel_compute_scale<float>(
|
const auto rwidth = area_pixel_compute_scale<float>(
|
||||||
input_width, output_width, align_corners, scales_w);
|
input_width, output_width, align_corners, scales_w);
|
||||||
|
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
|
float output_scale = static_cast<float>(output.q_scale() / input.q_scale());
|
||||||
float output_scale = output.q_scale() / input.q_scale();
|
|
||||||
|
|
||||||
const int64_t input_q_zero_point = input.q_zero_point();
|
const int64_t input_q_zero_point = input.q_zero_point();
|
||||||
const int64_t output_q_zero_point = output.q_zero_point();
|
const int64_t output_q_zero_point = output.q_zero_point();
|
||||||
|
|||||||
@ -148,7 +148,7 @@ Tensor qcat_nhwc_kernel(
|
|||||||
// Vectorized loop
|
// Vectorized loop
|
||||||
if (c + VLEN <= curr_C) {
|
if (c + VLEN <= curr_C) {
|
||||||
auto curr_scale_vec = Vectorized<float>(curr_scale);
|
auto curr_scale_vec = Vectorized<float>(curr_scale);
|
||||||
auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
|
auto curr_zero_pt_vec = Vectorized<float>(curr_zero_pt);
|
||||||
auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
|
auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
|
||||||
for (; c + VLEN <= curr_C; c += VLEN) {
|
for (; c + VLEN <= curr_C; c += VLEN) {
|
||||||
auto inp_vec = Vec::loadu(iptr + c);
|
auto inp_vec = Vec::loadu(iptr + c);
|
||||||
@ -174,7 +174,7 @@ Tensor qcat_nhwc_kernel(
|
|||||||
int64_t elem_size = curr_C - c;
|
int64_t elem_size = curr_C - c;
|
||||||
if ((VLEN == 4 * kVLEN) && elem_size >= kVLEN) {
|
if ((VLEN == 4 * kVLEN) && elem_size >= kVLEN) {
|
||||||
auto curr_scale_vec = Vectorized<float>(curr_scale);
|
auto curr_scale_vec = Vectorized<float>(curr_scale);
|
||||||
auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
|
auto curr_zero_pt_vec = Vectorized<float>(curr_zero_pt);
|
||||||
auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
|
auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
|
||||||
int64_t vec_num = elem_size / kVLEN;
|
int64_t vec_num = elem_size / kVLEN;
|
||||||
std::array<typename scalar_t::underlying, VLEN> buf_in{};
|
std::array<typename scalar_t::underlying, VLEN> buf_in{};
|
||||||
@ -611,12 +611,10 @@ void qrelu_kernel(const Tensor& qx, Tensor& qy) {
|
|||||||
void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
|
void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
|
||||||
const Scalar& negval_) {
|
const Scalar& negval_) {
|
||||||
int64_t i_zp = qx.q_zero_point();
|
int64_t i_zp = qx.q_zero_point();
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
float i_scale = static_cast<float>(qx.q_scale());
|
||||||
float i_scale = qx.q_scale();
|
|
||||||
|
|
||||||
int64_t o_zp = out.q_zero_point();
|
int64_t o_zp = out.q_zero_point();
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
float o_scale = static_cast<float>(out.q_scale());
|
||||||
float o_scale = out.q_scale();
|
|
||||||
float o_inv_scale = 1.0f / o_scale;
|
float o_inv_scale = 1.0f / o_scale;
|
||||||
|
|
||||||
float negval = negval_.to<float>();
|
float negval = negval_.to<float>();
|
||||||
@ -627,8 +625,8 @@ void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
|
|||||||
Vec zero_vec = Vec(0.0f);
|
Vec zero_vec = Vec(0.0f);
|
||||||
Vec one_vec = Vec(1.0f);
|
Vec one_vec = Vec(1.0f);
|
||||||
|
|
||||||
Vec i_scale_vec = Vec((float)i_scale);
|
Vec i_scale_vec = Vec(i_scale);
|
||||||
Vec i_zp_vec = Vec((float)i_zp);
|
Vec i_zp_vec = Vec(i_zp);
|
||||||
Vec i_scale_zp_neg_premul_vec = i_scale_vec * i_zp_vec.neg();
|
Vec i_scale_zp_neg_premul_vec = i_scale_vec * i_zp_vec.neg();
|
||||||
|
|
||||||
Vec negval_vec = Vec(negval);
|
Vec negval_vec = Vec(negval);
|
||||||
@ -738,10 +736,9 @@ void qprelu_out_kernel(Tensor& out,
|
|||||||
|
|
||||||
void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) {
|
void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) {
|
||||||
int64_t zero_point = qx.q_zero_point();
|
int64_t zero_point = qx.q_zero_point();
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
float scale = static_cast<float>(qx.q_scale());
|
||||||
float scale = qx.q_scale();
|
|
||||||
auto scale_vec = Vectorized<float>(scale);
|
auto scale_vec = Vectorized<float>(scale);
|
||||||
auto zero_point_vec = Vectorized<float>((float)zero_point);
|
auto zero_point_vec = Vectorized<float>(zero_point);
|
||||||
auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
|
auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
|
||||||
int64_t output_zero_point = zero_point;
|
int64_t output_zero_point = zero_point;
|
||||||
float output_scale = scale;
|
float output_scale = scale;
|
||||||
@ -828,10 +825,9 @@ void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) {
|
|||||||
void qsigmoid_kernel(
|
void qsigmoid_kernel(
|
||||||
const Tensor& qx, Tensor& qy, double output_scale, int64_t output_zero_point ) {
|
const Tensor& qx, Tensor& qy, double output_scale, int64_t output_zero_point ) {
|
||||||
int64_t zero_point = qx.q_zero_point();
|
int64_t zero_point = qx.q_zero_point();
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
float scale = static_cast<float>(qx.q_scale());
|
||||||
float scale = qx.q_scale();
|
|
||||||
auto scale_vec = Vectorized<float>(scale);
|
auto scale_vec = Vectorized<float>(scale);
|
||||||
auto zero_point_vec = Vectorized<float>((float)zero_point);
|
auto zero_point_vec = Vectorized<float>(zero_point);
|
||||||
|
|
||||||
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qsigmoid", [&]() {
|
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qsigmoid", [&]() {
|
||||||
float inv_output_scale = 1.0 / output_scale;
|
float inv_output_scale = 1.0 / output_scale;
|
||||||
@ -870,10 +866,9 @@ void qsigmoid_kernel(
|
|||||||
|
|
||||||
void qhardsigmoid_kernel(const Tensor& qx, Tensor& qy) {
|
void qhardsigmoid_kernel(const Tensor& qx, Tensor& qy) {
|
||||||
int64_t zero_point = qx.q_zero_point();
|
int64_t zero_point = qx.q_zero_point();
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
float scale = static_cast<float>(qx.q_scale());
|
||||||
float scale = qx.q_scale();
|
|
||||||
auto scale_vec = Vectorized<float>(scale);
|
auto scale_vec = Vectorized<float>(scale);
|
||||||
auto zero_point_vec = Vectorized<float>((float)zero_point);
|
auto zero_point_vec = Vectorized<float>(zero_point);
|
||||||
auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
|
auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
|
||||||
|
|
||||||
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qhardsigmoid", [&]() {
|
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qhardsigmoid", [&]() {
|
||||||
@ -1029,13 +1024,10 @@ void qthreshold_kernel(
|
|||||||
|
|
||||||
// defines input and output scales and zero_points
|
// defines input and output scales and zero_points
|
||||||
int64_t input_zero_point = qx.q_zero_point();
|
int64_t input_zero_point = qx.q_zero_point();
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
float input_scale = static_cast<float>(qx.q_scale());
|
||||||
float input_scale = qx.q_scale();
|
|
||||||
int64_t output_zero_point = qy.q_zero_point();
|
int64_t output_zero_point = qy.q_zero_point();
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
float output_scale = static_cast<float>(qy.q_scale());
|
||||||
float output_scale = qy.q_scale();
|
float inv_output_scale = static_cast<float>(1.0 / output_scale);
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
|
||||||
float inv_output_scale = 1.0 / output_scale;
|
|
||||||
|
|
||||||
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qthreshold", [&]() {
|
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qthreshold", [&]() {
|
||||||
qy = at::_empty_affine_quantized(
|
qy = at::_empty_affine_quantized(
|
||||||
@ -1096,8 +1088,7 @@ void qhardswish_kernel(const Tensor& qx, Tensor& qy) {
|
|||||||
|
|
||||||
const auto o_scale = qy.q_scale();
|
const auto o_scale = qy.q_scale();
|
||||||
const auto o_zero_point = qy.q_zero_point();
|
const auto o_zero_point = qy.q_zero_point();
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
const float o_inv_scale = static_cast<float>(1.0 / o_scale);
|
||||||
const float o_inv_scale = 1.0 / o_scale;
|
|
||||||
|
|
||||||
using fVec = Vectorized<float>;
|
using fVec = Vectorized<float>;
|
||||||
fVec i_scale_vec(i_scale);
|
fVec i_scale_vec(i_scale);
|
||||||
@ -1135,10 +1126,9 @@ void qhardswish_kernel(const Tensor& qx, Tensor& qy) {
|
|||||||
|
|
||||||
void qtanh_kernel(const Tensor& qx, Tensor& qy) {
|
void qtanh_kernel(const Tensor& qx, Tensor& qy) {
|
||||||
int64_t zero_point = qx.q_zero_point();
|
int64_t zero_point = qx.q_zero_point();
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
float scale = static_cast<float>(qx.q_scale());
|
||||||
float scale = qx.q_scale();
|
|
||||||
auto scale_vec = Vectorized<float>(scale);
|
auto scale_vec = Vectorized<float>(scale);
|
||||||
auto zero_point_vec = Vectorized<float>((float)zero_point);
|
auto zero_point_vec = Vectorized<float>(zero_point);
|
||||||
auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
|
auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
|
||||||
|
|
||||||
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qtanh", [&]() {
|
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qtanh", [&]() {
|
||||||
@ -1198,16 +1188,13 @@ void qelu_kernel(
|
|||||||
// they are NOT related to the quantization scale term
|
// they are NOT related to the quantization scale term
|
||||||
|
|
||||||
int64_t i_zp = qx.q_zero_point();
|
int64_t i_zp = qx.q_zero_point();
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
float i_scale = static_cast<float>(qx.q_scale());
|
||||||
float i_scale = qx.q_scale();
|
|
||||||
|
|
||||||
// In a future PR, we can improve on output scale and zero_point
|
// In a future PR, we can improve on output scale and zero_point
|
||||||
// selection.
|
// selection.
|
||||||
int64_t o_zp = qy.q_zero_point();
|
int64_t o_zp = qy.q_zero_point();
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
float o_scale = static_cast<float>(qy.q_scale());
|
||||||
float o_scale = qy.q_scale();
|
float inv_o_scale = static_cast<float>(1.0 / o_scale);
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
|
||||||
float inv_o_scale = 1.0 / o_scale;
|
|
||||||
|
|
||||||
float alpha_float = alpha.to<float>();
|
float alpha_float = alpha.to<float>();
|
||||||
float scale_coef = scale.to<float>();
|
float scale_coef = scale.to<float>();
|
||||||
@ -1227,7 +1214,7 @@ void qelu_kernel(
|
|||||||
Vec scale_coef_vec = Vec(scale_coef);
|
Vec scale_coef_vec = Vec(scale_coef);
|
||||||
Vec input_scale_coef_vec = Vec(input_scale_coef);
|
Vec input_scale_coef_vec = Vec(input_scale_coef);
|
||||||
Vec i_scale_vec = Vec(i_scale);
|
Vec i_scale_vec = Vec(i_scale);
|
||||||
Vec i_zero_point_vec = Vec((float)i_zp);
|
Vec i_zero_point_vec = Vec(i_zp);
|
||||||
Vec i_scale_neg_zp_premul_vec = i_scale_vec * i_zero_point_vec.neg();
|
Vec i_scale_neg_zp_premul_vec = i_scale_vec * i_zero_point_vec.neg();
|
||||||
|
|
||||||
cpu_kernel_vec(
|
cpu_kernel_vec(
|
||||||
@ -1326,23 +1313,20 @@ void qadd_scalar_kernel(Tensor& out, const Tensor& self, const Scalar& other) {
|
|||||||
template <bool ReLUFused = false>
|
template <bool ReLUFused = false>
|
||||||
void qadd_kernel(Tensor& out, const Tensor& self, const Tensor& other) {
|
void qadd_kernel(Tensor& out, const Tensor& self, const Tensor& other) {
|
||||||
int64_t zero_point = out.q_zero_point();
|
int64_t zero_point = out.q_zero_point();
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
float scale = static_cast<float>(out.q_scale());
|
||||||
float scale = out.q_scale();
|
|
||||||
float inv_scale = 1.0f / scale;
|
float inv_scale = 1.0f / scale;
|
||||||
int64_t self_zero_point = self.q_zero_point();
|
int64_t self_zero_point = self.q_zero_point();
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
float self_scale = static_cast<float>(self.q_scale());
|
||||||
float self_scale = self.q_scale();
|
|
||||||
int64_t other_zero_point = other.q_zero_point();
|
int64_t other_zero_point = other.q_zero_point();
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
float other_scale = static_cast<float>(other.q_scale());
|
||||||
float other_scale = other.q_scale();
|
|
||||||
|
|
||||||
// Broadcast out the parameters here to amortize out that cost across
|
// Broadcast out the parameters here to amortize out that cost across
|
||||||
// loop iterations.
|
// loop iterations.
|
||||||
// TODO: we can optimize dequantization by doing a premultiplication
|
// TODO: we can optimize dequantization by doing a premultiplication
|
||||||
// of the zero point by scale and doing FMA on scale*x_q - (scale*zero_point)
|
// of the zero point by scale and doing FMA on scale*x_q - (scale*zero_point)
|
||||||
auto self_zero_point_vec = Vectorized<float>((float)self_zero_point);
|
auto self_zero_point_vec = Vectorized<float>(self_zero_point);
|
||||||
auto self_scale_vec = Vectorized<float>(self_scale);
|
auto self_scale_vec = Vectorized<float>(self_scale);
|
||||||
auto other_zero_point_vec = Vectorized<float>((float)other_zero_point);
|
auto other_zero_point_vec = Vectorized<float>(other_zero_point);
|
||||||
auto other_scale_vec = Vectorized<float>(other_scale);
|
auto other_scale_vec = Vectorized<float>(other_scale);
|
||||||
|
|
||||||
auto self_scale_neg_zp_premul_vec = self_scale_vec * self_zero_point_vec.neg();
|
auto self_scale_neg_zp_premul_vec = self_scale_vec * self_zero_point_vec.neg();
|
||||||
@ -2965,7 +2949,7 @@ void quantized_normalize_kernel(
|
|||||||
const bool beta_null = beta_data == nullptr;
|
const bool beta_null = beta_data == nullptr;
|
||||||
int64_t x_zp = X.q_zero_point();
|
int64_t x_zp = X.q_zero_point();
|
||||||
float x_scale = X.q_scale();
|
float x_scale = X.q_scale();
|
||||||
fVec x_zp_vec((float)x_zp);
|
fVec x_zp_vec(x_zp);
|
||||||
fVec one_vec(1.0f);
|
fVec one_vec(1.0f);
|
||||||
fVec zero_vec(0.0f);
|
fVec zero_vec(0.0f);
|
||||||
float x_fake_scale = 1.0f;
|
float x_fake_scale = 1.0f;
|
||||||
@ -3253,7 +3237,7 @@ void quantized_groupnorm_nhwc_kernel(
|
|||||||
const bool beta_null = beta_data == nullptr;
|
const bool beta_null = beta_data == nullptr;
|
||||||
int64_t x_zp = X.q_zero_point();
|
int64_t x_zp = X.q_zero_point();
|
||||||
float x_scale = X.q_scale();
|
float x_scale = X.q_scale();
|
||||||
fVec x_zp_vec((float)x_zp);
|
fVec x_zp_vec(x_zp);
|
||||||
fVec one_vec(1.0f);
|
fVec one_vec(1.0f);
|
||||||
fVec zero_vec(0.0f);
|
fVec zero_vec(0.0f);
|
||||||
float x_fake_scale = 1.0f;
|
float x_fake_scale = 1.0f;
|
||||||
|
|||||||
@ -414,7 +414,6 @@ at::Tensor& PackedLinearWeightFp16::apply_dynamic_impl(
|
|||||||
TORCH_CHECK(input.size(input.dim() - 1) == packed_weight_fp16.numRows())
|
TORCH_CHECK(input.size(input.dim() - 1) == packed_weight_fp16.numRows())
|
||||||
TORCH_CHECK(input.dim() >= 2);
|
TORCH_CHECK(input.dim() >= 2);
|
||||||
|
|
||||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
|
||||||
const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
|
const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
|
||||||
const int64_t N = packed_weight_fp16.numCols();
|
const int64_t N = packed_weight_fp16.numCols();
|
||||||
std::vector<int64_t> output_sizes = input.sizes().vec();
|
std::vector<int64_t> output_sizes = input.sizes().vec();
|
||||||
|
|||||||
@ -467,6 +467,28 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, IntArrayRe
|
|||||||
!options.has_layout() || options.layout() == kSparse,
|
!options.has_layout() || options.layout() == kSparse,
|
||||||
"expected sparse layout, but got layout ",
|
"expected sparse layout, but got layout ",
|
||||||
options.layout());
|
options.layout());
|
||||||
|
|
||||||
|
if (indices.numel() > 0) {
|
||||||
|
Tensor min_indices =
|
||||||
|
std::get</* values */ 0>(indices.min(/* dim */ 1, /* keepdim */ false));
|
||||||
|
Tensor cpu_min_indices;
|
||||||
|
if (!indices.is_cpu()) {
|
||||||
|
cpu_min_indices = min_indices.to(at::DeviceType::CPU);
|
||||||
|
} else {
|
||||||
|
cpu_min_indices = min_indices;
|
||||||
|
}
|
||||||
|
auto cpu_min_indices_accessor = cpu_min_indices.accessor<int64_t, 1>();
|
||||||
|
for (const auto d : c10::irange(indices.size(0))) {
|
||||||
|
int64_t min_index_in_dim = cpu_min_indices_accessor[d];
|
||||||
|
TORCH_CHECK(
|
||||||
|
min_index_in_dim >= 0,
|
||||||
|
"found negative index ",
|
||||||
|
min_index_in_dim,
|
||||||
|
" for dim ",
|
||||||
|
d);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return at::native::_sparse_coo_tensor_unsafe(
|
return at::native::_sparse_coo_tensor_unsafe(
|
||||||
indices,
|
indices,
|
||||||
values,
|
values,
|
||||||
|
|||||||
@ -22,6 +22,7 @@
|
|||||||
#else
|
#else
|
||||||
#include <ATen/ops/empty.h>
|
#include <ATen/ops/empty.h>
|
||||||
#include <ATen/ops/empty_like.h>
|
#include <ATen/ops/empty_like.h>
|
||||||
|
#include <ATen/ops/zeros_like.h>
|
||||||
#include <ATen/ops/reshape.h>
|
#include <ATen/ops/reshape.h>
|
||||||
#include <ATen/ops/scalar_tensor.h>
|
#include <ATen/ops/scalar_tensor.h>
|
||||||
#include <ATen/ops/sum.h>
|
#include <ATen/ops/sum.h>
|
||||||
@ -42,7 +43,6 @@ C10_DIAGNOSTIC_POP()
|
|||||||
#include <static_switch.h>
|
#include <static_switch.h>
|
||||||
#include <ATen/native/transformers/cuda/flash_attn/flash_api.h>
|
#include <ATen/native/transformers/cuda/flash_attn/flash_api.h>
|
||||||
|
|
||||||
|
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
|
|
||||||
namespace FLASH_NAMESPACE {
|
namespace FLASH_NAMESPACE {
|
||||||
@ -417,6 +417,26 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
|
|||||||
const int head_size_og = sizes[3];
|
const int head_size_og = sizes[3];
|
||||||
const int seqlen_k = k.size(1);
|
const int seqlen_k = k.size(1);
|
||||||
const int num_heads_k = k.size(2);
|
const int num_heads_k = k.size(2);
|
||||||
|
|
||||||
|
if (batch_size == 0) {
|
||||||
|
auto opts = q.options();
|
||||||
|
at::Tensor out = at::empty({0, seqlen_q, num_heads, head_size_og}, opts);
|
||||||
|
at::Tensor q_padded = at::empty({0, seqlen_q, num_heads, head_size_og}, opts);
|
||||||
|
at::Tensor k_padded = at::empty({0, seqlen_k, num_heads_k, head_size_og}, opts);
|
||||||
|
at::Tensor v_padded = at::empty({0, seqlen_k, num_heads_k, head_size_og}, opts);
|
||||||
|
at::Tensor softmax_lse = at::empty({0, num_heads, seqlen_q}, opts.dtype(at::kFloat));
|
||||||
|
at::Tensor rng_state = at::empty({2}, at::dtype(c10::kUInt64).device(at::kCUDA));
|
||||||
|
at::Tensor _unused = at::empty({}, at::dtype(c10::kUInt64).device(at::kCUDA));
|
||||||
|
at::Tensor p = at::empty({0}, opts);
|
||||||
|
if (return_softmax) {
|
||||||
|
auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
|
||||||
|
const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
|
||||||
|
const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
|
||||||
|
p = at::empty({0, num_heads, seqlen_q_rounded, seqlen_k_rounded}, opts);
|
||||||
|
}
|
||||||
|
return {std::move(out), std::move(q_padded), std::move(k_padded), std::move(v_padded), std::move(softmax_lse), std::move(rng_state), _unused, std::move(p)};
|
||||||
|
}
|
||||||
|
|
||||||
TORCH_CHECK(batch_size > 0, "batch size must be positive");
|
TORCH_CHECK(batch_size > 0, "batch size must be positive");
|
||||||
TORCH_CHECK(head_size_og % 8 == 0, "head_size must be a multiple of 8, this is ensured by padding!");
|
TORCH_CHECK(head_size_og % 8 == 0, "head_size must be a multiple of 8, this is ensured by padding!");
|
||||||
TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
|
TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
|
||||||
@ -547,7 +567,7 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
|
|||||||
q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og});
|
q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og});
|
||||||
softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
|
softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
|
||||||
}
|
}
|
||||||
return {out, q_padded, k_padded, v_padded, softmax_lse, rng_state, _unused, p};
|
return {std::move(out), std::move(q_padded), std::move(k_padded), std::move(v_padded), std::move(softmax_lse), std::move(rng_state), std::move(_unused), std::move(p)};
|
||||||
}
|
}
|
||||||
|
|
||||||
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
|
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
|
||||||
@ -852,7 +872,6 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si
|
|||||||
TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
|
TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
|
||||||
TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
|
TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
|
||||||
TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension");
|
TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension");
|
||||||
TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
|
|
||||||
|
|
||||||
const auto sizes = q.sizes();
|
const auto sizes = q.sizes();
|
||||||
|
|
||||||
@ -863,6 +882,20 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si
|
|||||||
const int head_size = sizes[3];
|
const int head_size = sizes[3];
|
||||||
const int seqlen_k = k.size(1);
|
const int seqlen_k = k.size(1);
|
||||||
const int num_heads_k = k.size(2);
|
const int num_heads_k = k.size(2);
|
||||||
|
|
||||||
|
if (batch_size == 0) {
|
||||||
|
auto opts = q.options();
|
||||||
|
at::Tensor dq = at::empty_like(q);
|
||||||
|
at::Tensor dk = at::empty_like(k);
|
||||||
|
at::Tensor dv = at::empty_like(v);
|
||||||
|
auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
|
||||||
|
const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
|
||||||
|
at::Tensor softmax_d = at::empty({0, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat));
|
||||||
|
return {dq, dk, dv, softmax_d};
|
||||||
|
}
|
||||||
|
|
||||||
|
TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
|
||||||
|
|
||||||
TORCH_CHECK(batch_size > 0, "batch size must be positive");
|
TORCH_CHECK(batch_size > 0, "batch size must be positive");
|
||||||
TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
|
TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
|
||||||
TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!");
|
TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!");
|
||||||
|
|||||||
@ -1837,6 +1837,10 @@ class BenchmarkRunner:
|
|||||||
def skip_models_for_cuda(self):
|
def skip_models_for_cuda(self):
|
||||||
return set()
|
return set()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def skip_models_for_xpu(self):
|
||||||
|
return set()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def skip_models_for_cpu(self):
|
def skip_models_for_cpu(self):
|
||||||
return set()
|
return set()
|
||||||
@ -3927,6 +3931,8 @@ def run(runner, args, original_dir=None):
|
|||||||
runner.skip_models.update(runner.skip_models_for_cpu_aarch64)
|
runner.skip_models.update(runner.skip_models_for_cpu_aarch64)
|
||||||
elif args.devices == ["cuda"]:
|
elif args.devices == ["cuda"]:
|
||||||
runner.skip_models.update(runner.skip_models_for_cuda)
|
runner.skip_models.update(runner.skip_models_for_cuda)
|
||||||
|
elif args.devices == ["xpu"]:
|
||||||
|
runner.skip_models.update(runner.skip_models_for_xpu)
|
||||||
|
|
||||||
if not args.multiprocess:
|
if not args.multiprocess:
|
||||||
runner.skip_models.update(runner.skip_multiprocess_models)
|
runner.skip_models.update(runner.skip_multiprocess_models)
|
||||||
|
|||||||
@ -56,6 +56,20 @@ def list_benchmarks():
|
|||||||
print(f"Available benchmarks: {list(BENCHMARK_REGISTRY.keys())}")
|
print(f"Available benchmarks: {list(BENCHMARK_REGISTRY.keys())}")
|
||||||
|
|
||||||
|
|
||||||
|
def _run_benchmark(
|
||||||
|
benchmark_cls,
|
||||||
|
script_args,
|
||||||
|
):
|
||||||
|
benchmark = benchmark_cls(script_args)
|
||||||
|
benchmark.benchmark()
|
||||||
|
benchmark.report_geomean_speedup()
|
||||||
|
if script_args.print_benchmark_result:
|
||||||
|
print(f"Benchmarking results {benchmark.name}:")
|
||||||
|
print(benchmark.profiling_results)
|
||||||
|
if script_args.visualize:
|
||||||
|
benchmark.visualize()
|
||||||
|
|
||||||
|
|
||||||
def run_benchmark(
|
def run_benchmark(
|
||||||
benchmark_name: str,
|
benchmark_name: str,
|
||||||
script_args,
|
script_args,
|
||||||
@ -71,10 +85,7 @@ def run_benchmark(
|
|||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
|
|
||||||
benchmark_class = BENCHMARK_REGISTRY[benchmark_name]
|
benchmark_class = BENCHMARK_REGISTRY[benchmark_name]
|
||||||
benchmark = benchmark_class(script_args)
|
_run_benchmark(benchmark_class, script_args)
|
||||||
benchmark.benchmark()
|
|
||||||
if script_args.visualize:
|
|
||||||
benchmark.visualize()
|
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -87,10 +98,7 @@ def run_all_benchmarks(script_args):
|
|||||||
|
|
||||||
for name, cls in BENCHMARK_REGISTRY.items():
|
for name, cls in BENCHMARK_REGISTRY.items():
|
||||||
print(f"\n{'=' * 20} {name.upper()} {'=' * 20}")
|
print(f"\n{'=' * 20} {name.upper()} {'=' * 20}")
|
||||||
benchmark = cls(script_args)
|
_run_benchmark(cls, script_args)
|
||||||
benchmark.benchmark()
|
|
||||||
if script_args.visualize:
|
|
||||||
benchmark.visualize()
|
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
|
||||||
@ -149,8 +157,43 @@ Examples:
|
|||||||
help="Whether to exit with an error message for accuracy failure",
|
help="Whether to exit with an error message for accuracy failure",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--print-benchmark-result",
|
||||||
|
action="store_true",
|
||||||
|
help="Whether to print the raw benchmarking result. Easier to quickly check the benchmark results on a server without GUI",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--custom-compile-name",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Name for the curve with customized compilation options",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--custom-compile-options",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Json string for the custom compile options.",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.custom_compile_options:
|
||||||
|
import json
|
||||||
|
|
||||||
|
try:
|
||||||
|
args.custom_compile_options = json.loads(args.custom_compile_options)
|
||||||
|
except json.decoder.JSONDecodeError as e:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Invalid json string for --custom-compile-options: {args.custom_compile_options}"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
if not args.custom_compile_options:
|
||||||
|
raise RuntimeError("Found no options for --custom-compile-options")
|
||||||
|
if not args.custom_compile_name:
|
||||||
|
raise RuntimeError("Missing label name for the custom compilation")
|
||||||
|
|
||||||
# Handle list option
|
# Handle list option
|
||||||
if args.list:
|
if args.list:
|
||||||
list_benchmarks()
|
list_benchmarks()
|
||||||
|
|||||||
@ -8,6 +8,15 @@ import torch
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
|
||||||
|
# more important shapes used by internal models
|
||||||
|
extra_shapes_for_norm = (
|
||||||
|
(1152 * 500, 384),
|
||||||
|
(1152 * 500, 512),
|
||||||
|
(1152 * 1000, 384),
|
||||||
|
(1152 * 1000, 512),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class CrossEntropyForward(BenchmarkKernel):
|
class CrossEntropyForward(BenchmarkKernel):
|
||||||
def __init__(self, script_args):
|
def __init__(self, script_args):
|
||||||
super().__init__(script_args)
|
super().__init__(script_args)
|
||||||
@ -346,7 +355,7 @@ class RMSNormForward(BenchmarkKernel):
|
|||||||
(32768, 65536),
|
(32768, 65536),
|
||||||
(16384, 131072),
|
(16384, 131072),
|
||||||
(8192, 262144),
|
(8192, 262144),
|
||||||
)
|
) + extra_shapes_for_norm
|
||||||
|
|
||||||
def get_memory_bytes(self, args, kwargs) -> int:
|
def get_memory_bytes(self, args, kwargs) -> int:
|
||||||
x, w = args
|
x, w = args
|
||||||
@ -438,8 +447,7 @@ class RMSNormBackward(BenchmarkKernel):
|
|||||||
(32768, 4096),
|
(32768, 4096),
|
||||||
(32768, 8192),
|
(32768, 8192),
|
||||||
(32768, 16384),
|
(32768, 16384),
|
||||||
(32768, 32768),
|
) + extra_shapes_for_norm
|
||||||
)
|
|
||||||
|
|
||||||
def get_memory_bytes(self, args, kwargs) -> int:
|
def get_memory_bytes(self, args, kwargs) -> int:
|
||||||
x, w, dy = args
|
x, w, dy = args
|
||||||
@ -553,7 +561,7 @@ class LayerNormForward(BenchmarkKernel):
|
|||||||
(32768, 16384),
|
(32768, 16384),
|
||||||
(32768, 32768),
|
(32768, 32768),
|
||||||
(32768, 65536),
|
(32768, 65536),
|
||||||
)
|
) + extra_shapes_for_norm
|
||||||
|
|
||||||
def get_memory_bytes(self, args, kwargs) -> int:
|
def get_memory_bytes(self, args, kwargs) -> int:
|
||||||
x, w = args
|
x, w = args
|
||||||
@ -627,7 +635,7 @@ class LayerNormBackward(BenchmarkKernel):
|
|||||||
(32768, 16384),
|
(32768, 16384),
|
||||||
(32768, 32768),
|
(32768, 32768),
|
||||||
(32768, 65536),
|
(32768, 65536),
|
||||||
)
|
) + extra_shapes_for_norm
|
||||||
|
|
||||||
def get_memory_bytes(self, args, kwargs) -> int:
|
def get_memory_bytes(self, args, kwargs) -> int:
|
||||||
x, w, dy = args
|
x, w, dy = args
|
||||||
|
|||||||
@ -6,6 +6,7 @@ from dataclasses import dataclass
|
|||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
from scipy.stats import gmean
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch._inductor.runtime.benchmarking import benchmarker
|
from torch._inductor.runtime.benchmarking import benchmarker
|
||||||
@ -107,6 +108,18 @@ class BenchmarkKernel:
|
|||||||
for backend in self.available_backends:
|
for backend in self.available_backends:
|
||||||
args_ref, kwargs_ref = self.clone_inputs(args, kwargs)
|
args_ref, kwargs_ref = self.clone_inputs(args, kwargs)
|
||||||
res[backend] = getattr(self, backend)(args_ref, kwargs_ref)()
|
res[backend] = getattr(self, backend)(args_ref, kwargs_ref)()
|
||||||
|
|
||||||
|
if (
|
||||||
|
"compiled" in self.available_backends
|
||||||
|
and self.script_args.custom_compile_options
|
||||||
|
):
|
||||||
|
torch._dynamo.reset() # cause recompile
|
||||||
|
with torch._inductor.config.patch(self.script_args.custom_compile_options):
|
||||||
|
args_ref, kwargs_ref = self.clone_inputs(args, kwargs)
|
||||||
|
res[self.script_args.custom_compile_name] = self.compiled(
|
||||||
|
args_ref, kwargs_ref
|
||||||
|
)()
|
||||||
|
|
||||||
gold = res["eager"]
|
gold = res["eager"]
|
||||||
|
|
||||||
tol = {}
|
tol = {}
|
||||||
@ -115,7 +128,7 @@ class BenchmarkKernel:
|
|||||||
"atol": self.script_args.tolerance,
|
"atol": self.script_args.tolerance,
|
||||||
"rtol": self.script_args.tolerance,
|
"rtol": self.script_args.tolerance,
|
||||||
}
|
}
|
||||||
for backend in self.available_backends:
|
for backend in res:
|
||||||
if backend == "eager":
|
if backend == "eager":
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
@ -134,37 +147,83 @@ class BenchmarkKernel:
|
|||||||
print("Exit right away since --exit-on-accuracy-failure is set")
|
print("Exit right away since --exit-on-accuracy-failure is set")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
def benchmark_single_shape(
|
def benchmark_single_shape_for_backend(
|
||||||
self, args, kwargs=None, should_check_accuracy=True, setting: str = ""
|
self, backend, args, kwargs, setting, fn=None
|
||||||
):
|
) -> bool:
|
||||||
for backend in self.available_backends:
|
if fn is None:
|
||||||
|
fn = getattr(self, backend)
|
||||||
args_ref, kwargs_ref = self.clone_inputs(args, kwargs)
|
args_ref, kwargs_ref = self.clone_inputs(args, kwargs)
|
||||||
try:
|
try:
|
||||||
avg_time = benchmark_kernel_in_milliseconds(
|
avg_time = benchmark_kernel_in_milliseconds(fn(args_ref, kwargs_ref))
|
||||||
getattr(self, backend)(args_ref, kwargs_ref)
|
|
||||||
)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(
|
print(
|
||||||
f"Failed to run {backend} backend on {self.name} kernel for {setting} due to {e}"
|
f"Failed to run {backend} backend on {self.name} kernel for {setting} due to {e}"
|
||||||
)
|
)
|
||||||
self.available_backends.remove(backend) # noqa: B909
|
self.available_backends.remove(backend) # noqa: B909
|
||||||
continue
|
return False
|
||||||
mem_bytes = self.get_memory_bytes(args_ref, kwargs_ref)
|
mem_bytes = self.get_memory_bytes(args_ref, kwargs_ref)
|
||||||
perf = Performance(setting, avg_time, mem_bytes)
|
perf = Performance(setting, avg_time, mem_bytes)
|
||||||
print(f"{self.name} kernel on {backend} backend. {perf}")
|
print(f"{self.name} kernel on {backend} backend. {perf}")
|
||||||
self.profiling_results[backend].append(perf)
|
self.profiling_results[backend].append(perf)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def benchmark_single_shape(
|
||||||
|
self, args, kwargs=None, should_check_accuracy=True, setting: str = ""
|
||||||
|
):
|
||||||
|
for backend in self.available_backends:
|
||||||
|
self.benchmark_single_shape_for_backend(backend, args, kwargs, setting)
|
||||||
|
if (
|
||||||
|
"compiled" in self.available_backends
|
||||||
|
and self.script_args.custom_compile_options
|
||||||
|
):
|
||||||
|
torch._dynamo.reset() # cause recompile
|
||||||
|
with torch._inductor.config.patch(self.script_args.custom_compile_options):
|
||||||
|
status = self.benchmark_single_shape_for_backend(
|
||||||
|
self.script_args.custom_compile_name,
|
||||||
|
args,
|
||||||
|
kwargs,
|
||||||
|
setting,
|
||||||
|
fn=self.compiled,
|
||||||
|
)
|
||||||
|
if not status:
|
||||||
|
self.script_args.custom_compile_options = (
|
||||||
|
None # once fail, don't run again
|
||||||
|
)
|
||||||
|
|
||||||
if should_check_accuracy:
|
if should_check_accuracy:
|
||||||
self.check_accuracy(args, kwargs)
|
self.check_accuracy(args, kwargs)
|
||||||
|
|
||||||
def visualize(self) -> None:
|
def visualize(self) -> None:
|
||||||
|
device_name = torch.cuda.get_device_name(0)
|
||||||
visualize_comparison(
|
visualize_comparison(
|
||||||
self.profiling_results,
|
self.profiling_results,
|
||||||
title=f"{self.name}",
|
title=f"{self.name} ({device_name})",
|
||||||
output_path=f"{self.name}_bench",
|
output_path=f"{self.name}_bench",
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def report_geomean_speedup(self) -> None:
|
||||||
|
print(f"Geomean speedup for benchmark {self.name}")
|
||||||
|
eager_result = {
|
||||||
|
result.setting: result for result in self.profiling_results["eager"]
|
||||||
|
}
|
||||||
|
print(f" eager {len(eager_result)} data points")
|
||||||
|
for backend, backend_result in self.profiling_results.items():
|
||||||
|
if backend == "eager":
|
||||||
|
continue
|
||||||
|
speeduplist = []
|
||||||
|
for result in backend_result:
|
||||||
|
eager_latency = eager_result[result.setting].latency
|
||||||
|
backend_latency = result.latency
|
||||||
|
speeduplist.append(
|
||||||
|
eager_latency / backend_latency if backend_latency != 0 else 0.0
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(speeduplist) > 0:
|
||||||
|
print(
|
||||||
|
f" {backend} {len(speeduplist)} data points, {gmean(speeduplist):.2f}x speedup"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_backend_colors() -> dict[str, str]:
|
def get_backend_colors() -> dict[str, str]:
|
||||||
"""Get consistent color scheme for different backends."""
|
"""Get consistent color scheme for different backends."""
|
||||||
@ -252,5 +311,6 @@ def visualize_comparison(
|
|||||||
os.makedirs("pics", exist_ok=True)
|
os.makedirs("pics", exist_ok=True)
|
||||||
full_path = os.path.join("pics", output_path + ".png")
|
full_path = os.path.join("pics", output_path + ".png")
|
||||||
plt.savefig(full_path, dpi=300, bbox_inches="tight", facecolor="white")
|
plt.savefig(full_path, dpi=300, bbox_inches="tight", facecolor="white")
|
||||||
|
print(f"Chart saved to {full_path}")
|
||||||
|
|
||||||
plt.close()
|
plt.close()
|
||||||
|
|||||||
@ -74,7 +74,8 @@ REQUIRE_HIGHER_TOLERANCE = {
|
|||||||
REQUIRE_HIGHER_TOLERANCE_AMP = {}
|
REQUIRE_HIGHER_TOLERANCE_AMP = {}
|
||||||
|
|
||||||
REQUIRE_EVEN_HIGHER_TOLERANCE = {
|
REQUIRE_EVEN_HIGHER_TOLERANCE = {
|
||||||
"beit_base_patch16_224",
|
"deit_base_distilled_patch16_224",
|
||||||
|
"vit_base_patch16_siglip_256",
|
||||||
}
|
}
|
||||||
|
|
||||||
# These models need higher tolerance in MaxAutotune mode
|
# These models need higher tolerance in MaxAutotune mode
|
||||||
@ -354,7 +355,9 @@ class TimmRunner(BenchmarkRunner):
|
|||||||
if is_training:
|
if is_training:
|
||||||
from torch._inductor import config as inductor_config
|
from torch._inductor import config as inductor_config
|
||||||
|
|
||||||
if name in REQUIRE_EVEN_HIGHER_TOLERANCE or (
|
if name == "beit_base_patch16_224":
|
||||||
|
tolerance = 16 * 1e-2
|
||||||
|
elif name in REQUIRE_EVEN_HIGHER_TOLERANCE or (
|
||||||
inductor_config.max_autotune
|
inductor_config.max_autotune
|
||||||
and name in REQUIRE_EVEN_HIGHER_TOLERANCE_MAX_AUTOTUNE
|
and name in REQUIRE_EVEN_HIGHER_TOLERANCE_MAX_AUTOTUNE
|
||||||
):
|
):
|
||||||
|
|||||||
@ -124,6 +124,10 @@ class TorchBenchmarkRunner(BenchmarkRunner):
|
|||||||
def skip_models_for_cuda(self):
|
def skip_models_for_cuda(self):
|
||||||
return self._skip["device"]["cuda"]
|
return self._skip["device"]["cuda"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def skip_models_for_xpu(self):
|
||||||
|
return self._skip["device"]["xpu"]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def skip_models_for_freezing_cuda(self):
|
def skip_models_for_freezing_cuda(self):
|
||||||
return self._skip["freezing"]["cuda"]
|
return self._skip["freezing"]["cuda"]
|
||||||
|
|||||||
@ -217,6 +217,9 @@ skip:
|
|||||||
|
|
||||||
cuda: []
|
cuda: []
|
||||||
|
|
||||||
|
xpu:
|
||||||
|
- *DETECTRON2_MODELS
|
||||||
|
|
||||||
test:
|
test:
|
||||||
training:
|
training:
|
||||||
- *DETECTRON2_MODELS
|
- *DETECTRON2_MODELS
|
||||||
|
|||||||
157
benchmarks/transformer/config_utils.py
Normal file
157
benchmarks/transformer/config_utils.py
Normal file
@ -0,0 +1,157 @@
|
|||||||
|
"""Configuration utilities for parsing JSON and YAML config files."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def heads_input_type(s: str) -> tuple[int, int]:
|
||||||
|
"""Convert string format 'Hq,Hkv' to tuple (Hq, Hkv)."""
|
||||||
|
try:
|
||||||
|
hq, hkv = map(int, s.split(","))
|
||||||
|
return hq, hkv
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError("Heads must be Hq,Hkv") from e
|
||||||
|
|
||||||
|
|
||||||
|
default_config = {
|
||||||
|
"dynamic": False,
|
||||||
|
"calculate_bwd": False,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"b": [2, 8, 16],
|
||||||
|
"nh": ["16,16", "16,2"],
|
||||||
|
"s": [512, 1024, 4096],
|
||||||
|
"d": [64, 128],
|
||||||
|
"mods": ["noop", "causal", "alibi", "sliding_window"],
|
||||||
|
"backend": ["efficient"],
|
||||||
|
"max_autotune": False,
|
||||||
|
"decoding": False,
|
||||||
|
"kv_size": None,
|
||||||
|
"throughput": True,
|
||||||
|
"save_path": None,
|
||||||
|
"output_json_for_dashboard": None,
|
||||||
|
"benchmark_name": "PyTorch operator microbenchmark",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load_config_file(config_path: str) -> dict:
|
||||||
|
"""Load configuration from JSON or YAML file.
|
||||||
|
|
||||||
|
Automatically converts 'nh' field from strings to tuples.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config_path: Path to the configuration file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing the configuration
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: If config file doesn't exist
|
||||||
|
ValueError: If config file format is invalid
|
||||||
|
"""
|
||||||
|
with open(config_path) as f:
|
||||||
|
config_str = f.read()
|
||||||
|
|
||||||
|
# Try to load as JSON first
|
||||||
|
try:
|
||||||
|
config = json.loads(config_str)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# Fall back to YAML parsing
|
||||||
|
config = _parse_simple_yaml(config_str)
|
||||||
|
|
||||||
|
# Apply automatic conversions for 'nh' field
|
||||||
|
if "nh" in config and isinstance(config["nh"], list):
|
||||||
|
config["nh"] = [
|
||||||
|
heads_input_type(h) if isinstance(h, str) else h for h in config["nh"]
|
||||||
|
]
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_simple_yaml(yaml_str: str) -> dict:
|
||||||
|
"""Simple YAML parser for basic configs (without external dependencies).
|
||||||
|
|
||||||
|
Supports:
|
||||||
|
- key: value pairs
|
||||||
|
- booleans (true/false)
|
||||||
|
- null values
|
||||||
|
- integers and floats
|
||||||
|
- strings (quoted and unquoted)
|
||||||
|
- lists in JSON format [item1, item2, ...]
|
||||||
|
- comments (lines starting with # or after #)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
yaml_str: YAML content as string
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing parsed YAML content
|
||||||
|
"""
|
||||||
|
config = {}
|
||||||
|
|
||||||
|
for line in yaml_str.split("\n"):
|
||||||
|
# Remove comments
|
||||||
|
line = line.split("#")[0].strip()
|
||||||
|
|
||||||
|
if not line or ":" not in line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
key, value = line.split(":", 1)
|
||||||
|
key = key.strip()
|
||||||
|
value = value.strip()
|
||||||
|
|
||||||
|
# Parse value based on type
|
||||||
|
if value.lower() == "true":
|
||||||
|
config[key] = True
|
||||||
|
elif value.lower() == "false":
|
||||||
|
config[key] = False
|
||||||
|
elif value.lower() in ("null", "none", ""):
|
||||||
|
config[key] = None
|
||||||
|
elif value.startswith("[") and value.endswith("]"):
|
||||||
|
# Parse list - handle quoted strings properly
|
||||||
|
pattern = r'"([^"]+)"|\'([^\']+)\'|([^,\[\]\s]+)'
|
||||||
|
matches = re.findall(pattern, value[1:-1]) # Remove [ ]
|
||||||
|
parsed_items = []
|
||||||
|
for match in matches:
|
||||||
|
# match is a tuple of (double_quoted, single_quoted, unquoted)
|
||||||
|
item = match[0] or match[1] or match[2]
|
||||||
|
item = item.strip()
|
||||||
|
if item:
|
||||||
|
try:
|
||||||
|
parsed_items.append(int(item))
|
||||||
|
except ValueError:
|
||||||
|
parsed_items.append(item)
|
||||||
|
config[key] = parsed_items
|
||||||
|
elif value.startswith(('"', "'")):
|
||||||
|
config[key] = value.strip("\"'")
|
||||||
|
else:
|
||||||
|
# Try to parse as number
|
||||||
|
try:
|
||||||
|
config[key] = int(value)
|
||||||
|
except ValueError:
|
||||||
|
try:
|
||||||
|
config[key] = float(value)
|
||||||
|
except ValueError:
|
||||||
|
config[key] = value
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def print_default_config(output_format: str) -> None:
|
||||||
|
"""Print a default configuration template in JSON or YAML format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
output_format: Either "json" or "yaml"
|
||||||
|
"""
|
||||||
|
if output_format == "json":
|
||||||
|
print(json.dumps(default_config, indent=2))
|
||||||
|
else: # yaml
|
||||||
|
for key, value in default_config.items():
|
||||||
|
if value is None:
|
||||||
|
print(f"{key}: null")
|
||||||
|
elif isinstance(value, bool):
|
||||||
|
print(f"{key}: {str(value).lower()}")
|
||||||
|
elif isinstance(value, str):
|
||||||
|
print(f'{key}: "{value}"')
|
||||||
|
elif isinstance(value, list):
|
||||||
|
print(f"{key}: {json.dumps(value)}")
|
||||||
|
else:
|
||||||
|
print(f"{key}: {value}")
|
||||||
29
benchmarks/transformer/configs/config_basic.yaml
Normal file
29
benchmarks/transformer/configs/config_basic.yaml
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
# Basic benchmark configuration for PyTorch transformer benchmarks
|
||||||
|
# Usage: python score_mod.py --config config_basic.yaml
|
||||||
|
|
||||||
|
# Core parameters
|
||||||
|
dynamic: false
|
||||||
|
calculate_bwd: true
|
||||||
|
dtype: "bfloat16"
|
||||||
|
|
||||||
|
# Shape parameters - larger sweep
|
||||||
|
b: [1, 2, 4, 8, 16] # batch sizes
|
||||||
|
nh: ["16,16", "16,2", "32,32", "32,4"] # [query_heads,key_value_heads]
|
||||||
|
s: [512, 1024, 2048, 4096, 8192] # sequence lengths
|
||||||
|
d: [64, 128] # head dimensions (limited to 128 for Flash Attention/cuDNN compatibility)
|
||||||
|
|
||||||
|
# All attention types
|
||||||
|
mods: ["noop", "causal", "rel", "head_bias", "alibi", "sliding_window", "prefix_lm", "softcap"]
|
||||||
|
|
||||||
|
# Multiple backends for comparison (SDPA + Flash Attention) - flex is always included internally
|
||||||
|
backend: ["efficient", "math", "cudnn", "fav2"]
|
||||||
|
max_autotune: true # Enable torch.compile with max-autotune for optimal performance
|
||||||
|
|
||||||
|
# Decoding and cache settings
|
||||||
|
decoding: false
|
||||||
|
kv_size: null
|
||||||
|
|
||||||
|
# Metrics and output
|
||||||
|
throughput: true # Calculate memory bandwidth & TFLOPS
|
||||||
|
save_path: "comprehensive_results.csv" # Save to CSV
|
||||||
|
output_json_for_dashboard: "attn_bench_basic.json"
|
||||||
@ -1,15 +1,19 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import csv
|
import csv
|
||||||
|
import gc
|
||||||
import itertools
|
import itertools
|
||||||
|
import json
|
||||||
import random
|
import random
|
||||||
|
import sys
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
from contextlib import nullcontext
|
from contextlib import nullcontext
|
||||||
from dataclasses import asdict, dataclass
|
from dataclasses import asdict, dataclass
|
||||||
from functools import partial
|
from functools import partial, wraps
|
||||||
from typing import Optional, Union
|
from typing import Literal, Optional, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from config_utils import heads_input_type, load_config_file, print_default_config
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
@ -33,6 +37,96 @@ torch._dynamo.config.recompile_limit = 1000
|
|||||||
from torch._inductor.runtime.benchmarking import benchmarker
|
from torch._inductor.runtime.benchmarking import benchmarker
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_memory():
|
||||||
|
"""Aggressively free GPU memory"""
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
gc.collect()
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
|
||||||
|
def safe_backend(backend_name=None, return_dict=False):
|
||||||
|
"""Decorator that wraps backend functions with error handling
|
||||||
|
|
||||||
|
Args:
|
||||||
|
backend_name: Name of the backend for error messages
|
||||||
|
return_dict: If True, returns dict of results for all backends (for run_single_experiment)
|
||||||
|
If False, returns single ExperimentResults (for individual backend functions)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def decorator(func):
|
||||||
|
@wraps(func)
|
||||||
|
def wrapper(config, *args, **kwargs):
|
||||||
|
try:
|
||||||
|
return func(config, *args, **kwargs)
|
||||||
|
except torch.OutOfMemoryError:
|
||||||
|
print(
|
||||||
|
f"[SKIP] OOM for {backend_name or func.__name__} with shape {config.shape}"
|
||||||
|
)
|
||||||
|
cleanup_memory()
|
||||||
|
except RuntimeError as e:
|
||||||
|
error_msg = str(e)
|
||||||
|
if "out of resource" in error_msg or "OutOfMemoryError" in error_msg:
|
||||||
|
print(
|
||||||
|
f"[SKIP] Triton OOM for {backend_name or func.__name__} with shape {config.shape}"
|
||||||
|
)
|
||||||
|
cleanup_memory()
|
||||||
|
elif "No valid triton configs" in error_msg:
|
||||||
|
print(
|
||||||
|
f"[SKIP] No valid Triton config for {backend_name or func.__name__} with shape {config.shape}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"[SKIP] Runtime error for {backend_name or func.__name__} with shape {config.shape}: {str(e)[:100]}"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(
|
||||||
|
f"[SKIP] Error for {backend_name or func.__name__} with shape {config.shape}: {str(e)[:100]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Return appropriate NaN result based on function type
|
||||||
|
if return_dict:
|
||||||
|
# For run_single_experiment: return dict with NaN for all backends
|
||||||
|
nan_result = ExperimentResults(
|
||||||
|
fwd_time=float("nan"),
|
||||||
|
bwd_time=float("nan") if config.calculate_bwd_time else None,
|
||||||
|
)
|
||||||
|
results = dict.fromkeys(config.backends, nan_result)
|
||||||
|
results["flex"] = ExperimentResults(
|
||||||
|
fwd_time=float("nan"),
|
||||||
|
bwd_time=float("nan") if config.calculate_bwd_time else None,
|
||||||
|
sparsity=None,
|
||||||
|
)
|
||||||
|
return results
|
||||||
|
else:
|
||||||
|
# For individual backend functions: return single ExperimentResults
|
||||||
|
return ExperimentResults(
|
||||||
|
fwd_time=float("nan"),
|
||||||
|
bwd_time=float("nan") if config.calculate_bwd_time else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
|
# Type definitions
|
||||||
|
Backend = Literal["math", "efficient", "cudnn", "fav2", "fav3", "fakv", "og-eager"]
|
||||||
|
AttentionType = Literal[
|
||||||
|
"noop",
|
||||||
|
"causal",
|
||||||
|
"rel",
|
||||||
|
"head_bias",
|
||||||
|
"alibi",
|
||||||
|
"sliding_window",
|
||||||
|
"document_mask",
|
||||||
|
"prefix_lm",
|
||||||
|
"softcap",
|
||||||
|
]
|
||||||
|
DtypeString = Literal["bfloat16", "float16", "float32"]
|
||||||
|
SpeedupType = Literal["fwd", "bwd"]
|
||||||
|
|
||||||
|
|
||||||
def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) -> float:
|
def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) -> float:
|
||||||
# warmup
|
# warmup
|
||||||
for _ in range(5):
|
for _ in range(5):
|
||||||
@ -48,6 +142,7 @@ class ExperimentConfig:
|
|||||||
calculate_bwd_time: bool
|
calculate_bwd_time: bool
|
||||||
cal_bandwidth: bool
|
cal_bandwidth: bool
|
||||||
backends: list[str]
|
backends: list[str]
|
||||||
|
max_autotune: bool
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
assert len(self.shape) == 6, (
|
assert len(self.shape) == 6, (
|
||||||
@ -62,6 +157,7 @@ class ExperimentConfig:
|
|||||||
d.pop("cal_bandwidth", None)
|
d.pop("cal_bandwidth", None)
|
||||||
d["shape(B,Hq,M,Hkv,N,D)"] = d.pop("shape")
|
d["shape(B,Hq,M,Hkv,N,D)"] = d.pop("shape")
|
||||||
d.pop("backends", None)
|
d.pop("backends", None)
|
||||||
|
d.pop("max_autotune", False)
|
||||||
return d
|
return d
|
||||||
|
|
||||||
|
|
||||||
@ -209,6 +305,7 @@ def query_key_value_clones(
|
|||||||
return query_ref, key_ref, value_ref
|
return query_ref, key_ref, value_ref
|
||||||
|
|
||||||
|
|
||||||
|
@safe_backend("SDPA")
|
||||||
def run_single_backend_sdpa(
|
def run_single_backend_sdpa(
|
||||||
config: ExperimentConfig,
|
config: ExperimentConfig,
|
||||||
query: torch.Tensor,
|
query: torch.Tensor,
|
||||||
@ -223,6 +320,7 @@ def run_single_backend_sdpa(
|
|||||||
backend_context = get_backend_context(backend)
|
backend_context = get_backend_context(backend)
|
||||||
with backend_context:
|
with backend_context:
|
||||||
_device = torch.device("cuda")
|
_device = torch.device("cuda")
|
||||||
|
|
||||||
eager_sdpa = generate_eager_sdpa(
|
eager_sdpa = generate_eager_sdpa(
|
||||||
config.attn_type, config.shape, config.dtype, block_mask, score_mod
|
config.attn_type, config.shape, config.dtype, block_mask, score_mod
|
||||||
)
|
)
|
||||||
@ -290,6 +388,7 @@ def run_single_backend_sdpa(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@safe_backend("FlashAttention")
|
||||||
def run_single_backend_FA(
|
def run_single_backend_FA(
|
||||||
config: ExperimentConfig,
|
config: ExperimentConfig,
|
||||||
query: torch.Tensor,
|
query: torch.Tensor,
|
||||||
@ -301,9 +400,9 @@ def run_single_backend_FA(
|
|||||||
mask_kwargs,
|
mask_kwargs,
|
||||||
backend: str,
|
backend: str,
|
||||||
) -> ExperimentResults:
|
) -> ExperimentResults:
|
||||||
assert backend in ["fav2", "fav3", "fakv"]
|
assert backend in ["fav3", "fakv"]
|
||||||
# Generate callable for specific backend.
|
# Generate callable for specific backend.
|
||||||
if backend in ["fav2", "fav3"]:
|
if backend in ["fav3"]:
|
||||||
FA = generate_FA_callable(
|
FA = generate_FA_callable(
|
||||||
config.attn_type, config.shape, config.dtype, backend, **mask_kwargs
|
config.attn_type, config.shape, config.dtype, backend, **mask_kwargs
|
||||||
)
|
)
|
||||||
@ -354,10 +453,10 @@ def run_single_backend_FA(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@safe_backend("flex_attention", return_dict=True)
|
||||||
def run_single_experiment(
|
def run_single_experiment(
|
||||||
config: ExperimentConfig,
|
config: ExperimentConfig,
|
||||||
dynamic=False,
|
dynamic=False,
|
||||||
max_autotune=False,
|
|
||||||
) -> dict[str, ExperimentResults]:
|
) -> dict[str, ExperimentResults]:
|
||||||
device = torch.device("cuda")
|
device = torch.device("cuda")
|
||||||
batch_size, q_heads, q_seq_len, kv_heads, kv_seq_len, head_dim = config.shape
|
batch_size, q_heads, q_seq_len, kv_heads, kv_seq_len, head_dim = config.shape
|
||||||
@ -377,7 +476,7 @@ def run_single_experiment(
|
|||||||
block_mask, mask_kwargs = generate_block_mask(config.attn_type, config.shape)
|
block_mask, mask_kwargs = generate_block_mask(config.attn_type, config.shape)
|
||||||
kernel_options = get_kernel_options(config.attn_type, config.shape)
|
kernel_options = get_kernel_options(config.attn_type, config.shape)
|
||||||
|
|
||||||
if max_autotune:
|
if config.max_autotune:
|
||||||
compiled_sdpa = torch.compile(
|
compiled_sdpa = torch.compile(
|
||||||
flex_attention, dynamic=dynamic, mode="max-autotune-no-cudagraphs"
|
flex_attention, dynamic=dynamic, mode="max-autotune-no-cudagraphs"
|
||||||
)
|
)
|
||||||
@ -407,7 +506,7 @@ def run_single_experiment(
|
|||||||
|
|
||||||
results = {}
|
results = {}
|
||||||
for backend in config.backends:
|
for backend in config.backends:
|
||||||
if backend in ["fav2", "fav3", "fakv"]:
|
if backend in ["fav3", "fakv"]:
|
||||||
results[backend] = run_single_backend_FA(
|
results[backend] = run_single_backend_FA(
|
||||||
config,
|
config,
|
||||||
query,
|
query,
|
||||||
@ -419,7 +518,7 @@ def run_single_experiment(
|
|||||||
mask_kwargs,
|
mask_kwargs,
|
||||||
backend,
|
backend,
|
||||||
)
|
)
|
||||||
else: # sdpa
|
else: # sdpa (also supports fav2)
|
||||||
results[backend] = run_single_backend_sdpa(
|
results[backend] = run_single_backend_sdpa(
|
||||||
config,
|
config,
|
||||||
query,
|
query,
|
||||||
@ -440,7 +539,7 @@ def run_single_experiment(
|
|||||||
sparsity = block_mask.sparsity() / 100.0 if block_mask is not None else 0.0
|
sparsity = block_mask.sparsity() / 100.0 if block_mask is not None else 0.0
|
||||||
sparsity = sparsity if config.attn_type != "document_mask" else 0.5
|
sparsity = sparsity if config.attn_type != "document_mask" else 0.5
|
||||||
|
|
||||||
results["compiled"] = ExperimentResults(
|
results["flex"] = ExperimentResults(
|
||||||
fwd_time=forward_compiled_time,
|
fwd_time=forward_compiled_time,
|
||||||
bwd_time=backward_compile_time if config.calculate_bwd_time else None,
|
bwd_time=backward_compile_time if config.calculate_bwd_time else None,
|
||||||
sparsity=sparsity,
|
sparsity=sparsity,
|
||||||
@ -501,15 +600,15 @@ def calculate_tflops(config: ExperimentConfig, results: ExperimentResults) -> fl
|
|||||||
softmax_flops = M * N * 2 # Not counting online softmax overhead
|
softmax_flops = M * N * 2 # Not counting online softmax overhead
|
||||||
o_flops = M * D * N * 2
|
o_flops = M * D * N * 2
|
||||||
# Not counting split k overhead
|
# Not counting split k overhead
|
||||||
total_flops = B * Hq * (qk_flops + softmax_flops + o_flops) * (1 - results.sparsity)
|
sparsity = results.sparsity if results.sparsity is not None else 0.0
|
||||||
|
total_flops = B * Hq * (qk_flops + softmax_flops + o_flops) * (1 - sparsity)
|
||||||
return total_flops / results.fwd_time / 1e6 # in TFLOPs/
|
return total_flops / results.fwd_time / 1e6 # in TFLOPs/
|
||||||
|
|
||||||
|
|
||||||
def get_average_speedups(results: list[Experiment], type: str, backend: str):
|
def get_average_speedups(results: list[Experiment], type: str, backend: str):
|
||||||
# Calculate speedups
|
# Calculate speedups
|
||||||
speedups = [
|
speedups = [
|
||||||
calculate_speedup(r.results["compiled"], r.results[backend], type)
|
calculate_speedup(r.results["flex"], r.results[backend], type) for r in results
|
||||||
for r in results
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# Find indices of max and min speedups
|
# Find indices of max and min speedups
|
||||||
@ -537,7 +636,7 @@ def get_average_speedups(results: list[Experiment], type: str, backend: str):
|
|||||||
def print_results(results: list[Experiment], save_path: Optional[str] = None):
|
def print_results(results: list[Experiment], save_path: Optional[str] = None):
|
||||||
table_data = defaultdict(list)
|
table_data = defaultdict(list)
|
||||||
for experiment in results:
|
for experiment in results:
|
||||||
backends = experiment.config.backends + ["compiled"]
|
backends = experiment.config.backends + ["flex"]
|
||||||
for key, value in experiment.asdict().items():
|
for key, value in experiment.asdict().items():
|
||||||
if key in backends:
|
if key in backends:
|
||||||
if value.fwd_time:
|
if value.fwd_time:
|
||||||
@ -550,45 +649,43 @@ def print_results(results: list[Experiment], save_path: Optional[str] = None):
|
|||||||
# Calculate speedups
|
# Calculate speedups
|
||||||
for backend in results[0].config.backends:
|
for backend in results[0].config.backends:
|
||||||
fwd_speedups = [
|
fwd_speedups = [
|
||||||
calculate_speedup(r.results["compiled"], r.results[backend], type="fwd")
|
calculate_speedup(r.results["flex"], r.results[backend], type="fwd")
|
||||||
for r in results
|
for r in results
|
||||||
]
|
]
|
||||||
table_data[f"fwd_{backend}_speedup"] = fwd_speedups
|
table_data[f"fwd_speedup_flex_over_{backend}"] = fwd_speedups
|
||||||
|
|
||||||
if results[0].config.calculate_bwd_time:
|
if results[0].config.calculate_bwd_time:
|
||||||
for backend in results[0].config.backends:
|
for backend in results[0].config.backends:
|
||||||
bwd_speedups = [
|
bwd_speedups = [
|
||||||
calculate_speedup(r.results["compiled"], r.results[backend], type="bwd")
|
calculate_speedup(r.results["flex"], r.results[backend], type="bwd")
|
||||||
for r in results
|
for r in results
|
||||||
]
|
]
|
||||||
table_data[f"bwd_{backend}_speedup"] = bwd_speedups
|
table_data[f"bwd_speedup_flex_over_{backend}"] = bwd_speedups
|
||||||
|
|
||||||
# Calculate mem + computational throughput
|
# Calculate mem + computational throughput
|
||||||
if results[0].config.cal_bandwidth:
|
if results[0].config.cal_bandwidth:
|
||||||
fwd_bandwidth = [
|
fwd_bandwidth = [
|
||||||
calculate_bandwidth(r.config, r.results["compiled"], type="fwd")
|
calculate_bandwidth(r.config, r.results["flex"], type="fwd")
|
||||||
for r in results
|
for r in results
|
||||||
]
|
]
|
||||||
table_data["fwd_mem_bw (TB/s)"] = fwd_bandwidth
|
table_data["fwd_mem_bw (TB/s)"] = fwd_bandwidth
|
||||||
fwd_tflops = [
|
fwd_tflops = [calculate_tflops(r.config, r.results["flex"]) for r in results]
|
||||||
calculate_tflops(r.config, r.results["compiled"]) for r in results
|
|
||||||
]
|
|
||||||
table_data["TFlops/s"] = fwd_tflops
|
table_data["TFlops/s"] = fwd_tflops
|
||||||
|
|
||||||
print(tabulate(table_data, headers="keys", tablefmt="github", floatfmt=".3f"))
|
print(tabulate(table_data, headers="keys", tablefmt="github", floatfmt=".3f"))
|
||||||
|
|
||||||
for backend in results[0].config.backends:
|
for backend in results[0].config.backends:
|
||||||
if np.isnan(table_data[f"fwd_{backend}_speedup"]).all():
|
if np.isnan(table_data[f"fwd_speedup_flex_over_{backend}"]).all():
|
||||||
continue
|
continue
|
||||||
print("\n")
|
print("\n")
|
||||||
print(f"FWD Speedups vs. {backend}".center(125, "="))
|
print(f"FWD Speedup of Flex over {backend}".center(125, "="))
|
||||||
print("\n")
|
print("\n")
|
||||||
average_data = get_average_speedups(results, type="fwd", backend=backend)
|
average_data = get_average_speedups(results, type="fwd", backend=backend)
|
||||||
print(tabulate(average_data, headers="keys", tablefmt="github", floatfmt=".3f"))
|
print(tabulate(average_data, headers="keys", tablefmt="github", floatfmt=".3f"))
|
||||||
|
|
||||||
if results[0].config.calculate_bwd_time:
|
if results[0].config.calculate_bwd_time:
|
||||||
print("\n")
|
print("\n")
|
||||||
print(f"BWD Speedups vs. {backend}".center(125, "="))
|
print(f"BWD Speedup of Flex over {backend}".center(125, "="))
|
||||||
print("\n")
|
print("\n")
|
||||||
average_data = get_average_speedups(results, type="bwd", backend=backend)
|
average_data = get_average_speedups(results, type="bwd", backend=backend)
|
||||||
print(
|
print(
|
||||||
@ -791,14 +888,14 @@ def get_backend_context(backend: str):
|
|||||||
Returns a context manager for the specified backend.
|
Returns a context manager for the specified backend.
|
||||||
Args:
|
Args:
|
||||||
backend (str): The name of the backend to use.
|
backend (str): The name of the backend to use.
|
||||||
Valid options are 'fav2', 'cudnn', 'math', 'efficient', 'fav3', 'fakv', 'og-eager'.
|
Valid options are 'math', 'efficient', 'cudnn', 'fav2', 'fav3', 'fakv', 'og-eager'.
|
||||||
Returns:
|
Returns:
|
||||||
A context manager for the specified backend.
|
A context manager for the specified backend.
|
||||||
Raises:
|
Raises:
|
||||||
ValueError: If an invalid backend is specified.
|
ValueError: If an invalid backend is specified.
|
||||||
"""
|
"""
|
||||||
backends = {
|
backends = {
|
||||||
"fav2": nullcontext(),
|
"fav2": sdpa_kernel(SDPBackend.FLASH_ATTENTION),
|
||||||
"cudnn": sdpa_kernel(SDPBackend.CUDNN_ATTENTION),
|
"cudnn": sdpa_kernel(SDPBackend.CUDNN_ATTENTION),
|
||||||
"math": sdpa_kernel(SDPBackend.MATH),
|
"math": sdpa_kernel(SDPBackend.MATH),
|
||||||
"efficient": sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION),
|
"efficient": sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION),
|
||||||
@ -820,15 +917,7 @@ def generate_FA_callable(
|
|||||||
) -> Callable | None:
|
) -> Callable | None:
|
||||||
if dtype not in [torch.float16, torch.bfloat16]:
|
if dtype not in [torch.float16, torch.bfloat16]:
|
||||||
return None
|
return None
|
||||||
if backend == "fav2":
|
if backend == "fav3":
|
||||||
try:
|
|
||||||
from flash_attn import flash_attn_func, flash_attn_varlen_func
|
|
||||||
except ImportError:
|
|
||||||
print(
|
|
||||||
"Flash attention 2 is not installed. Please install it to run fav2 backend. "
|
|
||||||
)
|
|
||||||
raise
|
|
||||||
elif backend == "fav3":
|
|
||||||
try:
|
try:
|
||||||
from flash_attn.flash_attn_interface import (
|
from flash_attn.flash_attn_interface import (
|
||||||
flash_attn_func,
|
flash_attn_func,
|
||||||
@ -1034,6 +1123,7 @@ def generate_experiment_configs(
|
|||||||
kv_cache_size: list[int],
|
kv_cache_size: list[int],
|
||||||
cal_bandwidth: bool,
|
cal_bandwidth: bool,
|
||||||
backends: list[str],
|
backends: list[str],
|
||||||
|
max_autotune: bool,
|
||||||
) -> list[ExperimentConfig]:
|
) -> list[ExperimentConfig]:
|
||||||
assert not (calculate_bwd and decoding), "Decoding does not support backward"
|
assert not (calculate_bwd and decoding), "Decoding does not support backward"
|
||||||
|
|
||||||
@ -1077,52 +1167,333 @@ def generate_experiment_configs(
|
|||||||
calculate_bwd_time=calculate_bwd,
|
calculate_bwd_time=calculate_bwd,
|
||||||
cal_bandwidth=cal_bandwidth,
|
cal_bandwidth=cal_bandwidth,
|
||||||
backends=backends,
|
backends=backends,
|
||||||
|
max_autotune=max_autotune,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
return all_configs
|
return all_configs
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def _output_json_for_dashboard(
|
||||||
|
experiments,
|
||||||
|
output_file,
|
||||||
|
benchmark_name="PyTorch operator microbenchmark",
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Write the result into JSON format for PyTorch OSS dashboard.
|
||||||
|
The JSON format is defined at
|
||||||
|
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
|
||||||
|
|
||||||
|
Args:
|
||||||
|
experiments: List of experiment results
|
||||||
|
output_file: Path to output JSON file
|
||||||
|
benchmark_name: Name of the benchmark
|
||||||
|
"""
|
||||||
|
if not experiments:
|
||||||
|
return
|
||||||
|
|
||||||
|
import math
|
||||||
|
import platform
|
||||||
|
from dataclasses import asdict, dataclass
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
# Prepare headers and records for JSON output
|
||||||
|
records = []
|
||||||
|
for experiment in experiments:
|
||||||
|
config = experiment.config
|
||||||
|
results_dict = (
|
||||||
|
experiment.results
|
||||||
|
) # This is a dict: backend -> ExperimentResults
|
||||||
|
|
||||||
|
# Process each backend result
|
||||||
|
for backend, results in results_dict.items():
|
||||||
|
# Skip backends that were not run (NaN results)
|
||||||
|
if math.isnan(results.fwd_time):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract data from experiment
|
||||||
|
test_name = f"{backend}_{config.attn_type}_"
|
||||||
|
input_config = f"shape: {config.shape}, dtype: {config.dtype}"
|
||||||
|
|
||||||
|
# Determine mode based on backward pass
|
||||||
|
mode = "training" if config.calculate_bwd_time else "inference"
|
||||||
|
|
||||||
|
# Extract dtype
|
||||||
|
dtype = (
|
||||||
|
str(config.dtype).split(".")[1]
|
||||||
|
if "." in str(config.dtype)
|
||||||
|
else str(config.dtype)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Determine device
|
||||||
|
device = "cuda"
|
||||||
|
|
||||||
|
# Get device architecture
|
||||||
|
device_arch = (
|
||||||
|
torch.cuda.get_device_name(0)
|
||||||
|
if device == "cuda"
|
||||||
|
else platform.processor()
|
||||||
|
if device == "cpu"
|
||||||
|
else "unknown"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create dataclasses for JSON structure
|
||||||
|
@dataclass
|
||||||
|
class BenchmarkInfo:
|
||||||
|
name: str
|
||||||
|
mode: Optional[str]
|
||||||
|
dtype: str
|
||||||
|
extra_info: dict[str, Any]
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ModelInfo:
|
||||||
|
name: str
|
||||||
|
type: str
|
||||||
|
origins: list[str]
|
||||||
|
extra_info: dict[str, Any]
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MetricInfo:
|
||||||
|
name: str
|
||||||
|
unit: str
|
||||||
|
benchmark_values: list[float]
|
||||||
|
target_value: Optional[float]
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BenchmarkRecord:
|
||||||
|
benchmark: BenchmarkInfo
|
||||||
|
model: ModelInfo
|
||||||
|
metric: MetricInfo
|
||||||
|
|
||||||
|
# Benchmark extra info
|
||||||
|
benchmark_extra_info = {
|
||||||
|
"input_config": input_config,
|
||||||
|
"device": device,
|
||||||
|
"arch": device_arch,
|
||||||
|
"operator_name": backend,
|
||||||
|
"attn_type": config.attn_type,
|
||||||
|
"shape": str(config.shape),
|
||||||
|
"max_autotune": config.max_autotune,
|
||||||
|
}
|
||||||
|
# Add record for forward latency
|
||||||
|
record_fwd_latency = BenchmarkRecord(
|
||||||
|
benchmark=BenchmarkInfo(
|
||||||
|
name=benchmark_name,
|
||||||
|
mode=mode,
|
||||||
|
dtype=dtype,
|
||||||
|
extra_info=benchmark_extra_info,
|
||||||
|
),
|
||||||
|
model=ModelInfo(
|
||||||
|
name=test_name + str(config.shape),
|
||||||
|
type="attention-benchmark",
|
||||||
|
origins=["pytorch"],
|
||||||
|
extra_info={
|
||||||
|
"operator_name": backend,
|
||||||
|
"attn_type": config.attn_type,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
metric=MetricInfo(
|
||||||
|
name="forward latency",
|
||||||
|
unit="us",
|
||||||
|
benchmark_values=[results.fwd_time],
|
||||||
|
target_value=None,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
records.append(asdict(record_fwd_latency))
|
||||||
|
|
||||||
|
# Add record for forward memory bandwidth (if available)
|
||||||
|
if config.cal_bandwidth:
|
||||||
|
record_fwd_bandwidth = BenchmarkRecord(
|
||||||
|
benchmark=BenchmarkInfo(
|
||||||
|
name=benchmark_name,
|
||||||
|
mode=mode,
|
||||||
|
dtype=dtype,
|
||||||
|
extra_info=benchmark_extra_info,
|
||||||
|
),
|
||||||
|
model=ModelInfo(
|
||||||
|
name=test_name + str(config.shape),
|
||||||
|
type="attention-benchmark",
|
||||||
|
origins=["pytorch"],
|
||||||
|
extra_info={
|
||||||
|
"operator_name": backend,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
metric=MetricInfo(
|
||||||
|
name="memory bandwidth",
|
||||||
|
unit="TB/s",
|
||||||
|
benchmark_values=[calculate_bandwidth(config, results, "fwd")],
|
||||||
|
target_value=None,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
records.append(asdict(record_fwd_bandwidth))
|
||||||
|
|
||||||
|
# Add record for forward TFLOPS (if available)
|
||||||
|
if config.cal_bandwidth:
|
||||||
|
record_fwd_tflops = BenchmarkRecord(
|
||||||
|
benchmark=BenchmarkInfo(
|
||||||
|
name=benchmark_name,
|
||||||
|
mode=mode,
|
||||||
|
dtype=dtype,
|
||||||
|
extra_info=benchmark_extra_info,
|
||||||
|
),
|
||||||
|
model=ModelInfo(
|
||||||
|
name=test_name + str(config.shape),
|
||||||
|
type="attention-benchmark",
|
||||||
|
origins=["pytorch"],
|
||||||
|
extra_info={
|
||||||
|
"operator_name": backend,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
metric=MetricInfo(
|
||||||
|
name="tflops",
|
||||||
|
unit="TFLOPS/s",
|
||||||
|
benchmark_values=[calculate_tflops(config, results)],
|
||||||
|
target_value=None,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
records.append(asdict(record_fwd_tflops))
|
||||||
|
|
||||||
|
# Add record for backward latency (if available and not NaN)
|
||||||
|
if (
|
||||||
|
config.calculate_bwd_time
|
||||||
|
and results.bwd_time is not None
|
||||||
|
and not math.isnan(results.bwd_time)
|
||||||
|
):
|
||||||
|
record_bwd_latency = BenchmarkRecord(
|
||||||
|
benchmark=BenchmarkInfo(
|
||||||
|
name=benchmark_name,
|
||||||
|
mode=mode,
|
||||||
|
dtype=dtype,
|
||||||
|
extra_info=benchmark_extra_info,
|
||||||
|
),
|
||||||
|
model=ModelInfo(
|
||||||
|
name=test_name + str(config.shape),
|
||||||
|
type="attention-benchmark",
|
||||||
|
origins=["pytorch"],
|
||||||
|
extra_info={
|
||||||
|
"operator_name": backend,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
metric=MetricInfo(
|
||||||
|
name="backward latency",
|
||||||
|
unit="us",
|
||||||
|
benchmark_values=[results.bwd_time],
|
||||||
|
target_value=None,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
records.append(asdict(record_bwd_latency))
|
||||||
|
|
||||||
|
# Write all records to the output file
|
||||||
|
with open(output_file, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(records, f, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
def main(
|
||||||
|
dynamic: bool = False,
|
||||||
|
calculate_bwd: bool = False,
|
||||||
|
dtype: DtypeString = "bfloat16",
|
||||||
|
b: list[int] | None = None,
|
||||||
|
nh: list[str] | None = None,
|
||||||
|
s: list[int] | None = None,
|
||||||
|
d: list[int] | None = None,
|
||||||
|
mods: list[AttentionType] | None = None,
|
||||||
|
backend: list[Backend] | None = None,
|
||||||
|
max_autotune: bool = False,
|
||||||
|
decoding: bool = False,
|
||||||
|
kv_size: Optional[list[int]] = None,
|
||||||
|
throughput: bool = True,
|
||||||
|
save_path: Optional[str] = None,
|
||||||
|
output_json_for_dashboard: Optional[str] = None,
|
||||||
|
benchmark_name: str = "PyTorch operator microbenchmark",
|
||||||
|
) -> None:
|
||||||
|
"""Run sweep over sizes and score mods for flex attention.
|
||||||
|
|
||||||
|
Usage Examples:
|
||||||
|
# Use a yml config file
|
||||||
|
python score_mod.py --config basic_config.yaml
|
||||||
|
|
||||||
|
# Use a json config file
|
||||||
|
python score_mod.py --config my_config.json
|
||||||
|
|
||||||
|
# Generate a config template
|
||||||
|
python score_mod.py --print-config json > my_config.json # For a json config
|
||||||
|
python score_mod.py --print-config yaml > my_config.yaml # For a yaml config
|
||||||
|
|
||||||
|
# Override config with CLI args
|
||||||
|
python score_mod.py --config my_config.json -dtype float16 --max-autotune
|
||||||
|
|
||||||
|
# Pure CLI usage
|
||||||
|
python score_mod.py -b 4 8 -s 1024 2048 -mods causal alibi --backend efficient
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dynamic: Runs a dynamic shapes version of compiled flex attention
|
||||||
|
calculate_bwd: Calculate backward pass times
|
||||||
|
dtype: Data type for tensors (bfloat16, float16, float32)
|
||||||
|
b: Batch sizes to benchmark
|
||||||
|
nh: Number of query and key/value heads in format "Hq,Hkv"
|
||||||
|
s: Sequence lengths to benchmark
|
||||||
|
d: Head dimensions to benchmark
|
||||||
|
mods: Score modifications: noop, causal, rel, head_bias, alibi, sliding_window, document_mask, prefix_lm, softcap
|
||||||
|
backend: Backends for attention computation: math, efficient, cudnn, fav2, fav3, fakv, og-eager
|
||||||
|
max_autotune: Turn on max-autotune optimization
|
||||||
|
decoding: Benchmark decoding mode (query sequence length = 1)
|
||||||
|
kv_size: Key/value cache size in MiB (ignores batch size if specified)
|
||||||
|
throughput: Calculate kernel memory bandwidth & computational throughput (always True)
|
||||||
|
save_path: Path to save the results CSV file
|
||||||
|
output_json_for_dashboard: Path to save results in JSON format for PyTorch OSS dashboard
|
||||||
|
benchmark_name: Name of the benchmark for dashboard output
|
||||||
|
"""
|
||||||
|
# Convert dtype string to torch dtype (if not already converted)
|
||||||
|
import torch
|
||||||
|
|
||||||
|
if isinstance(dtype, str):
|
||||||
|
dtype = getattr(torch, dtype)
|
||||||
|
|
||||||
|
# Always calculate throughput
|
||||||
|
throughput = True
|
||||||
|
print("Backend: ", backend)
|
||||||
seed = 123
|
seed = 123
|
||||||
np.random.seed(seed)
|
np.random.seed(seed)
|
||||||
torch.manual_seed(seed)
|
torch.manual_seed(seed)
|
||||||
results = []
|
results = []
|
||||||
for config in tqdm(
|
for experiment_count, config in enumerate(
|
||||||
|
tqdm(
|
||||||
generate_experiment_configs(
|
generate_experiment_configs(
|
||||||
args.calculate_bwd,
|
calculate_bwd,
|
||||||
args.dtype,
|
dtype,
|
||||||
args.b,
|
b,
|
||||||
args.nh,
|
nh,
|
||||||
args.s,
|
s,
|
||||||
args.d,
|
d,
|
||||||
args.mods,
|
mods,
|
||||||
args.decoding,
|
decoding,
|
||||||
args.kv_size,
|
kv_size,
|
||||||
args.throughput,
|
throughput,
|
||||||
args.backend,
|
backend,
|
||||||
|
max_autotune,
|
||||||
)
|
)
|
||||||
|
),
|
||||||
|
start=1,
|
||||||
):
|
):
|
||||||
results.append(
|
results.append(
|
||||||
Experiment(
|
Experiment(
|
||||||
config,
|
config,
|
||||||
run_single_experiment(
|
run_single_experiment(
|
||||||
config,
|
config,
|
||||||
dynamic=args.dynamic,
|
dynamic=dynamic,
|
||||||
max_autotune=args.max_autotune,
|
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
print_results(results, args.save_path)
|
# Periodic memory cleanup every 50 experiments
|
||||||
|
if experiment_count % 50 == 0:
|
||||||
|
cleanup_memory()
|
||||||
|
|
||||||
|
print_results(results, save_path)
|
||||||
|
|
||||||
def heads_input_type(s):
|
# Output JSON for dashboard if requested
|
||||||
try:
|
if output_json_for_dashboard:
|
||||||
hq, hkv = map(int, s.split(","))
|
_output_json_for_dashboard(results, output_json_for_dashboard, benchmark_name)
|
||||||
return hq, hkv
|
|
||||||
except Exception as e:
|
|
||||||
raise argparse.ArgumentTypeError("Heads must be Hq,Hkv") from e
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -1130,6 +1501,12 @@ if __name__ == "__main__":
|
|||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Run sweep over sizes and score mods for flex attention"
|
description="Run sweep over sizes and score mods for flex attention"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--config",
|
||||||
|
type=str,
|
||||||
|
help="Path to JSON config file. CLI args override config file values.",
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dynamic",
|
"--dynamic",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
@ -1199,8 +1576,49 @@ Ignores -b batch size and calculate batch size from kv size instead when specifi
|
|||||||
default=["efficient"],
|
default=["efficient"],
|
||||||
help="Backend to use for attention computation",
|
help="Backend to use for attention computation",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-json-for-dashboard",
|
||||||
|
type=str,
|
||||||
|
help="Path to save results in JSON format for PyTorch OSS dashboard",
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--benchmark-name",
|
||||||
|
type=str,
|
||||||
|
help="Name of the benchmark for dashboard output",
|
||||||
|
default="PyTorch operator microbenchmark",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--print-config",
|
||||||
|
type=str,
|
||||||
|
choices=["json", "yaml"],
|
||||||
|
help="Print a default config template in JSON or YAML format and exit",
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
# Parse arguments
|
# Parse arguments
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Handle --print-config
|
||||||
|
if args.print_config:
|
||||||
|
print_default_config(args.print_config)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Load and merge config if provided
|
||||||
|
if args.config:
|
||||||
|
config = load_config_file(args.config)
|
||||||
|
|
||||||
|
# Merge config with CLI args (CLI args take precedence)
|
||||||
|
json_args = argparse.Namespace()
|
||||||
|
json_args.__dict__ = config
|
||||||
|
args = parser.parse_args(namespace=json_args)
|
||||||
|
|
||||||
|
# Convert dtype string to torch dtype (only if it's still a string)
|
||||||
|
if isinstance(args.dtype, str):
|
||||||
args.dtype = getattr(torch, args.dtype)
|
args.dtype = getattr(torch, args.dtype)
|
||||||
|
|
||||||
main(args)
|
# Remove config and print_config from args before passing to main
|
||||||
|
args_dict = vars(args)
|
||||||
|
args_dict.pop("config", None)
|
||||||
|
args_dict.pop("print_config", None)
|
||||||
|
|
||||||
|
main(**args_dict)
|
||||||
|
|||||||
@ -482,6 +482,7 @@ inductor_core_resources = [
|
|||||||
"torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp",
|
"torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp",
|
||||||
"torch/csrc/inductor/inductor_ops.cpp",
|
"torch/csrc/inductor/inductor_ops.cpp",
|
||||||
"torch/csrc/jit/serialization/pickle.cpp",
|
"torch/csrc/jit/serialization/pickle.cpp",
|
||||||
|
"torch/csrc/shim_common.cpp",
|
||||||
]
|
]
|
||||||
|
|
||||||
libtorch_core_sources = sorted(
|
libtorch_core_sources = sorted(
|
||||||
|
|||||||
@ -556,3 +556,26 @@ inline SymBool sym_ge(const SymInt& a, const SymInt& b) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
} // namespace c10
|
} // namespace c10
|
||||||
|
|
||||||
|
#include <limits>
|
||||||
|
|
||||||
|
namespace std {
|
||||||
|
|
||||||
|
template <>
|
||||||
|
class numeric_limits<c10::SymInt> {
|
||||||
|
public:
|
||||||
|
static constexpr bool is_specialized = true;
|
||||||
|
|
||||||
|
static constexpr int64_t max() noexcept {
|
||||||
|
return std::numeric_limits<int64_t>::max();
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr int64_t min() noexcept {
|
||||||
|
return std::numeric_limits<int64_t>::min();
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr bool is_signed = true;
|
||||||
|
static constexpr bool is_integer = true;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace std
|
||||||
|
|||||||
@ -15,7 +15,6 @@ namespace c10::cuda {
|
|||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
// Global stream state and constants
|
// Global stream state and constants
|
||||||
c10::once_flag init_flag;
|
|
||||||
DeviceIndex num_gpus = -1;
|
DeviceIndex num_gpus = -1;
|
||||||
constexpr int kStreamsPerPoolBits = 5;
|
constexpr int kStreamsPerPoolBits = 5;
|
||||||
constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
|
constexpr int kStreamsPerPool = 1 << kStreamsPerPoolBits;
|
||||||
@ -226,7 +225,10 @@ void initDeviceStreamState(DeviceIndex device_index) {
|
|||||||
// Init front-end to ensure initialization only occurs once
|
// Init front-end to ensure initialization only occurs once
|
||||||
void initCUDAStreamsOnce() {
|
void initCUDAStreamsOnce() {
|
||||||
// Inits default streams (once, globally)
|
// Inits default streams (once, globally)
|
||||||
c10::call_once(init_flag, initGlobalStreamState);
|
auto static init_flag [[maybe_unused]] = [] {
|
||||||
|
initGlobalStreamState();
|
||||||
|
return true;
|
||||||
|
}();
|
||||||
|
|
||||||
if (current_streams) {
|
if (current_streams) {
|
||||||
return;
|
return;
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user