mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-06 00:54:56 +08:00
Compare commits
216 Commits
v2.7.1-rc2
...
mlazos/hc
| Author | SHA1 | Date | |
|---|---|---|---|
| 268de64005 | |||
| 61a64c20c4 | |||
| ea02aac2ca | |||
| 5db3a4ac88 | |||
| 45fec7843d | |||
| 0d804dec0f | |||
| 3b48c72141 | |||
| 010963032c | |||
| d80a70b58a | |||
| cd5c13d8f0 | |||
| f06e366532 | |||
| b8c0c50bbe | |||
| dfdf58f8cb | |||
| e0e8639a10 | |||
| dbea13ed45 | |||
| c0566e0dbf | |||
| 585fd972b8 | |||
| ee3a2c6ee2 | |||
| 5b8cc4709a | |||
| a8df5e5af9 | |||
| d5b1d99f78 | |||
| 381d0cb239 | |||
| c41196a4d0 | |||
| fdacf3c920 | |||
| 405025778d | |||
| 5ba437fb45 | |||
| 213eea216a | |||
| ec9e11145e | |||
| 6e2b2660b9 | |||
| bc88f6faa1 | |||
| b06b5c3e27 | |||
| 6055a4f612 | |||
| 9b92828d4b | |||
| 9cd52da45c | |||
| 6c2db8fab0 | |||
| 2e0c98ff05 | |||
| dea7157160 | |||
| b8f91bcb14 | |||
| 7869196482 | |||
| bca75fe97a | |||
| c43e35d6f7 | |||
| bb42e4d137 | |||
| cccdf860e2 | |||
| 988827cdfb | |||
| ebabd0efdd | |||
| c36ac16da1 | |||
| 1096443467 | |||
| 60523540f1 | |||
| c1dd75e4dc | |||
| 425c6d8eba | |||
| f9a787224c | |||
| 186cc7327c | |||
| a0ac63cbd9 | |||
| 811f587d86 | |||
| 9a78513c3c | |||
| b52a8bef01 | |||
| 46226a90c8 | |||
| 523bffd388 | |||
| 37c914ca0c | |||
| 78715a181f | |||
| 1157367c78 | |||
| 24cfeec2c7 | |||
| afa1eda901 | |||
| a16ada41b9 | |||
| d25617255c | |||
| a3c6e3139a | |||
| e4f6e4ac84 | |||
| 8bc7bd94a5 | |||
| e8dd58b8cf | |||
| 5e9f792479 | |||
| 6c7d8419e3 | |||
| 769f19bf95 | |||
| 8d7c430e84 | |||
| 08a644a4c4 | |||
| c41c2130be | |||
| 8cdb9adc05 | |||
| 224cd9f055 | |||
| aaa4c3d60b | |||
| 2a011ca904 | |||
| 9d37b501db | |||
| c7c3e77324 | |||
| 790f93db3a | |||
| b2862f1435 | |||
| 1cc5f6b623 | |||
| 916e8979d3 | |||
| 6048d88afe | |||
| bfee141666 | |||
| 6b1b95ad2a | |||
| 5905bbe745 | |||
| 9f33c6f0a0 | |||
| f80bee4934 | |||
| 21c2edfec8 | |||
| 3e2c4086ad | |||
| acf42b0048 | |||
| a9c55277d7 | |||
| c83c711da8 | |||
| e7e477c1f9 | |||
| 4482a65fef | |||
| 115fc98cc0 | |||
| 740ce0fa5f | |||
| 578160c875 | |||
| f4368d8872 | |||
| 96795e9533 | |||
| 1c7196f04b | |||
| 9ad6265d04 | |||
| 7537b19c73 | |||
| 09f7f62cfe | |||
| 08af311fc2 | |||
| eb7bf4202d | |||
| ff58ccec6c | |||
| f9b4856989 | |||
| 643aaea133 | |||
| 05f2cbfe19 | |||
| d7d9a71e19 | |||
| dd6e9df3d0 | |||
| 0bd863a62f | |||
| a0893475ba | |||
| 1bdbf12672 | |||
| 69aeb87eca | |||
| 5e79b61e8a | |||
| fe01af2242 | |||
| c96ed7e6f5 | |||
| 9d7945e382 | |||
| a7f8de2198 | |||
| 5a843f8973 | |||
| 97272e4b49 | |||
| 2e02c07a5d | |||
| f2221b2fce | |||
| f067eafabb | |||
| 42e468d9b0 | |||
| a9aae05a6b | |||
| f2ea77c099 | |||
| 71795f159e | |||
| 706c22549c | |||
| 68bbe20db7 | |||
| c95a6b416b | |||
| 05ac99042f | |||
| be4e6c1c8e | |||
| e162758051 | |||
| d4496346b9 | |||
| db6d72213b | |||
| e6839819c8 | |||
| 9e6b2ca58d | |||
| bea181ff7e | |||
| e567900998 | |||
| aed0b7a742 | |||
| b4745db904 | |||
| c179971bfc | |||
| 56b2e4b8f0 | |||
| e66ad221e9 | |||
| e8d36019d4 | |||
| 15cd6921a5 | |||
| 49570cb402 | |||
| 4cae8f48cc | |||
| 9a3d26cfcd | |||
| 4098a229a0 | |||
| e5fccb2bab | |||
| 38e81a5332 | |||
| 4f8391db55 | |||
| 0dcd482e54 | |||
| 5e1b715dda | |||
| 970fefcc53 | |||
| c73c72b1e1 | |||
| 77ea66695a | |||
| 7c87ec1b50 | |||
| b263b272fa | |||
| e6f560a262 | |||
| e84cc4c052 | |||
| 6856d81c60 | |||
| b9803a5c81 | |||
| 3e605fe46d | |||
| 65d19a5699 | |||
| f59064f2b7 | |||
| bdf57fb8f7 | |||
| a8b1767ae5 | |||
| df60500ab8 | |||
| 96a6a71ac7 | |||
| d90f9e9a34 | |||
| f4bffb7461 | |||
| 75c8b7d972 | |||
| ec93aa7f84 | |||
| 2a7d583452 | |||
| c208f21791 | |||
| 037d7af778 | |||
| 7cdbb913e7 | |||
| 3646d4dbc8 | |||
| 420a9be743 | |||
| f2d43d866c | |||
| 4a12777ffe | |||
| 1e37e5b836 | |||
| e51615cb73 | |||
| b1980b2405 | |||
| 38c5cf99b3 | |||
| 3f1769f785 | |||
| 0c8ec26d3b | |||
| ab45aaca97 | |||
| e3ebf61589 | |||
| b5191b9312 | |||
| b90698f5ba | |||
| 215f856142 | |||
| 66300d3d55 | |||
| 86bc154d61 | |||
| fb0e9cb0a0 | |||
| 29fd875bc1 | |||
| 01e9036bd2 | |||
| 923ce10f6c | |||
| 28b78800b9 | |||
| b040dc3a53 | |||
| 626a5e22eb | |||
| 9a0f65d3d3 | |||
| 488c4480f9 | |||
| 5ada4e6a53 | |||
| 8fa81a6066 | |||
| f349304c08 | |||
| 81aee3c9c4 | |||
| d547a56668 |
@ -105,7 +105,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.10
|
ANACONDA_PYTHON_VERSION=3.10
|
||||||
GCC_VERSION=11
|
GCC_VERSION=11
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
KATEX=yes
|
KATEX=yes
|
||||||
UCX_COMMIT=${_UCX_COMMIT}
|
UCX_COMMIT=${_UCX_COMMIT}
|
||||||
@ -119,7 +118,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.10
|
ANACONDA_PYTHON_VERSION=3.10
|
||||||
GCC_VERSION=9
|
GCC_VERSION=9
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
KATEX=yes
|
KATEX=yes
|
||||||
UCX_COMMIT=${_UCX_COMMIT}
|
UCX_COMMIT=${_UCX_COMMIT}
|
||||||
@ -134,7 +132,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.12
|
ANACONDA_PYTHON_VERSION=3.12
|
||||||
GCC_VERSION=9
|
GCC_VERSION=9
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
KATEX=yes
|
KATEX=yes
|
||||||
UCX_COMMIT=${_UCX_COMMIT}
|
UCX_COMMIT=${_UCX_COMMIT}
|
||||||
@ -149,7 +146,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.13
|
ANACONDA_PYTHON_VERSION=3.13
|
||||||
GCC_VERSION=9
|
GCC_VERSION=9
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
KATEX=yes
|
KATEX=yes
|
||||||
UCX_COMMIT=${_UCX_COMMIT}
|
UCX_COMMIT=${_UCX_COMMIT}
|
||||||
@ -164,7 +160,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.10
|
ANACONDA_PYTHON_VERSION=3.10
|
||||||
GCC_VERSION=9
|
GCC_VERSION=9
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
KATEX=yes
|
KATEX=yes
|
||||||
UCX_COMMIT=${_UCX_COMMIT}
|
UCX_COMMIT=${_UCX_COMMIT}
|
||||||
@ -178,7 +173,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.10
|
ANACONDA_PYTHON_VERSION=3.10
|
||||||
GCC_VERSION=9
|
GCC_VERSION=9
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
KATEX=yes
|
KATEX=yes
|
||||||
UCX_COMMIT=${_UCX_COMMIT}
|
UCX_COMMIT=${_UCX_COMMIT}
|
||||||
@ -193,7 +187,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.12
|
ANACONDA_PYTHON_VERSION=3.12
|
||||||
GCC_VERSION=9
|
GCC_VERSION=9
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
KATEX=yes
|
KATEX=yes
|
||||||
UCX_COMMIT=${_UCX_COMMIT}
|
UCX_COMMIT=${_UCX_COMMIT}
|
||||||
@ -208,7 +201,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.13
|
ANACONDA_PYTHON_VERSION=3.13
|
||||||
GCC_VERSION=9
|
GCC_VERSION=9
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
KATEX=yes
|
KATEX=yes
|
||||||
UCX_COMMIT=${_UCX_COMMIT}
|
UCX_COMMIT=${_UCX_COMMIT}
|
||||||
@ -223,7 +215,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.10
|
ANACONDA_PYTHON_VERSION=3.10
|
||||||
GCC_VERSION=9
|
GCC_VERSION=9
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
KATEX=yes
|
KATEX=yes
|
||||||
UCX_COMMIT=${_UCX_COMMIT}
|
UCX_COMMIT=${_UCX_COMMIT}
|
||||||
@ -235,7 +226,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.9
|
ANACONDA_PYTHON_VERSION=3.9
|
||||||
CLANG_VERSION=10
|
CLANG_VERSION=10
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
CONDA_CMAKE=yes
|
CONDA_CMAKE=yes
|
||||||
ONNX=yes
|
ONNX=yes
|
||||||
@ -244,7 +234,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.9
|
ANACONDA_PYTHON_VERSION=3.9
|
||||||
CLANG_VERSION=10
|
CLANG_VERSION=10
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
VULKAN_SDK_VERSION=1.2.162.1
|
VULKAN_SDK_VERSION=1.2.162.1
|
||||||
SWIFTSHADER=yes
|
SWIFTSHADER=yes
|
||||||
@ -255,7 +244,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.11
|
ANACONDA_PYTHON_VERSION=3.11
|
||||||
CLANG_VERSION=10
|
CLANG_VERSION=10
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
VULKAN_SDK_VERSION=1.2.162.1
|
VULKAN_SDK_VERSION=1.2.162.1
|
||||||
SWIFTSHADER=yes
|
SWIFTSHADER=yes
|
||||||
@ -266,7 +254,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.9
|
ANACONDA_PYTHON_VERSION=3.9
|
||||||
GCC_VERSION=9
|
GCC_VERSION=9
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
CONDA_CMAKE=yes
|
CONDA_CMAKE=yes
|
||||||
TRITON=yes
|
TRITON=yes
|
||||||
@ -275,7 +262,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.10
|
ANACONDA_PYTHON_VERSION=3.10
|
||||||
GCC_VERSION=11
|
GCC_VERSION=11
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
ROCM_VERSION=6.2.4
|
ROCM_VERSION=6.2.4
|
||||||
NINJA_VERSION=1.9.0
|
NINJA_VERSION=1.9.0
|
||||||
@ -290,7 +276,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.10
|
ANACONDA_PYTHON_VERSION=3.10
|
||||||
GCC_VERSION=11
|
GCC_VERSION=11
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
ROCM_VERSION=6.3
|
ROCM_VERSION=6.3
|
||||||
NINJA_VERSION=1.9.0
|
NINJA_VERSION=1.9.0
|
||||||
@ -305,7 +290,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.9
|
ANACONDA_PYTHON_VERSION=3.9
|
||||||
GCC_VERSION=11
|
GCC_VERSION=11
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
XPU_VERSION=0.5
|
XPU_VERSION=0.5
|
||||||
NINJA_VERSION=1.9.0
|
NINJA_VERSION=1.9.0
|
||||||
@ -316,7 +300,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.9
|
ANACONDA_PYTHON_VERSION=3.9
|
||||||
GCC_VERSION=11
|
GCC_VERSION=11
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
XPU_VERSION=2025.0
|
XPU_VERSION=2025.0
|
||||||
NINJA_VERSION=1.9.0
|
NINJA_VERSION=1.9.0
|
||||||
@ -327,7 +310,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.9
|
ANACONDA_PYTHON_VERSION=3.9
|
||||||
GCC_VERSION=11
|
GCC_VERSION=11
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
KATEX=yes
|
KATEX=yes
|
||||||
CONDA_CMAKE=yes
|
CONDA_CMAKE=yes
|
||||||
@ -341,7 +323,6 @@ case "$image" in
|
|||||||
CUDNN_VERSION=9
|
CUDNN_VERSION=9
|
||||||
CLANG_VERSION=12
|
CLANG_VERSION=12
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
TRITON=yes
|
TRITON=yes
|
||||||
;;
|
;;
|
||||||
@ -349,7 +330,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.9
|
ANACONDA_PYTHON_VERSION=3.9
|
||||||
CLANG_VERSION=12
|
CLANG_VERSION=12
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
CONDA_CMAKE=yes
|
CONDA_CMAKE=yes
|
||||||
TRITON=yes
|
TRITON=yes
|
||||||
@ -370,7 +350,6 @@ case "$image" in
|
|||||||
ANACONDA_PYTHON_VERSION=3.9
|
ANACONDA_PYTHON_VERSION=3.9
|
||||||
GCC_VERSION=11
|
GCC_VERSION=11
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
KATEX=yes
|
KATEX=yes
|
||||||
CONDA_CMAKE=yes
|
CONDA_CMAKE=yes
|
||||||
@ -416,7 +395,6 @@ case "$image" in
|
|||||||
GCC_VERSION=11
|
GCC_VERSION=11
|
||||||
ACL=yes
|
ACL=yes
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
CONDA_CMAKE=yes
|
CONDA_CMAKE=yes
|
||||||
# snadampal: skipping llvm src build install because the current version
|
# snadampal: skipping llvm src build install because the current version
|
||||||
@ -428,7 +406,6 @@ case "$image" in
|
|||||||
GCC_VERSION=11
|
GCC_VERSION=11
|
||||||
ACL=yes
|
ACL=yes
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
CONDA_CMAKE=yes
|
CONDA_CMAKE=yes
|
||||||
# snadampal: skipping llvm src build install because the current version
|
# snadampal: skipping llvm src build install because the current version
|
||||||
@ -439,7 +416,6 @@ case "$image" in
|
|||||||
*)
|
*)
|
||||||
# Catch-all for builds that are not hardcoded.
|
# Catch-all for builds that are not hardcoded.
|
||||||
PROTOBUF=yes
|
PROTOBUF=yes
|
||||||
DB=yes
|
|
||||||
VISION=yes
|
VISION=yes
|
||||||
echo "image '$image' did not match an existing build configuration"
|
echo "image '$image' did not match an existing build configuration"
|
||||||
if [[ "$image" == *py* ]]; then
|
if [[ "$image" == *py* ]]; then
|
||||||
@ -495,7 +471,6 @@ docker build \
|
|||||||
--build-arg "BUILD_ENVIRONMENT=${image}" \
|
--build-arg "BUILD_ENVIRONMENT=${image}" \
|
||||||
--build-arg "PROTOBUF=${PROTOBUF:-}" \
|
--build-arg "PROTOBUF=${PROTOBUF:-}" \
|
||||||
--build-arg "LLVMDEV=${LLVMDEV:-}" \
|
--build-arg "LLVMDEV=${LLVMDEV:-}" \
|
||||||
--build-arg "DB=${DB:-}" \
|
|
||||||
--build-arg "VISION=${VISION:-}" \
|
--build-arg "VISION=${VISION:-}" \
|
||||||
--build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \
|
--build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \
|
||||||
--build-arg "CENTOS_VERSION=${CENTOS_VERSION}" \
|
--build-arg "CENTOS_VERSION=${CENTOS_VERSION}" \
|
||||||
|
|||||||
@ -55,13 +55,6 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
|
|||||||
RUN rm install_protobuf.sh
|
RUN rm install_protobuf.sh
|
||||||
ENV INSTALLED_PROTOBUF ${PROTOBUF}
|
ENV INSTALLED_PROTOBUF ${PROTOBUF}
|
||||||
|
|
||||||
# (optional) Install database packages like LMDB and LevelDB
|
|
||||||
ARG DB
|
|
||||||
COPY ./common/install_db.sh install_db.sh
|
|
||||||
RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
|
|
||||||
RUN rm install_db.sh
|
|
||||||
ENV INSTALLED_DB ${DB}
|
|
||||||
|
|
||||||
# (optional) Install vision packages like OpenCV
|
# (optional) Install vision packages like OpenCV
|
||||||
ARG VISION
|
ARG VISION
|
||||||
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
|
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
v2.25.1-1
|
v2.26.2-1
|
||||||
|
|||||||
@ -240,7 +240,7 @@ function prune_126 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function install_128 {
|
function install_128 {
|
||||||
CUDNN_VERSION=9.7.1.26
|
CUDNN_VERSION=9.8.0.87
|
||||||
echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
|
echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
|
||||||
rm -rf /usr/local/cuda-12.8 /usr/local/cuda
|
rm -rf /usr/local/cuda-12.8 /usr/local/cuda
|
||||||
# install CUDA 12.8.0 in the same container
|
# install CUDA 12.8.0 in the same container
|
||||||
|
|||||||
@ -161,7 +161,7 @@ function prune_126 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function install_128 {
|
function install_128 {
|
||||||
CUDNN_VERSION=9.7.1.26
|
CUDNN_VERSION=9.8.0.87
|
||||||
echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
|
echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
|
||||||
rm -rf /usr/local/cuda-12.8 /usr/local/cuda
|
rm -rf /usr/local/cuda-12.8 /usr/local/cuda
|
||||||
# install CUDA 12.8.0 in the same container
|
# install CUDA 12.8.0 in the same container
|
||||||
|
|||||||
@ -5,7 +5,7 @@ if [[ -n "${CUDNN_VERSION}" ]]; then
|
|||||||
mkdir tmp_cudnn
|
mkdir tmp_cudnn
|
||||||
pushd tmp_cudnn
|
pushd tmp_cudnn
|
||||||
if [[ ${CUDA_VERSION:0:4} == "12.8" ]]; then
|
if [[ ${CUDA_VERSION:0:4} == "12.8" ]]; then
|
||||||
CUDNN_NAME="cudnn-linux-x86_64-9.7.1.26_cuda12-archive"
|
CUDNN_NAME="cudnn-linux-x86_64-9.8.0.87_cuda12-archive"
|
||||||
elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
|
elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
|
||||||
CUDNN_NAME="cudnn-linux-x86_64-9.5.1.17_cuda12-archive"
|
CUDNN_NAME="cudnn-linux-x86_64-9.5.1.17_cuda12-archive"
|
||||||
elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
|
elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
|
||||||
|
|||||||
@ -1,38 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
install_ubuntu() {
|
|
||||||
apt-get update
|
|
||||||
|
|
||||||
# Cleanup
|
|
||||||
apt-get autoclean && apt-get clean
|
|
||||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
|
||||||
}
|
|
||||||
|
|
||||||
install_centos() {
|
|
||||||
# Need EPEL for many packages we depend on.
|
|
||||||
# See http://fedoraproject.org/wiki/EPEL
|
|
||||||
yum --enablerepo=extras install -y epel-release
|
|
||||||
|
|
||||||
# Cleanup
|
|
||||||
yum clean all
|
|
||||||
rm -rf /var/cache/yum
|
|
||||||
rm -rf /var/lib/yum/yumdb
|
|
||||||
rm -rf /var/lib/yum/history
|
|
||||||
}
|
|
||||||
|
|
||||||
# Install base packages depending on the base OS
|
|
||||||
ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
|
|
||||||
case "$ID" in
|
|
||||||
ubuntu)
|
|
||||||
install_ubuntu
|
|
||||||
;;
|
|
||||||
centos)
|
|
||||||
install_centos
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "Unable to determine OS..."
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
@ -25,7 +25,9 @@ python3 -m pip install meson ninja
|
|||||||
###########################
|
###########################
|
||||||
### clone repo
|
### clone repo
|
||||||
###########################
|
###########################
|
||||||
GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
|
# TEMPORARY FIX: https://gitlab.freedesktop.org/mesa/drm.git is down until 2025/03/22
|
||||||
|
# GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
|
||||||
|
GIT_SSL_NO_VERIFY=true git clone git://anongit.freedesktop.org/mesa/drm
|
||||||
pushd drm
|
pushd drm
|
||||||
|
|
||||||
###########################
|
###########################
|
||||||
|
|||||||
@ -41,11 +41,14 @@ fbscribelogger==0.1.7
|
|||||||
#Pinned versions: 0.1.6
|
#Pinned versions: 0.1.6
|
||||||
#test that import:
|
#test that import:
|
||||||
|
|
||||||
flatbuffers==2.0
|
flatbuffers==2.0 ; platform_machine != "s390x"
|
||||||
#Description: cross platform serialization library
|
#Description: cross platform serialization library
|
||||||
#Pinned versions: 2.0
|
#Pinned versions: 2.0
|
||||||
#test that import:
|
#test that import:
|
||||||
|
|
||||||
|
flatbuffers ; platform_machine == "s390x"
|
||||||
|
#Description: cross platform serialization library; Newer version is required on s390x for new python version
|
||||||
|
|
||||||
hypothesis==5.35.1
|
hypothesis==5.35.1
|
||||||
# Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
|
# Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
|
||||||
#Description: advanced library for generating parametrized tests
|
#Description: advanced library for generating parametrized tests
|
||||||
@ -102,10 +105,10 @@ networkx==2.8.8
|
|||||||
#Pinned versions: 2.8.8
|
#Pinned versions: 2.8.8
|
||||||
#test that import: functorch
|
#test that import: functorch
|
||||||
|
|
||||||
#ninja
|
ninja==1.11.1.3
|
||||||
#Description: build system. Note that it install from
|
#Description: build system. Used in some tests. Used in build to generate build
|
||||||
#here breaks things so it is commented out
|
#time tracing information
|
||||||
#Pinned versions: 1.10.0.post1
|
#Pinned versions: 1.11.1.3
|
||||||
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
|
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
|
||||||
|
|
||||||
numba==0.49.0 ; python_version < "3.9"
|
numba==0.49.0 ; python_version < "3.9"
|
||||||
@ -365,7 +368,6 @@ PyYAML
|
|||||||
pyzstd
|
pyzstd
|
||||||
setuptools
|
setuptools
|
||||||
|
|
||||||
ninja==1.11.1 ; platform_machine == "aarch64"
|
|
||||||
scons==4.5.2 ; platform_machine == "aarch64"
|
scons==4.5.2 ; platform_machine == "aarch64"
|
||||||
|
|
||||||
pulp==2.9.0 ; python_version >= "3.8"
|
pulp==2.9.0 ; python_version >= "3.8"
|
||||||
|
|||||||
@ -50,13 +50,6 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
|
|||||||
RUN rm install_protobuf.sh
|
RUN rm install_protobuf.sh
|
||||||
ENV INSTALLED_PROTOBUF ${PROTOBUF}
|
ENV INSTALLED_PROTOBUF ${PROTOBUF}
|
||||||
|
|
||||||
# (optional) Install database packages like LMDB and LevelDB
|
|
||||||
ARG DB
|
|
||||||
COPY ./common/install_db.sh install_db.sh
|
|
||||||
RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
|
|
||||||
RUN rm install_db.sh
|
|
||||||
ENV INSTALLED_DB ${DB}
|
|
||||||
|
|
||||||
# (optional) Install vision packages like OpenCV
|
# (optional) Install vision packages like OpenCV
|
||||||
ARG VISION
|
ARG VISION
|
||||||
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
|
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
|
||||||
|
|||||||
@ -50,13 +50,6 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
|
|||||||
RUN rm install_protobuf.sh
|
RUN rm install_protobuf.sh
|
||||||
ENV INSTALLED_PROTOBUF ${PROTOBUF}
|
ENV INSTALLED_PROTOBUF ${PROTOBUF}
|
||||||
|
|
||||||
# (optional) Install database packages like LMDB and LevelDB
|
|
||||||
ARG DB
|
|
||||||
COPY ./common/install_db.sh install_db.sh
|
|
||||||
RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
|
|
||||||
RUN rm install_db.sh
|
|
||||||
ENV INSTALLED_DB ${DB}
|
|
||||||
|
|
||||||
# (optional) Install vision packages like OpenCV
|
# (optional) Install vision packages like OpenCV
|
||||||
ARG VISION
|
ARG VISION
|
||||||
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
|
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
|
||||||
|
|||||||
@ -77,13 +77,6 @@ COPY triton_version.txt triton_version.txt
|
|||||||
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
|
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
|
||||||
RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt
|
RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt
|
||||||
|
|
||||||
# (optional) Install database packages like LMDB and LevelDB
|
|
||||||
ARG DB
|
|
||||||
COPY ./common/install_db.sh install_db.sh
|
|
||||||
RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
|
|
||||||
RUN rm install_db.sh
|
|
||||||
ENV INSTALLED_DB ${DB}
|
|
||||||
|
|
||||||
# (optional) Install vision packages like OpenCV
|
# (optional) Install vision packages like OpenCV
|
||||||
ARG VISION
|
ARG VISION
|
||||||
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
|
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
|
||||||
|
|||||||
@ -74,13 +74,6 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
|
|||||||
RUN rm install_protobuf.sh
|
RUN rm install_protobuf.sh
|
||||||
ENV INSTALLED_PROTOBUF ${PROTOBUF}
|
ENV INSTALLED_PROTOBUF ${PROTOBUF}
|
||||||
|
|
||||||
# (optional) Install database packages like LMDB and LevelDB
|
|
||||||
ARG DB
|
|
||||||
COPY ./common/install_db.sh install_db.sh
|
|
||||||
RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
|
|
||||||
RUN rm install_db.sh
|
|
||||||
ENV INSTALLED_DB ${DB}
|
|
||||||
|
|
||||||
# (optional) Install vision packages like OpenCV
|
# (optional) Install vision packages like OpenCV
|
||||||
ARG VISION
|
ARG VISION
|
||||||
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
|
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
|
||||||
|
|||||||
@ -73,26 +73,14 @@ fi
|
|||||||
# Check GCC ABI
|
# Check GCC ABI
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
|
||||||
# NOTE [ Building libtorch with old vs. new gcc ABI ]
|
# NOTE: As of https://github.com/pytorch/pytorch/issues/126551 we only produce
|
||||||
#
|
# wheels with cxx11-abi
|
||||||
# Packages built with one version of ABI could not be linked against by client
|
|
||||||
# C++ libraries that were compiled using the other version of ABI. Since both
|
|
||||||
# gcc ABIs are still common in the wild, we need to support both ABIs. Currently:
|
|
||||||
#
|
|
||||||
# - All the nightlies built on CentOS 7 + devtoolset7 use the old gcc ABI.
|
|
||||||
# - All the nightlies built on Ubuntu 16.04 + gcc 5.4 use the new gcc ABI.
|
|
||||||
|
|
||||||
echo "Checking that the gcc ABI is what we expect"
|
echo "Checking that the gcc ABI is what we expect"
|
||||||
if [[ "$(uname)" != 'Darwin' ]]; then
|
if [[ "$(uname)" != 'Darwin' ]]; then
|
||||||
function is_expected() {
|
function is_expected() {
|
||||||
if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* || "$DESIRED_CUDA" == *"rocm"* ]]; then
|
if [[ "$1" -gt 0 || "$1" == "ON " ]]; then
|
||||||
if [[ "$1" -gt 0 || "$1" == "ON " ]]; then
|
echo 1
|
||||||
echo 1
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
if [[ -z "$1" || "$1" == 0 || "$1" == "OFF" ]]; then
|
|
||||||
echo 1
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -121,9 +121,9 @@ def main() -> None:
|
|||||||
else:
|
else:
|
||||||
install_root = Path(distutils.sysconfig.get_python_lib()) / "torch"
|
install_root = Path(distutils.sysconfig.get_python_lib()) / "torch"
|
||||||
|
|
||||||
libtorch_cpu_path = install_root / "lib" / "libtorch_cpu.so"
|
libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so")
|
||||||
pre_cxx11_abi = "cxx11-abi" not in os.getenv("DESIRED_DEVTOOLSET", "")
|
# NOTE: All binaries are built with cxx11abi now
|
||||||
check_lib_symbols_for_abi_correctness(libtorch_cpu_path, pre_cxx11_abi)
|
check_lib_symbols_for_abi_correctness(libtorch_cpu_path, False)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@ -76,10 +76,13 @@ def read_release_matrix():
|
|||||||
|
|
||||||
|
|
||||||
def test_numpy():
|
def test_numpy():
|
||||||
import numpy as np
|
try:
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
x = np.arange(5)
|
x = np.arange(5)
|
||||||
torch.tensor(x)
|
torch.tensor(x)
|
||||||
|
except ImportError:
|
||||||
|
print("Numpy check skipped. Numpy is not installed.")
|
||||||
|
|
||||||
|
|
||||||
def check_version(package: str) -> None:
|
def check_version(package: str) -> None:
|
||||||
@ -410,6 +413,7 @@ def main() -> None:
|
|||||||
smoke_test_conv2d()
|
smoke_test_conv2d()
|
||||||
test_linalg()
|
test_linalg()
|
||||||
test_numpy()
|
test_numpy()
|
||||||
|
|
||||||
if is_cuda_system:
|
if is_cuda_system:
|
||||||
test_linalg("cuda")
|
test_linalg("cuda")
|
||||||
test_cuda_gds_errors_captured()
|
test_cuda_gds_errors_captured()
|
||||||
|
|||||||
@ -1619,6 +1619,7 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
|
|||||||
install_torchvision
|
install_torchvision
|
||||||
checkout_install_torchbench hf_T5 llama moco
|
checkout_install_torchbench hf_T5 llama moco
|
||||||
PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
|
PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
|
||||||
|
test_inductor_aoti
|
||||||
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
|
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
|
||||||
install_torchvision
|
install_torchvision
|
||||||
test_inductor_shard "${SHARD_NUMBER}"
|
test_inductor_shard "${SHARD_NUMBER}"
|
||||||
|
|||||||
@ -55,12 +55,16 @@ s3_upload() {
|
|||||||
s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/"
|
s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/"
|
||||||
fi
|
fi
|
||||||
(
|
(
|
||||||
|
cache_control_flag=""
|
||||||
|
if [[ "${UPLOAD_CHANNEL}" = "test" ]]; then
|
||||||
|
cache_control_flag="--cache-control='no-cache,no-store,must-revalidate'"
|
||||||
|
fi
|
||||||
for pkg in ${PKG_DIR}/*.${extension}; do
|
for pkg in ${PKG_DIR}/*.${extension}; do
|
||||||
(
|
(
|
||||||
set -x
|
set -x
|
||||||
shm_id=$(sha256sum "${pkg}" | awk '{print $1}')
|
shm_id=$(sha256sum "${pkg}" | awk '{print $1}')
|
||||||
${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_upload_dir}" \
|
${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_upload_dir}" \
|
||||||
--metadata "checksum-sha256=${shm_id}"
|
--metadata "checksum-sha256=${shm_id}" ${cache_control_flag}
|
||||||
)
|
)
|
||||||
done
|
done
|
||||||
)
|
)
|
||||||
|
|||||||
@ -48,7 +48,6 @@ misc-*,
|
|||||||
-misc-no-recursion,
|
-misc-no-recursion,
|
||||||
-misc-non-private-member-variables-in-classes,
|
-misc-non-private-member-variables-in-classes,
|
||||||
-misc-unused-using-decls,
|
-misc-unused-using-decls,
|
||||||
-misc-use-internal-linkage,
|
|
||||||
modernize-*,
|
modernize-*,
|
||||||
-modernize-macro-to-enum,
|
-modernize-macro-to-enum,
|
||||||
-modernize-return-braced-init-list,
|
-modernize-return-braced-init-list,
|
||||||
|
|||||||
4
.github/actionlint.yaml
vendored
4
.github/actionlint.yaml
vendored
@ -3,8 +3,11 @@ self-hosted-runner:
|
|||||||
# GitHub hosted runner that actionlint doesn't recognize because actionlint version (1.6.21) is too old
|
# GitHub hosted runner that actionlint doesn't recognize because actionlint version (1.6.21) is too old
|
||||||
- ubuntu-24.04
|
- ubuntu-24.04
|
||||||
# GitHub hosted x86 Linux runners
|
# GitHub hosted x86 Linux runners
|
||||||
|
# TODO: Cleanup mentions of linux.20_04 when upgrade to linux.24_04 is complete
|
||||||
- linux.20_04.4x
|
- linux.20_04.4x
|
||||||
- linux.20_04.16x
|
- linux.20_04.16x
|
||||||
|
- linux.24_04.4x
|
||||||
|
- linux.24_04.16x
|
||||||
# Organization-wide AWS Linux Runners
|
# Organization-wide AWS Linux Runners
|
||||||
- linux.large
|
- linux.large
|
||||||
- linux.2xlarge
|
- linux.2xlarge
|
||||||
@ -49,6 +52,7 @@ self-hosted-runner:
|
|||||||
- linux.rocm.gpu
|
- linux.rocm.gpu
|
||||||
- linux.rocm.gpu.2
|
- linux.rocm.gpu.2
|
||||||
- linux.rocm.gpu.4
|
- linux.rocm.gpu.4
|
||||||
|
- rocm-docker
|
||||||
# Repo-specific Apple hosted runners
|
# Repo-specific Apple hosted runners
|
||||||
- macos-m1-ultra
|
- macos-m1-ultra
|
||||||
- macos-m2-14
|
- macos-m2-14
|
||||||
|
|||||||
39
.github/actions/checkout-pytorch/action.yml
vendored
39
.github/actions/checkout-pytorch/action.yml
vendored
@ -24,8 +24,12 @@ runs:
|
|||||||
run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
|
run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
- name: Set up parallel fetch and clean workspace
|
- name: Set up parallel fetch and clean workspace
|
||||||
|
id: first-clean
|
||||||
|
continue-on-error: true
|
||||||
shell: bash
|
shell: bash
|
||||||
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
|
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
|
||||||
|
env:
|
||||||
|
NO_SUDO: ${{ inputs.no-sudo }}
|
||||||
run: |
|
run: |
|
||||||
# Use all available CPUs for fetching
|
# Use all available CPUs for fetching
|
||||||
cd "${GITHUB_WORKSPACE}"
|
cd "${GITHUB_WORKSPACE}"
|
||||||
@ -35,10 +39,16 @@ runs:
|
|||||||
# Clean workspace. The default checkout action should also do this, but
|
# Clean workspace. The default checkout action should also do this, but
|
||||||
# do it here as well just in case
|
# do it here as well just in case
|
||||||
if [[ -d .git ]]; then
|
if [[ -d .git ]]; then
|
||||||
git clean -ffdx
|
if [ -z "${NO_SUDO}" ]; then
|
||||||
|
sudo git clean -ffdx
|
||||||
|
else
|
||||||
|
git clean -ffdx
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
- name: Checkout PyTorch
|
- name: Checkout PyTorch
|
||||||
|
id: first-checkout-attempt
|
||||||
|
continue-on-error: true
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||||
@ -46,3 +56,30 @@ runs:
|
|||||||
fetch-depth: ${{ inputs.fetch-depth }}
|
fetch-depth: ${{ inputs.fetch-depth }}
|
||||||
submodules: ${{ inputs.submodules }}
|
submodules: ${{ inputs.submodules }}
|
||||||
show-progress: false
|
show-progress: false
|
||||||
|
|
||||||
|
- name: Clean workspace (try again)
|
||||||
|
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
|
||||||
|
(steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}
|
||||||
|
shell: bash
|
||||||
|
env:
|
||||||
|
NO_SUDO: ${{ inputs.no-sudo }}
|
||||||
|
run: |
|
||||||
|
retry () {
|
||||||
|
$* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
|
||||||
|
}
|
||||||
|
echo "${GITHUB_WORKSPACE}"
|
||||||
|
if [ -z "${NO_SUDO}" ]; then
|
||||||
|
retry sudo rm -rf "${GITHUB_WORKSPACE}"
|
||||||
|
else
|
||||||
|
retry rm -rf "${GITHUB_WORKSPACE}"
|
||||||
|
fi
|
||||||
|
mkdir "${GITHUB_WORKSPACE}"
|
||||||
|
|
||||||
|
- name: Checkout PyTorch (try again)
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
if: ${{ steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success' }}
|
||||||
|
with:
|
||||||
|
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||||
|
fetch-depth: ${{ inputs.fetch-depth }}
|
||||||
|
submodules: ${{ inputs.submodules }}
|
||||||
|
show-progress: false
|
||||||
|
|||||||
11
.github/scripts/generate_binary_build_matrix.py
vendored
11
.github/scripts/generate_binary_build_matrix.py
vendored
@ -17,6 +17,7 @@ from typing import Optional
|
|||||||
|
|
||||||
# NOTE: Also update the CUDA sources in tools/nightly.py when changing this list
|
# NOTE: Also update the CUDA sources in tools/nightly.py when changing this list
|
||||||
CUDA_ARCHES = ["11.8", "12.6", "12.8"]
|
CUDA_ARCHES = ["11.8", "12.6", "12.8"]
|
||||||
|
CUDA_STABLE = "12.6"
|
||||||
CUDA_ARCHES_FULL_VERSION = {
|
CUDA_ARCHES_FULL_VERSION = {
|
||||||
"11.8": "11.8.0",
|
"11.8": "11.8.0",
|
||||||
"12.6": "12.6.3",
|
"12.6": "12.6.3",
|
||||||
@ -67,7 +68,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
|||||||
"nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||||
@ -76,14 +77,14 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
|||||||
"nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
"nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||||
"nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
"nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||||
@ -373,7 +374,7 @@ def generate_wheels_matrix(
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
# Special build building to use on Colab. Python 3.11 for 12.6 CUDA
|
# Special build building to use on Colab. Python 3.11 for 12.6 CUDA
|
||||||
if python_version == "3.11" and arch_version == "12.6":
|
if python_version == "3.11" and arch_version == CUDA_STABLE:
|
||||||
ret.append(
|
ret.append(
|
||||||
{
|
{
|
||||||
"python_version": python_version,
|
"python_version": python_version,
|
||||||
@ -416,7 +417,7 @@ def generate_wheels_matrix(
|
|||||||
"pytorch_extra_install_requirements": (
|
"pytorch_extra_install_requirements": (
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS["xpu"]
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS["xpu"]
|
||||||
if gpu_arch_type == "xpu"
|
if gpu_arch_type == "xpu"
|
||||||
else PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.6"]
|
else PYTORCH_EXTRA_INSTALL_REQUIREMENTS[CUDA_STABLE]
|
||||||
if os != "linux"
|
if os != "linux"
|
||||||
else ""
|
else ""
|
||||||
),
|
),
|
||||||
|
|||||||
30
.github/scripts/get_ci_variable.py
vendored
Executable file
30
.github/scripts/get_ci_variable.py
vendored
Executable file
@ -0,0 +1,30 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Helper script - Return CI variables such as stable cuda, min python version, etc."""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: list[str]) -> None:
|
||||||
|
import generate_binary_build_matrix
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--cuda-stable-version",
|
||||||
|
action="store_true",
|
||||||
|
help="get cuda stable version",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--min-python-version",
|
||||||
|
action="store_true",
|
||||||
|
help="get min supported python version",
|
||||||
|
)
|
||||||
|
options = parser.parse_args(args)
|
||||||
|
if options.cuda_stable_version:
|
||||||
|
return print(generate_binary_build_matrix.CUDA_STABLE)
|
||||||
|
if options.min_python_version:
|
||||||
|
return print(generate_binary_build_matrix.FULL_PYTHON_VERSIONS[0])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main(sys.argv[1:])
|
||||||
97
.github/scripts/s390x-ci/tests_list.py
vendored
Executable file
97
.github/scripts/s390x-ci/tests_list.py
vendored
Executable file
@ -0,0 +1,97 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
sys.path.insert(1, os.path.join(sys.path[0], "..", "..", ".."))
|
||||||
|
|
||||||
|
from tools.testing.discover_tests import TESTS
|
||||||
|
|
||||||
|
|
||||||
|
skip_list = [
|
||||||
|
# these tests fail due to various reasons
|
||||||
|
"dynamo/test_misc",
|
||||||
|
"inductor/test_aot_inductor",
|
||||||
|
"inductor/test_cpu_repro",
|
||||||
|
"inductor/test_cpu_select_algorithm",
|
||||||
|
"inductor/test_aot_inductor_arrayref",
|
||||||
|
"inductor/test_torchinductor_codegen_dynamic_shapes",
|
||||||
|
"lazy/test_meta_kernel",
|
||||||
|
"onnx/test_utility_funs",
|
||||||
|
"profiler/test_profiler",
|
||||||
|
"test_ao_sparsity",
|
||||||
|
"test_cpp_extensions_open_device_registration",
|
||||||
|
"test_jit",
|
||||||
|
"test_metal",
|
||||||
|
"test_mps",
|
||||||
|
"dynamo/test_torchrec",
|
||||||
|
"inductor/test_aot_inductor_utils",
|
||||||
|
"inductor/test_coordinate_descent_tuner",
|
||||||
|
"test_jiterator",
|
||||||
|
# these tests run long and fail in addition to that
|
||||||
|
"dynamo/test_dynamic_shapes",
|
||||||
|
"test_quantization",
|
||||||
|
"inductor/test_torchinductor",
|
||||||
|
"inductor/test_torchinductor_dynamic_shapes",
|
||||||
|
"inductor/test_torchinductor_opinfo",
|
||||||
|
"test_binary_ufuncs",
|
||||||
|
"test_unary_ufuncs",
|
||||||
|
# these tests fail when cuda is not available
|
||||||
|
"inductor/test_cudacodecache",
|
||||||
|
"inductor/test_inductor_utils",
|
||||||
|
"inductor/test_inplacing_pass",
|
||||||
|
"inductor/test_kernel_benchmark",
|
||||||
|
"inductor/test_max_autotune",
|
||||||
|
"inductor/test_move_constructors_to_cuda",
|
||||||
|
"inductor/test_multi_kernel",
|
||||||
|
"inductor/test_pattern_matcher",
|
||||||
|
"inductor/test_perf",
|
||||||
|
"inductor/test_select_algorithm",
|
||||||
|
"inductor/test_snode_runtime",
|
||||||
|
"inductor/test_triton_wrapper",
|
||||||
|
# these tests fail when mkldnn is not available
|
||||||
|
"inductor/test_custom_post_grad_passes",
|
||||||
|
"inductor/test_mkldnn_pattern_matcher",
|
||||||
|
# lacks quantization support
|
||||||
|
"onnx/test_models_quantized_onnxruntime",
|
||||||
|
"onnx/test_pytorch_onnx_onnxruntime",
|
||||||
|
# https://github.com/pytorch/pytorch/issues/102078
|
||||||
|
"test_decomp",
|
||||||
|
# https://github.com/pytorch/pytorch/issues/146698
|
||||||
|
"test_model_exports_to_core_aten",
|
||||||
|
# runs very long, skip for now
|
||||||
|
"inductor/test_layout_optim",
|
||||||
|
"test_fx",
|
||||||
|
# some false errors
|
||||||
|
"doctests",
|
||||||
|
]
|
||||||
|
|
||||||
|
skip_list_regex = [
|
||||||
|
# distributed tests fail randomly
|
||||||
|
"distributed/.*",
|
||||||
|
]
|
||||||
|
|
||||||
|
all_testfiles = sorted(TESTS)
|
||||||
|
|
||||||
|
filtered_testfiles = []
|
||||||
|
|
||||||
|
for filename in all_testfiles:
|
||||||
|
if filename in skip_list:
|
||||||
|
continue
|
||||||
|
|
||||||
|
regex_filtered = False
|
||||||
|
|
||||||
|
for regex_string in skip_list_regex:
|
||||||
|
if re.fullmatch(regex_string, filename):
|
||||||
|
regex_filtered = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if regex_filtered:
|
||||||
|
continue
|
||||||
|
|
||||||
|
filtered_testfiles.append(filename)
|
||||||
|
|
||||||
|
for filename in filtered_testfiles:
|
||||||
|
print(' "' + filename + '",')
|
||||||
10
.github/scripts/trymerge.py
vendored
10
.github/scripts/trymerge.py
vendored
@ -819,10 +819,9 @@ class GitHubPR:
|
|||||||
cursor=info["reviews"]["pageInfo"]["startCursor"],
|
cursor=info["reviews"]["pageInfo"]["startCursor"],
|
||||||
)
|
)
|
||||||
info = rc["data"]["repository"]["pullRequest"]
|
info = rc["data"]["repository"]["pullRequest"]
|
||||||
reviews = {}
|
reviews = {
|
||||||
for author, state in self._reviews:
|
author: state for author, state in self._reviews if state != "COMMENTED"
|
||||||
if state != "COMMENTED":
|
}
|
||||||
reviews[author] = state
|
|
||||||
return list(reviews.items())
|
return list(reviews.items())
|
||||||
|
|
||||||
def get_approved_by(self) -> list[str]:
|
def get_approved_by(self) -> list[str]:
|
||||||
@ -2282,7 +2281,8 @@ def merge(
|
|||||||
except MandatoryChecksMissingError as ex:
|
except MandatoryChecksMissingError as ex:
|
||||||
last_exception = str(ex)
|
last_exception = str(ex)
|
||||||
print(
|
print(
|
||||||
f"Merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} failed due to: {ex}. Retrying in 5 min"
|
f"Merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} failed due to: {ex}. Retrying in 5 min",
|
||||||
|
flush=True,
|
||||||
)
|
)
|
||||||
time.sleep(5 * 60)
|
time.sleep(5 * 60)
|
||||||
# Finally report timeout back
|
# Finally report timeout back
|
||||||
|
|||||||
15
.github/workflows/_mac-build.yml
vendored
15
.github/workflows/_mac-build.yml
vendored
@ -33,10 +33,6 @@ on:
|
|||||||
default: "3.9"
|
default: "3.9"
|
||||||
description: |
|
description: |
|
||||||
The python version to be used. Will be 3.9 by default
|
The python version to be used. Will be 3.9 by default
|
||||||
environment-file:
|
|
||||||
required: false
|
|
||||||
type: string
|
|
||||||
description: Set the conda environment file used to setup macOS build.
|
|
||||||
test-matrix:
|
test-matrix:
|
||||||
required: false
|
required: false
|
||||||
type: string
|
type: string
|
||||||
@ -86,23 +82,12 @@ jobs:
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
- name: Setup miniconda
|
- name: Setup miniconda
|
||||||
if: inputs.environment-file == ''
|
|
||||||
uses: pytorch/test-infra/.github/actions/setup-miniconda@main
|
uses: pytorch/test-infra/.github/actions/setup-miniconda@main
|
||||||
with:
|
with:
|
||||||
python-version: ${{ inputs.python-version }}
|
python-version: ${{ inputs.python-version }}
|
||||||
environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
|
environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
|
||||||
pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
|
pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
|
||||||
|
|
||||||
# This option is used when cross-compiling arm64 from x86-64. Specifically, we need arm64 conda
|
|
||||||
# environment even though the arch is x86-64
|
|
||||||
- name: Setup miniconda using the provided environment file
|
|
||||||
if: inputs.environment-file != ''
|
|
||||||
uses: pytorch/test-infra/.github/actions/setup-miniconda@main
|
|
||||||
with:
|
|
||||||
python-version: ${{ inputs.python-version }}
|
|
||||||
environment-file: ${{ inputs.environment-file }}
|
|
||||||
pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
|
|
||||||
|
|
||||||
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
|
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
|
||||||
uses: nick-fields/retry@v3.0.0
|
uses: nick-fields/retry@v3.0.0
|
||||||
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
|
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
|
||||||
|
|||||||
2
.github/workflows/check-labels.yml
vendored
2
.github/workflows/check-labels.yml
vendored
@ -35,7 +35,7 @@ jobs:
|
|||||||
pull-requests: write
|
pull-requests: write
|
||||||
name: Check labels
|
name: Check labels
|
||||||
if: github.repository_owner == 'pytorch'
|
if: github.repository_owner == 'pytorch'
|
||||||
runs-on: linux.20_04.4x
|
runs-on: linux.24_04.4x
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout PyTorch
|
- name: Checkout PyTorch
|
||||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
||||||
|
|||||||
55
.github/workflows/docker-cache-mi300.yml
vendored
Normal file
55
.github/workflows/docker-cache-mi300.yml
vendored
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
name: docker-cache-mi300
|
||||||
|
|
||||||
|
on:
|
||||||
|
# run every 6 hours
|
||||||
|
schedule:
|
||||||
|
- cron: 0 0,6,12,18 * * *
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
docker-cache:
|
||||||
|
if: github.repository_owner == 'pytorch'
|
||||||
|
runs-on: rocm-docker
|
||||||
|
steps:
|
||||||
|
- name: Checkout PyTorch
|
||||||
|
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
||||||
|
with:
|
||||||
|
no-sudo: true
|
||||||
|
|
||||||
|
- name: configure aws credentials
|
||||||
|
id: aws_creds
|
||||||
|
uses: aws-actions/configure-aws-credentials@v4
|
||||||
|
with:
|
||||||
|
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||||
|
aws-region: us-east-1
|
||||||
|
role-duration-seconds: 18000
|
||||||
|
|
||||||
|
- name: Login to Amazon ECR
|
||||||
|
id: login-ecr
|
||||||
|
continue-on-error: false
|
||||||
|
uses: aws-actions/amazon-ecr-login@v2
|
||||||
|
|
||||||
|
- name: Calculate docker image
|
||||||
|
id: calculate-docker-image
|
||||||
|
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
||||||
|
with:
|
||||||
|
docker-image-name: pytorch-linux-focal-rocm-n-py3
|
||||||
|
push: false
|
||||||
|
|
||||||
|
- name: Pull docker image
|
||||||
|
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
||||||
|
with:
|
||||||
|
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||||
|
|
||||||
|
- name: Tar and upload to S3 bucket
|
||||||
|
run: |
|
||||||
|
sudo docker save -o ~/docker-data/pytorch/pytorch_docker_image.tar ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||||
|
sudo rclone copy -P --s3-upload-concurrency 64 --s3-chunk-size 200M --s3-upload-cutoff 300M ~/docker-data/pytorch/pytorch_docker_image.tar oci:pytorchbucket0002/pytorch_docker_image --progress
|
||||||
7
.github/workflows/docker-release.yml
vendored
7
.github/workflows/docker-release.yml
vendored
@ -117,7 +117,10 @@ jobs:
|
|||||||
# To get QEMU binaries in our PATH
|
# To get QEMU binaries in our PATH
|
||||||
echo "${RUNNER_TEMP}/bin" >> "${GITHUB_PATH}"
|
echo "${RUNNER_TEMP}/bin" >> "${GITHUB_PATH}"
|
||||||
# Generate PyTorch version to use
|
# Generate PyTorch version to use
|
||||||
echo "PYTORCH_VERSION=$(python3 .github/scripts/generate_pytorch_version.py --no-build-suffix)" >> "${GITHUB_ENV}"
|
{
|
||||||
|
echo "PYTORCH_VERSION=$(python3 .github/scripts/generate_pytorch_version.py --no-build-suffix)";
|
||||||
|
echo "STABLE_CUDA_VERSION=$(python3 .github/scripts/get_ci_variable.py --stable-cuda-version)"
|
||||||
|
} >> "${GITHUB_ENV}"
|
||||||
- name: Setup test specific variables
|
- name: Setup test specific variables
|
||||||
if: ${{ startsWith(github.event.ref, 'refs/tags/v') }}
|
if: ${{ startsWith(github.event.ref, 'refs/tags/v') }}
|
||||||
run: |
|
run: |
|
||||||
@ -154,7 +157,7 @@ jobs:
|
|||||||
docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}"
|
docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}"
|
||||||
|
|
||||||
# Please note, here we ned to pin specific verison of CUDA as with latest label
|
# Please note, here we ned to pin specific verison of CUDA as with latest label
|
||||||
if [[ ${CUDA_VERSION_SHORT} == "12.4" ]]; then
|
if [[ ${CUDA_VERSION_SHORT} == "${STABLE_CUDA_VERSION}" ]]; then
|
||||||
docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}" \
|
docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}" \
|
||||||
ghcr.io/pytorch/pytorch-nightly:latest
|
ghcr.io/pytorch/pytorch-nightly:latest
|
||||||
docker push ghcr.io/pytorch/pytorch-nightly:latest
|
docker push ghcr.io/pytorch/pytorch-nightly:latest
|
||||||
|
|||||||
24
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
24
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -64,7 +64,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_9-cpu-aarch64
|
build_name: manywheel-py3_9-cpu-aarch64
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_9-cpu-aarch64-test: # Testing
|
manywheel-py3_9-cpu-aarch64-test: # Testing
|
||||||
@ -134,7 +134,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_9-cuda-aarch64-12_8
|
build_name: manywheel-py3_9-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -181,7 +181,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_10-cpu-aarch64
|
build_name: manywheel-py3_10-cpu-aarch64
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_10-cpu-aarch64-test: # Testing
|
manywheel-py3_10-cpu-aarch64-test: # Testing
|
||||||
@ -251,7 +251,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_10-cuda-aarch64-12_8
|
build_name: manywheel-py3_10-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -298,7 +298,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_11-cpu-aarch64
|
build_name: manywheel-py3_11-cpu-aarch64
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_11-cpu-aarch64-test: # Testing
|
manywheel-py3_11-cpu-aarch64-test: # Testing
|
||||||
@ -368,7 +368,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_11-cuda-aarch64-12_8
|
build_name: manywheel-py3_11-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -415,7 +415,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_12-cpu-aarch64
|
build_name: manywheel-py3_12-cpu-aarch64
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_12-cpu-aarch64-test: # Testing
|
manywheel-py3_12-cpu-aarch64-test: # Testing
|
||||||
@ -485,7 +485,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_12-cuda-aarch64-12_8
|
build_name: manywheel-py3_12-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -532,7 +532,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13-cpu-aarch64
|
build_name: manywheel-py3_13-cpu-aarch64
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_13-cpu-aarch64-test: # Testing
|
manywheel-py3_13-cpu-aarch64-test: # Testing
|
||||||
@ -602,7 +602,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13-cuda-aarch64-12_8
|
build_name: manywheel-py3_13-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -649,7 +649,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13t-cpu-aarch64
|
build_name: manywheel-py3_13t-cpu-aarch64
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_13t-cpu-aarch64-test: # Testing
|
manywheel-py3_13t-cpu-aarch64-test: # Testing
|
||||||
@ -719,7 +719,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13t-cuda-aarch64-12_8
|
build_name: manywheel-py3_13t-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|||||||
4
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
4
.github/workflows/generated-linux-binary-manywheel-main.yml
generated
vendored
@ -105,7 +105,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_9-cuda12_6
|
build_name: manywheel-py3_9-cuda12_6
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_9-cuda12_6-test: # Testing
|
manywheel-py3_9-cuda12_6-test: # Testing
|
||||||
@ -152,7 +152,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_9-cuda12_8
|
build_name: manywheel-py3_9-cuda12_8
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_9-cuda12_8-test: # Testing
|
manywheel-py3_9-cuda12_8-test: # Testing
|
||||||
|
|||||||
24
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
24
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
@ -262,7 +262,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_9-cuda12_6
|
build_name: manywheel-py3_9-cuda12_6
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_9-cuda12_6-test: # Testing
|
manywheel-py3_9-cuda12_6-test: # Testing
|
||||||
@ -331,7 +331,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_9-cuda12_8
|
build_name: manywheel-py3_9-cuda12_8
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_9-cuda12_8-test: # Testing
|
manywheel-py3_9-cuda12_8-test: # Testing
|
||||||
@ -891,7 +891,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_10-cuda12_6
|
build_name: manywheel-py3_10-cuda12_6
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_10-cuda12_6-test: # Testing
|
manywheel-py3_10-cuda12_6-test: # Testing
|
||||||
@ -960,7 +960,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_10-cuda12_8
|
build_name: manywheel-py3_10-cuda12_8
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_10-cuda12_8-test: # Testing
|
manywheel-py3_10-cuda12_8-test: # Testing
|
||||||
@ -1520,7 +1520,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_11-cuda12_6
|
build_name: manywheel-py3_11-cuda12_6
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_11-cuda12_6-test: # Testing
|
manywheel-py3_11-cuda12_6-test: # Testing
|
||||||
@ -1654,7 +1654,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_11-cuda12_8
|
build_name: manywheel-py3_11-cuda12_8
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_11-cuda12_8-test: # Testing
|
manywheel-py3_11-cuda12_8-test: # Testing
|
||||||
@ -2214,7 +2214,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_12-cuda12_6
|
build_name: manywheel-py3_12-cuda12_6
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_12-cuda12_6-test: # Testing
|
manywheel-py3_12-cuda12_6-test: # Testing
|
||||||
@ -2283,7 +2283,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_12-cuda12_8
|
build_name: manywheel-py3_12-cuda12_8
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_12-cuda12_8-test: # Testing
|
manywheel-py3_12-cuda12_8-test: # Testing
|
||||||
@ -2843,7 +2843,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_13-cuda12_6
|
build_name: manywheel-py3_13-cuda12_6
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_13-cuda12_6-test: # Testing
|
manywheel-py3_13-cuda12_6-test: # Testing
|
||||||
@ -2912,7 +2912,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_13-cuda12_8
|
build_name: manywheel-py3_13-cuda12_8
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_13-cuda12_8-test: # Testing
|
manywheel-py3_13-cuda12_8-test: # Testing
|
||||||
@ -3472,7 +3472,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_13t-cuda12_6
|
build_name: manywheel-py3_13t-cuda12_6
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_13t-cuda12_6-test: # Testing
|
manywheel-py3_13t-cuda12_6-test: # Testing
|
||||||
@ -3541,7 +3541,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_13t-cuda12_8
|
build_name: manywheel-py3_13t-cuda12_8
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_13t-cuda12_8-test: # Testing
|
manywheel-py3_13t-cuda12_8-test: # Testing
|
||||||
|
|||||||
10
.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
generated
vendored
10
.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
generated
vendored
@ -63,7 +63,7 @@ jobs:
|
|||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
build_name: manywheel-py3_9-cpu-s390x
|
build_name: manywheel-py3_9-cpu-s390x
|
||||||
build_environment: linux-s390x-binary-manywheel
|
build_environment: linux-s390x-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_9-cpu-s390x-test: # Testing
|
manywheel-py3_9-cpu-s390x-test: # Testing
|
||||||
@ -128,7 +128,7 @@ jobs:
|
|||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
build_name: manywheel-py3_10-cpu-s390x
|
build_name: manywheel-py3_10-cpu-s390x
|
||||||
build_environment: linux-s390x-binary-manywheel
|
build_environment: linux-s390x-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_10-cpu-s390x-test: # Testing
|
manywheel-py3_10-cpu-s390x-test: # Testing
|
||||||
@ -193,7 +193,7 @@ jobs:
|
|||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
build_name: manywheel-py3_11-cpu-s390x
|
build_name: manywheel-py3_11-cpu-s390x
|
||||||
build_environment: linux-s390x-binary-manywheel
|
build_environment: linux-s390x-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_11-cpu-s390x-test: # Testing
|
manywheel-py3_11-cpu-s390x-test: # Testing
|
||||||
@ -258,7 +258,7 @@ jobs:
|
|||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
build_name: manywheel-py3_12-cpu-s390x
|
build_name: manywheel-py3_12-cpu-s390x
|
||||||
build_environment: linux-s390x-binary-manywheel
|
build_environment: linux-s390x-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_12-cpu-s390x-test: # Testing
|
manywheel-py3_12-cpu-s390x-test: # Testing
|
||||||
@ -323,7 +323,7 @@ jobs:
|
|||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
build_name: manywheel-py3_13-cpu-s390x
|
build_name: manywheel-py3_13-cpu-s390x
|
||||||
build_environment: linux-s390x-binary-manywheel
|
build_environment: linux-s390x-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_13-cpu-s390x-test: # Testing
|
manywheel-py3_13-cpu-s390x-test: # Testing
|
||||||
|
|||||||
12
.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
generated
vendored
12
.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
generated
vendored
@ -43,7 +43,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cpu
|
GPU_ARCH_TYPE: cpu
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.9"
|
DESIRED_PYTHON: "3.9"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||||
@ -167,7 +167,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cpu
|
GPU_ARCH_TYPE: cpu
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.10"
|
DESIRED_PYTHON: "3.10"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||||
@ -291,7 +291,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cpu
|
GPU_ARCH_TYPE: cpu
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.11"
|
DESIRED_PYTHON: "3.11"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||||
@ -415,7 +415,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cpu
|
GPU_ARCH_TYPE: cpu
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.12"
|
DESIRED_PYTHON: "3.12"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||||
@ -539,7 +539,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cpu
|
GPU_ARCH_TYPE: cpu
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.13"
|
DESIRED_PYTHON: "3.13"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||||
@ -663,7 +663,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cpu
|
GPU_ARCH_TYPE: cpu
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.13t"
|
DESIRED_PYTHON: "3.13t"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||||
|
|||||||
2
.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
generated
vendored
2
.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
generated
vendored
@ -54,7 +54,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cpu
|
GPU_ARCH_TYPE: cpu
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.12"
|
DESIRED_PYTHON: "3.12"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||||
|
|||||||
48
.github/workflows/generated-windows-binary-wheel-nightly.yml
generated
vendored
48
.github/workflows/generated-windows-binary-wheel-nightly.yml
generated
vendored
@ -54,7 +54,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cpu
|
GPU_ARCH_TYPE: cpu
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.9"
|
DESIRED_PYTHON: "3.9"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -290,7 +290,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cuda
|
GPU_ARCH_TYPE: cuda
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.9"
|
DESIRED_PYTHON: "3.9"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -528,7 +528,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cuda
|
GPU_ARCH_TYPE: cuda
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.9"
|
DESIRED_PYTHON: "3.9"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -766,7 +766,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cuda
|
GPU_ARCH_TYPE: cuda
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.9"
|
DESIRED_PYTHON: "3.9"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -1238,7 +1238,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cpu
|
GPU_ARCH_TYPE: cpu
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.10"
|
DESIRED_PYTHON: "3.10"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -1474,7 +1474,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cuda
|
GPU_ARCH_TYPE: cuda
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.10"
|
DESIRED_PYTHON: "3.10"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -1712,7 +1712,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cuda
|
GPU_ARCH_TYPE: cuda
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.10"
|
DESIRED_PYTHON: "3.10"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -1950,7 +1950,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cuda
|
GPU_ARCH_TYPE: cuda
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.10"
|
DESIRED_PYTHON: "3.10"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -2422,7 +2422,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cpu
|
GPU_ARCH_TYPE: cpu
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.11"
|
DESIRED_PYTHON: "3.11"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -2658,7 +2658,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cuda
|
GPU_ARCH_TYPE: cuda
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.11"
|
DESIRED_PYTHON: "3.11"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -2896,7 +2896,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cuda
|
GPU_ARCH_TYPE: cuda
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.11"
|
DESIRED_PYTHON: "3.11"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -3134,7 +3134,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cuda
|
GPU_ARCH_TYPE: cuda
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.11"
|
DESIRED_PYTHON: "3.11"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -3606,7 +3606,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cpu
|
GPU_ARCH_TYPE: cpu
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.12"
|
DESIRED_PYTHON: "3.12"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -3842,7 +3842,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cuda
|
GPU_ARCH_TYPE: cuda
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.12"
|
DESIRED_PYTHON: "3.12"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -4080,7 +4080,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cuda
|
GPU_ARCH_TYPE: cuda
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.12"
|
DESIRED_PYTHON: "3.12"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -4318,7 +4318,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cuda
|
GPU_ARCH_TYPE: cuda
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.12"
|
DESIRED_PYTHON: "3.12"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -4790,7 +4790,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cpu
|
GPU_ARCH_TYPE: cpu
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.13"
|
DESIRED_PYTHON: "3.13"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -5026,7 +5026,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cuda
|
GPU_ARCH_TYPE: cuda
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.13"
|
DESIRED_PYTHON: "3.13"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -5264,7 +5264,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cuda
|
GPU_ARCH_TYPE: cuda
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.13"
|
DESIRED_PYTHON: "3.13"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -5502,7 +5502,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cuda
|
GPU_ARCH_TYPE: cuda
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.13"
|
DESIRED_PYTHON: "3.13"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -5974,7 +5974,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cpu
|
GPU_ARCH_TYPE: cpu
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.13t"
|
DESIRED_PYTHON: "3.13t"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -6210,7 +6210,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cuda
|
GPU_ARCH_TYPE: cuda
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.13t"
|
DESIRED_PYTHON: "3.13t"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -6448,7 +6448,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cuda
|
GPU_ARCH_TYPE: cuda
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.13t"
|
DESIRED_PYTHON: "3.13t"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
@ -6686,7 +6686,7 @@ jobs:
|
|||||||
GPU_ARCH_TYPE: cuda
|
GPU_ARCH_TYPE: cuda
|
||||||
SKIP_ALL_TESTS: 1
|
SKIP_ALL_TESTS: 1
|
||||||
DESIRED_PYTHON: "3.13t"
|
DESIRED_PYTHON: "3.13t"
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||||
steps:
|
steps:
|
||||||
- name: Display EC2 information
|
- name: Display EC2 information
|
||||||
shell: bash
|
shell: bash
|
||||||
|
|||||||
41
.github/workflows/lint.yml
vendored
41
.github/workflows/lint.yml
vendored
@ -26,7 +26,7 @@ jobs:
|
|||||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||||
|
|
||||||
lintrunner-clang:
|
lintrunner-clang:
|
||||||
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
|
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
|
||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
timeout: 120
|
timeout: 120
|
||||||
@ -43,7 +43,7 @@ jobs:
|
|||||||
.github/scripts/lintrunner.sh
|
.github/scripts/lintrunner.sh
|
||||||
|
|
||||||
lintrunner-noclang:
|
lintrunner-noclang:
|
||||||
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
|
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
|
||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
timeout: 120
|
timeout: 120
|
||||||
@ -59,7 +59,7 @@ jobs:
|
|||||||
.github/scripts/lintrunner.sh
|
.github/scripts/lintrunner.sh
|
||||||
|
|
||||||
quick-checks:
|
quick-checks:
|
||||||
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
|
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
|
||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
timeout: 120
|
timeout: 120
|
||||||
@ -116,7 +116,7 @@ jobs:
|
|||||||
bash .github/scripts/pr-sanity-check.sh
|
bash .github/scripts/pr-sanity-check.sh
|
||||||
|
|
||||||
workflow-checks:
|
workflow-checks:
|
||||||
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
|
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
|
||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
timeout: 120
|
timeout: 120
|
||||||
@ -154,7 +154,7 @@ jobs:
|
|||||||
exit $RC
|
exit $RC
|
||||||
|
|
||||||
toc:
|
toc:
|
||||||
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
|
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
|
||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
timeout: 120
|
timeout: 120
|
||||||
@ -194,7 +194,7 @@ jobs:
|
|||||||
test-tools:
|
test-tools:
|
||||||
name: Test tools
|
name: Test tools
|
||||||
if: ${{ github.repository == 'pytorch/pytorch' }}
|
if: ${{ github.repository == 'pytorch/pytorch' }}
|
||||||
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
|
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
|
||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
timeout: 120
|
timeout: 120
|
||||||
@ -215,7 +215,7 @@ jobs:
|
|||||||
test_run_test:
|
test_run_test:
|
||||||
name: Test `run_test.py` is usable without boto3
|
name: Test `run_test.py` is usable without boto3
|
||||||
if: ${{ github.repository == 'pytorch/pytorch' }}
|
if: ${{ github.repository == 'pytorch/pytorch' }}
|
||||||
runs-on: linux.20_04.4x
|
runs-on: linux.24_04.4x
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout PyTorch
|
- name: Checkout PyTorch
|
||||||
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
|
||||||
@ -241,10 +241,18 @@ jobs:
|
|||||||
test_collect_env:
|
test_collect_env:
|
||||||
if: ${{ github.repository == 'pytorch/pytorch' }}
|
if: ${{ github.repository == 'pytorch/pytorch' }}
|
||||||
name: Test collect_env
|
name: Test collect_env
|
||||||
runs-on: linux.20_04.4x
|
runs-on: ${{ matrix.runner }}
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
test_type: [with_torch, without_torch, older_python_version]
|
include:
|
||||||
|
- test_type: with_torch
|
||||||
|
runner: linux.24_04.4x
|
||||||
|
- test_type: without_torch
|
||||||
|
runner: linux.24_04.4x
|
||||||
|
# NOTE: The oldest supported version of python for 24.04 is 3.8
|
||||||
|
# so this cannot be updated if we want to keep this test at 3.6
|
||||||
|
- test_type: older_python_version
|
||||||
|
runner: linux.20_04.4x
|
||||||
steps:
|
steps:
|
||||||
# [see note: pytorch repo ref]
|
# [see note: pytorch repo ref]
|
||||||
# deep clone (fetch-depth 0) required, to allow us to use git log
|
# deep clone (fetch-depth 0) required, to allow us to use git log
|
||||||
@ -253,21 +261,28 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
submodules: false
|
submodules: false
|
||||||
fetch-depth: 1
|
fetch-depth: 1
|
||||||
- name: Setup Python 3.6
|
- name: Get min python version
|
||||||
|
id: get-min-python-version
|
||||||
|
if: matrix.test_type == 'older_python_version'
|
||||||
|
run: |
|
||||||
|
set -eou pipefail
|
||||||
|
# Generate PyTorch version to use
|
||||||
|
echo "MIN_PYTHON_VERSION=$(python3 .github/scripts/get_ci_variable.py --min-python-version)" >> "${GITHUB_OUTPUT}"
|
||||||
|
- name: Setup Old Python version
|
||||||
if: matrix.test_type == 'older_python_version'
|
if: matrix.test_type == 'older_python_version'
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: '3.6'
|
python-version: 3.6
|
||||||
architecture: x64
|
architecture: x64
|
||||||
check-latest: false
|
check-latest: false
|
||||||
cache: pip
|
cache: pip
|
||||||
cache-dependency-path: |
|
cache-dependency-path: |
|
||||||
**/requirements.txt
|
**/requirements.txt
|
||||||
- name: Setup Python 3.9
|
- name: Setup Min Python version
|
||||||
if: matrix.test_type != 'older_python_version'
|
if: matrix.test_type != 'older_python_version'
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: '3.9'
|
python-version: ${{ steps.get-min-python-version.outputs.MIN_PYTHON_VERSION }}
|
||||||
architecture: x64
|
architecture: x64
|
||||||
check-latest: false
|
check-latest: false
|
||||||
cache: pip
|
cache: pip
|
||||||
|
|||||||
2
.github/workflows/revert.yml
vendored
2
.github/workflows/revert.yml
vendored
@ -7,7 +7,7 @@ on:
|
|||||||
jobs:
|
jobs:
|
||||||
do_revert:
|
do_revert:
|
||||||
name: try_revert_pr_${{ github.event.client_payload.pr_num }}
|
name: try_revert_pr_${{ github.event.client_payload.pr_num }}
|
||||||
runs-on: linux.20_04.4x
|
runs-on: linux.24_04.4x
|
||||||
environment: mergebot
|
environment: mergebot
|
||||||
env:
|
env:
|
||||||
GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||||
|
|||||||
4
.github/workflows/test-check-binary.yml
vendored
4
.github/workflows/test-check-binary.yml
vendored
@ -15,7 +15,7 @@ jobs:
|
|||||||
check_binary_linux_cpu:
|
check_binary_linux_cpu:
|
||||||
if: github.repository_owner == 'pytorch'
|
if: github.repository_owner == 'pytorch'
|
||||||
name: Test check_binary.sh for Linux CPU
|
name: Test check_binary.sh for Linux CPU
|
||||||
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
|
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
|
||||||
with:
|
with:
|
||||||
docker-image: python:3.11
|
docker-image: python:3.11
|
||||||
docker-build-dir: "skip-docker-build"
|
docker-build-dir: "skip-docker-build"
|
||||||
@ -28,7 +28,7 @@ jobs:
|
|||||||
check_binary_linux_cuda:
|
check_binary_linux_cuda:
|
||||||
if: github.repository_owner == 'pytorch'
|
if: github.repository_owner == 'pytorch'
|
||||||
name: Test check_binary.sh for Linux CUDA
|
name: Test check_binary.sh for Linux CUDA
|
||||||
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
|
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
|
||||||
with:
|
with:
|
||||||
runner: linux.4xlarge.nvidia.gpu
|
runner: linux.4xlarge.nvidia.gpu
|
||||||
docker-image: python:3.11
|
docker-image: python:3.11
|
||||||
|
|||||||
2
.github/workflows/trymerge.yml
vendored
2
.github/workflows/trymerge.yml
vendored
@ -7,7 +7,7 @@ on:
|
|||||||
jobs:
|
jobs:
|
||||||
do_merge:
|
do_merge:
|
||||||
name: try_merge_pr_${{ github.event.client_payload.pr_num }}
|
name: try_merge_pr_${{ github.event.client_payload.pr_num }}
|
||||||
runs-on: linux.20_04.4x
|
runs-on: linux.24_04.4x
|
||||||
environment: mergebot
|
environment: mergebot
|
||||||
permissions:
|
permissions:
|
||||||
id-token: write
|
id-token: write
|
||||||
|
|||||||
62
RELEASE.md
62
RELEASE.md
@ -19,7 +19,7 @@
|
|||||||
- [Cherry Picking Fixes](#cherry-picking-fixes)
|
- [Cherry Picking Fixes](#cherry-picking-fixes)
|
||||||
- [How to do Cherry Picking](#how-to-do-cherry-picking)
|
- [How to do Cherry Picking](#how-to-do-cherry-picking)
|
||||||
- [Cherry Picking Reverts](#cherry-picking-reverts)
|
- [Cherry Picking Reverts](#cherry-picking-reverts)
|
||||||
- [Preparing and Creating Final Release candidate](#preparing-and-creating-final-release-candidate)
|
- [Preparing and Creating Final Release Candidate](#preparing-and-creating-final-release-candidate)
|
||||||
- [Promoting RCs to Stable](#promoting-rcs-to-stable)
|
- [Promoting RCs to Stable](#promoting-rcs-to-stable)
|
||||||
- [Additional Steps to prepare for release day](#additional-steps-to-prepare-for-release-day)
|
- [Additional Steps to prepare for release day](#additional-steps-to-prepare-for-release-day)
|
||||||
- [Modify release matrix](#modify-release-matrix)
|
- [Modify release matrix](#modify-release-matrix)
|
||||||
@ -63,7 +63,7 @@ Following is the Release Compatibility Matrix for PyTorch releases:
|
|||||||
|
|
||||||
## Release Cadence
|
## Release Cadence
|
||||||
|
|
||||||
Following is the release cadence. All future dates below are tentative, for latest updates on the release scheduled please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional.
|
Following is the release cadence. All future dates below are tentative. For latest updates on the release schedule, please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional.
|
||||||
|
|
||||||
| Minor Version | Release branch cut | Release date | First patch release date | Second patch release date|
|
| Minor Version | Release branch cut | Release date | First patch release date | Second patch release date|
|
||||||
| --- | --- | --- | --- | --- |
|
| --- | --- | --- | --- | --- |
|
||||||
@ -91,20 +91,20 @@ Releasing a new version of PyTorch generally entails 3 major steps:
|
|||||||
|
|
||||||
### Frequently Asked Questions
|
### Frequently Asked Questions
|
||||||
|
|
||||||
* Q: What is release branch cut ?
|
* Q: What is a release branch cut ?
|
||||||
* A: When bulk of the tracked features merged into the main branch, the primary release engineer starts the release process of cutting the release branch by creating a new git branch based off of the current `main` development branch of PyTorch. This allows PyTorch development flow on `main` to continue uninterrupted, while the release engineering team focuses on stabilizing the release branch in order to release a series of release candidates (RC). The activities in the release branch include both regression and performance testing as well as polishing new features and fixing release-specific bugs. In general, new features *are not* added to the release branch after it was created.
|
* A: When bulk of the tracked features merged into the main branch, the primary release engineer starts the release process of cutting the release branch by creating a new git branch based off of the current `main` development branch of PyTorch. This allows PyTorch development flow on `main` to continue uninterrupted, while the release engineering team focuses on stabilizing the release branch in order to release a series of release candidates (RC). The activities in the release branch include both regression and performance testing as well as polishing new features and fixing release-specific bugs. In general, new features *are not* added to the release branch after it was created.
|
||||||
|
|
||||||
* Q: What is cherry-pick ?
|
* Q: What is a cherry-pick ?
|
||||||
* A: A cherry pick is a process of propagating commits from the main into the release branch, utilizing git's built in [cherry-pick feature](https://git-scm.com/docs/git-cherry-pick). These commits are typically limited to small fixes or documentation updates to ensure that the release engineering team has sufficient time to complete a thorough round of testing on the release branch. To nominate a fix for cherry-picking, a separate pull request must be created against the respective release branch and then mentioned in the Release Tracker issue (example: https://github.com/pytorch/pytorch/issues/94937) following the template from the issue description. The comment nominating a particular cherry-pick for inclusion in the release should include the committed PR against main branch, the newly created cherry-pick PR, as well as the acceptance criteria for why the cherry-pick is needed in the first place.
|
* A: A cherry pick is a process of propagating commits from the main into the release branch, utilizing git's built in [cherry-pick feature](https://git-scm.com/docs/git-cherry-pick). These commits are typically limited to small fixes or documentation updates to ensure that the release engineering team has sufficient time to complete a thorough round of testing on the release branch. To nominate a fix for cherry-picking, a separate pull request must be created against the respective release branch and then mentioned in the Release Tracker issue (example: https://github.com/pytorch/pytorch/issues/94937) following the template from the issue description. The comment nominating a particular cherry-pick for inclusion in the release should include the committed PR against main branch, the newly created cherry-pick PR, as well as the acceptance criteria for why the cherry-pick is needed in the first place.
|
||||||
|
|
||||||
## Cutting a release branch preparations
|
## Cutting a release branch preparations
|
||||||
|
|
||||||
Following Requirements needs to be met prior to cutting a release branch:
|
Following requirements need to be met prior to cutting a release branch:
|
||||||
|
|
||||||
* Resolve all outstanding issues in the milestones(for example [1.11.0](https://github.com/pytorch/pytorch/milestone/28))before first RC cut is completed. After RC cut is completed following script should be executed from test-infra repo in order to validate the presence of the fixes in the release branch :
|
* Resolve all outstanding issues in the milestones (for example [1.11.0](https://github.com/pytorch/pytorch/milestone/28)) before first RC cut is completed. After RC cut is completed, the following script should be executed from test-infra repo in order to validate the presence of the fixes in the release branch:
|
||||||
``` python github_analyze.py --repo-path ~/local/pytorch --remote upstream --branch release/1.11 --milestone-id 26 --missing-in-branch ```
|
``` python github_analyze.py --repo-path ~/local/pytorch --remote upstream --branch release/1.11 --milestone-id 26 --missing-in-branch ```
|
||||||
* Validate that all new workflows have been created in the PyTorch and domain libraries included in the release. Validate it against all dimensions of release matrix, including operating systems(Linux, MacOS, Windows), Python versions as well as CPU architectures(x86 and arm) and accelerator versions(CUDA, ROCm, XPU).
|
* Validate that all new workflows have been created in the PyTorch and domain libraries included in the release. Validate it against all dimensions of release matrix, including operating systems (Linux, MacOS, Windows), Python versions as well as CPU architectures (x86 and arm) and accelerator versions (CUDA, ROCm, XPU).
|
||||||
* All the nightly jobs for pytorch and domain libraries should be green. Validate this using following HUD links:
|
* All the nightly jobs for pytorch and domain libraries should be green. Validate this using the following HUD links:
|
||||||
* [Pytorch](https://hud.pytorch.org/hud/pytorch/pytorch/nightly)
|
* [Pytorch](https://hud.pytorch.org/hud/pytorch/pytorch/nightly)
|
||||||
* [TorchVision](https://hud.pytorch.org/hud/pytorch/vision/nightly)
|
* [TorchVision](https://hud.pytorch.org/hud/pytorch/vision/nightly)
|
||||||
* [TorchAudio](https://hud.pytorch.org/hud/pytorch/audio/nightly)
|
* [TorchAudio](https://hud.pytorch.org/hud/pytorch/audio/nightly)
|
||||||
@ -224,12 +224,12 @@ Backups are stored in a non-public S3 bucket at [`s3://pytorch-backup`](https://
|
|||||||
|
|
||||||
### Release Candidate health validation
|
### Release Candidate health validation
|
||||||
|
|
||||||
Validate the release jobs for pytorch and domain libraries should be green. Validate this using following HUD links:
|
Validate that the release jobs for pytorch and domain libraries are green. Validate this using the following HUD links:
|
||||||
* [Pytorch](https://hud.pytorch.org/hud/pytorch/pytorch/release%2F1.12)
|
* [Pytorch](https://hud.pytorch.org/hud/pytorch/pytorch/release%2F1.12)
|
||||||
* [TorchVision](https://hud.pytorch.org/hud/pytorch/vision/release%2F1.12)
|
* [TorchVision](https://hud.pytorch.org/hud/pytorch/vision/release%2F1.12)
|
||||||
* [TorchAudio](https://hud.pytorch.org/hud/pytorch/audio/release%2F1.12)
|
* [TorchAudio](https://hud.pytorch.org/hud/pytorch/audio/release%2F1.12)
|
||||||
|
|
||||||
Validate that the documentation build has completed and generated entry corresponding to the release in [docs repository](https://github.com/pytorch/docs/tree/main/).
|
Validate that the documentation build has completed and generated an entry corresponding to the release in the [docs repository](https://github.com/pytorch/docs/tree/main/).
|
||||||
|
|
||||||
### Cherry Picking Fixes
|
### Cherry Picking Fixes
|
||||||
|
|
||||||
@ -274,15 +274,15 @@ requires `pytorchbot`, so it's only available in PyTorch atm.
|
|||||||
|
|
||||||
### Cherry Picking Reverts
|
### Cherry Picking Reverts
|
||||||
|
|
||||||
If PR that has been cherry-picked into release branch has been reverted, its cherry-pick must be reverted as well.
|
If a PR that has been cherry-picked into the release branch has been reverted, its cherry-pick must be reverted as well.
|
||||||
|
|
||||||
Reverts for changes that was committed into the main branch prior to the branch cut, must be propagated into release branch as well.
|
Reverts for changes that were committed into the main branch prior to the branch cut must be propagated into the release branch as well.
|
||||||
|
|
||||||
## Preparing and Creating Final Release candidate
|
## Preparing and Creating Final Release Candidate
|
||||||
|
|
||||||
The following requirements need to be met prior to creating final Release Candidate :
|
The following requirements need to be met prior to creating the final Release Candidate:
|
||||||
|
|
||||||
* Resolve all outstanding open issues in the milestone. There should be no open issues/PRs (for example [2.1.2](https://github.com/pytorch/pytorch/milestone/39)). The issue should either be closed or de-milestoned.
|
* Resolve all outstanding open issues in the milestone. There should be no open issues/PRs (for example [2.1.2](https://github.com/pytorch/pytorch/milestone/39)). Each issue should either be closed or de-milestoned.
|
||||||
|
|
||||||
* Validate that all closed milestone PRs are present in the release branch. Confirm this by running:
|
* Validate that all closed milestone PRs are present in the release branch. Confirm this by running:
|
||||||
``` python github_analyze.py --repo-path ~/local/pytorch --remote upstream --branch release/2.2 --milestone-id 40 --missing-in-branch ```
|
``` python github_analyze.py --repo-path ~/local/pytorch --remote upstream --branch release/2.2 --milestone-id 40 --missing-in-branch ```
|
||||||
@ -291,7 +291,7 @@ The following requirements need to be met prior to creating final Release Candid
|
|||||||
|
|
||||||
* Perform [Release Candidate health validation](#release-candidate-health-validation). CI should have the green signal.
|
* Perform [Release Candidate health validation](#release-candidate-health-validation). CI should have the green signal.
|
||||||
|
|
||||||
After the final RC is created. The following tasks should be performed :
|
After the final RC is created, the following tasks should be performed:
|
||||||
|
|
||||||
* Perform [Release Candidate health validation](#release-candidate-health-validation). CI should have the green signal.
|
* Perform [Release Candidate health validation](#release-candidate-health-validation). CI should have the green signal.
|
||||||
|
|
||||||
@ -323,25 +323,25 @@ Promotion should occur in two steps:
|
|||||||
|
|
||||||
## Additional Steps to prepare for release day
|
## Additional Steps to prepare for release day
|
||||||
|
|
||||||
The following should be prepared for the release day
|
The following should be prepared for the release day:
|
||||||
|
|
||||||
### Modify release matrix
|
### Modify release matrix
|
||||||
|
|
||||||
Need to modify release matrix for get started page. See following [PR](https://github.com/pytorch/test-infra/pull/4611) as reference.
|
Modify the release matrix for the get started page. See the following [PR](https://github.com/pytorch/test-infra/pull/4611) as reference.
|
||||||
|
|
||||||
The PR to update published_versions.json and quick-start-module.js is auto generated. See following [PR](https://github.com/pytorch/pytorch.github.io/pull/1467) as reference.
|
The PR to update published_versions.json and quick-start-module.js is auto generated. See the following [PR](https://github.com/pytorch/pytorch.github.io/pull/1467) as reference.
|
||||||
|
|
||||||
Please note: This PR needs to be merged on the release day and hence it should be absolutely free of any failures. To test this PR, open another test PR but pointing to the Release candidate location as above [Release Candidate Storage](RELEASE.md#release-candidate-storage)
|
Please note: This PR needs to be merged on the release day and hence it should be absolutely free of any failures. To test this PR, open another test PR pointing to the Release Candidate location as described in the [Release Candidate Storage](#release-candidate-storage) section.
|
||||||
|
|
||||||
### Open Google Colab issue
|
### Open Google Colab issue
|
||||||
|
|
||||||
This is normally done right after the release is completed. We would need to create Google Colab Issue see following [PR](https://github.com/googlecolab/colabtools/issues/2372)
|
This is normally done right after the release is completed. We need to create a Google Colab issue. See the following example [issue](https://github.com/googlecolab/colabtools/issues/2372)
|
||||||
|
|
||||||
# Patch Releases
|
# Patch Releases
|
||||||
|
|
||||||
A patch release is a maintenance release of PyTorch that includes fixes for regressions found in a previous minor release. Patch releases typically will bump the `patch` version from semver (i.e. `[major].[minor].[patch]`).
|
A patch release is a maintenance release of PyTorch that includes fixes for regressions found in a previous minor release. Patch releases typically will bump the `patch` version from semver (i.e. `[major].[minor].[patch]`).
|
||||||
|
|
||||||
Please note: Starting from 2.1 one can expect up to 2 patch releases after every minor ones. Patch releases would only be published for latest minor release.
|
Please note: Starting from 2.1, one can expect up to 2 patch releases after every minor release. Patch releases are only published for the latest minor release.
|
||||||
|
|
||||||
## Patch Release Criteria
|
## Patch Release Criteria
|
||||||
|
|
||||||
@ -363,29 +363,29 @@ Patch releases should be considered if a regression meets the following criteria
|
|||||||
> Main POC: Patch Release Managers, Triage Reviewers
|
> Main POC: Patch Release Managers, Triage Reviewers
|
||||||
|
|
||||||
Patch releases should follow these high-level phases. This process starts immediately after the previous release has completed.
|
Patch releases should follow these high-level phases. This process starts immediately after the previous release has completed.
|
||||||
Patch release process takes around 4-5 weeks to complete.
|
The patch release process takes around 4-5 weeks to complete.
|
||||||
|
|
||||||
1. Triage, is a process where issues are identified, graded, compared to Patch Release Criteria and added to Patch Release milestone. This process normally takes 2 weeks after the release completion.
|
1. Triage is a process where issues are identified, graded, compared to Patch Release Criteria and added to Patch Release milestone. This process normally takes 2 weeks after the release completion.
|
||||||
2. Go/No Go meeting between PyTorch Releng, PyTorch Core and Project Managers where potential issues triggering a release in milestones are reviewed, and following decisions are made:
|
2. Go/No Go meeting between PyTorch Releng, PyTorch Core and Project Managers where potential issues triggering a release in milestones are reviewed, and following decisions are made:
|
||||||
* Should the new patch Release be created ?
|
* Should the new patch release be created?
|
||||||
* Timeline execution for the patch release
|
* Timeline execution for the patch release
|
||||||
3. Cherry picking phase starts after the decision is made to create patch release. At this point a new release tracker for the patch release is created, and an announcement will be made on official channels [example announcement](https://dev-discuss.pytorch.org/t/pytorch-release-2-0-1-important-information/1176). The authors of the fixes to regressions will be asked to create their own cherry picks. This process normally takes 2 weeks.
|
3. Cherry picking phase starts after the decision is made to create a patch release. At this point, a new release tracker for the patch release is created, and an announcement will be made on official channels [example announcement](https://dev-discuss.pytorch.org/t/pytorch-release-2-0-1-important-information/1176). The authors of the fixes to regressions will be asked to create their own cherry picks. This process normally takes 2 weeks.
|
||||||
4. Building Binaries, Promotion to Stable and testing. After all cherry picks have been merged, Release Managers trigger new build and produce new release candidate. Announcement is made on the official channel about the RC availability at this point. This process normally takes 2 weeks.
|
4. Building Binaries, Promotion to Stable and testing. After all cherry picks have been merged, Release Managers trigger a new build and produce a new release candidate. An announcement is made on the official channel about the RC availability at this point. This process normally takes 2 weeks.
|
||||||
5. General Availability
|
5. General Availability
|
||||||
|
|
||||||
### Triage
|
### Triage
|
||||||
|
|
||||||
> Main POC: Triage Reviewers
|
> Main POC: Triage Reviewers
|
||||||
|
|
||||||
1. Tag issues / pull requests that are candidates for a potential patch release with `triage review`
|
1. Tag issues/pull requests that are candidates for a potential patch release with `triage review`
|
||||||
* 
|
* 
|
||||||
2. Triage reviewers will then check if the regression / fix identified fits within above mentioned [Patch Release Criteria](#patch-release-criteria)
|
2. Triage reviewers will then check if the regression/fix identified fits within the above mentioned [Patch Release Criteria](#patch-release-criteria)
|
||||||
3. Triage reviewers will then add the issue / pull request to the related milestone (i.e. `1.9.1`) if the regressions is found to be within the [Patch Release Criteria](#patch-release-criteria)
|
3. Triage reviewers will then add the issue/pull request to the related milestone (i.e. `1.9.1`) if the regression is found to be within the [Patch Release Criteria](#patch-release-criteria)
|
||||||
* 
|
* 
|
||||||
|
|
||||||
### Issue Tracker for Patch releases
|
### Issue Tracker for Patch releases
|
||||||
|
|
||||||
For patch releases issue tracker needs to be created. For patch release, we require all cherry-pick changes to have links to either a high-priority GitHub issue or a CI failure from previous RC. An example of this would look like:
|
For patch releases, an issue tracker needs to be created. For a patch release, we require all cherry-pick changes to have links to either a high-priority GitHub issue or a CI failure from previous RC. An example of this would look like:
|
||||||
* https://github.com/pytorch/pytorch/issues/128436
|
* https://github.com/pytorch/pytorch/issues/128436
|
||||||
|
|
||||||
Only following issues are accepted:
|
Only following issues are accepted:
|
||||||
|
|||||||
@ -343,9 +343,32 @@ if(USE_CUDA)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(USE_ROCM)
|
if(USE_ROCM)
|
||||||
|
# NOTE: The PyTorch build does not actually add_subdirectory
|
||||||
|
# third_party/composable_kernel or use it as a CMake library. What is used
|
||||||
|
# is header only, so this should be ok, except that the CMake build generates
|
||||||
|
# a ck/config.h. We just do that part here. Without this, the ck.h from the
|
||||||
|
# ROCM SDK may get accidentally used instead.
|
||||||
|
function(_pytorch_rocm_generate_ck_conf)
|
||||||
|
set(CK_ENABLE_INT8 "ON")
|
||||||
|
set(CK_ENABLE_FP16 "ON")
|
||||||
|
set(CK_ENABLE_FP32 "ON")
|
||||||
|
set(CK_ENABLE_FP64 "ON")
|
||||||
|
set(CK_ENABLE_BF16 "ON")
|
||||||
|
set(CK_ENABLE_FP8 "ON")
|
||||||
|
set(CK_ENABLE_BF8 "ON")
|
||||||
|
set(CK_USE_XDL "ON")
|
||||||
|
set(CK_USE_WMMA "ON")
|
||||||
|
configure_file(
|
||||||
|
"${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in"
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h"
|
||||||
|
)
|
||||||
|
endfunction()
|
||||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
|
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
|
||||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
|
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
|
||||||
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
|
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
|
||||||
|
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
|
||||||
|
_pytorch_rocm_generate_ck_conf()
|
||||||
|
|
||||||
# Next two lines are needed because TunableOp uses third-party/fmt
|
# Next two lines are needed because TunableOp uses third-party/fmt
|
||||||
list(APPEND ATen_HIP_INCLUDE $<TARGET_PROPERTY:fmt::fmt-header-only,INTERFACE_INCLUDE_DIRECTORIES>)
|
list(APPEND ATen_HIP_INCLUDE $<TARGET_PROPERTY:fmt::fmt-header-only,INTERFACE_INCLUDE_DIRECTORIES>)
|
||||||
list(APPEND ATen_HIP_DEPENDENCY_LIBS fmt::fmt-header-only)
|
list(APPEND ATen_HIP_DEPENDENCY_LIBS fmt::fmt-header-only)
|
||||||
|
|||||||
@ -69,7 +69,7 @@ Generator createCPUGenerator(uint64_t seed_val) {
|
|||||||
* Helper function to concatenate two 32 bit unsigned int
|
* Helper function to concatenate two 32 bit unsigned int
|
||||||
* and return them as a 64 bit unsigned int
|
* and return them as a 64 bit unsigned int
|
||||||
*/
|
*/
|
||||||
inline uint64_t make64BitsFrom32Bits(uint32_t hi, uint32_t lo) {
|
inline static uint64_t make64BitsFrom32Bits(uint32_t hi, uint32_t lo) {
|
||||||
return (static_cast<uint64_t>(hi) << 32) | lo;
|
return (static_cast<uint64_t>(hi) << 32) | lo;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -588,7 +588,7 @@ Allocator* getCPUAllocator() {
|
|||||||
// means the allow_tf32 flags are overridden and tf32 is force disabled
|
// means the allow_tf32 flags are overridden and tf32 is force disabled
|
||||||
// override_allow_tf32_flag = false
|
// override_allow_tf32_flag = false
|
||||||
// means the original allow_tf32 flags are followed
|
// means the original allow_tf32 flags are followed
|
||||||
thread_local bool override_allow_tf32_flag = false;
|
thread_local static bool override_allow_tf32_flag = false;
|
||||||
|
|
||||||
NoTF32Guard::NoTF32Guard() {
|
NoTF32Guard::NoTF32Guard() {
|
||||||
if (!override_allow_tf32_flag) {
|
if (!override_allow_tf32_flag) {
|
||||||
@ -611,7 +611,7 @@ bool NoTF32Guard::should_disable_tf32() {
|
|||||||
// This information can be used, for example, to select implementations
|
// This information can be used, for example, to select implementations
|
||||||
// with different numerical or performance characteristics.
|
// with different numerical or performance characteristics.
|
||||||
// See https://pytorch.org/docs/stable/notes/numerical_accuracy.html for details.
|
// See https://pytorch.org/docs/stable/notes/numerical_accuracy.html for details.
|
||||||
thread_local bool rocm_is_backward_pass;
|
thread_local static bool rocm_is_backward_pass;
|
||||||
|
|
||||||
ROCmBackwardPassGuard::ROCmBackwardPassGuard() {
|
ROCmBackwardPassGuard::ROCmBackwardPassGuard() {
|
||||||
rocm_is_backward_pass = true;
|
rocm_is_backward_pass = true;
|
||||||
|
|||||||
@ -110,6 +110,11 @@ class TORCH_API Context {
|
|||||||
|
|
||||||
Allocator* getPinnedMemoryAllocator(
|
Allocator* getPinnedMemoryAllocator(
|
||||||
std::optional<c10::DeviceType> device_type = std::nullopt) {
|
std::optional<c10::DeviceType> device_type = std::nullopt) {
|
||||||
|
auto opt_device_type =
|
||||||
|
device_type.has_value() ? device_type : at::getAccelerator();
|
||||||
|
if (opt_device_type) {
|
||||||
|
lazyInitDevice(opt_device_type.value());
|
||||||
|
}
|
||||||
return getAcceleratorHooksInterface(device_type).getPinnedMemoryAllocator();
|
return getAcceleratorHooksInterface(device_type).getPinnedMemoryAllocator();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -28,10 +28,8 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
|
|||||||
opt_device_type = at::getAccelerator(false);
|
opt_device_type = at::getAccelerator(false);
|
||||||
}
|
}
|
||||||
if (opt_device_type.has_value()) {
|
if (opt_device_type.has_value()) {
|
||||||
at::globalContext().lazyInitDevice(opt_device_type.value());
|
return at::globalContext().getPinnedMemoryAllocator(
|
||||||
return at::globalContext()
|
opt_device_type.value());
|
||||||
.getAcceleratorHooksInterface(opt_device_type)
|
|
||||||
.getPinnedMemoryAllocator();
|
|
||||||
} else {
|
} else {
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
false, "Need to provide pin_memory allocator to use pin memory.")
|
false, "Need to provide pin_memory allocator to use pin memory.")
|
||||||
@ -172,7 +170,7 @@ SymInt computeStorageNbytes(
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
TensorBase _empty_generic(
|
static TensorBase _empty_generic(
|
||||||
ArrayRef<T> size,
|
ArrayRef<T> size,
|
||||||
c10::Allocator* allocator,
|
c10::Allocator* allocator,
|
||||||
c10::DispatchKeySet ks,
|
c10::DispatchKeySet ks,
|
||||||
@ -225,7 +223,7 @@ TensorBase empty_generic_symint(
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
TensorBase _empty_strided_generic(
|
static TensorBase _empty_strided_generic(
|
||||||
T size,
|
T size,
|
||||||
T stride,
|
T stride,
|
||||||
c10::Allocator* allocator,
|
c10::Allocator* allocator,
|
||||||
|
|||||||
@ -59,7 +59,7 @@ SymDimVector infer_size_symdimvector(SymIntArrayRef a, SymIntArrayRef b) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<typename Container>
|
template<typename Container>
|
||||||
C10_ALWAYS_INLINE InferExpandGeometryResult<Container> inferExpandGeometryImpl(
|
C10_ALWAYS_INLINE static InferExpandGeometryResult<Container> inferExpandGeometryImpl(
|
||||||
IntArrayRef tensor_sizes,
|
IntArrayRef tensor_sizes,
|
||||||
IntArrayRef tensor_strides,
|
IntArrayRef tensor_strides,
|
||||||
IntArrayRef sizes) {
|
IntArrayRef sizes) {
|
||||||
|
|||||||
@ -737,7 +737,7 @@ bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
bool isFunctionalTensorIListRef(c10::IListRef<T> list) {
|
static bool isFunctionalTensorIListRef(c10::IListRef<T> list) {
|
||||||
if (list.size() == 0) return false;
|
if (list.size() == 0) return false;
|
||||||
auto functional_count = 0;
|
auto functional_count = 0;
|
||||||
for (const auto& tensor : list) {
|
for (const auto& tensor : list) {
|
||||||
@ -803,7 +803,7 @@ void set_sizes_strides_offset(const std::vector<Tensor>& outs, const std::vector
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
thread_local bool _functionalizationReapplyViews;
|
thread_local static bool _functionalizationReapplyViews;
|
||||||
|
|
||||||
bool getFunctionalizationReapplyViewsTLS() {
|
bool getFunctionalizationReapplyViewsTLS() {
|
||||||
return _functionalizationReapplyViews;
|
return _functionalizationReapplyViews;
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
namespace at::impl {
|
namespace at::impl {
|
||||||
|
|
||||||
thread_local int64_t VmapMode_current_vmap_level = 0;
|
thread_local static int64_t VmapMode_current_vmap_level = 0;
|
||||||
|
|
||||||
int64_t VmapMode::current_vmap_level() {
|
int64_t VmapMode::current_vmap_level() {
|
||||||
return VmapMode_current_vmap_level;
|
return VmapMode_current_vmap_level;
|
||||||
|
|||||||
@ -71,7 +71,7 @@ c10::DispatchKeySet get_view_key_set(const at::Tensor& base) {
|
|||||||
|
|
||||||
namespace at::native {
|
namespace at::native {
|
||||||
|
|
||||||
inline std::vector<int64_t> construct_opt_sizes(const at::Tensor& sizes) {
|
inline static std::vector<int64_t> construct_opt_sizes(const at::Tensor& sizes) {
|
||||||
// torch.tensor([]) is considered to have `dim() = 1` and `size(0) = 0`
|
// torch.tensor([]) is considered to have `dim() = 1` and `size(0) = 0`
|
||||||
// torch.nested_tensor([]) should also has `dim() = 1` and `size(0) = 0`
|
// torch.nested_tensor([]) should also has `dim() = 1` and `size(0) = 0`
|
||||||
if (sizes.dim() == 0) {
|
if (sizes.dim() == 0) {
|
||||||
|
|||||||
@ -5,7 +5,7 @@ namespace at {
|
|||||||
|
|
||||||
// See TensorGeometry.h on why this is useful now that we cache is_contiguous.
|
// See TensorGeometry.h on why this is useful now that we cache is_contiguous.
|
||||||
template <typename T>
|
template <typename T>
|
||||||
bool _geometry_is_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides) {
|
static bool _geometry_is_contiguous(ArrayRef<T> sizes, ArrayRef<T> strides) {
|
||||||
assert(!overflows<std::int64_t>(sizes.size()));
|
assert(!overflows<std::int64_t>(sizes.size()));
|
||||||
auto dim = static_cast<std::int64_t>(sizes.size());
|
auto dim = static_cast<std::int64_t>(sizes.size());
|
||||||
T expected_stride = 1;
|
T expected_stride = 1;
|
||||||
|
|||||||
@ -327,7 +327,7 @@ std::vector<int64_t> defaultStrides(IntArrayRef sizes) {
|
|||||||
// see overloads of computeStride() below.
|
// see overloads of computeStride() below.
|
||||||
//
|
//
|
||||||
template <typename ResultVec, typename NewShapeVec, typename Numel>
|
template <typename ResultVec, typename NewShapeVec, typename Numel>
|
||||||
inline std::optional<ResultVec> computeStride_impl(
|
inline static std::optional<ResultVec> computeStride_impl(
|
||||||
const NewShapeVec& oldshape,
|
const NewShapeVec& oldshape,
|
||||||
const NewShapeVec& oldstride,
|
const NewShapeVec& oldstride,
|
||||||
const NewShapeVec& newshape,
|
const NewShapeVec& newshape,
|
||||||
|
|||||||
@ -20,12 +20,12 @@ namespace at {
|
|||||||
// We haven't made a decision on that yet so we are temporarily banning random
|
// We haven't made a decision on that yet so we are temporarily banning random
|
||||||
// operations inside of vmap while we gather user feedback.
|
// operations inside of vmap while we gather user feedback.
|
||||||
|
|
||||||
template <typename... Args> Tensor unsupportedRandomOp(Args... args) {
|
template <typename... Args> static Tensor unsupportedRandomOp(Args... args) {
|
||||||
TORCH_CHECK(false, "vmap: We do not yet support calling random operations inside of vmap. ",
|
TORCH_CHECK(false, "vmap: We do not yet support calling random operations inside of vmap. ",
|
||||||
"Please perform random operations outside of vmap as a workaround");
|
"Please perform random operations outside of vmap as a workaround");
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename... Args> Tensor& unsupportedRandomOp_(Args... args) {
|
template <typename... Args> static Tensor& unsupportedRandomOp_(Args... args) {
|
||||||
TORCH_CHECK(false, "vmap: We do not yet support calling random operations inside of vmap. ",
|
TORCH_CHECK(false, "vmap: We do not yet support calling random operations inside of vmap. ",
|
||||||
"Please perform random operations outside of vmap as a workaround");
|
"Please perform random operations outside of vmap as a workaround");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -64,7 +64,7 @@ thread_local std::array<at::ScalarType, at::COMPILE_TIME_MAX_DEVICE_TYPES>
|
|||||||
at::ScalarType::Undefined, // IDEEP.
|
at::ScalarType::Undefined, // IDEEP.
|
||||||
at::kHalf, // AMD HIP
|
at::kHalf, // AMD HIP
|
||||||
at::ScalarType::Undefined, // FPGA
|
at::ScalarType::Undefined, // FPGA
|
||||||
at::ScalarType::Undefined, // ONNX Runtime / Microsoft
|
at::kBFloat16, // ONNX Runtime / Microsoft
|
||||||
at::kBFloat16, // XLA / TPU
|
at::kBFloat16, // XLA / TPU
|
||||||
at::ScalarType::Undefined, // Vulkan
|
at::ScalarType::Undefined, // Vulkan
|
||||||
at::ScalarType::Undefined, // Metal
|
at::ScalarType::Undefined, // Metal
|
||||||
@ -500,6 +500,44 @@ TORCH_LIBRARY_IMPL(aten, AutocastMTIA, m) {
|
|||||||
TORCH_FN((&at::autocast::binary_cross_entropy_banned)));
|
TORCH_FN((&at::autocast::binary_cross_entropy_banned)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MAIA
|
||||||
|
TORCH_LIBRARY_IMPL(_, AutocastMAIA, m) {
|
||||||
|
m.fallback(torch::CppFunction::makeFallthrough());
|
||||||
|
}
|
||||||
|
|
||||||
|
TORCH_LIBRARY_IMPL(aten, AutocastMAIA, m) {
|
||||||
|
// lower_precision_fp
|
||||||
|
#define _KERNEL_MAIA_LOW_PRECISION_FP(...) \
|
||||||
|
KERNEL_MAIA(__VA_ARGS__, lower_precision_fp)
|
||||||
|
|
||||||
|
AT_FORALL_LOWER_PRECISION_FP(_KERNEL_MAIA_LOW_PRECISION_FP)
|
||||||
|
|
||||||
|
// fp32
|
||||||
|
#define _KERNEL_MAIA_FP32(...) KERNEL_MAIA(__VA_ARGS__, fp32)
|
||||||
|
|
||||||
|
AT_FORALL_FP32(_KERNEL_MAIA_FP32)
|
||||||
|
|
||||||
|
// fp32_set_opt_dtype
|
||||||
|
#define _KERNEL_MAIA_FP32_SET_OPT_DTYPE(...) \
|
||||||
|
KERNEL_MAIA(__VA_ARGS__, fp32_set_opt_dtype)
|
||||||
|
|
||||||
|
AT_FORALL_FP32_SET_OPT_DTYPE(_KERNEL_MAIA_FP32_SET_OPT_DTYPE)
|
||||||
|
|
||||||
|
// fp32_append_dtype
|
||||||
|
// The fp32_append_dtype wrapper overrides implicit promotion behavior.
|
||||||
|
// norm does not implicitly promote, but be aware when adding new ops to this policy.
|
||||||
|
AT_FORALL_DIFFERENT_REDISPATCH_SIGNATURE(
|
||||||
|
KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_MAIA)
|
||||||
|
|
||||||
|
// promote
|
||||||
|
#define _KERNEL_MAIA_PROMOTE(...) KERNEL_MAIA(__VA_ARGS__, promote)
|
||||||
|
|
||||||
|
AT_FORALL_PROMOTE(_KERNEL_MAIA_PROMOTE)
|
||||||
|
|
||||||
|
m.impl(TORCH_SELECTIVE_NAME("aten::binary_cross_entropy"),
|
||||||
|
TORCH_FN((&at::autocast::binary_cross_entropy_banned)));
|
||||||
|
}
|
||||||
|
|
||||||
// XPU
|
// XPU
|
||||||
TORCH_LIBRARY_IMPL(_, AutocastXPU, m) {
|
TORCH_LIBRARY_IMPL(_, AutocastXPU, m) {
|
||||||
m.fallback(torch::CppFunction::makeFallthrough());
|
m.fallback(torch::CppFunction::makeFallthrough());
|
||||||
|
|||||||
@ -123,12 +123,14 @@ TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
|
|||||||
_(privateuseone, at::kPrivateUse1)
|
_(privateuseone, at::kPrivateUse1)
|
||||||
|
|
||||||
// deprecated other backend specific autocast APIs
|
// deprecated other backend specific autocast APIs
|
||||||
|
// NOLINTNEXTLINE(misc-use-internal-linkage)
|
||||||
AT_FORALL_DEPRECATED_AUTOCAST_BACKENDS(DECLARE_DEPRECATED_AUTOCAST_APIS)
|
AT_FORALL_DEPRECATED_AUTOCAST_BACKENDS(DECLARE_DEPRECATED_AUTOCAST_APIS)
|
||||||
|
|
||||||
const std::array<at::DeviceType, 9> _AUTOCAST_SUPPORTED_DEVICES{
|
const std::array<at::DeviceType, 10> _AUTOCAST_SUPPORTED_DEVICES{
|
||||||
at::kCPU,
|
at::kCPU,
|
||||||
at::kCUDA,
|
at::kCUDA,
|
||||||
at::kMTIA,
|
at::kMTIA,
|
||||||
|
at::kMAIA,
|
||||||
at::kXPU,
|
at::kXPU,
|
||||||
at::kIPU,
|
at::kIPU,
|
||||||
at::kHPU,
|
at::kHPU,
|
||||||
@ -149,6 +151,8 @@ inline bool is_autocast_eligible(
|
|||||||
tensor.is_floating_point();
|
tensor.is_floating_point();
|
||||||
case c10::DeviceType::MTIA:
|
case c10::DeviceType::MTIA:
|
||||||
return tensor.is_mtia() && tensor.is_floating_point();
|
return tensor.is_mtia() && tensor.is_floating_point();
|
||||||
|
case c10::DeviceType::MAIA:
|
||||||
|
return tensor.is_maia() && tensor.is_floating_point();
|
||||||
case c10::DeviceType::XPU:
|
case c10::DeviceType::XPU:
|
||||||
return tensor.is_xpu() && tensor.is_floating_point();
|
return tensor.is_xpu() && tensor.is_floating_point();
|
||||||
case c10::DeviceType::IPU:
|
case c10::DeviceType::IPU:
|
||||||
@ -176,6 +180,8 @@ inline DispatchKey get_autocast_dispatch_key_from_device_type(
|
|||||||
return DispatchKey::AutocastCPU;
|
return DispatchKey::AutocastCPU;
|
||||||
case c10::DeviceType::MTIA:
|
case c10::DeviceType::MTIA:
|
||||||
return DispatchKey::AutocastMTIA;
|
return DispatchKey::AutocastMTIA;
|
||||||
|
case c10::DeviceType::MAIA:
|
||||||
|
return DispatchKey::AutocastMAIA;
|
||||||
case c10::DeviceType::XPU:
|
case c10::DeviceType::XPU:
|
||||||
return DispatchKey::AutocastXPU;
|
return DispatchKey::AutocastXPU;
|
||||||
case c10::DeviceType::IPU:
|
case c10::DeviceType::IPU:
|
||||||
@ -747,6 +753,24 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
|
|||||||
REDISPATCH_SIGNATURE, \
|
REDISPATCH_SIGNATURE, \
|
||||||
POLICY)
|
POLICY)
|
||||||
|
|
||||||
|
// KERNEL_MAIA/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_MAIA
|
||||||
|
// registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastMAIA
|
||||||
|
#define KERNEL_MAIA(...) KERNEL(c10::DeviceType::MAIA, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_MAIA( \
|
||||||
|
REDISPATCH_FUNC, \
|
||||||
|
REGISTER_NAME, \
|
||||||
|
REGISTER_SIGNATURE, \
|
||||||
|
REDISPATCH_SIGNATURE, \
|
||||||
|
POLICY) \
|
||||||
|
KERNEL_DIFFERENT_REDISPATCH_SIGNATURE( \
|
||||||
|
c10::DeviceType::MAIA, \
|
||||||
|
REDISPATCH_FUNC, \
|
||||||
|
REGISTER_NAME, \
|
||||||
|
REGISTER_SIGNATURE, \
|
||||||
|
REDISPATCH_SIGNATURE, \
|
||||||
|
POLICY)
|
||||||
|
|
||||||
// KERNEL_XPU/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_XPU
|
// KERNEL_XPU/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_XPU
|
||||||
// registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastXPU
|
// registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastXPU
|
||||||
#define KERNEL_XPU(...) KERNEL(c10::DeviceType::XPU, __VA_ARGS__)
|
#define KERNEL_XPU(...) KERNEL(c10::DeviceType::XPU, __VA_ARGS__)
|
||||||
|
|||||||
@ -43,7 +43,7 @@ std::string toString(const Scalar& s) {
|
|||||||
namespace at {
|
namespace at {
|
||||||
|
|
||||||
//not all C++ compilers have default float so we define our own here
|
//not all C++ compilers have default float so we define our own here
|
||||||
inline std::ios_base& defaultfloat(std::ios_base& __base) {
|
inline static std::ios_base& defaultfloat(std::ios_base& __base) {
|
||||||
__base.unsetf(std::ios_base::floatfield);
|
__base.unsetf(std::ios_base::floatfield);
|
||||||
return __base;
|
return __base;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -42,7 +42,7 @@ static std::vector<at::OptionalTensorRef> get_unboxed_opt_tensor_vector() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void check_elements_same(at::ITensorListRef list, const T& thing, int use_count) {
|
static void check_elements_same(at::ITensorListRef list, const T& thing, int use_count) {
|
||||||
EXPECT_EQ(thing.size(), list.size());
|
EXPECT_EQ(thing.size(), list.size());
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
for (const auto& t : list) {
|
for (const auto& t : list) {
|
||||||
|
|||||||
@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
namespace at {
|
namespace at {
|
||||||
|
|
||||||
thread_local bool NamesMode_enabled = true;
|
thread_local static bool NamesMode_enabled = true;
|
||||||
|
|
||||||
bool NamesMode::is_enabled() {
|
bool NamesMode::is_enabled() {
|
||||||
return NamesMode_enabled;
|
return NamesMode_enabled;
|
||||||
|
|||||||
@ -80,6 +80,10 @@ TORCH_LIBRARY_IMPL(_, AutogradMTIA, m) {
|
|||||||
m.fallback(AUTOGRAD_FALLBACK);
|
m.fallback(AUTOGRAD_FALLBACK);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TORCH_LIBRARY_IMPL(_, AutogradMAIA, m) {
|
||||||
|
m.fallback(AUTOGRAD_FALLBACK);
|
||||||
|
}
|
||||||
|
|
||||||
TORCH_LIBRARY_IMPL(_, AutogradXLA, m) {
|
TORCH_LIBRARY_IMPL(_, AutogradXLA, m) {
|
||||||
m.fallback(AUTOGRAD_FALLBACK);
|
m.fallback(AUTOGRAD_FALLBACK);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -329,7 +329,7 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
|
|||||||
|
|
||||||
|
|
||||||
template <typename Dtype>
|
template <typename Dtype>
|
||||||
inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
|
static inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
|
||||||
cudaDataType_t abcType = CUDA_R_32F;
|
cudaDataType_t abcType = CUDA_R_32F;
|
||||||
cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
|
cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
|
||||||
cudaDataType_t scaleType = CUDA_R_32F;
|
cudaDataType_t scaleType = CUDA_R_32F;
|
||||||
@ -1079,7 +1079,13 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
|
|||||||
}
|
}
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
|
else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
|
||||||
at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
|
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||||
|
c10::string_view arch(dprops->gcnArchName);
|
||||||
|
if (arch == "gfx1100") { //no CK GEMM version for gfx1100
|
||||||
|
gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
|
||||||
|
} else{
|
||||||
|
at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
else {
|
else {
|
||||||
|
|||||||
@ -156,6 +156,7 @@ NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*)
|
|||||||
NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *)
|
NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *)
|
||||||
NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **)
|
NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **)
|
||||||
|
|
||||||
|
CUDA_STUB2(cuModuleLoad, CUmodule*, const char*)
|
||||||
CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *)
|
CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *)
|
||||||
CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *)
|
CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *)
|
||||||
CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t)
|
CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t)
|
||||||
@ -169,6 +170,8 @@ CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *)
|
|||||||
CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *)
|
CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *)
|
||||||
CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
|
CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
|
||||||
CUDA_STUB3(cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
|
CUDA_STUB3(cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
|
||||||
|
CUDA_STUB3(cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
|
||||||
|
|
||||||
|
|
||||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
|
#if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
|
||||||
CUresult CUDAAPI
|
CUresult CUDAAPI
|
||||||
|
|||||||
@ -43,6 +43,7 @@ namespace at::cuda {
|
|||||||
_(nvrtcGetProgramLogSize) \
|
_(nvrtcGetProgramLogSize) \
|
||||||
_(nvrtcGetProgramLog) \
|
_(nvrtcGetProgramLog) \
|
||||||
_(nvrtcGetLoweredName) \
|
_(nvrtcGetLoweredName) \
|
||||||
|
_(cuModuleLoad) \
|
||||||
_(cuModuleLoadData) \
|
_(cuModuleLoadData) \
|
||||||
_(cuModuleLoadDataEx) \
|
_(cuModuleLoadDataEx) \
|
||||||
_(cuModuleGetFunction) \
|
_(cuModuleGetFunction) \
|
||||||
@ -60,6 +61,7 @@ namespace at::cuda {
|
|||||||
_(cuLinkComplete) \
|
_(cuLinkComplete) \
|
||||||
_(cuFuncSetAttribute) \
|
_(cuFuncSetAttribute) \
|
||||||
_(cuFuncGetAttribute) \
|
_(cuFuncGetAttribute) \
|
||||||
|
_(cuPointerGetAttribute) \
|
||||||
|
|
||||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
|
#if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
|
||||||
#define AT_FORALL_NVRTC_EXTENDED(_) \
|
#define AT_FORALL_NVRTC_EXTENDED(_) \
|
||||||
|
|||||||
@ -575,11 +575,20 @@ struct ScaledGemmParams : OpParams {
|
|||||||
|
|
||||||
std::string BLASSignature() const override {
|
std::string BLASSignature() const override {
|
||||||
// Excluding use_fast_accum and use_rowise booleans for now
|
// Excluding use_fast_accum and use_rowise booleans for now
|
||||||
return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, "
|
if (bias_ptr == nullptr) {
|
||||||
"transA: %c, transB: %c, batch_count: 1, scaleA: f32_r, scaleB: f32_r, a_type: %s, b_type: %s, c_type: %s, d_type: %s, bias_type: %s, scale_type: %s, compute_type: %s }",
|
return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, "
|
||||||
m, n, k, lda, ldb, ldc, ldc, transa, transb,
|
"transA: %c, transB: %c, batch_count: 1, scaleA: f32_r, scaleB: f32_r, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, compute_type: %s }",
|
||||||
ScalarTypeToBLASType(a_dtype), ScalarTypeToBLASType(b_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(bias_dtype),
|
m, n, k, lda, ldb, ldc, ldc, transa, transb,
|
||||||
ComputeTypeFor<T>(), ComputeTypeFor<T>());
|
ScalarTypeToBLASType(a_dtype), ScalarTypeToBLASType(b_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(c_dtype),
|
||||||
|
ComputeTypeFor<T>(), ComputeTypeFor<T>());
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, "
|
||||||
|
"transA: %c, transB: %c, batch_count: 1, scaleA: f32_r, scaleB: f32_r, a_type: %s, b_type: %s, c_type: %s, d_type: %s, bias_type: %s, scale_type: %s, compute_type: %s }",
|
||||||
|
m, n, k, lda, ldb, ldc, ldc, transa, transb,
|
||||||
|
ScalarTypeToBLASType(a_dtype), ScalarTypeToBLASType(b_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(bias_dtype),
|
||||||
|
ComputeTypeFor<T>(), ComputeTypeFor<T>());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string Signature() const override {
|
std::string Signature() const override {
|
||||||
|
|||||||
@ -498,7 +498,11 @@ class HipblasltGemmOp : public Callable<ParamsT> {
|
|||||||
mat_c, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, sizeof(stride_c)));
|
mat_c, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, sizeof(stride_c)));
|
||||||
}
|
}
|
||||||
|
|
||||||
HipBlasLtMatmulDescriptor matmul(HIPBLAS_COMPUTE_32F, HIP_R_32F);
|
hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
|
||||||
|
if (at::globalContext().allowTF32CuBLAS()) {
|
||||||
|
computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
|
||||||
|
}
|
||||||
|
HipBlasLtMatmulDescriptor matmul(computeType, HIP_R_32F);
|
||||||
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, opa);
|
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, opa);
|
||||||
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, opb);
|
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, opb);
|
||||||
|
|
||||||
@ -611,6 +615,11 @@ auto GetHipBlasLtTypeStringAndOps() {
|
|||||||
auto in_out_datatype = HipDataTypeFor<CT>();
|
auto in_out_datatype = HipDataTypeFor<CT>();
|
||||||
std::vector<hipblasLtMatmulHeuristicResult_t> heuristic_result;
|
std::vector<hipblasLtMatmulHeuristicResult_t> heuristic_result;
|
||||||
|
|
||||||
|
hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
|
||||||
|
if (at::globalContext().allowTF32CuBLAS()) {
|
||||||
|
computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
|
||||||
|
}
|
||||||
|
|
||||||
hipblasLtHandle_t handle;
|
hipblasLtHandle_t handle;
|
||||||
TORCH_HIPBLASLT_CHECK(hipblasLtCreate(&handle));
|
TORCH_HIPBLASLT_CHECK(hipblasLtCreate(&handle));
|
||||||
TORCH_HIPBLASLT_CHECK(hipblaslt_ext::getAllAlgos(handle,
|
TORCH_HIPBLASLT_CHECK(hipblaslt_ext::getAllAlgos(handle,
|
||||||
@ -621,7 +630,7 @@ auto GetHipBlasLtTypeStringAndOps() {
|
|||||||
b_datatype,
|
b_datatype,
|
||||||
in_out_datatype,
|
in_out_datatype,
|
||||||
in_out_datatype,
|
in_out_datatype,
|
||||||
HIPBLAS_COMPUTE_32F,
|
computeType,
|
||||||
heuristic_result));
|
heuristic_result));
|
||||||
TORCH_HIPBLASLT_CHECK(hipblasLtDestroy(handle));
|
TORCH_HIPBLASLT_CHECK(hipblasLtDestroy(handle));
|
||||||
|
|
||||||
|
|||||||
@ -141,6 +141,8 @@ class RocblasGemmOp : public Callable<GemmParams<T>> {
|
|||||||
|
|
||||||
TuningStatus Call(const GemmParams<T>* params) override {
|
TuningStatus Call(const GemmParams<T>* params) override {
|
||||||
auto input_output_type = RocBlasDataTypeFor<T>();
|
auto input_output_type = RocBlasDataTypeFor<T>();
|
||||||
|
if (at::globalContext().allowTF32CuBLAS() && input_output_type == rocblas_datatype_f32_r)
|
||||||
|
return FAIL; // no support for TF32 in rocBLAS
|
||||||
auto compute_type = RocBlasComputeTypeFor<T>();
|
auto compute_type = RocBlasComputeTypeFor<T>();
|
||||||
auto h_a = DoCastForHalfOrBfloat16(params->alpha);
|
auto h_a = DoCastForHalfOrBfloat16(params->alpha);
|
||||||
auto h_b = DoCastForHalfOrBfloat16(params->beta);
|
auto h_b = DoCastForHalfOrBfloat16(params->beta);
|
||||||
@ -207,6 +209,8 @@ class RocblasGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>>
|
|||||||
|
|
||||||
TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
|
TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
|
||||||
auto input_output_type = RocBlasDataTypeFor<T>();
|
auto input_output_type = RocBlasDataTypeFor<T>();
|
||||||
|
if (at::globalContext().allowTF32CuBLAS() && input_output_type == rocblas_datatype_f32_r)
|
||||||
|
return FAIL; // no support for TF32 in rocBLAS
|
||||||
auto compute_type = RocBlasComputeTypeFor<T>();
|
auto compute_type = RocBlasComputeTypeFor<T>();
|
||||||
auto h_a = DoCastForHalfOrBfloat16(params->alpha);
|
auto h_a = DoCastForHalfOrBfloat16(params->alpha);
|
||||||
auto h_b = DoCastForHalfOrBfloat16(params->beta);
|
auto h_b = DoCastForHalfOrBfloat16(params->beta);
|
||||||
|
|||||||
@ -12,7 +12,7 @@
|
|||||||
namespace at::functorch {
|
namespace at::functorch {
|
||||||
|
|
||||||
template <typename Func>
|
template <typename Func>
|
||||||
std::tuple<Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>>
|
static std::tuple<Tensor, std::optional<int64_t>,Tensor, std::optional<int64_t>>
|
||||||
max_pool_with_indices_batch_rule_helper(
|
max_pool_with_indices_batch_rule_helper(
|
||||||
const Tensor& self, std::optional<int64_t> self_bdim,
|
const Tensor& self, std::optional<int64_t> self_bdim,
|
||||||
IntArrayRef kernel_size, IntArrayRef stride,
|
IntArrayRef kernel_size, IntArrayRef stride,
|
||||||
|
|||||||
@ -20,7 +20,7 @@
|
|||||||
namespace at::functorch {
|
namespace at::functorch {
|
||||||
|
|
||||||
template <typename F, F Func, typename... ExtraArgs>
|
template <typename F, F Func, typename... ExtraArgs>
|
||||||
Tensor random_batching_rule(SymIntArrayRef shape, ExtraArgs... extra_args) {
|
static Tensor random_batching_rule(SymIntArrayRef shape, ExtraArgs... extra_args) {
|
||||||
c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
|
c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
|
||||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||||
TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
|
TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
|
||||||
@ -37,7 +37,7 @@ Tensor random_batching_rule(SymIntArrayRef shape, ExtraArgs... extra_args) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename F, F Func, typename... ExtraArgs>
|
template <typename F, F Func, typename... ExtraArgs>
|
||||||
Tensor& random_inplace_batching_rule(Tensor& self, ExtraArgs... extra_args) {
|
static Tensor& random_inplace_batching_rule(Tensor& self, ExtraArgs... extra_args) {
|
||||||
c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
|
c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
|
||||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||||
TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
|
TORCH_INTERNAL_ASSERT(maybe_layer.has_value());
|
||||||
@ -108,7 +108,7 @@ static Tensor& bernoulli_inplace_Tensor_batching_rule(Tensor& self, const Tensor
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename F, F Func, typename... ExtraArgs>
|
template <typename F, F Func, typename... ExtraArgs>
|
||||||
Tensor randperm_batching_rule(int64_t n, ExtraArgs... extra_args) {
|
static Tensor randperm_batching_rule(int64_t n, ExtraArgs... extra_args) {
|
||||||
c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
|
c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
|
||||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||||
auto const batch_size = maybe_layer->batchSize();
|
auto const batch_size = maybe_layer->batchSize();
|
||||||
@ -127,7 +127,7 @@ Tensor randperm_batching_rule(int64_t n, ExtraArgs... extra_args) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <typename F, F Func, typename... ExtraArgs>
|
template <typename F, F Func, typename... ExtraArgs>
|
||||||
Tensor unary_pointwise_random_batch_rule(const Tensor& tensor, ExtraArgs... extra_args) {
|
static Tensor unary_pointwise_random_batch_rule(const Tensor& tensor, ExtraArgs... extra_args) {
|
||||||
c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
|
c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
|
||||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||||
const auto cur_level = maybe_layer->layerId();
|
const auto cur_level = maybe_layer->layerId();
|
||||||
@ -153,7 +153,7 @@ Tensor unary_pointwise_random_batch_rule(const Tensor& tensor, ExtraArgs... extr
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<typename F, F Func, typename... ExtraArgs>
|
template<typename F, F Func, typename... ExtraArgs>
|
||||||
Tensor tensor_like_random_batch_rule(const Tensor& self, ExtraArgs... extra_args) {
|
static Tensor tensor_like_random_batch_rule(const Tensor& self, ExtraArgs... extra_args) {
|
||||||
c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
|
c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode);
|
||||||
auto maybe_layer = maybeCurrentDynamicLayer();
|
auto maybe_layer = maybeCurrentDynamicLayer();
|
||||||
const auto cur_level = maybe_layer->layerId();
|
const auto cur_level = maybe_layer->layerId();
|
||||||
@ -272,7 +272,7 @@ struct RandomBatchRuleHelper<F, Func, typelist<T1, T...>> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
template <typename F, F Func, typename... T>
|
template <typename F, F Func, typename... T>
|
||||||
Tensor rand_int_wrapper(SymIntArrayRef shape, c10::SymInt high, T... extra_args) {
|
static Tensor rand_int_wrapper(SymIntArrayRef shape, c10::SymInt high, T... extra_args) {
|
||||||
return Func(high, shape, std::forward<T>(extra_args)...);
|
return Func(high, shape, std::forward<T>(extra_args)...);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -299,7 +299,7 @@ struct RandIntBatchRuleHelper<F, Func, typelist<T1, T2, T...>> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
template <typename F, F Func, typename T0, typename T1, typename... T>
|
template <typename F, F Func, typename T0, typename T1, typename... T>
|
||||||
Tensor rand_int_low_wrapper(SymIntArrayRef shape, T0 scalar0, T1 scalar1, T... extra_args) {
|
static Tensor rand_int_low_wrapper(SymIntArrayRef shape, T0 scalar0, T1 scalar1, T... extra_args) {
|
||||||
return Func(scalar0, scalar1, shape, std::forward<T>(extra_args)...);
|
return Func(scalar0, scalar1, shape, std::forward<T>(extra_args)...);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -346,7 +346,7 @@ struct NormalPointwiseBatchRule<F, Func, typelist<A0, T...>> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
template<typename F, F Func, typename... T>
|
template<typename F, F Func, typename... T>
|
||||||
Tensor normal_wrapper(const Tensor& tensor, double scalar, T... extra_args) {
|
static Tensor normal_wrapper(const Tensor& tensor, double scalar, T... extra_args) {
|
||||||
return Func(scalar, tensor, extra_args...);
|
return Func(scalar, tensor, extra_args...);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -19,7 +19,7 @@
|
|||||||
|
|
||||||
namespace at::functorch {
|
namespace at::functorch {
|
||||||
|
|
||||||
bool kVmapFallbackWarningEnabled = true;
|
static bool kVmapFallbackWarningEnabled = true;
|
||||||
|
|
||||||
bool isVmapFallbackWarningEnabled() {
|
bool isVmapFallbackWarningEnabled() {
|
||||||
return kVmapFallbackWarningEnabled;
|
return kVmapFallbackWarningEnabled;
|
||||||
@ -29,7 +29,7 @@ void setVmapFallbackWarningEnabled(bool enabled) {
|
|||||||
kVmapFallbackWarningEnabled = enabled;
|
kVmapFallbackWarningEnabled = enabled;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool kVmapFallbackEnabled = true;
|
static bool kVmapFallbackEnabled = true;
|
||||||
|
|
||||||
bool isVmapFallbackEnabled() {
|
bool isVmapFallbackEnabled() {
|
||||||
return kVmapFallbackEnabled;
|
return kVmapFallbackEnabled;
|
||||||
|
|||||||
@ -322,6 +322,24 @@ void gemm(
|
|||||||
const float beta,
|
const float beta,
|
||||||
at::BFloat16 *c, int64_t ldc) {
|
at::BFloat16 *c, int64_t ldc) {
|
||||||
internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc);
|
internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc);
|
||||||
|
#if AT_MKLDNN_ENABLED()
|
||||||
|
#ifdef __aarch64__
|
||||||
|
// MKLDNN also supports ARM for bf16, and the bypass is only
|
||||||
|
// currently intended for x86/x86_64.
|
||||||
|
const bool use_bf16_gemv_trans = false;
|
||||||
|
#elif defined(__powerpc__)
|
||||||
|
const bool use_bf16_gemv_trans = false;
|
||||||
|
#else
|
||||||
|
const bool bf16_gemv_trans_would_be_faster = cpuinfo_initialize() &&
|
||||||
|
!cpuinfo_has_x86_avx512bf16();
|
||||||
|
const bool use_bf16_gemv_trans = bf16_gemv_trans_would_be_faster &&
|
||||||
|
transa == TransposeType::Transpose &&
|
||||||
|
transb == TransposeType::NoTranspose && n == 1 && alpha == 1.0;
|
||||||
|
#endif
|
||||||
|
if (!use_bf16_gemv_trans && mkldnn_bf16_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if AT_BUILD_WITH_BLAS() && defined(BLAS_HAS_SBGEMM)
|
#if AT_BUILD_WITH_BLAS() && defined(BLAS_HAS_SBGEMM)
|
||||||
if (use_blas_gemm(transa, transb, m, n, k, lda, ldb, ldc)) {
|
if (use_blas_gemm(transa, transb, m, n, k, lda, ldb, ldc)) {
|
||||||
int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc;
|
int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc;
|
||||||
@ -342,24 +360,6 @@ void gemm(
|
|||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
#if AT_MKLDNN_ENABLED()
|
|
||||||
#ifdef __aarch64__
|
|
||||||
// MKLDNN also supports ARM for bf16, and the bypass is only
|
|
||||||
// currently intended for x86/x86_64.
|
|
||||||
const bool use_bf16_gemv_trans = false;
|
|
||||||
#elif defined(__powerpc__)
|
|
||||||
const bool use_bf16_gemv_trans = false;
|
|
||||||
#else
|
|
||||||
const bool bf16_gemv_trans_would_be_faster = cpuinfo_initialize() &&
|
|
||||||
!cpuinfo_has_x86_avx512bf16();
|
|
||||||
const bool use_bf16_gemv_trans = bf16_gemv_trans_would_be_faster &&
|
|
||||||
transa == TransposeType::Transpose &&
|
|
||||||
transb == TransposeType::NoTranspose && n == 1 && alpha == 1.0;
|
|
||||||
#endif
|
|
||||||
if (!use_bf16_gemv_trans && mkldnn_bf16_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
gemm_stub(
|
gemm_stub(
|
||||||
at::kCPU, at::kBFloat16,
|
at::kCPU, at::kBFloat16,
|
||||||
|
|||||||
@ -3610,11 +3610,11 @@ Tensor& transpose_(Tensor& self, int64_t dim0, int64_t dim1) {
|
|||||||
return at::_mkldnn_transpose_(self, dim0, dim1);
|
return at::_mkldnn_transpose_(self, dim0, dim1);
|
||||||
}
|
}
|
||||||
|
|
||||||
DimVector sizes(self.sizes().begin(), self.sizes().end());
|
SymDimVector sizes(self.sym_sizes().begin(), self.sym_sizes().end());
|
||||||
DimVector strides(self.strides().begin(), self.strides().end());
|
|
||||||
std::swap(strides[dim0], strides[dim1]);
|
|
||||||
std::swap(sizes[dim0], sizes[dim1]);
|
std::swap(sizes[dim0], sizes[dim1]);
|
||||||
self.as_strided_(sizes, strides);
|
SymDimVector strides(self.sym_strides().begin(), self.sym_strides().end());
|
||||||
|
std::swap(strides[dim0], strides[dim1]);
|
||||||
|
auto result = self.as_strided__symint(std::move(sizes), std::move(strides));
|
||||||
return self;
|
return self;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -832,9 +832,9 @@ void hardswish_backward_kernel(TensorIterator& iter) {
|
|||||||
cpu_kernel_vec(
|
cpu_kernel_vec(
|
||||||
iter,
|
iter,
|
||||||
[&](scalar_t grad_val, scalar_t self_val) -> scalar_t {
|
[&](scalar_t grad_val, scalar_t self_val) -> scalar_t {
|
||||||
if (float(self_val) < neg_three) {
|
if (float(self_val) <= neg_three) {
|
||||||
return zero;
|
return zero;
|
||||||
} else if (float(self_val) <= three) {
|
} else if (float(self_val) < three) {
|
||||||
return float(grad_val) * ((float(self_val) / three) + one_half);
|
return float(grad_val) * ((float(self_val) / three) + one_half);
|
||||||
} else {
|
} else {
|
||||||
return grad_val;
|
return grad_val;
|
||||||
@ -847,19 +847,19 @@ void hardswish_backward_kernel(TensorIterator& iter) {
|
|||||||
Vec::blendv(
|
Vec::blendv(
|
||||||
grad_val0 * ((self_val0 / kThreeVec) + kOneHalfVec),
|
grad_val0 * ((self_val0 / kThreeVec) + kOneHalfVec),
|
||||||
grad_val0,
|
grad_val0,
|
||||||
self_val0 > kThreeVec
|
self_val0 >= kThreeVec
|
||||||
),
|
),
|
||||||
kZeroVec,
|
kZeroVec,
|
||||||
self_val0 < kNegThreeVec
|
self_val0 <= kNegThreeVec
|
||||||
);
|
);
|
||||||
self_val1 = Vec::blendv(
|
self_val1 = Vec::blendv(
|
||||||
Vec::blendv(
|
Vec::blendv(
|
||||||
grad_val1 * ((self_val1 / kThreeVec) + kOneHalfVec),
|
grad_val1 * ((self_val1 / kThreeVec) + kOneHalfVec),
|
||||||
grad_val1,
|
grad_val1,
|
||||||
self_val1 > kThreeVec
|
self_val1 >= kThreeVec
|
||||||
),
|
),
|
||||||
kZeroVec,
|
kZeroVec,
|
||||||
self_val1 < kNegThreeVec
|
self_val1 <= kNegThreeVec
|
||||||
);
|
);
|
||||||
return convert_from_float<scalar_t>(self_val0, self_val1);
|
return convert_from_float<scalar_t>(self_val0, self_val1);
|
||||||
});
|
});
|
||||||
@ -878,9 +878,9 @@ void hardswish_backward_kernel(TensorIterator& iter) {
|
|||||||
cpu_kernel_vec(
|
cpu_kernel_vec(
|
||||||
iter,
|
iter,
|
||||||
[&](scalar_t grad_val, scalar_t self_val) {
|
[&](scalar_t grad_val, scalar_t self_val) {
|
||||||
if (self_val < neg_three) {
|
if (self_val <= neg_three) {
|
||||||
return zero;
|
return zero;
|
||||||
} else if (self_val <= three) {
|
} else if (self_val < three) {
|
||||||
return grad_val * ((self_val / three) + one_half);
|
return grad_val * ((self_val / three) + one_half);
|
||||||
} else {
|
} else {
|
||||||
return grad_val;
|
return grad_val;
|
||||||
@ -891,10 +891,10 @@ void hardswish_backward_kernel(TensorIterator& iter) {
|
|||||||
Vec::blendv(
|
Vec::blendv(
|
||||||
grad_val * ((self_val / kThreeVec) + kOneHalfVec),
|
grad_val * ((self_val / kThreeVec) + kOneHalfVec),
|
||||||
grad_val,
|
grad_val,
|
||||||
self_val > kThreeVec
|
self_val >= kThreeVec
|
||||||
),
|
),
|
||||||
kZeroVec,
|
kZeroVec,
|
||||||
self_val < kNegThreeVec
|
self_val <= kNegThreeVec
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|||||||
@ -1,5 +1,12 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
// On Windows, math.h needs to be included with _USE_MATH_DEFINES defined to
|
||||||
|
// access constants such as M_SQRT2 and M_2_SQRTPI.
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define _USE_MATH_DEFINES
|
||||||
|
#include <cmath>
|
||||||
|
#endif // _WIN32
|
||||||
|
|
||||||
#include <ATen/cpu/vec/vec.h>
|
#include <ATen/cpu/vec/vec.h>
|
||||||
#include <c10/util/BFloat16.h> // For c10::is_reduced_floating_point_v.
|
#include <c10/util/BFloat16.h> // For c10::is_reduced_floating_point_v.
|
||||||
|
|
||||||
|
|||||||
@ -45,9 +45,9 @@ void hardswish_backward_kernel(TensorIterator& iter) {
|
|||||||
[zero, three, neg_three, one_half]GPU_LAMBDA(scalar_t grad_val_, scalar_t self_val_) -> scalar_t {
|
[zero, three, neg_three, one_half]GPU_LAMBDA(scalar_t grad_val_, scalar_t self_val_) -> scalar_t {
|
||||||
opmath_t grad_val = static_cast<opmath_t>(grad_val_);
|
opmath_t grad_val = static_cast<opmath_t>(grad_val_);
|
||||||
opmath_t self_val = static_cast<opmath_t>(self_val_);
|
opmath_t self_val = static_cast<opmath_t>(self_val_);
|
||||||
if (self_val < neg_three) {
|
if (self_val <= neg_three) {
|
||||||
return zero;
|
return zero;
|
||||||
} else if (self_val <= three) {
|
} else if (self_val < three) {
|
||||||
return grad_val * ((self_val / three) + one_half);
|
return grad_val * ((self_val / three) + one_half);
|
||||||
} else {
|
} else {
|
||||||
return grad_val;
|
return grad_val;
|
||||||
|
|||||||
@ -51,6 +51,23 @@
|
|||||||
|
|
||||||
namespace at::native {
|
namespace at::native {
|
||||||
|
|
||||||
|
#ifdef USE_ROCM
|
||||||
|
// Custom configuration for vectorized elementwise kernel
|
||||||
|
// with template instantiation.
|
||||||
|
namespace vectorized_templated_config {
|
||||||
|
constexpr int num_threads() {
|
||||||
|
return 512;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr int elems_per_thread() {
|
||||||
|
return 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr int block_work_size() {
|
||||||
|
return elems_per_thread() * num_threads();
|
||||||
|
}
|
||||||
|
} // namespace vectorized_templated_config
|
||||||
|
#endif
|
||||||
|
|
||||||
template <typename args_t, size_t... Is>
|
template <typename args_t, size_t... Is>
|
||||||
constexpr auto sum_of_sizes(args_t args, std::index_sequence<Is...>) {
|
constexpr auto sum_of_sizes(args_t args, std::index_sequence<Is...>) {
|
||||||
@ -255,6 +272,139 @@ static inline void launch_vectorized_kernel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef USE_ROCM
|
||||||
|
template <
|
||||||
|
int vec_size,
|
||||||
|
typename func_t,
|
||||||
|
typename array_t,
|
||||||
|
typename inp_calc_t,
|
||||||
|
typename out_calc_t,
|
||||||
|
typename loader_t,
|
||||||
|
typename storer_t,
|
||||||
|
typename OutputType,
|
||||||
|
typename... InputTypes>
|
||||||
|
C10_LAUNCH_BOUNDS_1(vectorized_templated_config::num_threads())
|
||||||
|
__global__ void vectorized_templated_elementwise_kernel(
|
||||||
|
int N,
|
||||||
|
func_t f,
|
||||||
|
array_t data,
|
||||||
|
inp_calc_t inp_calc,
|
||||||
|
out_calc_t out_calc,
|
||||||
|
loader_t loader,
|
||||||
|
storer_t storer) {
|
||||||
|
int remaining =
|
||||||
|
N - vectorized_templated_config::block_work_size() * blockIdx.x;
|
||||||
|
if (remaining <
|
||||||
|
vectorized_templated_config::block_work_size()) { // if this block handles
|
||||||
|
// the reminder,
|
||||||
|
// just do a naive unrolled loop
|
||||||
|
auto policy = memory::policies::unroll_base<
|
||||||
|
vectorized_templated_config::num_threads(),
|
||||||
|
array_t,
|
||||||
|
inp_calc_t,
|
||||||
|
out_calc_t,
|
||||||
|
loader_t,
|
||||||
|
storer_t,
|
||||||
|
vectorized_templated_config::elems_per_thread()>(
|
||||||
|
data, remaining, inp_calc, out_calc, loader, storer);
|
||||||
|
elementwise_kernel_helper(f, policy);
|
||||||
|
} else { // if this block has a full `block_work_size` data to handle, use
|
||||||
|
// vectorized memory access
|
||||||
|
elementwise_kernel_helper(
|
||||||
|
f,
|
||||||
|
memory::policies::vectorized_templated<
|
||||||
|
vec_size,
|
||||||
|
array_t,
|
||||||
|
vectorized_templated_config::elems_per_thread(),
|
||||||
|
vectorized_templated_config::num_threads(),
|
||||||
|
OutputType,
|
||||||
|
InputTypes...>(data));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// This function assume trivial 1d and supports template specialization
|
||||||
|
// to avoid dynamic casting.
|
||||||
|
// Input vectorization size is based on runtime information, i.e.
|
||||||
|
// the actual data types of the input and output tensor and cannot
|
||||||
|
// be determined using the functor type, as in regular non-templated
|
||||||
|
// vectorized kernels. The caller is in charge of selecting the correct input
|
||||||
|
// vectorization length.
|
||||||
|
template <
|
||||||
|
typename func_t,
|
||||||
|
typename array_t,
|
||||||
|
typename inp_calc_t,
|
||||||
|
typename out_calc_t,
|
||||||
|
typename loader_t,
|
||||||
|
typename storer_t,
|
||||||
|
typename OutputType,
|
||||||
|
typename... InputTypes>
|
||||||
|
static inline void launch_vectorized_templated_kernel(
|
||||||
|
int64_t N,
|
||||||
|
const func_t& f,
|
||||||
|
array_t data,
|
||||||
|
inp_calc_t ic,
|
||||||
|
out_calc_t oc,
|
||||||
|
loader_t l,
|
||||||
|
storer_t s) {
|
||||||
|
TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
|
||||||
|
using traits = function_traits<func_t>;
|
||||||
|
int64_t grid = (N + vectorized_templated_config::block_work_size() - 1) /
|
||||||
|
vectorized_templated_config::block_work_size();
|
||||||
|
auto stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
int vec_size = memory::can_vectorize_up_to<func_t>(data);
|
||||||
|
switch (vec_size) {
|
||||||
|
case 8:
|
||||||
|
vectorized_templated_elementwise_kernel<
|
||||||
|
8,
|
||||||
|
func_t,
|
||||||
|
array_t,
|
||||||
|
inp_calc_t,
|
||||||
|
out_calc_t,
|
||||||
|
loader_t,
|
||||||
|
storer_t,
|
||||||
|
OutputType,
|
||||||
|
InputTypes...>
|
||||||
|
<<<grid, vectorized_templated_config::num_threads(), 0, stream>>>(
|
||||||
|
N, f, data, ic, oc, l, s);
|
||||||
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
vectorized_templated_elementwise_kernel<
|
||||||
|
4,
|
||||||
|
func_t,
|
||||||
|
array_t,
|
||||||
|
inp_calc_t,
|
||||||
|
out_calc_t,
|
||||||
|
loader_t,
|
||||||
|
storer_t,
|
||||||
|
OutputType,
|
||||||
|
InputTypes...>
|
||||||
|
<<<grid, vectorized_templated_config::num_threads(), 0, stream>>>(
|
||||||
|
N, f, data, ic, oc, l, s);
|
||||||
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
vectorized_templated_elementwise_kernel<
|
||||||
|
2,
|
||||||
|
func_t,
|
||||||
|
array_t,
|
||||||
|
inp_calc_t,
|
||||||
|
out_calc_t,
|
||||||
|
loader_t,
|
||||||
|
storer_t,
|
||||||
|
OutputType,
|
||||||
|
InputTypes...>
|
||||||
|
<<<grid, vectorized_templated_config::num_threads(), 0, stream>>>(
|
||||||
|
N, f, data, ic, oc, l, s);
|
||||||
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
// vector size 1 is not handled as part of vectorize_templated kernel
|
||||||
|
TORCH_INTERNAL_ASSERT(false, "Unexpected vectorization size");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
template <
|
template <
|
||||||
typename func_t,
|
typename func_t,
|
||||||
typename array_t,
|
typename array_t,
|
||||||
@ -392,6 +542,46 @@ void gpu_kernel_impl_nocast(TensorIteratorBase& iter, const func_t& f) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef USE_ROCM
|
||||||
|
namespace {
|
||||||
|
template <typename TupleLike, size_t arity, size_t arg_num = 0>
|
||||||
|
struct check_types {
|
||||||
|
constexpr static inline bool check() {
|
||||||
|
if constexpr (arity != 2)
|
||||||
|
return false;
|
||||||
|
if constexpr (arg_num == 0) {
|
||||||
|
using SelectedType = std::tuple_element_t<arg_num, TupleLike>;
|
||||||
|
if constexpr (std::is_same_v<float, SelectedType>)
|
||||||
|
return check_types<TupleLike, arity, arg_num + 1>::check();
|
||||||
|
} else if constexpr (arg_num == 1) {
|
||||||
|
using SelectedType2 = std::tuple_element_t<arg_num, TupleLike>;
|
||||||
|
if constexpr (std::is_same_v<float, SelectedType2>)
|
||||||
|
return check_types<TupleLike, arity, arg_num + 1>::check();
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Bottom case: if we got this far, assume correct type matching except
|
||||||
|
// when there are no arguments (arity == 0).
|
||||||
|
template <typename TupleLike, size_t arity>
|
||||||
|
struct check_types<TupleLike, arity, arity> {
|
||||||
|
constexpr static inline bool check() {
|
||||||
|
if constexpr (arity != 0)
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename TupleLike>
|
||||||
|
struct check_types<TupleLike, 0, 0> {
|
||||||
|
constexpr static inline bool check() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} // namespace
|
||||||
|
#endif
|
||||||
|
|
||||||
template <typename func_t>
|
template <typename func_t>
|
||||||
void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
|
void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
|
||||||
if (!needs_dynamic_casting<func_t>::check(iter)) {
|
if (!needs_dynamic_casting<func_t>::check(iter)) {
|
||||||
@ -416,6 +606,45 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
|
|||||||
|
|
||||||
if (contiguous) {
|
if (contiguous) {
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
|
// Attempt to call specialized vectorized elementwise kernel
|
||||||
|
// that enables interleaving.
|
||||||
|
using float_map = c10::CppTypeToScalarType<float>;
|
||||||
|
using bfloat16_map = c10::CppTypeToScalarType<BFloat16>;
|
||||||
|
if (iter.ninputs() == 2 && iter.input_dtype(0) == float_map::value &&
|
||||||
|
iter.input_dtype(1) == bfloat16_map::value &&
|
||||||
|
memory::can_vectorize_up_to<func_t>(data) > 1) {
|
||||||
|
// constexpr to reduce the amount of kernels (empty) generated for
|
||||||
|
// vectorized templated elementwise and limit which functors are actually
|
||||||
|
// applied to the load and store at compile time.
|
||||||
|
using func_tuple = typename traits::ArgsTuple;
|
||||||
|
if constexpr (
|
||||||
|
std::is_same_v<float, arg0_t> && traits::arity == 2 &&
|
||||||
|
check_types<func_tuple, traits::arity, 0>::check()) {
|
||||||
|
auto input_offset_calculator = TrivialOffsetCalculator<traits::arity>();
|
||||||
|
auto output_offset_calculator = TrivialOffsetCalculator<1>();
|
||||||
|
auto loader = memory::LoadWithCast<traits::arity>(iter);
|
||||||
|
auto storer = memory::StoreWithCast<1>(iter);
|
||||||
|
launch_vectorized_templated_kernel<
|
||||||
|
func_t,
|
||||||
|
std::array<char*, ntensors>,
|
||||||
|
decltype(input_offset_calculator),
|
||||||
|
decltype(output_offset_calculator),
|
||||||
|
decltype(loader),
|
||||||
|
decltype(storer),
|
||||||
|
float,
|
||||||
|
float,
|
||||||
|
BFloat16>(
|
||||||
|
numel,
|
||||||
|
f,
|
||||||
|
data,
|
||||||
|
input_offset_calculator,
|
||||||
|
output_offset_calculator,
|
||||||
|
loader,
|
||||||
|
storer);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
std::array<ScalarType, ntensors> dtypes;
|
std::array<ScalarType, ntensors> dtypes;
|
||||||
auto inner_strides = iter.get_inner_strides();
|
auto inner_strides = iter.get_inner_strides();
|
||||||
std::array<int, ntensors> strides;
|
std::array<int, ntensors> strides;
|
||||||
|
|||||||
@ -67,6 +67,28 @@ struct vectorized_load_helper {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef USE_ROCM
|
||||||
|
// Templated version of vectorized load helper.
|
||||||
|
// It can be used on heterogeneous input tensor element types.
|
||||||
|
template <int arg_index>
|
||||||
|
struct vectorized_templated_load_helper {
|
||||||
|
template <typename args_t, typename policy_t>
|
||||||
|
static __device__ void apply(policy_t& self, args_t* args, int idx) {
|
||||||
|
using arg_t = std::tuple_element_t<arg_index, args_t>;
|
||||||
|
// `data` hold the data_ptr for tensors [output, input0, input1, ...], so we
|
||||||
|
// need a +1 offset to get the input
|
||||||
|
|
||||||
|
// Delay pointer arithmetic to the policy loader where we know the actual
|
||||||
|
// type of the current argument.
|
||||||
|
char* ptr = (self.data[arg_index + 1]);
|
||||||
|
auto args_accessor = [&args] __device__(int thread_unroll_idx) -> arg_t& {
|
||||||
|
return std::get<arg_index>(args[thread_unroll_idx]);
|
||||||
|
};
|
||||||
|
self.template load_single_arg<arg_index>(args_accessor, ptr, idx);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
template<int arg_index>
|
template<int arg_index>
|
||||||
struct unroll_load_helper {
|
struct unroll_load_helper {
|
||||||
template <typename args_t, typename policy_t, typename offset_t, typename loader_t>
|
template <typename args_t, typename policy_t, typename offset_t, typename loader_t>
|
||||||
@ -181,9 +203,16 @@ __device__ aligned_vector<bool, vec_size> load_vector(const bool *base_ptr, uint
|
|||||||
|
|
||||||
namespace policies {
|
namespace policies {
|
||||||
|
|
||||||
template<typename data_t, typename inp_calc_t, typename out_calc_t, typename loader_t, typename storer_t, int elems_per_thread, int num_outputs=1>
|
template <
|
||||||
struct unroll {
|
int num_threads,
|
||||||
|
typename data_t,
|
||||||
|
typename inp_calc_t,
|
||||||
|
typename out_calc_t,
|
||||||
|
typename loader_t,
|
||||||
|
typename storer_t,
|
||||||
|
int elems_per_thread,
|
||||||
|
int num_outputs = 1>
|
||||||
|
struct unroll_base {
|
||||||
data_t data;
|
data_t data;
|
||||||
int remaining;
|
int remaining;
|
||||||
inp_calc_t input_offset_calculator;
|
inp_calc_t input_offset_calculator;
|
||||||
@ -191,12 +220,24 @@ struct unroll {
|
|||||||
loader_t loader;
|
loader_t loader;
|
||||||
storer_t storer;
|
storer_t storer;
|
||||||
static constexpr int tws = elems_per_thread;
|
static constexpr int tws = elems_per_thread;
|
||||||
|
static constexpr int block_work_size = elems_per_thread * num_threads;
|
||||||
|
|
||||||
__device__ unroll(data_t data, int remaining, inp_calc_t ic, out_calc_t oc, loader_t l, storer_t s):
|
__device__ unroll_base(
|
||||||
data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc), loader(l), storer(s) {}
|
data_t data,
|
||||||
|
int remaining,
|
||||||
|
inp_calc_t ic,
|
||||||
|
out_calc_t oc,
|
||||||
|
loader_t l,
|
||||||
|
storer_t s)
|
||||||
|
: data(data),
|
||||||
|
remaining(remaining),
|
||||||
|
input_offset_calculator(ic),
|
||||||
|
output_offset_calculator(oc),
|
||||||
|
loader(l),
|
||||||
|
storer(s) {}
|
||||||
|
|
||||||
__device__ inline bool check_inbounds(int thread_work_elem) {
|
__device__ inline bool check_inbounds(int thread_work_elem) {
|
||||||
return ((int)(threadIdx.x + thread_work_elem*num_threads()) < remaining);
|
return ((int)(threadIdx.x + thread_work_elem * num_threads) < remaining);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename args_t>
|
template<typename args_t>
|
||||||
@ -205,13 +246,13 @@ struct unroll {
|
|||||||
int thread_idx = threadIdx.x;
|
int thread_idx = threadIdx.x;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < elems_per_thread; i++) {
|
for (int i = 0; i < elems_per_thread; i++) {
|
||||||
if (thread_idx >= remaining) {
|
if (thread_idx < remaining) {
|
||||||
return;
|
int linear_idx = thread_idx + block_work_size * idx;
|
||||||
|
auto offset = input_offset_calculator.get(linear_idx);
|
||||||
|
detail::static_unroll<detail::unroll_load_helper, arity>::with_args(
|
||||||
|
*this, args, offset, loader, i, num_outputs);
|
||||||
|
thread_idx += num_threads;
|
||||||
}
|
}
|
||||||
int linear_idx = thread_idx + elems_per_thread * num_threads() * idx;
|
|
||||||
auto offset = input_offset_calculator.get(linear_idx);
|
|
||||||
detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, loader, i, num_outputs);
|
|
||||||
thread_idx += num_threads();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -220,22 +261,36 @@ struct unroll {
|
|||||||
int thread_idx = threadIdx.x;
|
int thread_idx = threadIdx.x;
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < elems_per_thread; i++) {
|
for (int i = 0; i < elems_per_thread; i++) {
|
||||||
if (thread_idx >= remaining) {
|
if (thread_idx < remaining) {
|
||||||
return;
|
int linear_idx = thread_idx + block_work_size * idx;
|
||||||
|
int offset = output_offset_calculator.get(linear_idx)[0];
|
||||||
|
storer.store(from[i], data[0], offset);
|
||||||
|
thread_idx += num_threads;
|
||||||
}
|
}
|
||||||
int linear_idx = thread_idx + elems_per_thread * num_threads() * idx;
|
|
||||||
int offset = output_offset_calculator.get(linear_idx)[0];
|
|
||||||
storer.store(from[i], data[0], offset);
|
|
||||||
thread_idx += num_threads();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Assumption:
|
// Utility type for all users of unroll that extract the num_threads value from
|
||||||
// all tensors are contiguous, that is: stride == sizeof(type) for all tensors
|
// the caller scope.
|
||||||
// Note:
|
template <
|
||||||
// Functions in vectorized policy does not do boundary check. It assumes the whole block
|
typename data_t,
|
||||||
// has its job to do. So the reminders should be handled by the caller manually.
|
typename inp_calc_t,
|
||||||
|
typename out_calc_t,
|
||||||
|
typename loader_t,
|
||||||
|
typename storer_t,
|
||||||
|
int elems_per_thread,
|
||||||
|
int num_outputs = 1>
|
||||||
|
using unroll = unroll_base<
|
||||||
|
num_threads(),
|
||||||
|
data_t,
|
||||||
|
inp_calc_t,
|
||||||
|
out_calc_t,
|
||||||
|
loader_t,
|
||||||
|
storer_t,
|
||||||
|
elems_per_thread,
|
||||||
|
num_outputs>;
|
||||||
|
|
||||||
template <int vec_size, typename data_t, int elems_per_thread> // vec_size: number of scalars, can be 1, 2, or 4.
|
template <int vec_size, typename data_t, int elems_per_thread> // vec_size: number of scalars, can be 1, 2, or 4.
|
||||||
struct vectorized {
|
struct vectorized {
|
||||||
|
|
||||||
@ -289,6 +344,86 @@ struct vectorized {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef USE_ROCM
|
||||||
|
// This is similar to vectorized policy above, but this one supports
|
||||||
|
// heterogenous input tensor types as templated parameters.
|
||||||
|
// Its use should be limited to frequently used heterogeneous data types
|
||||||
|
// as each instantiation will generate a separate kernel, leading to code
|
||||||
|
// bloating if applied to all combinations supported in PyTorch. Assumption: all
|
||||||
|
// tensors are contiguous, that is: stride == sizeof(type) for all tensors.
|
||||||
|
template <
|
||||||
|
int vec_size,
|
||||||
|
typename data_t,
|
||||||
|
int elems_per_thread,
|
||||||
|
int num_threads,
|
||||||
|
typename CastToT,
|
||||||
|
typename... CastFromTs> // vec_size: number of scalars, can be 1, 2, or 4.
|
||||||
|
struct vectorized_templated {
|
||||||
|
static_assert(
|
||||||
|
elems_per_thread % vec_size == 0,
|
||||||
|
"The workload per thread must be a multiple of vec_size");
|
||||||
|
static constexpr int loop_size = elems_per_thread / vec_size;
|
||||||
|
static constexpr int tws = elems_per_thread;
|
||||||
|
static constexpr int block_work_size = elems_per_thread * num_threads;
|
||||||
|
data_t data;
|
||||||
|
|
||||||
|
__device__ vectorized_templated(data_t data) : data(data) {}
|
||||||
|
|
||||||
|
__device__ inline constexpr bool check_inbounds(int thread_work_elem) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int arg_index, typename accessor_t>
|
||||||
|
__device__ inline void load_single_arg(accessor_t to, char* ptr, int idx) {
|
||||||
|
// extract the arg_index-th input tensor element type from the
|
||||||
|
// variadic template argument.
|
||||||
|
using CastFromT =
|
||||||
|
std::tuple_element_t<arg_index, std::tuple<CastFromTs...>>;
|
||||||
|
// Delayed pointer arithmetic from the caller: this is the place
|
||||||
|
// where we know the type of the argument.
|
||||||
|
CastFromT* block_ptr =
|
||||||
|
reinterpret_cast<CastFromT*>(ptr) + block_work_size * idx;
|
||||||
|
int thread_idx = threadIdx.x;
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < loop_size; i++) {
|
||||||
|
int index = thread_idx + i * num_threads;
|
||||||
|
auto v = load_vector<vec_size>(block_ptr, index);
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < vec_size; j++) {
|
||||||
|
to(vec_size * i + j) = c10::convert<CastToT>(v.val[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename args_t>
|
||||||
|
__device__ inline void load(args_t* args, int idx) {
|
||||||
|
constexpr int arity = std::tuple_size<args_t>::value;
|
||||||
|
detail::static_unroll<detail::vectorized_templated_load_helper, arity>::
|
||||||
|
with_args(*this, args, idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assume for now that from (temporary array per thread) is of the same
|
||||||
|
// type as to (destination tensor), which is the case for
|
||||||
|
// float(float,bfloat16) and functor add on float(float,float).
|
||||||
|
template <typename scalar_t>
|
||||||
|
__device__ inline void store(scalar_t* from, int idx) {
|
||||||
|
using vec_t = aligned_vector<scalar_t, vec_size>;
|
||||||
|
scalar_t* to = reinterpret_cast<scalar_t*>(data[0]) + block_work_size * idx;
|
||||||
|
vec_t* to_ = reinterpret_cast<vec_t*>(to);
|
||||||
|
int thread_idx = threadIdx.x;
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < loop_size; i++) {
|
||||||
|
int index = thread_idx + i * num_threads;
|
||||||
|
vec_t v;
|
||||||
|
for (int j = 0; j < vec_size; j++) {
|
||||||
|
v.val[j] = from[vec_size * i + j];
|
||||||
|
}
|
||||||
|
to_[index] = v;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
template <typename data_t, typename inp_calc_t, typename out_calc_t, int num_outputs>
|
template <typename data_t, typename inp_calc_t, typename out_calc_t, int num_outputs>
|
||||||
struct multi_outputs_unroll {
|
struct multi_outputs_unroll {
|
||||||
//multi_outputs_unroll struct members and check_inbounds and load methods are copypasted from unroll struct
|
//multi_outputs_unroll struct members and check_inbounds and load methods are copypasted from unroll struct
|
||||||
|
|||||||
@ -89,6 +89,20 @@ struct SoftMaxBackwardEpilogue {
|
|||||||
const AccumT sum;
|
const AccumT sum;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename T, typename AccumT, typename OutT>
|
||||||
|
struct SoftMaxForwardWithMulEpilogue {
|
||||||
|
__device__ __forceinline__ SoftMaxForwardWithMulEpilogue(AccumT max_input, AccumT sum)
|
||||||
|
: max_input(max_input)
|
||||||
|
, sum(sum) {}
|
||||||
|
|
||||||
|
__device__ __forceinline__ OutT operator()(T input) const {
|
||||||
|
return static_cast<OutT>(__expf(input - max_input) * sum);
|
||||||
|
}
|
||||||
|
|
||||||
|
const AccumT max_input;
|
||||||
|
const AccumT sum;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -387,6 +401,19 @@ struct SumExpFloat
|
|||||||
const AccumT max_k;
|
const AccumT max_k;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<typename T, typename AccumT>
|
||||||
|
struct SumExpfFloat
|
||||||
|
{
|
||||||
|
__device__ __forceinline__ SumExpfFloat(AccumT v)
|
||||||
|
: max_k(v) {}
|
||||||
|
|
||||||
|
__device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
|
||||||
|
return sum + __expf(v - max_k);
|
||||||
|
}
|
||||||
|
|
||||||
|
const AccumT max_k;
|
||||||
|
};
|
||||||
|
|
||||||
template <template<typename> class Reduction, typename AccumT>
|
template <template<typename> class Reduction, typename AccumT>
|
||||||
__device__ __forceinline__ AccumT
|
__device__ __forceinline__ AccumT
|
||||||
blockReduce(AccumT* smem, AccumT val,
|
blockReduce(AccumT* smem, AccumT val,
|
||||||
@ -449,6 +476,19 @@ T blockReduceWarp(T* smem_cache, T value, const Reduction<T>& op, T defaultVal)
|
|||||||
return smem_cache[0];
|
return smem_cache[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <template<typename> class Reduction, typename T>
|
||||||
|
__device__ __forceinline__
|
||||||
|
T blockReduceWarpInverse(T* smem_cache, T value, const Reduction<T>& op, T defaultVal)
|
||||||
|
{
|
||||||
|
T result = cuda_utils::BlockReduce<T, Reduction<T>>(value, op, defaultVal, smem_cache);
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
smem_cache[0] = 1 / result;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
return smem_cache[0];
|
||||||
|
}
|
||||||
|
|
||||||
template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT, typename index_t=int>
|
template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT, typename index_t=int>
|
||||||
__device__ __forceinline__ AccumT
|
__device__ __forceinline__ AccumT
|
||||||
ilpReduce(index_t shift,
|
ilpReduce(index_t shift,
|
||||||
@ -664,6 +704,38 @@ WriteBpropResults(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template <typename, typename, typename> class EpilogueWithMul>
|
||||||
|
__global__ void
|
||||||
|
cunn_SoftMaxForwardFast(outscalar_t *output, const scalar_t *input, int classes)
|
||||||
|
{
|
||||||
|
extern __shared__ unsigned char smem[];
|
||||||
|
auto sdata = reinterpret_cast<accscalar_t*>(smem);
|
||||||
|
|
||||||
|
// each block handles a sample in the mini-batch
|
||||||
|
input += static_cast<int64_t>(blockIdx.x) * classes;
|
||||||
|
output += static_cast<int64_t>(blockIdx.x) * classes;
|
||||||
|
|
||||||
|
const int shift = ((uint64_t)input) % ALIGN_BYTES / sizeof(scalar_t);
|
||||||
|
|
||||||
|
// find the max
|
||||||
|
accscalar_t threadMax = ilpReduce<MaxFloat, ILP, scalar_t, accscalar_t>(
|
||||||
|
shift, input, classes, MaxFloat<scalar_t, accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
|
||||||
|
accscalar_t max_k = blockReduceWarp<Max, accscalar_t>(sdata, threadMax,
|
||||||
|
Max<accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
|
||||||
|
|
||||||
|
// reduce all values
|
||||||
|
accscalar_t threadExp = ilpReduce<SumExpfFloat, ILP, scalar_t, accscalar_t>(
|
||||||
|
shift, input, classes, SumExpfFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
|
||||||
|
accscalar_t sumAll = blockReduceWarpInverse<Add, accscalar_t>(sdata, threadExp,
|
||||||
|
Add<accscalar_t>(), static_cast<accscalar_t>(0));
|
||||||
|
|
||||||
|
EpilogueWithMul<scalar_t, accscalar_t, outscalar_t> epilogue(max_k, sumAll);
|
||||||
|
|
||||||
|
for (int offset = threadIdx.x; offset < classes; offset += blockDim.x) {
|
||||||
|
output[offset] = epilogue(input[offset]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template <typename, typename, typename> class Epilogue>
|
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template <typename, typename, typename> class Epilogue>
|
||||||
__global__ void
|
__global__ void
|
||||||
cunn_SoftMaxForward(outscalar_t *output, const scalar_t *input, int classes)
|
cunn_SoftMaxForward(outscalar_t *output, const scalar_t *input, int classes)
|
||||||
@ -755,6 +827,68 @@ cunn_SoftMaxForwardReg(outscalar_t *output, const scalar_t *input, index_t class
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t,
|
||||||
|
template <typename, typename, typename> class EpilogueWithMul, typename index_t = int32_t>
|
||||||
|
__global__ void
|
||||||
|
cunn_SoftMaxForwardGmem(outscalar_t *output, const scalar_t *input, index_t classes)
|
||||||
|
{
|
||||||
|
// Each thread block processes a sample in the batch
|
||||||
|
input += static_cast<int64_t>(blockIdx.x) * classes;
|
||||||
|
output += static_cast<int64_t>(blockIdx.x) * classes;
|
||||||
|
|
||||||
|
accscalar_t threadMax = -at::numeric_limits<accscalar_t>::max();
|
||||||
|
accscalar_t threadExp = static_cast<accscalar_t>(0);
|
||||||
|
|
||||||
|
// The first smem segment is used to cache input values and the last
|
||||||
|
// segment is used for thread block reductions
|
||||||
|
extern __shared__ unsigned char smem[];
|
||||||
|
auto smem_reduction_cache = reinterpret_cast<accscalar_t*>(smem);
|
||||||
|
|
||||||
|
using LoadT = at::native::memory::aligned_vector<scalar_t, ILP>;
|
||||||
|
const LoadT* const input_vec_ptr = reinterpret_cast<const LoadT*>(input);
|
||||||
|
|
||||||
|
// Do the first step in max calculation:
|
||||||
|
MaxFloat<scalar_t, accscalar_t> maxFunc;
|
||||||
|
for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) {
|
||||||
|
LoadT crnt_vec = input_vec_ptr[offset];
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < ILP; ++i) {
|
||||||
|
threadMax = maxFunc(threadMax, crnt_vec.val[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
accscalar_t max_k = blockReduceWarp<Max, accscalar_t>(smem_reduction_cache, threadMax,
|
||||||
|
Max<accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
|
||||||
|
|
||||||
|
// Do the second step in sum exp calculation:
|
||||||
|
SumExpfFloat<scalar_t, accscalar_t> sumExpFunc(max_k);
|
||||||
|
for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) {
|
||||||
|
LoadT crnt_vec = input_vec_ptr[offset];
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < ILP; ++i) {
|
||||||
|
threadExp = sumExpFunc(threadExp, crnt_vec.val[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
accscalar_t sumAll = blockReduceWarpInverse<Add, accscalar_t>(smem_reduction_cache, threadExp,
|
||||||
|
Add<accscalar_t>(), static_cast<accscalar_t>(0));
|
||||||
|
|
||||||
|
EpilogueWithMul<scalar_t, accscalar_t, outscalar_t> epilogue(max_k, sumAll);
|
||||||
|
|
||||||
|
using StoreT = at::native::memory::aligned_vector<outscalar_t, ILP>;
|
||||||
|
StoreT* output_vec_ptr = reinterpret_cast<StoreT*>(output);
|
||||||
|
for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) {
|
||||||
|
LoadT crnt_vec = input_vec_ptr[offset];
|
||||||
|
StoreT out_vec;
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < ILP; ++i) {
|
||||||
|
out_vec.val[i] = epilogue(crnt_vec.val[i]);
|
||||||
|
}
|
||||||
|
output_vec_ptr[offset] = out_vec;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t,
|
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t,
|
||||||
template <typename, typename, typename> class Epilogue, typename index_t = int32_t>
|
template <typename, typename, typename> class Epilogue, typename index_t = int32_t>
|
||||||
__global__ void
|
__global__ void
|
||||||
@ -935,7 +1069,9 @@ cunn_SoftMaxBackwardSmem(scalar_t *gradInput, const outscalar_t *output, const o
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<template<typename, typename, typename> class Epilogue, bool is_log_softmax>
|
|
||||||
|
template<template<typename, typename, typename> class Epilogue,
|
||||||
|
template<typename, typename, typename> class EpilogueWithMul, bool is_log_softmax, bool use_fast_softmax>
|
||||||
Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_to_float, const Tensor& output){
|
Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_to_float, const Tensor& output){
|
||||||
if (half_to_float) {
|
if (half_to_float) {
|
||||||
TORCH_CHECK(input_.scalar_type() == ScalarType::Half, "conversion is supported for Half type only");
|
TORCH_CHECK(input_.scalar_type() == ScalarType::Half, "conversion is supported for Half type only");
|
||||||
@ -977,66 +1113,78 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
|
constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
|
||||||
dim3 block = SoftMaxForward_getBlockSize(dim_size);
|
if constexpr (use_fast_softmax) {
|
||||||
size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
|
dim3 block(512);
|
||||||
auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
|
size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
|
||||||
smem_reduction_sz) / sizeof(scalar_t);
|
if (dim_size % ILP == 0) {
|
||||||
|
cunn_SoftMaxForwardGmem<ILP, scalar_t, accscalar_t, scalar_t, EpilogueWithMul>
|
||||||
bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
|
|
||||||
can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
|
|
||||||
can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
|
|
||||||
can_use_smem &= !(dim_size % ILP);
|
|
||||||
|
|
||||||
int32_t potential_reg_cnt = potential_register_count(dim_size, block.x);
|
|
||||||
if(potential_reg_cnt < 10){
|
|
||||||
TORCH_INTERNAL_ASSERT(potential_reg_cnt > 0, "potential_reg_cnt for softmax with register should be greater than 0.");
|
|
||||||
switch (potential_reg_cnt) {
|
|
||||||
// TODO(Wenqin): try to investigate why we couldn't use macro for below code,
|
|
||||||
// because it seems on MSVS, it seems the macro way didn't expand correct.
|
|
||||||
case 1:
|
|
||||||
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 1>
|
|
||||||
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
||||||
break;
|
} else {
|
||||||
case 2:
|
cunn_SoftMaxForwardFast<ILP, scalar_t, accscalar_t, scalar_t, EpilogueWithMul>
|
||||||
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 2>
|
|
||||||
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
||||||
break;
|
|
||||||
case 3:
|
|
||||||
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 3>
|
|
||||||
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
|
||||||
break;
|
|
||||||
case 4:
|
|
||||||
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 4>
|
|
||||||
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
|
||||||
break;
|
|
||||||
case 5:
|
|
||||||
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 5>
|
|
||||||
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
|
||||||
break;
|
|
||||||
case 6:
|
|
||||||
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 6>
|
|
||||||
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
|
||||||
break;
|
|
||||||
case 7:
|
|
||||||
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 7>
|
|
||||||
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
|
||||||
break;
|
|
||||||
case 8:
|
|
||||||
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 8>
|
|
||||||
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
|
||||||
break;
|
|
||||||
case 9:
|
|
||||||
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 9>
|
|
||||||
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
} else if (can_use_smem) {
|
|
||||||
size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz;
|
|
||||||
cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
|
|
||||||
<<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
|
||||||
} else {
|
} else {
|
||||||
cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
|
dim3 block = SoftMaxForward_getBlockSize(dim_size);
|
||||||
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
|
||||||
|
auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
|
||||||
|
smem_reduction_sz) / sizeof(scalar_t);
|
||||||
|
|
||||||
|
bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
|
||||||
|
can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
|
||||||
|
can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
|
||||||
|
can_use_smem &= !(dim_size % ILP);
|
||||||
|
|
||||||
|
int32_t potential_reg_cnt = potential_register_count(dim_size, block.x);
|
||||||
|
if(potential_reg_cnt < 10){
|
||||||
|
TORCH_INTERNAL_ASSERT(potential_reg_cnt > 0, "potential_reg_cnt for softmax with register should be greater than 0.");
|
||||||
|
switch (potential_reg_cnt) {
|
||||||
|
// TODO(Wenqin): try to investigate why we couldn't use macro for below code,
|
||||||
|
// because it seems on MSVS, it seems the macro way didn't expand correct.
|
||||||
|
case 1:
|
||||||
|
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 1>
|
||||||
|
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 2>
|
||||||
|
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 3>
|
||||||
|
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 4>
|
||||||
|
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
||||||
|
break;
|
||||||
|
case 5:
|
||||||
|
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 5>
|
||||||
|
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
||||||
|
break;
|
||||||
|
case 6:
|
||||||
|
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 6>
|
||||||
|
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
||||||
|
break;
|
||||||
|
case 7:
|
||||||
|
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 7>
|
||||||
|
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
||||||
|
break;
|
||||||
|
case 8:
|
||||||
|
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 8>
|
||||||
|
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
||||||
|
break;
|
||||||
|
case 9:
|
||||||
|
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 9>
|
||||||
|
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else if (can_use_smem) {
|
||||||
|
size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz;
|
||||||
|
cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
|
||||||
|
<<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
||||||
|
} else {
|
||||||
|
cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
|
||||||
|
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||||
@ -1056,23 +1204,35 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
|
constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
|
||||||
dim3 block = SoftMaxForward_getBlockSize(dim_size);
|
if constexpr (use_fast_softmax) {
|
||||||
size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
|
dim3 block(512);
|
||||||
auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
|
size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
|
||||||
smem_reduction_sz) / sizeof(scalar_t);
|
if (dim_size % ILP == 0) {
|
||||||
|
cunn_SoftMaxForwardGmem<ILP, scalar_t, accscalar_t, accscalar_t, EpilogueWithMul>
|
||||||
bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
|
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
||||||
can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
|
} else {
|
||||||
can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
|
cunn_SoftMaxForwardFast<ILP, scalar_t, accscalar_t, accscalar_t, EpilogueWithMul>
|
||||||
can_use_smem &= !(dim_size % ILP);
|
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
||||||
|
}
|
||||||
if (can_use_smem) {
|
|
||||||
size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz;
|
|
||||||
cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
|
|
||||||
<<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
|
||||||
} else {
|
} else {
|
||||||
cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
|
dim3 block = SoftMaxForward_getBlockSize(dim_size);
|
||||||
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
|
||||||
|
auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
|
||||||
|
smem_reduction_sz) / sizeof(scalar_t);
|
||||||
|
|
||||||
|
bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
|
||||||
|
can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
|
||||||
|
can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
|
||||||
|
can_use_smem &= !(dim_size % ILP);
|
||||||
|
|
||||||
|
if (can_use_smem) {
|
||||||
|
size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz;
|
||||||
|
cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
|
||||||
|
<<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
||||||
|
} else {
|
||||||
|
cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
|
||||||
|
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||||
@ -1252,7 +1412,7 @@ TORCH_IMPL_FUNC(log_softmax_cuda_out) (
|
|||||||
const int64_t dim,
|
const int64_t dim,
|
||||||
const bool half_to_float,
|
const bool half_to_float,
|
||||||
const Tensor &output) {
|
const Tensor &output) {
|
||||||
host_softmax<LogSoftMaxForwardEpilogue,true>(input, dim, half_to_float, output);
|
host_softmax<LogSoftMaxForwardEpilogue, LogSoftMaxForwardEpilogue, true, false>(input, dim, half_to_float, output);
|
||||||
}
|
}
|
||||||
|
|
||||||
TORCH_IMPL_FUNC(log_softmax_backward_cuda_out) (
|
TORCH_IMPL_FUNC(log_softmax_backward_cuda_out) (
|
||||||
@ -1276,7 +1436,11 @@ TORCH_IMPL_FUNC(softmax_cuda_out) (
|
|||||||
const int64_t dim,
|
const int64_t dim,
|
||||||
const bool half_to_float,
|
const bool half_to_float,
|
||||||
const Tensor &output) {
|
const Tensor &output) {
|
||||||
host_softmax<SoftMaxForwardEpilogue,false>(input, dim, half_to_float, output);
|
#if defined(USE_ROCM)
|
||||||
|
host_softmax<SoftMaxForwardEpilogue, SoftMaxForwardWithMulEpilogue, false, true>(input, dim, half_to_float, output);
|
||||||
|
#else
|
||||||
|
host_softmax<SoftMaxForwardEpilogue, SoftMaxForwardWithMulEpilogue, false, false>(input, dim, half_to_float, output);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
TORCH_IMPL_FUNC(softmax_backward_cuda_out)
|
TORCH_IMPL_FUNC(softmax_backward_cuda_out)
|
||||||
|
|||||||
@ -469,11 +469,315 @@ void dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
|
||||||
|
// If any of the shapes cant be tiled, we must use padding.
|
||||||
|
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
|
||||||
|
// Dispatch to best implementation.
|
||||||
|
// TODO add more configurations. Optimize.
|
||||||
|
|
||||||
|
bool transa_ = std::tolower(transa) != 'n';
|
||||||
|
bool transb_ = std::tolower(transb) != 'n';
|
||||||
|
|
||||||
|
if (use_padding) {
|
||||||
|
if(transa_ && transb_) { // col , col
|
||||||
|
gemm_impl_wmma<
|
||||||
|
at::BFloat16,
|
||||||
|
256,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
64,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
S<1, 32, 1, 8>,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
true,
|
||||||
|
true>
|
||||||
|
(CUDABLAS_GEMM_ARGS(at::BFloat16));
|
||||||
|
}
|
||||||
|
else if(transa_ && !transb_) { // row, col
|
||||||
|
gemm_impl_wmma<
|
||||||
|
at::BFloat16,
|
||||||
|
256,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
64,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
S<1, 32, 1, 8>,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
true,
|
||||||
|
false>
|
||||||
|
(CUDABLAS_GEMM_ARGS(at::BFloat16));
|
||||||
|
}
|
||||||
|
else if(!transa_ && transb_) { //col, row
|
||||||
|
gemm_impl_wmma<
|
||||||
|
at::BFloat16,
|
||||||
|
256,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
64,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
S<1, 32, 1, 8>,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
false,
|
||||||
|
true>
|
||||||
|
(CUDABLAS_GEMM_ARGS(at::BFloat16));
|
||||||
|
}
|
||||||
|
else if(!transa_ && !transb_) { //row, row
|
||||||
|
gemm_impl_wmma<
|
||||||
|
at::BFloat16,
|
||||||
|
256,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
64,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
S<1, 32, 1, 8>,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
false,
|
||||||
|
false>
|
||||||
|
(CUDABLAS_GEMM_ARGS(at::BFloat16));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
TORCH_CHECK(false, "unreachable");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if(transa_ && transb_) { // col , col
|
||||||
|
gemm_impl_wmma<
|
||||||
|
at::BFloat16,
|
||||||
|
256,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
64,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
S<1, 32, 1, 8>,
|
||||||
|
8,
|
||||||
|
false,
|
||||||
|
true,
|
||||||
|
true>
|
||||||
|
(CUDABLAS_GEMM_ARGS(at::BFloat16));
|
||||||
|
}
|
||||||
|
else if(transa_ && !transb_) { // row, col
|
||||||
|
gemm_impl_wmma<
|
||||||
|
at::BFloat16,
|
||||||
|
256,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
64,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
S<1, 32, 1, 8>,
|
||||||
|
8,
|
||||||
|
false,
|
||||||
|
true,
|
||||||
|
false>
|
||||||
|
(CUDABLAS_GEMM_ARGS(at::BFloat16));
|
||||||
|
}
|
||||||
|
else if(!transa_ && transb_) { //col, row
|
||||||
|
gemm_impl_wmma<
|
||||||
|
at::BFloat16,
|
||||||
|
256,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
64,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
S<1, 32, 1, 8>,
|
||||||
|
8,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
true>
|
||||||
|
(CUDABLAS_GEMM_ARGS(at::BFloat16));
|
||||||
|
}
|
||||||
|
else if(!transa_ && !transb_) { //row, row
|
||||||
|
gemm_impl_wmma<
|
||||||
|
at::BFloat16,
|
||||||
|
256,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
64,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
S<1, 32, 1, 8>, 8,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
false>
|
||||||
|
(CUDABLAS_GEMM_ARGS(at::BFloat16));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
TORCH_CHECK(false, "unreachable");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
|
void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
|
||||||
dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGS(at::BFloat16));
|
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||||
|
c10::string_view arch(dprops->gcnArchName);
|
||||||
|
if (arch == "gfx1100") {
|
||||||
|
dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGS(at::BFloat16));
|
||||||
|
} else{
|
||||||
|
dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGS(at::BFloat16));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace at::native
|
} // namespace at::native
|
||||||
|
|||||||
@ -297,10 +297,314 @@ void dispatch_half_gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
|
||||||
|
// If any of the shapes cant be tiled, we must use padding.
|
||||||
|
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
|
||||||
|
// Dispatch to best implementation.
|
||||||
|
// TODO add more configurations. Optimize.
|
||||||
|
|
||||||
|
bool transa_ = std::tolower(transa) != 'n';
|
||||||
|
bool transb_ = std::tolower(transb) != 'n';
|
||||||
|
|
||||||
|
if (use_padding) {
|
||||||
|
if(transa_ && transb_) { // col , col
|
||||||
|
gemm_impl_wmma<
|
||||||
|
at::Half,
|
||||||
|
256,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
64,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
S<1, 32, 1, 8>,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
true,
|
||||||
|
true>
|
||||||
|
(CUDABLAS_GEMM_ARGS(at::Half));
|
||||||
|
}
|
||||||
|
else if(transa_ && !transb_) { // row, col
|
||||||
|
gemm_impl_wmma<
|
||||||
|
at::Half,
|
||||||
|
256,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
64,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
S<1, 32, 1, 8>,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
true,
|
||||||
|
false>
|
||||||
|
(CUDABLAS_GEMM_ARGS(at::Half));
|
||||||
|
}
|
||||||
|
else if(!transa_ && transb_) { //col, row
|
||||||
|
gemm_impl_wmma<
|
||||||
|
at::Half,
|
||||||
|
256,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
64,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
S<1, 32, 1, 8>,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
false,
|
||||||
|
true>
|
||||||
|
(CUDABLAS_GEMM_ARGS(at::Half));
|
||||||
|
}
|
||||||
|
else if(!transa_ && !transb_) { //row, row
|
||||||
|
gemm_impl_wmma<
|
||||||
|
at::Half,
|
||||||
|
256,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
64,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
S<1, 32, 1, 8>,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
false,
|
||||||
|
false>
|
||||||
|
(CUDABLAS_GEMM_ARGS(at::Half));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
TORCH_CHECK(false, "unreachable");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if(transa_ && transb_) { // col , col
|
||||||
|
gemm_impl_wmma<
|
||||||
|
at::Half,
|
||||||
|
256,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
64,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
S<1, 32, 1, 8>,
|
||||||
|
8,
|
||||||
|
false,
|
||||||
|
true,
|
||||||
|
true>
|
||||||
|
(CUDABLAS_GEMM_ARGS(at::Half));
|
||||||
|
}
|
||||||
|
else if(transa_ && !transb_) { // row, col
|
||||||
|
gemm_impl_wmma<
|
||||||
|
at::Half,
|
||||||
|
256,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
64,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
S<1, 32, 1, 8>,
|
||||||
|
8,
|
||||||
|
false,
|
||||||
|
true,
|
||||||
|
false>
|
||||||
|
(CUDABLAS_GEMM_ARGS(at::Half));
|
||||||
|
}
|
||||||
|
else if(!transa_ && transb_) { //col, row
|
||||||
|
gemm_impl_wmma<
|
||||||
|
at::Half,
|
||||||
|
256,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
64,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
S<1, 32, 1, 8>,
|
||||||
|
8,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
true>
|
||||||
|
(CUDABLAS_GEMM_ARGS(at::Half));
|
||||||
|
}
|
||||||
|
else if(!transa_ && !transb_) { //row, row
|
||||||
|
gemm_impl_wmma<
|
||||||
|
at::Half,
|
||||||
|
256,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
64,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
16,
|
||||||
|
4,
|
||||||
|
4,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
S<1, 0, 2>,
|
||||||
|
2,
|
||||||
|
8,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
S<4, 64, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
S<0, 2, 1>,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
8,
|
||||||
|
true,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
S<1, 32, 1, 8>, 8,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
false>
|
||||||
|
(CUDABLAS_GEMM_ARGS(at::Half));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
TORCH_CHECK(false, "unreachable");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
|
void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
|
||||||
dispatch_half_gemm(CUDABLAS_GEMM_ARGS(at::Half));
|
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||||
|
c10::string_view arch(dprops->gcnArchName);
|
||||||
|
if (arch == "gfx1100") {
|
||||||
|
dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGS(at::Half));
|
||||||
|
} else{
|
||||||
|
dispatch_half_gemm(CUDABLAS_GEMM_ARGS(at::Half));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace at::native
|
} // namespace at::native
|
||||||
|
|||||||
@ -30,6 +30,7 @@
|
|||||||
#include <ck/library/utility/literals.hpp>
|
#include <ck/library/utility/literals.hpp>
|
||||||
|
|
||||||
#include <ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp>
|
#include <ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp>
|
||||||
|
#include <ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp>
|
||||||
|
|
||||||
// Define commonly used types.
|
// Define commonly used types.
|
||||||
template <ck::index_t... Is>
|
template <ck::index_t... Is>
|
||||||
@ -236,4 +237,180 @@ void gemm_impl(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
|
|||||||
invoker.Run(argument, StreamConfig{stream, false});
|
invoker.Run(argument, StreamConfig{stream, false});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <
|
||||||
|
typename Dtype,
|
||||||
|
int BLOCK_SIZE,
|
||||||
|
int MBLOCK,
|
||||||
|
int NBLOCK,
|
||||||
|
int KBLOCK,
|
||||||
|
int K1,
|
||||||
|
int MPER_WMMA,
|
||||||
|
int NPER_WMMA,
|
||||||
|
int MPER_WAVE,
|
||||||
|
int NPER_WAVE,
|
||||||
|
typename ABLOCK_CLUSTER_LENS,
|
||||||
|
typename ABLOCK_CLUSTER_ORDER,
|
||||||
|
typename ABLOCK_SRC_ORDER,
|
||||||
|
int ABLOCK_VECTOR_DIM,
|
||||||
|
int ABLOCK_SCALAR_VEC,
|
||||||
|
int ABLOCK_SCALAR_VEC_K1,
|
||||||
|
bool ABLOCK_LDS_EXTRAM,
|
||||||
|
typename BBLOCK_CLUSTER_LENS,
|
||||||
|
typename BBLOCK_CLUSTER_ORDER,
|
||||||
|
typename BBLOCK_SRC_ORDER,
|
||||||
|
int BBLOCK_VECTOR_DIM,
|
||||||
|
int BBLOCK_SCALAR_VEC,
|
||||||
|
int BBLOCK_SCALAR_VEC_AK1,
|
||||||
|
bool BBLOCK_LDS_EXTRAN,
|
||||||
|
int CMPER_WAVE,
|
||||||
|
int CNPER_WAVE,
|
||||||
|
typename CBLOCK_CLUSTER_LENS,
|
||||||
|
int CNPER_BLOCK,
|
||||||
|
bool PADDING = false,
|
||||||
|
bool TRANSA = false,
|
||||||
|
bool TRANSB = false>
|
||||||
|
void gemm_impl_wmma(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
|
||||||
|
// Get input information.
|
||||||
|
int M = m;
|
||||||
|
int N = n;
|
||||||
|
int K = k;
|
||||||
|
|
||||||
|
int StrideA = lda;
|
||||||
|
int StrideB = ldb;
|
||||||
|
int StrideC = ldc;
|
||||||
|
|
||||||
|
int KBatch = 1;
|
||||||
|
|
||||||
|
float falpha = alpha;
|
||||||
|
float fbeta = beta;
|
||||||
|
|
||||||
|
using ADataType = typename CkMathType<Dtype>::dtype;
|
||||||
|
using BDataType = typename CkMathType<Dtype>::dtype;
|
||||||
|
using CDataType = typename CkMathType<Dtype>::dtype;
|
||||||
|
using DDataType = typename CkMathType<Dtype>::dtype;
|
||||||
|
|
||||||
|
using AccDataType = float;
|
||||||
|
using CShuffleDataType = typename CkMathType<Dtype>::dtype;
|
||||||
|
|
||||||
|
using ALayout = typename CkTensorLayout<TRANSA, TRANSB>::a_layout;
|
||||||
|
using BLayout = typename CkTensorLayout<TRANSA, TRANSB>::b_layout;
|
||||||
|
|
||||||
|
using DLayout = Row;
|
||||||
|
using CLayout = Row;
|
||||||
|
|
||||||
|
using AElementOp = PassThrough;
|
||||||
|
using BElementOp = PassThrough;
|
||||||
|
using CElementOp = PassThrough;
|
||||||
|
|
||||||
|
|
||||||
|
static constexpr auto GemmDefault =
|
||||||
|
ck::tensor_operation::device::GemmSpecialization::Default;
|
||||||
|
static constexpr auto GemmMNKPadding =
|
||||||
|
ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||||
|
static constexpr auto GemmSpec = PADDING ? GemmMNKPadding : GemmDefault;
|
||||||
|
|
||||||
|
|
||||||
|
using DeviceGemmInstance =
|
||||||
|
ck::tensor_operation::device::DeviceGemmWmma_CShuffle<ALayout,
|
||||||
|
BLayout,
|
||||||
|
CLayout,
|
||||||
|
ADataType,
|
||||||
|
BDataType,
|
||||||
|
CDataType,
|
||||||
|
AccDataType,
|
||||||
|
CShuffleDataType,
|
||||||
|
AElementOp,
|
||||||
|
BElementOp,
|
||||||
|
CElementOp,
|
||||||
|
GemmSpec,
|
||||||
|
1, // NumPrefetch
|
||||||
|
BLOCK_SIZE,
|
||||||
|
MBLOCK,
|
||||||
|
NBLOCK,
|
||||||
|
KBLOCK,
|
||||||
|
K1,
|
||||||
|
MPER_WMMA,
|
||||||
|
NPER_WMMA,
|
||||||
|
MPER_WAVE,
|
||||||
|
NPER_WAVE,
|
||||||
|
ABLOCK_CLUSTER_LENS,
|
||||||
|
ABLOCK_CLUSTER_ORDER,
|
||||||
|
ABLOCK_SRC_ORDER,
|
||||||
|
ABLOCK_VECTOR_DIM,
|
||||||
|
ABLOCK_SCALAR_VEC,
|
||||||
|
ABLOCK_SCALAR_VEC_K1,
|
||||||
|
ABLOCK_LDS_EXTRAM,
|
||||||
|
BBLOCK_CLUSTER_LENS,
|
||||||
|
BBLOCK_CLUSTER_ORDER,
|
||||||
|
BBLOCK_SRC_ORDER,
|
||||||
|
BBLOCK_VECTOR_DIM,
|
||||||
|
BBLOCK_SCALAR_VEC,
|
||||||
|
BBLOCK_SCALAR_VEC_AK1,
|
||||||
|
BBLOCK_LDS_EXTRAN,
|
||||||
|
CMPER_WAVE,
|
||||||
|
CNPER_WAVE,
|
||||||
|
CBLOCK_CLUSTER_LENS,
|
||||||
|
CNPER_BLOCK>;
|
||||||
|
|
||||||
|
auto gemm = DeviceGemmInstance{};
|
||||||
|
auto invoker = gemm.MakeInvoker();
|
||||||
|
|
||||||
|
auto a_element_op = AElementOp{};
|
||||||
|
auto b_element_op = BElementOp{};
|
||||||
|
auto c_element_op = CElementOp{};
|
||||||
|
|
||||||
|
|
||||||
|
using DDataArrayType = std::array<const void*, 0>;
|
||||||
|
DDataArrayType DDataArray;
|
||||||
|
|
||||||
|
// We swap A and B inputs here as a temporary workaround
|
||||||
|
auto argument = gemm.MakeArgument(
|
||||||
|
reinterpret_cast<const ADataType*>(b),
|
||||||
|
reinterpret_cast<const BDataType*>(a),
|
||||||
|
reinterpret_cast<CDataType*>(c),
|
||||||
|
N,
|
||||||
|
M,
|
||||||
|
K,
|
||||||
|
StrideB,
|
||||||
|
StrideA,
|
||||||
|
StrideC,
|
||||||
|
b_element_op,
|
||||||
|
a_element_op,
|
||||||
|
c_element_op);
|
||||||
|
|
||||||
|
|
||||||
|
if(!gemm.IsSupportedArgument(argument))
|
||||||
|
{
|
||||||
|
printf("error shape = %d %d %d TRANSA=%d TRANSB=%d \n",
|
||||||
|
n, m, k,TRANSA, TRANSB);
|
||||||
|
|
||||||
|
throw std::runtime_error(
|
||||||
|
"wrong! device_gemm with the specified compilation parameters does "
|
||||||
|
"not support this GEMM problem");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
auto stream = at::cuda::getCurrentHIPStream().stream();
|
||||||
|
#if 1
|
||||||
|
invoker.Run(argument, StreamConfig{stream, false});
|
||||||
|
#else
|
||||||
|
float ave_time = invoker.Run(argument, StreamConfig{stream, true});
|
||||||
|
std::size_t flop = std::size_t(2) * M * N * K;
|
||||||
|
|
||||||
|
std::size_t num_btype =
|
||||||
|
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
|
||||||
|
|
||||||
|
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||||
|
|
||||||
|
float gb_per_sec = num_btype / 1.E6 / ave_time;
|
||||||
|
|
||||||
|
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
|
||||||
|
<< gb_per_sec << " GB/s, " << N <<" " <<M<<" " << k <<" "
|
||||||
|
<< "stride: "<<StrideA <<" "<<StrideB <<" "<<StrideC <<" "
|
||||||
|
<< gemm.GetTypeString()
|
||||||
|
<< std::endl;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace at::native
|
} // namespace at::native
|
||||||
|
|||||||
@ -311,9 +311,8 @@ void gpu_float_sdpa(
|
|||||||
bool is_causal,
|
bool is_causal,
|
||||||
float softmax_scale,
|
float softmax_scale,
|
||||||
const Tensor& output) {
|
const Tensor& output) {
|
||||||
auto eng = GpuEngineManager::Instance().get_engine(
|
auto& eng = GpuEngineManager::Instance().get_engine();
|
||||||
{c10::kXPU, c10::xpu::current_device()});
|
auto& strm = GpuStreamManager::Instance().get_stream();
|
||||||
auto strm = GpuStreamManager::Instance().get_stream();
|
|
||||||
|
|
||||||
const auto get_tril_mask = [&]() {
|
const auto get_tril_mask = [&]() {
|
||||||
auto opts = query.options();
|
auto opts = query.options();
|
||||||
|
|||||||
@ -338,8 +338,7 @@ class Attr {
|
|||||||
// [1, C, 1, 1], channel broadcast
|
// [1, C, 1, 1], channel broadcast
|
||||||
// [dst.shape], no broadcast and eltwise-wise binary operations on dst
|
// [dst.shape], no broadcast and eltwise-wise binary operations on dst
|
||||||
|
|
||||||
auto engine = GpuEngineManager::Instance().get_engine(
|
auto& engine = GpuEngineManager::Instance().get_engine();
|
||||||
{c10::kXPU, c10::xpu::current_device()});
|
|
||||||
for (size_t i = 0; i < ops_params_.size(); ++i) {
|
for (size_t i = 0; i < ops_params_.size(); ++i) {
|
||||||
kind_t kind = ops_params_[i].kind_;
|
kind_t kind = ops_params_[i].kind_;
|
||||||
if (kind == kind_t::binary) {
|
if (kind == kind_t::binary) {
|
||||||
|
|||||||
@ -83,9 +83,8 @@ sycl::event convolution(
|
|||||||
int64_t groups,
|
int64_t groups,
|
||||||
Attr& attr,
|
Attr& attr,
|
||||||
const std::vector<sycl::event>& deps) {
|
const std::vector<sycl::event>& deps) {
|
||||||
auto engine = GpuEngineManager::Instance().get_engine(
|
auto& engine = GpuEngineManager::Instance().get_engine();
|
||||||
{c10::kXPU, c10::xpu::current_device()});
|
auto& stream = GpuStreamManager::Instance().get_stream();
|
||||||
auto stream = GpuStreamManager::Instance().get_stream();
|
|
||||||
|
|
||||||
bool is_channels_last = use_channels_last_for_conv(src, weight);
|
bool is_channels_last = use_channels_last_for_conv(src, weight);
|
||||||
|
|
||||||
@ -184,9 +183,8 @@ sycl::event convolution_backward_weights(
|
|||||||
IntArrayRef dilation,
|
IntArrayRef dilation,
|
||||||
int64_t groups,
|
int64_t groups,
|
||||||
const std::vector<sycl::event>& deps) {
|
const std::vector<sycl::event>& deps) {
|
||||||
auto engine = GpuEngineManager::Instance().get_engine(
|
auto& engine = GpuEngineManager::Instance().get_engine();
|
||||||
{c10::kXPU, c10::xpu::current_device()});
|
auto& stream = GpuStreamManager::Instance().get_stream();
|
||||||
auto stream = GpuStreamManager::Instance().get_stream();
|
|
||||||
|
|
||||||
bool is_channels_last = use_channels_last_for_conv(src, diff_dst);
|
bool is_channels_last = use_channels_last_for_conv(src, diff_dst);
|
||||||
|
|
||||||
@ -292,9 +290,8 @@ sycl::event convolution_backward_data(
|
|||||||
int64_t groups,
|
int64_t groups,
|
||||||
bool bias_defined,
|
bool bias_defined,
|
||||||
const std::vector<sycl::event>& deps) {
|
const std::vector<sycl::event>& deps) {
|
||||||
auto engine = GpuEngineManager::Instance().get_engine(
|
auto& engine = GpuEngineManager::Instance().get_engine();
|
||||||
{c10::kXPU, c10::xpu::current_device()});
|
auto& stream = GpuStreamManager::Instance().get_stream();
|
||||||
auto stream = GpuStreamManager::Instance().get_stream();
|
|
||||||
|
|
||||||
bool is_channels_last = use_channels_last_for_conv(diff_dst, weight);
|
bool is_channels_last = use_channels_last_for_conv(diff_dst, weight);
|
||||||
|
|
||||||
|
|||||||
@ -158,9 +158,8 @@ sycl::event deconvolution(
|
|||||||
int64_t groups,
|
int64_t groups,
|
||||||
Attr& attr,
|
Attr& attr,
|
||||||
const std::vector<sycl::event>& deps) {
|
const std::vector<sycl::event>& deps) {
|
||||||
auto engine = GpuEngineManager::Instance().get_engine(
|
auto& engine = GpuEngineManager::Instance().get_engine();
|
||||||
{c10::kXPU, c10::xpu::current_device()});
|
auto& stream = GpuStreamManager::Instance().get_stream();
|
||||||
auto stream = GpuStreamManager::Instance().get_stream();
|
|
||||||
|
|
||||||
bool is_channels_last_suggested = use_channels_last_for_conv(src, weight);
|
bool is_channels_last_suggested = use_channels_last_for_conv(src, weight);
|
||||||
|
|
||||||
@ -249,9 +248,8 @@ sycl::event deconvolution_backward_data(
|
|||||||
int64_t groups,
|
int64_t groups,
|
||||||
bool bias_defined,
|
bool bias_defined,
|
||||||
const std::vector<sycl::event>& deps) {
|
const std::vector<sycl::event>& deps) {
|
||||||
auto engine = GpuEngineManager::Instance().get_engine(
|
auto& engine = GpuEngineManager::Instance().get_engine();
|
||||||
{c10::kXPU, c10::xpu::current_device()});
|
auto& stream = GpuStreamManager::Instance().get_stream();
|
||||||
auto stream = GpuStreamManager::Instance().get_stream();
|
|
||||||
|
|
||||||
bool is_channels_last_suggested =
|
bool is_channels_last_suggested =
|
||||||
use_channels_last_for_conv(diff_dst, weight);
|
use_channels_last_for_conv(diff_dst, weight);
|
||||||
@ -347,9 +345,8 @@ sycl::event deconvolution_backward_weights(
|
|||||||
IntArrayRef dilation,
|
IntArrayRef dilation,
|
||||||
int64_t groups,
|
int64_t groups,
|
||||||
const std::vector<sycl::event>& deps) {
|
const std::vector<sycl::event>& deps) {
|
||||||
auto engine = GpuEngineManager::Instance().get_engine(
|
auto& engine = GpuEngineManager::Instance().get_engine();
|
||||||
{c10::kXPU, c10::xpu::current_device()});
|
auto& stream = GpuStreamManager::Instance().get_stream();
|
||||||
auto stream = GpuStreamManager::Instance().get_stream();
|
|
||||||
|
|
||||||
bool is_channels_last_suggested = use_channels_last_for_conv(src, diff_dst);
|
bool is_channels_last_suggested = use_channels_last_for_conv(src, diff_dst);
|
||||||
|
|
||||||
|
|||||||
@ -30,9 +30,8 @@ sycl::event matmul(
|
|||||||
"oneDNN input matrixes must have the same ranks");
|
"oneDNN input matrixes must have the same ranks");
|
||||||
TORCH_CHECK(result.defined(), "oneDNN matmul result should be defined");
|
TORCH_CHECK(result.defined(), "oneDNN matmul result should be defined");
|
||||||
|
|
||||||
at::Device cur_device = at::Device(at::kXPU, c10::xpu::current_device());
|
auto& engine = GpuEngineManager::Instance().get_engine();
|
||||||
auto engine = GpuEngineManager::Instance().get_engine(cur_device);
|
auto& stream = GpuStreamManager::Instance().get_stream();
|
||||||
auto stream = GpuStreamManager::Instance().get_stream();
|
|
||||||
|
|
||||||
at::Tensor m1 = mat1;
|
at::Tensor m1 = mat1;
|
||||||
at::Tensor m2 = mat2;
|
at::Tensor m2 = mat2;
|
||||||
|
|||||||
@ -5,6 +5,7 @@
|
|||||||
|
|
||||||
#include <ATen/native/mkldnn/xpu/detail/Attr.h>
|
#include <ATen/native/mkldnn/xpu/detail/Attr.h>
|
||||||
#include <ATen/native/mkldnn/xpu/detail/Utils.h>
|
#include <ATen/native/mkldnn/xpu/detail/Utils.h>
|
||||||
|
#include <ATen/native/mkldnn/xpu/detail/oneDNN.h>
|
||||||
#include <ATen/native/mkldnn/xpu/detail/oneDNNContext.h>
|
#include <ATen/native/mkldnn/xpu/detail/oneDNNContext.h>
|
||||||
|
|
||||||
#include <oneapi/dnnl/dnnl.hpp>
|
#include <oneapi/dnnl/dnnl.hpp>
|
||||||
@ -106,9 +107,8 @@ at::Tensor quantized_convolution(
|
|||||||
output.defined(),
|
output.defined(),
|
||||||
"A valid output is required for quantized convolution.");
|
"A valid output is required for quantized convolution.");
|
||||||
|
|
||||||
auto engine = GpuEngineManager::Instance().get_engine(
|
auto& engine = GpuEngineManager::Instance().get_engine();
|
||||||
{c10::kXPU, c10::xpu::current_device()});
|
auto& stream = GpuStreamManager::Instance().get_stream();
|
||||||
auto stream = GpuStreamManager::Instance().get_stream();
|
|
||||||
|
|
||||||
// input tensors config
|
// input tensors config
|
||||||
dnnl::memory::dims src_dims = act.sizes().vec();
|
dnnl::memory::dims src_dims = act.sizes().vec();
|
||||||
|
|||||||
@ -125,9 +125,8 @@ void quantized_matmul(
|
|||||||
attr);
|
attr);
|
||||||
|
|
||||||
size_t dims = result.dim();
|
size_t dims = result.dim();
|
||||||
at::Device cur_device = at::Device(at::kXPU, c10::xpu::current_device());
|
auto& engine = GpuEngineManager::Instance().get_engine();
|
||||||
auto engine = GpuEngineManager::Instance().get_engine(cur_device);
|
auto& stream = GpuStreamManager::Instance().get_stream();
|
||||||
auto stream = GpuStreamManager::Instance().get_stream();
|
|
||||||
|
|
||||||
at::Tensor m1 = is_onednn_matmul_strides(mat1) ? mat1 : mat1.contiguous();
|
at::Tensor m1 = is_onednn_matmul_strides(mat1) ? mat1 : mat1.contiguous();
|
||||||
at::Tensor m2 = is_onednn_matmul_strides(mat2) ? mat2 : mat2.contiguous();
|
at::Tensor m2 = is_onednn_matmul_strides(mat2) ? mat2 : mat2.contiguous();
|
||||||
|
|||||||
@ -29,8 +29,7 @@ static inline void dnnl_delete(
|
|||||||
}
|
}
|
||||||
|
|
||||||
GpuEngineManager::GpuEngineManager() {
|
GpuEngineManager::GpuEngineManager() {
|
||||||
c10::DeviceIndex device_count = c10::xpu::device_count();
|
c10::DeviceIndex device_count = c10::xpu::device_count_ensure_non_zero();
|
||||||
TORCH_INTERNAL_ASSERT(device_count > 0);
|
|
||||||
for (const auto i : c10::irange(device_count)) {
|
for (const auto i : c10::irange(device_count)) {
|
||||||
static dnnl::graph::allocator alloc =
|
static dnnl::graph::allocator alloc =
|
||||||
dnnl::graph::sycl_interop::make_allocator(dnnl_alloc, dnnl_delete);
|
dnnl::graph::sycl_interop::make_allocator(dnnl_alloc, dnnl_delete);
|
||||||
|
|||||||
@ -25,10 +25,15 @@ bool set_onednn_verbose(int level);
|
|||||||
struct TORCH_XPU_API GpuEngineManager {
|
struct TORCH_XPU_API GpuEngineManager {
|
||||||
static GpuEngineManager& Instance(); // Singleton
|
static GpuEngineManager& Instance(); // Singleton
|
||||||
|
|
||||||
|
dnnl::engine& get_engine(
|
||||||
|
DeviceIndex device_index = c10::xpu::current_device()) {
|
||||||
|
c10::xpu::check_device_index(device_index);
|
||||||
|
return *engine_pool[device_index];
|
||||||
|
}
|
||||||
|
|
||||||
dnnl::engine& get_engine(const Device& device) {
|
dnnl::engine& get_engine(const Device& device) {
|
||||||
TORCH_INTERNAL_ASSERT(device.type() == kXPU);
|
TORCH_INTERNAL_ASSERT(device.type() == kXPU);
|
||||||
TORCH_INTERNAL_ASSERT(device.index() < c10::xpu::device_count());
|
return get_engine(device.index());
|
||||||
return *engine_pool[device.index()];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
GpuEngineManager(GpuEngineManager const&) = delete;
|
GpuEngineManager(GpuEngineManager const&) = delete;
|
||||||
@ -48,16 +53,15 @@ struct TORCH_XPU_API GpuEngineManager {
|
|||||||
struct TORCH_XPU_API GpuStreamManager {
|
struct TORCH_XPU_API GpuStreamManager {
|
||||||
static GpuStreamManager& Instance(); // Singleton
|
static GpuStreamManager& Instance(); // Singleton
|
||||||
|
|
||||||
dnnl::stream get_stream() {
|
dnnl::stream& get_stream(
|
||||||
auto stream = c10::xpu::getCurrentXPUStream();
|
DeviceIndex device_index = c10::xpu::current_device()) {
|
||||||
|
auto stream = c10::xpu::getCurrentXPUStream(device_index);
|
||||||
auto priority = stream.priority();
|
auto priority = stream.priority();
|
||||||
auto device_index = stream.device_index();
|
|
||||||
if (stream_pool[device_index][priority].find(stream) ==
|
if (stream_pool[device_index][priority].find(stream) ==
|
||||||
stream_pool[device_index][priority].end()) {
|
stream_pool[device_index][priority].end()) {
|
||||||
stream_pool[device_index][priority][stream] =
|
stream_pool[device_index][priority][stream] =
|
||||||
std::make_shared<dnnl::stream>(dnnl::sycl_interop::make_stream(
|
std::make_shared<dnnl::stream>(dnnl::sycl_interop::make_stream(
|
||||||
GpuEngineManager::Instance().get_engine(
|
GpuEngineManager::Instance().get_engine(device_index),
|
||||||
{c10::kXPU, device_index}),
|
|
||||||
stream.queue()));
|
stream.queue()));
|
||||||
}
|
}
|
||||||
return *stream_pool[device_index][priority][stream];
|
return *stream_pool[device_index][priority][stream];
|
||||||
@ -70,8 +74,7 @@ struct TORCH_XPU_API GpuStreamManager {
|
|||||||
|
|
||||||
protected:
|
protected:
|
||||||
GpuStreamManager() {
|
GpuStreamManager() {
|
||||||
c10::DeviceIndex device_count = c10::xpu::device_count();
|
c10::DeviceIndex device_count = c10::xpu::device_count_ensure_non_zero();
|
||||||
TORCH_INTERNAL_ASSERT(device_count > 0);
|
|
||||||
stream_pool.resize(device_count);
|
stream_pool.resize(device_count);
|
||||||
}
|
}
|
||||||
~GpuStreamManager() = default;
|
~GpuStreamManager() = default;
|
||||||
|
|||||||
@ -19,7 +19,7 @@ static inline c10::ScalarType qconv_decide_out_dtype(
|
|||||||
return dst_dtype;
|
return dst_dtype;
|
||||||
}
|
}
|
||||||
|
|
||||||
at::Tensor qconv_prepack_xpu(
|
static at::Tensor qconv_prepack_xpu(
|
||||||
at::Tensor weight,
|
at::Tensor weight,
|
||||||
at::Tensor weight_scales,
|
at::Tensor weight_scales,
|
||||||
double input_scale,
|
double input_scale,
|
||||||
|
|||||||
@ -19,7 +19,7 @@ static inline c10::ScalarType qlinear_decide_out_dtype(
|
|||||||
return dst_dtype;
|
return dst_dtype;
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor q_linear_pointwise(
|
static Tensor q_linear_pointwise(
|
||||||
Tensor act,
|
Tensor act,
|
||||||
double act_scale,
|
double act_scale,
|
||||||
int64_t act_zero_point,
|
int64_t act_zero_point,
|
||||||
@ -78,7 +78,7 @@ Tensor q_linear_pointwise(
|
|||||||
return qout;
|
return qout;
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor q_linear_pointwise_tensor(
|
static Tensor q_linear_pointwise_tensor(
|
||||||
Tensor act,
|
Tensor act,
|
||||||
Tensor act_scale,
|
Tensor act_scale,
|
||||||
Tensor act_zero_point,
|
Tensor act_zero_point,
|
||||||
@ -137,7 +137,7 @@ Tensor q_linear_pointwise_tensor(
|
|||||||
return qout;
|
return qout;
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor q_linear_pointwise_binary(
|
static Tensor q_linear_pointwise_binary(
|
||||||
Tensor act,
|
Tensor act,
|
||||||
double act_scale,
|
double act_scale,
|
||||||
int64_t act_zero_point,
|
int64_t act_zero_point,
|
||||||
@ -208,7 +208,7 @@ Tensor q_linear_pointwise_binary(
|
|||||||
return dim == 3 ? qout.reshape({act.size(0), -1, N}) : qout;
|
return dim == 3 ? qout.reshape({act.size(0), -1, N}) : qout;
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor q_linear_pointwise_binary_tensor(
|
static Tensor q_linear_pointwise_binary_tensor(
|
||||||
Tensor act,
|
Tensor act,
|
||||||
Tensor act_scale,
|
Tensor act_scale,
|
||||||
Tensor act_zero_point,
|
Tensor act_zero_point,
|
||||||
@ -248,7 +248,7 @@ Tensor q_linear_pointwise_binary_tensor(
|
|||||||
unary_post_op_algorithm);
|
unary_post_op_algorithm);
|
||||||
}
|
}
|
||||||
|
|
||||||
at::Tensor q_linear_prepack_onednn(
|
static at::Tensor q_linear_prepack_onednn(
|
||||||
at::Tensor weight,
|
at::Tensor weight,
|
||||||
std::optional<torch::List<int64_t>> input_shape) {
|
std::optional<torch::List<int64_t>> input_shape) {
|
||||||
at::Tensor weight_transposed = weight.transpose(0, 1);
|
at::Tensor weight_transposed = weight.transpose(0, 1);
|
||||||
|
|||||||
@ -133,6 +133,10 @@ class MetalShaderLibrary {
|
|||||||
TensorIteratorBase& iter,
|
TensorIteratorBase& iter,
|
||||||
const std::string& name,
|
const std::string& name,
|
||||||
std::optional<int64_t> extra = std::nullopt);
|
std::optional<int64_t> extra = std::nullopt);
|
||||||
|
void exec_binary_kernel(
|
||||||
|
TensorIteratorBase& iter,
|
||||||
|
const std::string& name,
|
||||||
|
const bool supports_dense = true);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual MTLLibrary_t getLibrary();
|
virtual MTLLibrary_t getLibrary();
|
||||||
|
|||||||
@ -1010,6 +1010,49 @@ void MetalShaderLibrary::exec_unary_kernel(TensorIteratorBase& iter,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void MetalShaderLibrary::exec_binary_kernel(TensorIteratorBase& iter,
|
||||||
|
const std::string& name,
|
||||||
|
const bool supports_dense) {
|
||||||
|
TORCH_CHECK(iter.common_dtype() != at::kDouble, "float64 is not supported on MPS");
|
||||||
|
|
||||||
|
Tensor input = iter.input(0);
|
||||||
|
Tensor other = iter.input(1);
|
||||||
|
Tensor out = iter.output();
|
||||||
|
|
||||||
|
id<MTLDevice> device = MPSDevice::getInstance()->device();
|
||||||
|
MPSStream* mpsStream = getCurrentMPSStream();
|
||||||
|
const uint32_t nDim = iter.ndim();
|
||||||
|
constexpr uint32_t nOffsets = 3;
|
||||||
|
const uint32_t numThreads = iter.numel();
|
||||||
|
dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
|
||||||
|
@autoreleasepool {
|
||||||
|
auto computeEncoder = mpsStream->commandEncoder();
|
||||||
|
if (supports_dense && iter.is_contiguous()) {
|
||||||
|
const auto kernel_name = fmt::format("{}_dense_{}", name, scalarToMetalTypeString(input));
|
||||||
|
auto binaryPSO = getPipelineStateForFunc(kernel_name);
|
||||||
|
[computeEncoder setComputePipelineState:binaryPSO];
|
||||||
|
mtl_setArgs(computeEncoder, input, other, out);
|
||||||
|
mtl_dispatch1DJob(computeEncoder, binaryPSO, numThreads);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const auto kernel = fmt::format("{}_{}", name, scalarToMetalTypeString(input));
|
||||||
|
auto kernelDataOffsets = generateKernelDataOffsets(computeEncoder, iter);
|
||||||
|
|
||||||
|
auto binaryPSO = getPipelineStateForFunc(kernel);
|
||||||
|
|
||||||
|
// this function call is a no-op if MPS Profiler is not enabled
|
||||||
|
getMPSProfiler().beginProfileKernel(binaryPSO, kernel, {input, other});
|
||||||
|
|
||||||
|
[computeEncoder setComputePipelineState:binaryPSO];
|
||||||
|
mtl_setArgs(computeEncoder, input, other, out);
|
||||||
|
[computeEncoder setBuffer:kernelDataOffsets offset:0 atIndex:3];
|
||||||
|
mtl_dispatch1DJob(computeEncoder, binaryPSO, numThreads);
|
||||||
|
|
||||||
|
getMPSProfiler().endProfileKernel(binaryPSO);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
MetalShaderLibrary& MetalShaderLibrary::getBundledLibrary() {
|
MetalShaderLibrary& MetalShaderLibrary::getBundledLibrary() {
|
||||||
static BundledShaderLibary l;
|
static BundledShaderLibary l;
|
||||||
return l;
|
return l;
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
#include <c10/metal/indexing.h>
|
||||||
#include <c10/metal/special_math.h>
|
#include <c10/metal/special_math.h>
|
||||||
#include <c10/metal/utils.h>
|
#include <c10/metal/utils.h>
|
||||||
#include <metal_stdlib>
|
#include <metal_stdlib>
|
||||||
@ -91,59 +92,6 @@ struct polar_functor {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Future BinaryTensorIterator
|
|
||||||
template <typename T, typename F>
|
|
||||||
using result_of = decltype(::metal::declval<F>()(
|
|
||||||
::metal::declval<T>(),
|
|
||||||
::metal::declval<T>()));
|
|
||||||
|
|
||||||
template <typename T, typename F>
|
|
||||||
kernel void binary_indexing(
|
|
||||||
constant void* input_ [[buffer(0)]],
|
|
||||||
constant void* other_ [[buffer(1)]],
|
|
||||||
device void* out_ [[buffer(2)]],
|
|
||||||
constant uint3* offsets [[buffer(3)]],
|
|
||||||
uint tid [[thread_position_in_grid]]) {
|
|
||||||
auto out = (device result_of<T, F>*)((device uint8_t*)out_ + offsets[tid].x);
|
|
||||||
auto input = (constant T*)((constant uint8_t*)input_ + offsets[tid].y);
|
|
||||||
auto other = (constant T*)((constant uint8_t*)other_ + offsets[tid].z);
|
|
||||||
F f;
|
|
||||||
*out = f(*input, *other);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T, typename F>
|
|
||||||
kernel void binary_dense(
|
|
||||||
constant T* input [[buffer(0)]],
|
|
||||||
constant T* other [[buffer(1)]],
|
|
||||||
device result_of<T, F>* out [[buffer(2)]],
|
|
||||||
uint tid [[thread_position_in_grid]]) {
|
|
||||||
F f;
|
|
||||||
out[tid] = f(input[tid], other[tid]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define REGISTER_BINARY_INDEXING_OP(NAME, DTYPE) \
|
|
||||||
template [[host_name(#NAME "_" #DTYPE)]] kernel void \
|
|
||||||
binary_indexing<DTYPE, NAME##_functor>( \
|
|
||||||
constant void* input_, \
|
|
||||||
constant void* other_, \
|
|
||||||
device void* out_, \
|
|
||||||
constant uint3* offsets, \
|
|
||||||
uint tid); \
|
|
||||||
template [[host_name(#NAME "_dense_" #DTYPE)]] kernel void \
|
|
||||||
binary_dense<DTYPE, NAME##_functor>( \
|
|
||||||
constant DTYPE * input_, \
|
|
||||||
constant DTYPE * other_, \
|
|
||||||
device result_of<DTYPE, NAME##_functor> * out_, \
|
|
||||||
uint tid)
|
|
||||||
|
|
||||||
#define REGISTER_BINARY_OP(NAME, DTYPE) \
|
|
||||||
template [[host_name(#NAME "_" #DTYPE)]] kernel void NAME<DTYPE>( \
|
|
||||||
constant void* input_, \
|
|
||||||
constant void* other_, \
|
|
||||||
device void* out_, \
|
|
||||||
constant uint3* offsets, \
|
|
||||||
uint tid)
|
|
||||||
|
|
||||||
REGISTER_BINARY_INDEXING_OP(copysign, long);
|
REGISTER_BINARY_INDEXING_OP(copysign, long);
|
||||||
REGISTER_BINARY_INDEXING_OP(copysign, int);
|
REGISTER_BINARY_INDEXING_OP(copysign, int);
|
||||||
REGISTER_BINARY_INDEXING_OP(copysign, float);
|
REGISTER_BINARY_INDEXING_OP(copysign, float);
|
||||||
@ -190,9 +138,7 @@ kernel void complex_mul(
|
|||||||
out[1] = input[0] * other[1] + input[1] * other[0];
|
out[1] = input[0] * other[1] + input[1] * other[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
REGISTER_BINARY_OP(complex_mul, float);
|
// Constructs complex tensor from real and imaginary planes
|
||||||
REGISTER_BINARY_OP(complex_mul, half);
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
kernel void complex_kernel(
|
kernel void complex_kernel(
|
||||||
constant void* real_ [[buffer(0)]],
|
constant void* real_ [[buffer(0)]],
|
||||||
@ -207,5 +153,15 @@ kernel void complex_kernel(
|
|||||||
out[1] = imag[0];
|
out[1] = imag[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define REGISTER_BINARY_OP(NAME, DTYPE) \
|
||||||
|
template [[host_name(#NAME "_" #DTYPE)]] kernel void NAME<DTYPE>( \
|
||||||
|
constant void* input_, \
|
||||||
|
constant void* other_, \
|
||||||
|
device void* out_, \
|
||||||
|
constant uint3* offsets, \
|
||||||
|
uint tid)
|
||||||
|
|
||||||
|
REGISTER_BINARY_OP(complex_mul, float);
|
||||||
|
REGISTER_BINARY_OP(complex_mul, half);
|
||||||
REGISTER_BINARY_OP(complex_kernel, float);
|
REGISTER_BINARY_OP(complex_kernel, float);
|
||||||
REGISTER_BINARY_OP(complex_kernel, half);
|
REGISTER_BINARY_OP(complex_kernel, half);
|
||||||
|
|||||||
@ -1,16 +1,63 @@
|
|||||||
#include <c10/metal/indexing.h>
|
#include <c10/metal/indexing.h>
|
||||||
#include <c10/metal/special_math.h>
|
#include <c10/metal/special_math.h>
|
||||||
using namespace c10::metal;
|
using namespace c10::metal;
|
||||||
|
using namespace metal;
|
||||||
|
|
||||||
|
DEFINE_UNARY_FLOATING_FUNCTOR(bessel_j0_forward);
|
||||||
|
DEFINE_UNARY_FLOATING_FUNCTOR(bessel_j1_forward);
|
||||||
|
DEFINE_UNARY_FLOATING_FUNCTOR(modified_bessel_i0_forward);
|
||||||
|
DEFINE_UNARY_FLOATING_FUNCTOR(modified_bessel_i1_forward);
|
||||||
DEFINE_UNARY_FLOATING_FUNCTOR(i0);
|
DEFINE_UNARY_FLOATING_FUNCTOR(i0);
|
||||||
|
DEFINE_UNARY_FLOATING_FUNCTOR(i0e);
|
||||||
DEFINE_UNARY_FLOATING_FUNCTOR(i1);
|
DEFINE_UNARY_FLOATING_FUNCTOR(i1);
|
||||||
|
DEFINE_UNARY_FLOATING_FUNCTOR(i1e);
|
||||||
DEFINE_UNARY_FLOATING_FUNCTOR(spherical_bessel_j0);
|
DEFINE_UNARY_FLOATING_FUNCTOR(spherical_bessel_j0);
|
||||||
DEFINE_UNARY_FLOATING_FUNCTOR(entr);
|
DEFINE_UNARY_FLOATING_FUNCTOR(entr);
|
||||||
|
|
||||||
#define REGISTER_SPECIAL(DTI, DTO) \
|
// TODO: Replaceme with DEFINE_UNARY_FLOATING_FUNCTOR
|
||||||
REGISTER_UNARY_OP(i0, DTI, DTO); \
|
// But for some reason instantinating bessel_y[01] on M1/M2 results in
|
||||||
REGISTER_UNARY_OP(i1, DTI, DTO); \
|
// Failed to created pipeline state object, error: Error Domain=AGXMetalG14X
|
||||||
REGISTER_UNARY_OP(spherical_bessel_j0, DTI, DTO); \
|
// Code=3 "Compiler encountered an internal error"
|
||||||
|
struct bessel_y0_forward_functor {
|
||||||
|
template <typename T>
|
||||||
|
inline enable_if_t<is_floating_point_v<T>, T> operator()(const T x) {
|
||||||
|
return static_cast<T>(bessel_y0_forward(x));
|
||||||
|
}
|
||||||
|
template <typename T>
|
||||||
|
inline enable_if_t<is_integral_v<T>, float> operator()(const T x) {
|
||||||
|
return bessel_y0_forward(static_cast<float>(x));
|
||||||
|
}
|
||||||
|
inline float operator()(const bool x) {
|
||||||
|
return x ? 0.08825694769620895 : -INFINITY;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct bessel_y1_forward_functor {
|
||||||
|
template <typename T>
|
||||||
|
inline enable_if_t<is_floating_point_v<T>, T> operator()(const T x) {
|
||||||
|
return static_cast<T>(bessel_y1_forward(x));
|
||||||
|
}
|
||||||
|
template <typename T>
|
||||||
|
inline enable_if_t<is_integral_v<T>, float> operator()(const T x) {
|
||||||
|
return bessel_y1_forward(static_cast<float>(x));
|
||||||
|
}
|
||||||
|
inline float operator()(const bool x) {
|
||||||
|
return x ? -0.7812128067016602 : -INFINITY;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#define REGISTER_SPECIAL(DTI, DTO) \
|
||||||
|
REGISTER_UNARY_OP(bessel_j0_forward, DTI, DTO); \
|
||||||
|
REGISTER_UNARY_OP(bessel_j1_forward, DTI, DTO); \
|
||||||
|
REGISTER_UNARY_OP(modified_bessel_i0_forward, DTI, DTO); \
|
||||||
|
REGISTER_UNARY_OP(modified_bessel_i1_forward, DTI, DTO); \
|
||||||
|
REGISTER_UNARY_OP(bessel_y0_forward, DTI, DTO); \
|
||||||
|
REGISTER_UNARY_OP(bessel_y1_forward, DTI, DTO); \
|
||||||
|
REGISTER_UNARY_OP(i0, DTI, DTO); \
|
||||||
|
REGISTER_UNARY_OP(i0e, DTI, DTO); \
|
||||||
|
REGISTER_UNARY_OP(i1, DTI, DTO); \
|
||||||
|
REGISTER_UNARY_OP(i1e, DTI, DTO); \
|
||||||
|
REGISTER_UNARY_OP(spherical_bessel_j0, DTI, DTO); \
|
||||||
REGISTER_UNARY_OP(entr, DTI, DTO)
|
REGISTER_UNARY_OP(entr, DTI, DTO)
|
||||||
|
|
||||||
REGISTER_SPECIAL(float, float);
|
REGISTER_SPECIAL(float, float);
|
||||||
|
|||||||
@ -268,12 +268,31 @@ kernel void upsample_bilinear2d(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline float bilinear_functor(float x) {
|
struct BilinearFunctor {
|
||||||
return abs(x) < 1.0 ? 1.0 - abs(x) : abs(x);
|
inline float operator()(float x) {
|
||||||
}
|
x = abs(x);
|
||||||
|
return x < 1.0 ? 1.0 - x : x;
|
||||||
|
}
|
||||||
|
static constant constexpr float area_factor = 1.0;
|
||||||
|
};
|
||||||
|
|
||||||
template <typename T>
|
struct BicubicFunctor {
|
||||||
kernel void upsample_bilinear2d_aa(
|
inline float operator()(float x) {
|
||||||
|
// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
|
||||||
|
x = abs(x);
|
||||||
|
if (x < 1.0) {
|
||||||
|
return 1.0 + (1.5 * x - 2.5) * x * x;
|
||||||
|
}
|
||||||
|
if (x < 2.0) {
|
||||||
|
return 2.0 - 0.5 * ((x - 5.0) * x + 8.0) * x;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
static constant constexpr float area_factor = 2.0;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T, typename F>
|
||||||
|
kernel void upsample_2d_aa(
|
||||||
constant T* inputData [[buffer(0)]],
|
constant T* inputData [[buffer(0)]],
|
||||||
device T* outputData [[buffer(1)]],
|
device T* outputData [[buffer(1)]],
|
||||||
constant ulong4& input_strides [[buffer(2)]],
|
constant ulong4& input_strides [[buffer(2)]],
|
||||||
@ -286,15 +305,26 @@ kernel void upsample_bilinear2d_aa(
|
|||||||
auto output_x = thread_index % static_cast<uint>(output_sizes.w);
|
auto output_x = thread_index % static_cast<uint>(output_sizes.w);
|
||||||
auto output_y = thread_index / static_cast<uint>(output_sizes.w);
|
auto output_y = thread_index / static_cast<uint>(output_sizes.w);
|
||||||
(void)align_corners; // Align corners is unused for AA algorithm
|
(void)align_corners; // Align corners is unused for AA algorithm
|
||||||
|
F f;
|
||||||
auto x_center = area_pixel_compute_source_index(
|
auto x_center = area_pixel_compute_source_index(
|
||||||
scales.x, output_x, /*align_corners=*/false, /*cubic=*/false);
|
scales.x,
|
||||||
|
output_x,
|
||||||
|
/*align_corners=*/false,
|
||||||
|
/*cubic=*/F::area_factor == 2.0);
|
||||||
auto y_center = area_pixel_compute_source_index(
|
auto y_center = area_pixel_compute_source_index(
|
||||||
scales.y, output_y, /*align_corners=*/false, /*cubic=*/false);
|
scales.y,
|
||||||
|
output_y,
|
||||||
|
/*align_corners=*/false,
|
||||||
|
/*cubic=*/F::area_factor == 2.0);
|
||||||
auto clamped_scales = max(1.0, scales);
|
auto clamped_scales = max(1.0, scales);
|
||||||
auto x_min = max(0L, long(floor(x_center - clamped_scales.x + 1)));
|
auto x_min =
|
||||||
auto x_max = min(input_sizes.w, long(ceil(x_center + clamped_scales.x)));
|
max(0L, long(floor(x_center - f.area_factor * clamped_scales.x + 1)));
|
||||||
auto y_min = max(0L, long(floor(y_center - clamped_scales.y + 1)));
|
auto x_max = min(
|
||||||
auto y_max = min(input_sizes.z, long(ceil(y_center + clamped_scales.y)));
|
input_sizes.w, long(ceil(x_center + f.area_factor * clamped_scales.x)));
|
||||||
|
auto y_min =
|
||||||
|
max(0L, long(floor(y_center - f.area_factor * clamped_scales.y + 1)));
|
||||||
|
auto y_max = min(
|
||||||
|
input_sizes.z, long(ceil(y_center + f.area_factor * clamped_scales.y)));
|
||||||
for (int n = 0; n < output_sizes.x; n++) {
|
for (int n = 0; n < output_sizes.x; n++) {
|
||||||
for (int c = 0; c < output_sizes.y; c++) {
|
for (int c = 0; c < output_sizes.y; c++) {
|
||||||
float res = 0.0;
|
float res = 0.0;
|
||||||
@ -302,9 +332,9 @@ kernel void upsample_bilinear2d_aa(
|
|||||||
constant auto* input =
|
constant auto* input =
|
||||||
inputData + n * input_strides.x + c * input_strides.y;
|
inputData + n * input_strides.x + c * input_strides.y;
|
||||||
for (auto y = y_min; y < y_max; ++y) {
|
for (auto y = y_min; y < y_max; ++y) {
|
||||||
auto dy = bilinear_functor((y - y_center) / clamped_scales.y);
|
auto dy = f((y - y_center) / clamped_scales.y);
|
||||||
for (auto x = x_min; x < x_max; ++x) {
|
for (auto x = x_min; x < x_max; ++x) {
|
||||||
auto dx = bilinear_functor((x - x_center) / clamped_scales.x);
|
auto dx = f((x - x_center) / clamped_scales.x);
|
||||||
auto val = input[x * input_strides.w + y * input_strides.z];
|
auto val = input[x * input_strides.w + y * input_strides.z];
|
||||||
res += val * dx * dy;
|
res += val * dx * dy;
|
||||||
ws += dx * dy;
|
ws += dx * dy;
|
||||||
@ -456,6 +486,19 @@ kernel void upsample_bicubic2d_backward(
|
|||||||
constant bool& align_corners [[buffer(7)]], \
|
constant bool& align_corners [[buffer(7)]], \
|
||||||
uint thread_index [[thread_position_in_grid]])
|
uint thread_index [[thread_position_in_grid]])
|
||||||
|
|
||||||
|
#define INSTANTIATE_UPSAMPLE_2D_AA(NAME, FUNCTOR, DTYPE) \
|
||||||
|
template [[host_name("upsample_" #NAME "_" #DTYPE)]] kernel void \
|
||||||
|
upsample_2d_aa<DTYPE, FUNCTOR>( \
|
||||||
|
constant DTYPE * inputData [[buffer(0)]], \
|
||||||
|
device DTYPE * outputData [[buffer(1)]], \
|
||||||
|
constant ulong4 & input_strides [[buffer(2)]], \
|
||||||
|
constant ulong4 & output_strides [[buffer(3)]], \
|
||||||
|
constant long4 & input_sizes [[buffer(4)]], \
|
||||||
|
constant long4 & output_sizes [[buffer(5)]], \
|
||||||
|
constant float2 & scales [[buffer(6)]], \
|
||||||
|
constant bool& align_corners [[buffer(7)]], \
|
||||||
|
uint thread_index [[thread_position_in_grid]])
|
||||||
|
|
||||||
#define INSTANTIATE_UPSAMPLE_2D_BACKWARD(NAME, DTYPE) \
|
#define INSTANTIATE_UPSAMPLE_2D_BACKWARD(NAME, DTYPE) \
|
||||||
template [[host_name("upsample_" #NAME "_backward_" #DTYPE)]] kernel void \
|
template [[host_name("upsample_" #NAME "_backward_" #DTYPE)]] kernel void \
|
||||||
upsample_##NAME##_backward<DTYPE>( \
|
upsample_##NAME##_backward<DTYPE>( \
|
||||||
@ -482,11 +525,12 @@ kernel void upsample_bicubic2d_backward(
|
|||||||
constant bool& align_corners [[buffer(7)]], \
|
constant bool& align_corners [[buffer(7)]], \
|
||||||
uint thread_index [[thread_position_in_grid]])
|
uint thread_index [[thread_position_in_grid]])
|
||||||
|
|
||||||
#define INSTANTIATE_UPSAMPLE_ALL(DTYPE) \
|
#define INSTANTIATE_UPSAMPLE_ALL(DTYPE) \
|
||||||
INSTANTIATE_UPSAMPLE_2D(bicubic2d, DTYPE); \
|
INSTANTIATE_UPSAMPLE_2D(bicubic2d, DTYPE); \
|
||||||
INSTANTIATE_UPSAMPLE_2D_BACKWARD(bicubic2d, DTYPE); \
|
INSTANTIATE_UPSAMPLE_2D_AA(bicubic2d_aa, BicubicFunctor, DTYPE); \
|
||||||
INSTANTIATE_UPSAMPLE_2D(bilinear2d, DTYPE); \
|
INSTANTIATE_UPSAMPLE_2D_BACKWARD(bicubic2d, DTYPE); \
|
||||||
INSTANTIATE_UPSAMPLE_2D(bilinear2d_aa, DTYPE); \
|
INSTANTIATE_UPSAMPLE_2D(bilinear2d, DTYPE); \
|
||||||
|
INSTANTIATE_UPSAMPLE_2D_AA(bilinear2d_aa, BilinearFunctor, DTYPE); \
|
||||||
INSTANTIATE_UPSAMPLE_LINEAR(DTYPE);
|
INSTANTIATE_UPSAMPLE_LINEAR(DTYPE);
|
||||||
|
|
||||||
INSTANTIATE_UPSAMPLE_2D(bilinear2d, uchar);
|
INSTANTIATE_UPSAMPLE_2D(bilinear2d, uchar);
|
||||||
|
|||||||
@ -44,7 +44,8 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math_mps(const Tensor&
|
|||||||
TORCH_CHECK(!attn_mask.has_value(),
|
TORCH_CHECK(!attn_mask.has_value(),
|
||||||
"_scaled_dot_product_attention: Explicit attn_mask should not be set when is_causal=True");
|
"_scaled_dot_product_attention: Explicit attn_mask should not be set when is_causal=True");
|
||||||
}
|
}
|
||||||
|
TORCH_CHECK(query.size(-3) == key.size(-3) && key.size(-3) == value.size(-3),
|
||||||
|
"number of heads in query/key/value should match");
|
||||||
TORCH_CHECK(dropout_p == 0.0, "_scaled_dot_product_attention_math_for_mps: dropout_p != 0.0 is not supported");
|
TORCH_CHECK(dropout_p == 0.0, "_scaled_dot_product_attention_math_for_mps: dropout_p != 0.0 is not supported");
|
||||||
TORCH_CHECK(macOS15_0_plus || (query.is_contiguous() && key.is_contiguous() && value.is_contiguous()),
|
TORCH_CHECK(macOS15_0_plus || (query.is_contiguous() && key.is_contiguous() && value.is_contiguous()),
|
||||||
"_scaled_dot_product_attention_math_for_mps: query, key, and value must be contiguous");
|
"_scaled_dot_product_attention_math_for_mps: query, key, and value must be contiguous");
|
||||||
@ -55,6 +56,7 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math_mps(const Tensor&
|
|||||||
auto [q_, sq] = ensure_4d(query);
|
auto [q_, sq] = ensure_4d(query);
|
||||||
auto [k_, sk] = ensure_4d(key);
|
auto [k_, sk] = ensure_4d(key);
|
||||||
auto [v_, sv] = ensure_4d(value);
|
auto [v_, sv] = ensure_4d(value);
|
||||||
|
|
||||||
std::optional<Tensor> mask_;
|
std::optional<Tensor> mask_;
|
||||||
if (attn_mask) {
|
if (attn_mask) {
|
||||||
auto maskExpandedDims = query.sizes().vec();
|
auto maskExpandedDims = query.sizes().vec();
|
||||||
|
|||||||
@ -23,54 +23,13 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace at::native {
|
namespace at::native {
|
||||||
namespace mps {
|
|
||||||
|
|
||||||
#ifndef PYTORCH_JIT_COMPILE_SHADERS
|
#ifndef PYTORCH_JIT_COMPILE_SHADERS
|
||||||
static auto& lib = MetalShaderLibrary::getBundledLibrary();
|
static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
|
||||||
#else
|
#else
|
||||||
#include <ATen/native/mps/BinaryKernel_metallib.h>
|
#include <ATen/native/mps/BinaryKernel_metallib.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void binary_mps_impl(TensorIteratorBase& iter, const std::string func_name, bool supports_dense = true) {
|
namespace mps {
|
||||||
TORCH_CHECK(iter.common_dtype() != at::kDouble, "float64 is not supported on MPS");
|
|
||||||
|
|
||||||
Tensor input = iter.input(0);
|
|
||||||
Tensor other = iter.input(1);
|
|
||||||
Tensor out = iter.output();
|
|
||||||
|
|
||||||
id<MTLDevice> device = MPSDevice::getInstance()->device();
|
|
||||||
MPSStream* mpsStream = getCurrentMPSStream();
|
|
||||||
const uint32_t nDim = iter.ndim();
|
|
||||||
constexpr uint32_t nOffsets = 3;
|
|
||||||
const uint32_t numThreads = iter.numel();
|
|
||||||
dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
|
|
||||||
@autoreleasepool {
|
|
||||||
auto computeEncoder = mpsStream->commandEncoder();
|
|
||||||
if (supports_dense && iter.is_contiguous()) {
|
|
||||||
const auto kernel_name = fmt::format("{}_dense_{}", func_name, scalarToMetalTypeString(input));
|
|
||||||
auto binaryPSO = lib.getPipelineStateForFunc(kernel_name);
|
|
||||||
[computeEncoder setComputePipelineState:binaryPSO];
|
|
||||||
mtl_setArgs(computeEncoder, input, other, out);
|
|
||||||
mtl_dispatch1DJob(computeEncoder, binaryPSO, numThreads);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const std::string kernel = func_name + "_" + scalarToMetalTypeString(input);
|
|
||||||
auto kernelDataOffsets = generateKernelDataOffsets(computeEncoder, iter);
|
|
||||||
|
|
||||||
id<MTLComputePipelineState> binaryPSO = lib.getPipelineStateForFunc(kernel);
|
|
||||||
|
|
||||||
// this function call is a no-op if MPS Profiler is not enabled
|
|
||||||
getMPSProfiler().beginProfileKernel(binaryPSO, kernel, {input, other});
|
|
||||||
|
|
||||||
[computeEncoder setComputePipelineState:binaryPSO];
|
|
||||||
mtl_setArgs(computeEncoder, input, other, out);
|
|
||||||
[computeEncoder setBuffer:kernelDataOffsets offset:0 atIndex:3];
|
|
||||||
mtl_dispatch1DJob(computeEncoder, binaryPSO, numThreads);
|
|
||||||
|
|
||||||
getMPSProfiler().endProfileKernel(binaryPSO);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
void complex_mul_out(const Tensor& input, const Tensor& other, const Tensor& output) {
|
void complex_mul_out(const Tensor& input, const Tensor& other, const Tensor& output) {
|
||||||
TORCH_INTERNAL_ASSERT(c10::isComplexType(input.scalar_type()) || c10::isComplexType(other.scalar_type()));
|
TORCH_INTERNAL_ASSERT(c10::isComplexType(input.scalar_type()) || c10::isComplexType(other.scalar_type()));
|
||||||
@ -89,43 +48,43 @@ void complex_mul_out(const Tensor& input, const Tensor& other, const Tensor& out
|
|||||||
auto iter =
|
auto iter =
|
||||||
TensorIteratorConfig().add_output(output_as_real).add_input(input_as_real).add_input(other_as_real).build();
|
TensorIteratorConfig().add_output(output_as_real).add_input(input_as_real).add_input(other_as_real).build();
|
||||||
|
|
||||||
mps::binary_mps_impl(iter, "complex_mul", false);
|
lib.exec_binary_kernel(iter, "complex_mul", /*supports_dense=*/false);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace mps
|
} // namespace mps
|
||||||
|
|
||||||
static void fmax_mps_kernel(TensorIteratorBase& iter) {
|
static void fmax_mps_kernel(TensorIteratorBase& iter) {
|
||||||
if (isFloatingType(iter.common_dtype())) {
|
if (isFloatingType(iter.common_dtype())) {
|
||||||
mps::binary_mps_impl(iter, "fmax");
|
lib.exec_binary_kernel(iter, "fmax");
|
||||||
} else {
|
} else {
|
||||||
at::maximum_out(const_cast<Tensor&>(iter.output()), iter.input(0), iter.input(1));
|
at::maximum_out(const_cast<Tensor&>(iter.output()), iter.input(0), iter.input(1));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
static void fmin_mps_kernel(TensorIteratorBase& iter) {
|
static void fmin_mps_kernel(TensorIteratorBase& iter) {
|
||||||
if (isFloatingType(iter.common_dtype())) {
|
if (isFloatingType(iter.common_dtype())) {
|
||||||
mps::binary_mps_impl(iter, "fmin");
|
lib.exec_binary_kernel(iter, "fmin");
|
||||||
} else {
|
} else {
|
||||||
at::minimum_out(const_cast<Tensor&>(iter.output()), iter.input(0), iter.input(1));
|
at::minimum_out(const_cast<Tensor&>(iter.output()), iter.input(0), iter.input(1));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void copysign_mps_kernel(TensorIteratorBase& iter) {
|
static void copysign_mps_kernel(TensorIteratorBase& iter) {
|
||||||
mps::binary_mps_impl(iter, "copysign");
|
lib.exec_binary_kernel(iter, "copysign");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void nextafter_mps_kernel(TensorIteratorBase& iter) {
|
static void nextafter_mps_kernel(TensorIteratorBase& iter) {
|
||||||
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "nextafter_mps not implemented for non-floating types");
|
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "nextafter_mps not implemented for non-floating types");
|
||||||
mps::binary_mps_impl(iter, "nextafter");
|
lib.exec_binary_kernel(iter, "nextafter");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void zeta_mps_kernel(TensorIteratorBase& iter) {
|
static void zeta_mps_kernel(TensorIteratorBase& iter) {
|
||||||
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "zeta_mps not implemented for non-floating types");
|
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "zeta_mps not implemented for non-floating types");
|
||||||
mps::binary_mps_impl(iter, "zeta");
|
lib.exec_binary_kernel(iter, "zeta");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void xlog1py_mps_kernel(TensorIteratorBase& iter) {
|
static void xlog1py_mps_kernel(TensorIteratorBase& iter) {
|
||||||
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "xlog1py_mps not implemented for non-floating types");
|
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "xlog1py_mps not implemented for non-floating types");
|
||||||
mps::binary_mps_impl(iter, "xlog1py");
|
lib.exec_binary_kernel(iter, "xlog1py");
|
||||||
}
|
}
|
||||||
|
|
||||||
REGISTER_DISPATCH(fmax_stub, &fmax_mps_kernel)
|
REGISTER_DISPATCH(fmax_stub, &fmax_mps_kernel)
|
||||||
@ -147,7 +106,7 @@ Tensor& polar_out_mps(const Tensor& abs, const Tensor& angle, Tensor& output) {
|
|||||||
auto output_as_real = at::view_as_real(output).select(output.dim(), 0);
|
auto output_as_real = at::view_as_real(output).select(output.dim(), 0);
|
||||||
auto iter = TensorIteratorConfig().add_output(output_as_real).add_input(abs).add_input(angle).build();
|
auto iter = TensorIteratorConfig().add_output(output_as_real).add_input(abs).add_input(angle).build();
|
||||||
|
|
||||||
mps::binary_mps_impl(iter, "polar");
|
lib.exec_binary_kernel(iter, "polar");
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -163,7 +122,7 @@ Tensor& complex_out_mps(const Tensor& real, const Tensor& imag, Tensor& output)
|
|||||||
auto output_as_real = at::view_as_real(output).select(output.dim(), 0);
|
auto output_as_real = at::view_as_real(output).select(output.dim(), 0);
|
||||||
auto iter = TensorIteratorConfig().add_output(output_as_real).add_input(real).add_input(imag).build();
|
auto iter = TensorIteratorConfig().add_output(output_as_real).add_input(real).add_input(imag).build();
|
||||||
|
|
||||||
mps::binary_mps_impl(iter, "complex_kernel", false);
|
lib.exec_binary_kernel(iter, "complex_kernel", /*supports_dense=*/false);
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
} // namespace at::native
|
} // namespace at::native
|
||||||
|
|||||||
@ -14,7 +14,6 @@
|
|||||||
#include <ATen/ops/atan2_native.h>
|
#include <ATen/ops/atan2_native.h>
|
||||||
#include <ATen/ops/div_native.h>
|
#include <ATen/ops/div_native.h>
|
||||||
#include <ATen/ops/eq_native.h>
|
#include <ATen/ops/eq_native.h>
|
||||||
#include <ATen/ops/floor_divide_native.h>
|
|
||||||
#include <ATen/ops/fmod_native.h>
|
#include <ATen/ops/fmod_native.h>
|
||||||
#include <ATen/ops/ge_native.h>
|
#include <ATen/ops/ge_native.h>
|
||||||
#include <ATen/ops/gt_native.h>
|
#include <ATen/ops/gt_native.h>
|
||||||
@ -447,19 +446,8 @@ TORCH_IMPL_FUNC(pow_Scalar_out_mps)(const Scalar& base, const Tensor& exp, const
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor& floor_divide_out_mps(const Tensor& self, const Tensor& other, Tensor& result) {
|
static void div_floor_kernel_mps(TensorIteratorBase& iter) {
|
||||||
mps::div_mode_template(self, other, "floor", result, "floor_divide_out");
|
mps::div_mode_template(iter.input(0), iter.input(1), "floor", iter.output(0), "floor_divide_out");
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
Tensor floor_divide_mps(const Tensor& self, const Tensor& other) {
|
|
||||||
Tensor output = at::empty_like(self);
|
|
||||||
mps::div_mode_template(self, other, "floor", output, "floor_divide");
|
|
||||||
return output;
|
|
||||||
}
|
|
||||||
|
|
||||||
Tensor& floor_divide_mps_(Tensor& self, const Tensor& other) {
|
|
||||||
return floor_divide_out_mps(self, other, self);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TORCH_IMPL_FUNC(remainder_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
|
TORCH_IMPL_FUNC(remainder_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
|
||||||
@ -538,4 +526,6 @@ TORCH_IMPL_FUNC(xlogy_out_mps)(const Tensor& self, const Tensor& other, const Te
|
|||||||
TORCH_IMPL_FUNC(lerp_Scalar_mps)(const Tensor& self, const Tensor& end, const Scalar& weight, const Tensor& out) {
|
TORCH_IMPL_FUNC(lerp_Scalar_mps)(const Tensor& self, const Tensor& end, const Scalar& weight, const Tensor& out) {
|
||||||
mps::add_sub_lerp_template(self, end, weight, out, "lerp");
|
mps::add_sub_lerp_template(self, end, weight, out, "lerp");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
REGISTER_DISPATCH(div_floor_stub, &div_floor_kernel_mps);
|
||||||
} // namespace at::native
|
} // namespace at::native
|
||||||
|
|||||||
@ -60,9 +60,25 @@ static void _fused_sgd_with_momentum_kernel_mps_(TensorList params,
|
|||||||
const bool is_first_step,
|
const bool is_first_step,
|
||||||
const std::optional<Tensor>& grad_scale,
|
const std::optional<Tensor>& grad_scale,
|
||||||
const std::optional<Tensor>& found_inf) {
|
const std::optional<Tensor>& found_inf) {
|
||||||
|
if (lr_tensor.is_cpu()) {
|
||||||
|
return _fused_sgd_with_momentum_kernel_mps_(params,
|
||||||
|
grads,
|
||||||
|
momentum_buffer_list,
|
||||||
|
weight_decay,
|
||||||
|
momentum,
|
||||||
|
lr_tensor.item<double>(),
|
||||||
|
dampening,
|
||||||
|
nesterov,
|
||||||
|
maximize,
|
||||||
|
is_first_step,
|
||||||
|
grad_scale,
|
||||||
|
found_inf);
|
||||||
|
}
|
||||||
TORCH_CHECK_GT(momentum, 0);
|
TORCH_CHECK_GT(momentum, 0);
|
||||||
TORCH_CHECK(native::check_fast_path_restrictions({params, grads, momentum_buffer_list}));
|
TORCH_CHECK(native::check_fast_path_restrictions({params, grads, momentum_buffer_list}));
|
||||||
|
|
||||||
|
TORCH_CHECK(lr_tensor.device() == params[0].device(), "lr must be on the same GPU device as the params");
|
||||||
|
|
||||||
std::vector<std::vector<Tensor>> tensor_lists{params.vec(), grads.vec(), momentum_buffer_list.vec()};
|
std::vector<std::vector<Tensor>> tensor_lists{params.vec(), grads.vec(), momentum_buffer_list.vec()};
|
||||||
|
|
||||||
const auto kernel_name = "fused_sgd_momentum_" + scalarToMetalTypeString(params[0].scalar_type());
|
const auto kernel_name = "fused_sgd_momentum_" + scalarToMetalTypeString(params[0].scalar_type());
|
||||||
|
|||||||
@ -16,10 +16,18 @@ static void i0_kernel_mps(TensorIteratorBase& iter) {
|
|||||||
lib.exec_unary_kernel(iter, "i0");
|
lib.exec_unary_kernel(iter, "i0");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void i0e_kernel_mps(TensorIteratorBase& iter) {
|
||||||
|
lib.exec_unary_kernel(iter, "i0e");
|
||||||
|
}
|
||||||
|
|
||||||
static void i1_kernel_mps(TensorIteratorBase& iter) {
|
static void i1_kernel_mps(TensorIteratorBase& iter) {
|
||||||
lib.exec_unary_kernel(iter, "i1");
|
lib.exec_unary_kernel(iter, "i1");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void i1e_kernel_mps(TensorIteratorBase& iter) {
|
||||||
|
lib.exec_unary_kernel(iter, "i1e");
|
||||||
|
}
|
||||||
|
|
||||||
static void spherical_bessel_j0_kernel_mps(TensorIteratorBase& iter) {
|
static void spherical_bessel_j0_kernel_mps(TensorIteratorBase& iter) {
|
||||||
lib.exec_unary_kernel(iter, "spherical_bessel_j0");
|
lib.exec_unary_kernel(iter, "spherical_bessel_j0");
|
||||||
}
|
}
|
||||||
@ -28,8 +36,40 @@ static void entr_kernel_mps(TensorIteratorBase& iter) {
|
|||||||
lib.exec_unary_kernel(iter, "entr");
|
lib.exec_unary_kernel(iter, "entr");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void bessel_j0_kernel_mps(TensorIteratorBase& iter) {
|
||||||
|
lib.exec_unary_kernel(iter, "bessel_j0_forward");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void bessel_j1_kernel_mps(TensorIteratorBase& iter) {
|
||||||
|
lib.exec_unary_kernel(iter, "bessel_j1_forward");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void modified_bessel_i0_kernel_mps(TensorIteratorBase& iter) {
|
||||||
|
lib.exec_unary_kernel(iter, "modified_bessel_i0_forward");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void modified_bessel_i1_kernel_mps(TensorIteratorBase& iter) {
|
||||||
|
lib.exec_unary_kernel(iter, "modified_bessel_i1_forward");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void bessel_y0_kernel_mps(TensorIteratorBase& iter) {
|
||||||
|
lib.exec_unary_kernel(iter, "bessel_y0_forward");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void bessel_y1_kernel_mps(TensorIteratorBase& iter) {
|
||||||
|
lib.exec_unary_kernel(iter, "bessel_y1_forward");
|
||||||
|
}
|
||||||
|
|
||||||
REGISTER_DISPATCH(i0_stub, &i0_kernel_mps)
|
REGISTER_DISPATCH(i0_stub, &i0_kernel_mps)
|
||||||
|
REGISTER_DISPATCH(special_i0e_stub, &i0e_kernel_mps)
|
||||||
REGISTER_DISPATCH(special_i1_stub, &i1_kernel_mps)
|
REGISTER_DISPATCH(special_i1_stub, &i1_kernel_mps)
|
||||||
|
REGISTER_DISPATCH(special_i1e_stub, &i1e_kernel_mps)
|
||||||
|
REGISTER_DISPATCH(special_bessel_j0_stub, &bessel_j0_kernel_mps)
|
||||||
|
REGISTER_DISPATCH(special_bessel_j1_stub, &bessel_j1_kernel_mps)
|
||||||
|
REGISTER_DISPATCH(special_modified_bessel_i0_stub, &modified_bessel_i0_kernel_mps)
|
||||||
|
REGISTER_DISPATCH(special_modified_bessel_i1_stub, &modified_bessel_i1_kernel_mps)
|
||||||
|
REGISTER_DISPATCH(special_bessel_y0_stub, &bessel_y0_kernel_mps)
|
||||||
|
REGISTER_DISPATCH(special_bessel_y1_stub, &bessel_y1_kernel_mps)
|
||||||
REGISTER_DISPATCH(special_spherical_bessel_j0_stub, &spherical_bessel_j0_kernel_mps)
|
REGISTER_DISPATCH(special_spherical_bessel_j0_stub, &spherical_bessel_j0_kernel_mps)
|
||||||
REGISTER_DISPATCH(special_entr_stub, &entr_kernel_mps)
|
REGISTER_DISPATCH(special_entr_stub, &entr_kernel_mps)
|
||||||
} // namespace at::native
|
} // namespace at::native
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user