mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-03 15:35:04 +08:00
Compare commits
295 Commits
export-D83
...
rms_norm_p
| Author | SHA1 | Date | |
|---|---|---|---|
| 9f269c794c | |||
| aa9c96af04 | |||
| c3b71d5499 | |||
| 1e3600b528 | |||
| fee7624bd6 | |||
| 24e94e021a | |||
| 69be99ee51 | |||
| 034e951b0c | |||
| 160ab53dd5 | |||
| 5bcfdae71d | |||
| 4e8ba37ce3 | |||
| 26534e9809 | |||
| 657f8c3e21 | |||
| b0831930ed | |||
| c01636e1bc | |||
| fd68d409ad | |||
| 0d3a4f7155 | |||
| 108bb224f7 | |||
| fc8ac1216c | |||
| 030de07aff | |||
| 7d67a41db4 | |||
| 85b035ca9c | |||
| 267d0197bf | |||
| 1dec8a67a8 | |||
| 797cd80b26 | |||
| 7d39401fa0 | |||
| e3ae0594d1 | |||
| f1e4c42b6e | |||
| d3e511f07c | |||
| d3be06cbdc | |||
| 1129605415 | |||
| a6b1ef1717 | |||
| 12577064dd | |||
| 24b6eb7727 | |||
| 32066772b3 | |||
| 47f0024310 | |||
| 98d640bb11 | |||
| 5d288bc3f7 | |||
| bfb47ec50e | |||
| 7a0cd8ed09 | |||
| 984e64b2cd | |||
| b9bcb37f40 | |||
| 7e3b9d105e | |||
| 45c3f02d69 | |||
| f5543e3741 | |||
| 5fc2c7a2a1 | |||
| 7692fa09cd | |||
| df71b70727 | |||
| 80ba6e458f | |||
| 0d50e5d8d4 | |||
| 99b05d1b78 | |||
| f911d64750 | |||
| 52db60170d | |||
| 56838bad5f | |||
| ad3a56ab98 | |||
| a7fd0b4001 | |||
| 181ee3bd42 | |||
| 0ec0549823 | |||
| 8221ee6db9 | |||
| b939de26d1 | |||
| 694db5f549 | |||
| 639a0b1239 | |||
| 398775a43e | |||
| fcd5f8c352 | |||
| 4acc66f119 | |||
| 8f40a0c634 | |||
| a5c3c08d10 | |||
| a553ea9ea4 | |||
| ba71e9ca9a | |||
| 694d205143 | |||
| 629293f568 | |||
| c37802a8c4 | |||
| 0a3ac47c0a | |||
| e83be7042e | |||
| fb545fb068 | |||
| 2df2c316e2 | |||
| 08b0a8f11a | |||
| 3f1824742c | |||
| bbb7d2270b | |||
| 6a5a436624 | |||
| ad559072db | |||
| ad02bd13df | |||
| 7563f61cc8 | |||
| fa8e073a4e | |||
| 95b5534773 | |||
| 9ee1afbf66 | |||
| f60751024e | |||
| 2de4cf2102 | |||
| 369f2d6951 | |||
| 32920926f0 | |||
| 39e5cdddf7 | |||
| 2829d48bd1 | |||
| f1af679270 | |||
| d46d8d6f54 | |||
| a5335263d3 | |||
| 79aee77381 | |||
| f5cb9a4c68 | |||
| f20bf77874 | |||
| 75f798e05b | |||
| 476b149a00 | |||
| 845da9c817 | |||
| 0918bf321c | |||
| 90519402c2 | |||
| 791ca80d3a | |||
| 5cbdade914 | |||
| 0187db88d4 | |||
| 311ea0dec0 | |||
| cf7756da38 | |||
| e380028a51 | |||
| b4403bfc62 | |||
| 12c12466b0 | |||
| f4d05feb7a | |||
| 7481622237 | |||
| b2a0f90501 | |||
| 14d4a77495 | |||
| 3d4ca228be | |||
| c3d205d598 | |||
| c54e2c5b41 | |||
| c3047938a0 | |||
| d2eff5d454 | |||
| 972030fe2e | |||
| d401e4e70a | |||
| f1a3440715 | |||
| 82ff07c788 | |||
| e0604d3170 | |||
| 8101fd46d4 | |||
| 3d4a2d8a93 | |||
| 59ddfb69a7 | |||
| bebabd7fce | |||
| 56a809aa07 | |||
| b33762bd2f | |||
| f02708c2be | |||
| a186aa8d6c | |||
| 48c3b71ecc | |||
| 2c9f877fa7 | |||
| fc540cefd4 | |||
| d1a6e006e0 | |||
| fa560e1158 | |||
| a3fe1825aa | |||
| deb776319b | |||
| d7040e6d75 | |||
| 35f3572fa4 | |||
| bc5111cd8d | |||
| 398fdd32bb | |||
| 5fd1d41e62 | |||
| c594950e86 | |||
| 14102fb1f3 | |||
| 5cdbcb5233 | |||
| eae701cad0 | |||
| 8f51556daa | |||
| c0bbda37e8 | |||
| fefb546b91 | |||
| d6d6fa26f5 | |||
| 467c21ad9a | |||
| 4a94591321 | |||
| 5e7272b60a | |||
| 1dd6b76914 | |||
| 284716a691 | |||
| 8b188647cf | |||
| 96b61844a7 | |||
| 1b655a87ef | |||
| cb6966704c | |||
| 17d5aa4767 | |||
| cde81e92b9 | |||
| bfc2050db9 | |||
| c5701d0ab5 | |||
| 23669d02a6 | |||
| e8d887ae3f | |||
| 774abb018e | |||
| 0e19561e23 | |||
| 1fa520ea65 | |||
| c2e3cc7aed | |||
| 5849eea129 | |||
| 924482a6f6 | |||
| 20be077085 | |||
| 94eaeb9cb8 | |||
| 753d9bd806 | |||
| dd1fe7c22f | |||
| 695cb0d342 | |||
| 1764f3a9c8 | |||
| c9eabadc5e | |||
| c201a1cab1 | |||
| e105a47575 | |||
| aab27b051a | |||
| f8b4c00294 | |||
| 877f126e35 | |||
| 4fada51ada | |||
| 76b2c37045 | |||
| adedf26e21 | |||
| bea89d6060 | |||
| 48e672d149 | |||
| afaaaa314c | |||
| 84fe848503 | |||
| 56afad4eb3 | |||
| 2a058bfecf | |||
| 31e42eb732 | |||
| a9b29caeae | |||
| 0d4992c170 | |||
| b060e5c131 | |||
| 6d5e651a50 | |||
| 3cc5949dc2 | |||
| f167fd09fa | |||
| 68b3984b77 | |||
| a1eb6b5538 | |||
| f36f372acc | |||
| d9483d4c8d | |||
| fea819ed08 | |||
| 84a2715d34 | |||
| 572cc12b42 | |||
| 1fdef664a5 | |||
| 08ae55021e | |||
| 551921d484 | |||
| b5189e269e | |||
| 3895ce093f | |||
| 8aa087a29d | |||
| 7379972cc0 | |||
| b903018c26 | |||
| 21b48f8dfa | |||
| 009ea77234 | |||
| 0e46a10aa7 | |||
| a25818cf7e | |||
| e3e93c7107 | |||
| 1abfa5f70b | |||
| 687c15c0b3 | |||
| 895795f07c | |||
| 2dc56456cb | |||
| 8110ce02a2 | |||
| 43c30f607e | |||
| 5ebf74a655 | |||
| acd936cc1a | |||
| a4a0378e6b | |||
| ac841267a1 | |||
| 0eacd934bc | |||
| 5016e7b2eb | |||
| 544b443ea1 | |||
| 3041ede082 | |||
| 34d6ef7022 | |||
| 110efe4df4 | |||
| e137cd0a10 | |||
| be28329710 | |||
| 85a7c745aa | |||
| 32fe4f681e | |||
| ebb2b2e894 | |||
| 13413b3b07 | |||
| 5d0b3e28dc | |||
| 9139368b64 | |||
| 02095cc09d | |||
| 65868156c6 | |||
| f93ea7dab1 | |||
| a77f5d9a00 | |||
| ff46d5a79b | |||
| f452edd782 | |||
| ea698e8bfc | |||
| 7f7a28046b | |||
| d8283a317a | |||
| e0ca3049c0 | |||
| 8417981c96 | |||
| 06e71c8558 | |||
| a76b59cc45 | |||
| 74336f8c77 | |||
| 236ce736a1 | |||
| 17bdb232e1 | |||
| add37bacda | |||
| 1425b40f29 | |||
| 8af9ed0824 | |||
| 7045aab143 | |||
| 7ae8aaf4c0 | |||
| f2450798cd | |||
| 46d17e8871 | |||
| dc011d3203 | |||
| e95920e3e6 | |||
| 5e769ff867 | |||
| 0ae3e30621 | |||
| 47f50cfd45 | |||
| a51f877287 | |||
| b44423bbb4 | |||
| 8e1e4ee8e0 | |||
| 1e836bc769 | |||
| 9a91486e45 | |||
| 92381a5aa7 | |||
| 2a5f87decf | |||
| 840d63c12d | |||
| 2ce894bb1d | |||
| 47ec1e9990 | |||
| 904abfc2ca | |||
| 7d16fcf2df | |||
| 483845a9c4 | |||
| 60bcb4ee88 | |||
| ee7434be82 | |||
| d049ed2cb1 | |||
| 9901d44418 | |||
| 6096c0fc74 | |||
| f6951cb8ea | |||
| 8887a33ede | |||
| 36a48e7e6d |
@ -195,13 +195,16 @@ case "$tag" in
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-xpu-n-py3)
|
||||
pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
XPU_VERSION=2025.2
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
if [[ $tag =~ "benchmarks" ]]; then
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
fi
|
||||
;;
|
||||
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
|
||||
@ -49,12 +49,20 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
|
||||
export SYSROOT_DEP="sysroot_linux-64=2.17"
|
||||
fi
|
||||
|
||||
# Install correct Python version
|
||||
# Also ensure sysroot is using a modern GLIBC to match system compilers
|
||||
if [ "$ANACONDA_PYTHON_VERSION" = "3.14" ]; then
|
||||
as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
|
||||
python="3.14.0" \
|
||||
${SYSROOT_DEP} \
|
||||
-c conda-forge
|
||||
else
|
||||
# Install correct Python version
|
||||
# Also ensure sysroot is using a modern GLIBC to match system compilers
|
||||
as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
|
||||
python="$ANACONDA_PYTHON_VERSION" \
|
||||
${SYSROOT_DEP}
|
||||
|
||||
fi
|
||||
# libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30
|
||||
# which is provided in libstdcxx 12 and up.
|
||||
conda_install libstdcxx-ng=12.3.0 --update-deps -c conda-forge
|
||||
|
||||
@ -10,7 +10,7 @@ else
|
||||
arch_path='sbsa'
|
||||
fi
|
||||
|
||||
NVSHMEM_VERSION=3.3.24
|
||||
NVSHMEM_VERSION=3.4.5
|
||||
|
||||
function install_cuda {
|
||||
version=$1
|
||||
@ -150,7 +150,7 @@ function install_130 {
|
||||
CUDNN_VERSION=9.13.0.50
|
||||
echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
|
||||
# install CUDA 13.0 in the same container
|
||||
install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
|
||||
install_cuda 13.0.2 cuda_13.0.2_580.95.05_linux
|
||||
|
||||
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
|
||||
install_cudnn 13 $CUDNN_VERSION
|
||||
|
||||
@ -40,11 +40,7 @@ EOF
|
||||
|
||||
# Default url values
|
||||
rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
|
||||
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
|
||||
|
||||
# Add amdgpu repository
|
||||
UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
|
||||
echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
|
||||
|
||||
# Add rocm repository
|
||||
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
|
||||
|
||||
@ -138,10 +138,12 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
|
||||
#test_binary_ufuncs.py
|
||||
numpy==1.22.4; python_version == "3.10"
|
||||
numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
|
||||
numpy==2.1.2; python_version >= "3.13"
|
||||
numpy==2.1.2; python_version >= "3.13" and python_version < "3.14"
|
||||
numpy==2.3.4; python_version >= "3.14"
|
||||
|
||||
pandas==2.0.3; python_version < "3.13"
|
||||
pandas==2.2.3; python_version >= "3.13"
|
||||
pandas==2.2.3; python_version >= "3.13" and python_version < "3.14"
|
||||
pandas==2.3.3; python_version >= "3.14"
|
||||
|
||||
#onnxruntime
|
||||
#Description: scoring engine for Open Neural Network Exchange (ONNX) models
|
||||
@ -153,7 +155,8 @@ opt-einsum==3.3
|
||||
#Pinned versions: 3.3
|
||||
#test that import: test_linalg.py
|
||||
|
||||
optree==0.13.0
|
||||
optree==0.13.0 ; python_version < "3.14"
|
||||
optree==0.17.0 ; python_version >= "3.14"
|
||||
#Description: A library for tree manipulation
|
||||
#Pinned versions: 0.13.0
|
||||
#test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
|
||||
@ -252,7 +255,8 @@ scikit-image==0.22.0
|
||||
#test that import:
|
||||
|
||||
scipy==1.10.1 ; python_version <= "3.11"
|
||||
scipy==1.14.1 ; python_version >= "3.12"
|
||||
scipy==1.14.1 ; python_version > "3.11" and python_version < "3.14"
|
||||
scipy==1.16.2 ; python_version >= "3.14"
|
||||
# Pin SciPy because of failing distribution tests (see #60347)
|
||||
#Description: scientific python
|
||||
#Pinned versions: 1.10.1
|
||||
@ -324,7 +328,8 @@ pywavelets==1.7.0 ; python_version >= "3.12"
|
||||
#Pinned versions: 1.4.1
|
||||
#test that import:
|
||||
|
||||
lxml==5.3.0
|
||||
lxml==5.3.0 ; python_version < "3.14"
|
||||
lxml==6.0.2 ; python_version >= "3.14"
|
||||
#Description: This is a requirement of unittest-xml-reporting
|
||||
|
||||
PyGithub==2.3.0
|
||||
@ -334,7 +339,9 @@ sympy==1.13.3
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
onnx==1.19.1
|
||||
onnx==1.19.1 ; python_version < "3.14"
|
||||
# Unpin once Python 3.14 is supported. See onnxruntime issue 26309.
|
||||
onnx==1.18.0 ; python_version == "3.14"
|
||||
#Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
@ -359,7 +366,7 @@ pwlf==2.2.1
|
||||
#test that import: test_sac_estimator.py
|
||||
|
||||
# To build PyTorch itself
|
||||
pyyaml==6.0.2
|
||||
pyyaml==6.0.3
|
||||
pyzstd
|
||||
setuptools==78.1.1
|
||||
packaging==23.1
|
||||
|
||||
@ -54,12 +54,15 @@ ENV OPENSSL_DIR /opt/openssl
|
||||
RUN rm install_openssl.sh
|
||||
|
||||
ARG INDUCTOR_BENCHMARKS
|
||||
ARG ANACONDA_PYTHON_VERSION
|
||||
ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
|
||||
COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
|
||||
COPY ci_commit_pins/timm.txt timm.txt
|
||||
COPY ci_commit_pins/torchbench.txt torchbench.txt
|
||||
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
|
||||
|
||||
# Install XPU Dependencies
|
||||
ARG XPU_VERSION
|
||||
|
||||
@ -100,6 +100,8 @@ COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
|
||||
COPY ci_commit_pins/timm.txt timm.txt
|
||||
COPY ci_commit_pins/torchbench.txt torchbench.txt
|
||||
# Only build aoti cpp tests when INDUCTOR_BENCHMARKS is set to True
|
||||
ENV BUILD_AOT_INDUCTOR_TEST ${INDUCTOR_BENCHMARKS}
|
||||
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
|
||||
|
||||
|
||||
@ -6,7 +6,7 @@ dependencies = [
|
||||
"GitPython==3.1.45",
|
||||
"docker==7.1.0",
|
||||
"pytest==7.3.2",
|
||||
"uv==0.9.5"
|
||||
"uv==0.9.6"
|
||||
]
|
||||
|
||||
[tool.setuptools]
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
SHELL=/usr/bin/env bash
|
||||
|
||||
DOCKER_CMD ?= docker
|
||||
DESIRED_ROCM ?= 7.0
|
||||
DESIRED_ROCM ?= 7.1
|
||||
DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
|
||||
PACKAGE_NAME = magma-rocm
|
||||
# inherit this from underlying docker image, do not pass this env var to docker
|
||||
@ -16,6 +16,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
|
||||
magma-rocm/build_magma.sh
|
||||
|
||||
.PHONY: all
|
||||
all: magma-rocm71
|
||||
all: magma-rocm70
|
||||
all: magma-rocm64
|
||||
|
||||
@ -24,6 +25,11 @@ clean:
|
||||
$(RM) -r magma-*
|
||||
$(RM) -r output
|
||||
|
||||
.PHONY: magma-rocm71
|
||||
magma-rocm71: DESIRED_ROCM := 7.1
|
||||
magma-rocm71:
|
||||
$(DOCKER_RUN)
|
||||
|
||||
.PHONY: magma-rocm70
|
||||
magma-rocm70: DESIRED_ROCM := 7.0
|
||||
magma-rocm70:
|
||||
|
||||
@ -6,8 +6,8 @@ set -eou pipefail
|
||||
# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
|
||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
|
||||
# https://github.com/icl-utk-edu/magma/pull/65
|
||||
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
|
||||
# post merge of https://github.com/icl-utk-edu/magma/pull/65
|
||||
MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
|
||||
|
||||
# Folders for the build
|
||||
PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
|
||||
@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE
|
||||
|
||||
# Fetch magma sources and verify checksum
|
||||
pushd ${PACKAGE_DIR}
|
||||
git clone https://github.com/jeffdaily/magma
|
||||
git clone https://github.com/icl-utk-edu/magma
|
||||
pushd magma
|
||||
git checkout ${MAGMA_VERSION}
|
||||
popd
|
||||
|
||||
@ -426,7 +426,7 @@ fi
|
||||
if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; then
|
||||
# export test times so that potential sharded tests that'll branch off this build will use consistent data
|
||||
# don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
|
||||
python tools/stats/export_test_times.py
|
||||
PYTHONPATH=. python tools/stats/export_test_times.py
|
||||
fi
|
||||
# don't do this for bazel or s390x or riscv64 as they don't use sccache
|
||||
if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
|
||||
|
||||
@ -460,28 +460,18 @@ test_inductor_shard() {
|
||||
--verbose
|
||||
}
|
||||
|
||||
test_inductor_aoti() {
|
||||
# docker build uses bdist_wheel which does not work with test_aot_inductor
|
||||
# TODO: need a faster way to build
|
||||
test_inductor_aoti_cpp() {
|
||||
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
|
||||
# We need to hipify before building again
|
||||
python3 tools/amd_build/build_amd.py
|
||||
fi
|
||||
if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
|
||||
BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .)
|
||||
# TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
|
||||
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
|
||||
else
|
||||
BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .)
|
||||
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
|
||||
fi
|
||||
|
||||
# aoti cmake custom command requires `torch` to be installed
|
||||
# initialize the cmake build cache and install torch
|
||||
/usr/bin/env "${BUILD_COMMAND[@]}"
|
||||
# rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
|
||||
/usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
|
||||
|
||||
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
|
||||
}
|
||||
|
||||
@ -582,6 +572,8 @@ fi
|
||||
|
||||
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
|
||||
DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
|
||||
elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
|
||||
DYNAMO_BENCHMARK_FLAGS+=(--device xpu)
|
||||
else
|
||||
DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
|
||||
fi
|
||||
@ -675,6 +667,8 @@ test_perf_for_dashboard() {
|
||||
device=cuda_b200
|
||||
elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
|
||||
device=rocm
|
||||
elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
|
||||
device=xpu
|
||||
fi
|
||||
|
||||
for mode in "${modes[@]}"; do
|
||||
@ -1767,7 +1761,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
|
||||
else
|
||||
# Do this after checkout_install_torchbench to ensure we clobber any
|
||||
# nightlies that torchbench may pull in
|
||||
if [[ "${TEST_CONFIG}" != *cpu* ]]; then
|
||||
if [[ "${TEST_CONFIG}" != *cpu* && "${TEST_CONFIG}" != *xpu* ]]; then
|
||||
install_torchrec_and_fbgemm
|
||||
fi
|
||||
PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
|
||||
@ -1776,7 +1770,7 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
|
||||
install_torchvision
|
||||
PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
|
||||
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
|
||||
test_inductor_aoti
|
||||
test_inductor_aoti_cpp
|
||||
fi
|
||||
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
|
||||
install_torchvision
|
||||
|
||||
@ -7,12 +7,9 @@ if "%DESIRED_PYTHON%" == "3.13t" (
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
|
||||
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
|
||||
set PYTHON_EXEC="python3.13t"
|
||||
) else if "%DESIRED_PYTHON%"=="3.14" (
|
||||
echo Python version is set to 3.14 or 3.14t
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
|
||||
) else if "%DESIRED_PYTHON%"=="3.14t" (
|
||||
echo Python version is set to 3.14 or 3.14t
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0-amd64.exe"
|
||||
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
|
||||
set PYTHON_EXEC="python3.14t"
|
||||
) else (
|
||||
|
||||
@ -1,3 +1,8 @@
|
||||
---
|
||||
name: docstring
|
||||
description: Write docstrings for PyTorch functions and methods following PyTorch conventions. Use when writing or updating docstrings in PyTorch code.
|
||||
---
|
||||
|
||||
# PyTorch Docstring Writing Guide
|
||||
|
||||
This skill describes how to write docstrings for functions and methods in the PyTorch project, following the conventions in `torch/_tensor_docs.py` and `torch/nn/functional.py`.
|
||||
385
.claude/skills/skill-writer/SKILL.md
Normal file
385
.claude/skills/skill-writer/SKILL.md
Normal file
@ -0,0 +1,385 @@
|
||||
---
|
||||
name: skill-writer
|
||||
description: Guide users through creating Agent Skills for Claude Code. Use when the user wants to create, write, author, or design a new Skill, or needs help with SKILL.md files, frontmatter, or skill structure.
|
||||
---
|
||||
|
||||
# Skill Writer
|
||||
|
||||
This Skill helps you create well-structured Agent Skills for Claude Code that follow best practices and validation requirements.
|
||||
|
||||
## When to use this Skill
|
||||
|
||||
Use this Skill when:
|
||||
- Creating a new Agent Skill
|
||||
- Writing or updating SKILL.md files
|
||||
- Designing skill structure and frontmatter
|
||||
- Troubleshooting skill discovery issues
|
||||
- Converting existing prompts or workflows into Skills
|
||||
|
||||
## Instructions
|
||||
|
||||
### Step 1: Determine Skill scope
|
||||
|
||||
First, understand what the Skill should do:
|
||||
|
||||
1. **Ask clarifying questions**:
|
||||
- What specific capability should this Skill provide?
|
||||
- When should Claude use this Skill?
|
||||
- What tools or resources does it need?
|
||||
- Is this for personal use or team sharing?
|
||||
|
||||
2. **Keep it focused**: One Skill = one capability
|
||||
- Good: "PDF form filling", "Excel data analysis"
|
||||
- Too broad: "Document processing", "Data tools"
|
||||
|
||||
### Step 2: Choose Skill location
|
||||
|
||||
Determine where to create the Skill:
|
||||
|
||||
**Personal Skills** (`~/.claude/skills/`):
|
||||
- Individual workflows and preferences
|
||||
- Experimental Skills
|
||||
- Personal productivity tools
|
||||
|
||||
**Project Skills** (`.claude/skills/`):
|
||||
- Team workflows and conventions
|
||||
- Project-specific expertise
|
||||
- Shared utilities (committed to git)
|
||||
|
||||
### Step 3: Create Skill structure
|
||||
|
||||
Create the directory and files:
|
||||
|
||||
```bash
|
||||
# Personal
|
||||
mkdir -p ~/.claude/skills/skill-name
|
||||
|
||||
# Project
|
||||
mkdir -p .claude/skills/skill-name
|
||||
```
|
||||
|
||||
For multi-file Skills:
|
||||
```
|
||||
skill-name/
|
||||
├── SKILL.md (required)
|
||||
├── reference.md (optional)
|
||||
├── examples.md (optional)
|
||||
├── scripts/
|
||||
│ └── helper.py (optional)
|
||||
└── templates/
|
||||
└── template.txt (optional)
|
||||
```
|
||||
|
||||
### Step 4: Write SKILL.md frontmatter
|
||||
|
||||
Create YAML frontmatter with required fields:
|
||||
|
||||
```yaml
|
||||
---
|
||||
name: skill-name
|
||||
description: Brief description of what this does and when to use it
|
||||
---
|
||||
```
|
||||
|
||||
**Field requirements**:
|
||||
|
||||
- **name**:
|
||||
- Lowercase letters, numbers, hyphens only
|
||||
- Max 64 characters
|
||||
- Must match directory name
|
||||
- Good: `pdf-processor`, `git-commit-helper`
|
||||
- Bad: `PDF_Processor`, `Git Commits!`
|
||||
|
||||
- **description**:
|
||||
- Max 1024 characters
|
||||
- Include BOTH what it does AND when to use it
|
||||
- Use specific trigger words users would say
|
||||
- Mention file types, operations, and context
|
||||
|
||||
**Optional frontmatter fields**:
|
||||
|
||||
- **allowed-tools**: Restrict tool access (comma-separated list)
|
||||
```yaml
|
||||
allowed-tools: Read, Grep, Glob
|
||||
```
|
||||
Use for:
|
||||
- Read-only Skills
|
||||
- Security-sensitive workflows
|
||||
- Limited-scope operations
|
||||
|
||||
### Step 5: Write effective descriptions
|
||||
|
||||
The description is critical for Claude to discover your Skill.
|
||||
|
||||
**Formula**: `[What it does] + [When to use it] + [Key triggers]`
|
||||
|
||||
**Examples**:
|
||||
|
||||
✅ **Good**:
|
||||
```yaml
|
||||
description: Extract text and tables from PDF files, fill forms, merge documents. Use when working with PDF files or when the user mentions PDFs, forms, or document extraction.
|
||||
```
|
||||
|
||||
✅ **Good**:
|
||||
```yaml
|
||||
description: Analyze Excel spreadsheets, create pivot tables, and generate charts. Use when working with Excel files, spreadsheets, or analyzing tabular data in .xlsx format.
|
||||
```
|
||||
|
||||
❌ **Too vague**:
|
||||
```yaml
|
||||
description: Helps with documents
|
||||
description: For data analysis
|
||||
```
|
||||
|
||||
**Tips**:
|
||||
- Include specific file extensions (.pdf, .xlsx, .json)
|
||||
- Mention common user phrases ("analyze", "extract", "generate")
|
||||
- List concrete operations (not generic verbs)
|
||||
- Add context clues ("Use when...", "For...")
|
||||
|
||||
### Step 6: Structure the Skill content
|
||||
|
||||
Use clear Markdown sections:
|
||||
|
||||
```markdown
|
||||
# Skill Name
|
||||
|
||||
Brief overview of what this Skill does.
|
||||
|
||||
## Quick start
|
||||
|
||||
Provide a simple example to get started immediately.
|
||||
|
||||
## Instructions
|
||||
|
||||
Step-by-step guidance for Claude:
|
||||
1. First step with clear action
|
||||
2. Second step with expected outcome
|
||||
3. Handle edge cases
|
||||
|
||||
## Examples
|
||||
|
||||
Show concrete usage examples with code or commands.
|
||||
|
||||
## Best practices
|
||||
|
||||
- Key conventions to follow
|
||||
- Common pitfalls to avoid
|
||||
- When to use vs. not use
|
||||
|
||||
## Requirements
|
||||
|
||||
List any dependencies or prerequisites:
|
||||
```bash
|
||||
pip install package-name
|
||||
```
|
||||
|
||||
## Advanced usage
|
||||
|
||||
For complex scenarios, see [reference.md](reference.md).
|
||||
```
|
||||
|
||||
### Step 7: Add supporting files (optional)
|
||||
|
||||
Create additional files for progressive disclosure:
|
||||
|
||||
**reference.md**: Detailed API docs, advanced options
|
||||
**examples.md**: Extended examples and use cases
|
||||
**scripts/**: Helper scripts and utilities
|
||||
**templates/**: File templates or boilerplate
|
||||
|
||||
Reference them from SKILL.md:
|
||||
```markdown
|
||||
For advanced usage, see [reference.md](reference.md).
|
||||
|
||||
Run the helper script:
|
||||
\`\`\`bash
|
||||
python scripts/helper.py input.txt
|
||||
\`\`\`
|
||||
```
|
||||
|
||||
### Step 8: Validate the Skill
|
||||
|
||||
Check these requirements:
|
||||
|
||||
✅ **File structure**:
|
||||
- [ ] SKILL.md exists in correct location
|
||||
- [ ] Directory name matches frontmatter `name`
|
||||
|
||||
✅ **YAML frontmatter**:
|
||||
- [ ] Opening `---` on line 1
|
||||
- [ ] Closing `---` before content
|
||||
- [ ] Valid YAML (no tabs, correct indentation)
|
||||
- [ ] `name` follows naming rules
|
||||
- [ ] `description` is specific and < 1024 chars
|
||||
|
||||
✅ **Content quality**:
|
||||
- [ ] Clear instructions for Claude
|
||||
- [ ] Concrete examples provided
|
||||
- [ ] Edge cases handled
|
||||
- [ ] Dependencies listed (if any)
|
||||
|
||||
✅ **Testing**:
|
||||
- [ ] Description matches user questions
|
||||
- [ ] Skill activates on relevant queries
|
||||
- [ ] Instructions are clear and actionable
|
||||
|
||||
### Step 9: Test the Skill
|
||||
|
||||
1. **Restart Claude Code** (if running) to load the Skill
|
||||
|
||||
2. **Ask relevant questions** that match the description:
|
||||
```
|
||||
Can you help me extract text from this PDF?
|
||||
```
|
||||
|
||||
3. **Verify activation**: Claude should use the Skill automatically
|
||||
|
||||
4. **Check behavior**: Confirm Claude follows the instructions correctly
|
||||
|
||||
### Step 10: Debug if needed
|
||||
|
||||
If Claude doesn't use the Skill:
|
||||
|
||||
1. **Make description more specific**:
|
||||
- Add trigger words
|
||||
- Include file types
|
||||
- Mention common user phrases
|
||||
|
||||
2. **Check file location**:
|
||||
```bash
|
||||
ls ~/.claude/skills/skill-name/SKILL.md
|
||||
ls .claude/skills/skill-name/SKILL.md
|
||||
```
|
||||
|
||||
3. **Validate YAML**:
|
||||
```bash
|
||||
cat SKILL.md | head -n 10
|
||||
```
|
||||
|
||||
4. **Run debug mode**:
|
||||
```bash
|
||||
claude --debug
|
||||
```
|
||||
|
||||
## Common patterns
|
||||
|
||||
### Read-only Skill
|
||||
|
||||
```yaml
|
||||
---
|
||||
name: code-reader
|
||||
description: Read and analyze code without making changes. Use for code review, understanding codebases, or documentation.
|
||||
allowed-tools: Read, Grep, Glob
|
||||
---
|
||||
```
|
||||
|
||||
### Script-based Skill
|
||||
|
||||
```yaml
|
||||
---
|
||||
name: data-processor
|
||||
description: Process CSV and JSON data files with Python scripts. Use when analyzing data files or transforming datasets.
|
||||
---
|
||||
|
||||
# Data Processor
|
||||
|
||||
## Instructions
|
||||
|
||||
1. Use the processing script:
|
||||
\`\`\`bash
|
||||
python scripts/process.py input.csv --output results.json
|
||||
\`\`\`
|
||||
|
||||
2. Validate output with:
|
||||
\`\`\`bash
|
||||
python scripts/validate.py results.json
|
||||
\`\`\`
|
||||
```
|
||||
|
||||
### Multi-file Skill with progressive disclosure
|
||||
|
||||
```yaml
|
||||
---
|
||||
name: api-designer
|
||||
description: Design REST APIs following best practices. Use when creating API endpoints, designing routes, or planning API architecture.
|
||||
---
|
||||
|
||||
# API Designer
|
||||
|
||||
Quick start: See [examples.md](examples.md)
|
||||
|
||||
Detailed reference: See [reference.md](reference.md)
|
||||
|
||||
## Instructions
|
||||
|
||||
1. Gather requirements
|
||||
2. Design endpoints (see examples.md)
|
||||
3. Document with OpenAPI spec
|
||||
4. Review against best practices (see reference.md)
|
||||
```
|
||||
|
||||
## Best practices for Skill authors
|
||||
|
||||
1. **One Skill, one purpose**: Don't create mega-Skills
|
||||
2. **Specific descriptions**: Include trigger words users will say
|
||||
3. **Clear instructions**: Write for Claude, not humans
|
||||
4. **Concrete examples**: Show real code, not pseudocode
|
||||
5. **List dependencies**: Mention required packages in description
|
||||
6. **Test with teammates**: Verify activation and clarity
|
||||
7. **Version your Skills**: Document changes in content
|
||||
8. **Use progressive disclosure**: Put advanced details in separate files
|
||||
|
||||
## Validation checklist
|
||||
|
||||
Before finalizing a Skill, verify:
|
||||
|
||||
- [ ] Name is lowercase, hyphens only, max 64 chars
|
||||
- [ ] Description is specific and < 1024 chars
|
||||
- [ ] Description includes "what" and "when"
|
||||
- [ ] YAML frontmatter is valid
|
||||
- [ ] Instructions are step-by-step
|
||||
- [ ] Examples are concrete and realistic
|
||||
- [ ] Dependencies are documented
|
||||
- [ ] File paths use forward slashes
|
||||
- [ ] Skill activates on relevant queries
|
||||
- [ ] Claude follows instructions correctly
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
**Skill doesn't activate**:
|
||||
- Make description more specific with trigger words
|
||||
- Include file types and operations in description
|
||||
- Add "Use when..." clause with user phrases
|
||||
|
||||
**Multiple Skills conflict**:
|
||||
- Make descriptions more distinct
|
||||
- Use different trigger words
|
||||
- Narrow the scope of each Skill
|
||||
|
||||
**Skill has errors**:
|
||||
- Check YAML syntax (no tabs, proper indentation)
|
||||
- Verify file paths (use forward slashes)
|
||||
- Ensure scripts have execute permissions
|
||||
- List all dependencies
|
||||
|
||||
## Examples
|
||||
|
||||
See the documentation for complete examples:
|
||||
- Simple single-file Skill (commit-helper)
|
||||
- Skill with tool permissions (code-reviewer)
|
||||
- Multi-file Skill (pdf-processing)
|
||||
|
||||
## Output format
|
||||
|
||||
When creating a Skill, I will:
|
||||
|
||||
1. Ask clarifying questions about scope and requirements
|
||||
2. Suggest a Skill name and location
|
||||
3. Create the SKILL.md file with proper frontmatter
|
||||
4. Include clear instructions and examples
|
||||
5. Add supporting files if needed
|
||||
6. Provide testing instructions
|
||||
7. Validate against all requirements
|
||||
|
||||
The result will be a complete, working Skill that follows all best practices and validation rules.
|
||||
4
.github/actions/diskspace-cleanup/action.yml
vendored
4
.github/actions/diskspace-cleanup/action.yml
vendored
@ -27,7 +27,9 @@ runs:
|
||||
docker system prune -af
|
||||
diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
|
||||
if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
|
||||
echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
|
||||
diskspace_cutoff_int=$((diskspace_cutoff + 0))
|
||||
difference=$((100 - diskspace_cutoff_int))
|
||||
echo "Error: Available diskspace is less than $difference percent. Not enough diskspace."
|
||||
echo "$msg"
|
||||
exit 1
|
||||
else
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
69bbe7363897764f9e758d851cd0340147d27f94
|
||||
3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2
|
||||
|
||||
2
.github/ci_commit_pins/vision.txt
vendored
2
.github/ci_commit_pins/vision.txt
vendored
@ -1 +1 @@
|
||||
1752fe6809b74921644866275ab80244b96e80bc
|
||||
218d2ab791d437309f91e0486eb9fa7f00badc17
|
||||
|
||||
20
.github/merge_rules.yaml
vendored
20
.github/merge_rules.yaml
vendored
@ -540,6 +540,26 @@
|
||||
- Lint
|
||||
- pull
|
||||
|
||||
- name: PrivateUse1
|
||||
patterns:
|
||||
- torch/accelerator/**
|
||||
- torch/utils/backend_registration.py
|
||||
- torch/csrc/acc/**
|
||||
- torch/csrc/DeviceAccelerator.*
|
||||
- torch/csrc/profiler/standalone/privateuse1_observer.*
|
||||
- aten/src/ATen/DeviceAccelerator.*
|
||||
- aten/src/ATen/core/GeneratorForPrivateuseone.*
|
||||
- aten/src/ATen/detail/PrivateUse1HooksInterface.*
|
||||
- docs/source/accelerator/**
|
||||
- test/cpp_extensions/open_registration_extension/torch_openreg/**
|
||||
approved_by:
|
||||
- albanD
|
||||
- fffrog
|
||||
mandatory_checks_name:
|
||||
- EasyCLA
|
||||
- Lint
|
||||
- pull
|
||||
|
||||
- name: superuser
|
||||
patterns:
|
||||
- '*'
|
||||
|
||||
2
.github/pytorch-probot.yml
vendored
2
.github/pytorch-probot.yml
vendored
@ -19,6 +19,7 @@ ciflow_push_tags:
|
||||
- ciflow/inductor-perf-test-nightly-rocm-mi300
|
||||
- ciflow/inductor-perf-test-nightly-rocm-mi355
|
||||
- ciflow/inductor-perf-test-nightly-x86-zen
|
||||
- ciflow/inductor-perf-test-nightly-xpu
|
||||
- ciflow/inductor-periodic
|
||||
- ciflow/inductor-rocm
|
||||
- ciflow/linux-aarch64
|
||||
@ -26,6 +27,7 @@ ciflow_push_tags:
|
||||
- ciflow/nightly
|
||||
- ciflow/op-benchmark
|
||||
- ciflow/periodic
|
||||
- ciflow/periodic-rocm-mi200
|
||||
- ciflow/periodic-rocm-mi300
|
||||
- ciflow/pull
|
||||
- ciflow/quantization-periodic
|
||||
|
||||
117
.github/scripts/generate_binary_build_matrix.py
vendored
117
.github/scripts/generate_binary_build_matrix.py
vendored
@ -11,18 +11,24 @@ architectures:
|
||||
* Latest XPU
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
|
||||
SCRIPT_DIR = Path(__file__).absolute().parent
|
||||
REPO_ROOT = SCRIPT_DIR.parent.parent
|
||||
|
||||
|
||||
CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
|
||||
CUDA_STABLE = "12.8"
|
||||
CUDA_ARCHES_FULL_VERSION = {
|
||||
"12.6": "12.6.3",
|
||||
"12.8": "12.8.1",
|
||||
"12.9": "12.9.1",
|
||||
"13.0": "13.0.0",
|
||||
"13.0": "13.0.2",
|
||||
}
|
||||
CUDA_ARCHES_CUDNN_VERSION = {
|
||||
"12.6": "9",
|
||||
@ -31,7 +37,6 @@ CUDA_ARCHES_CUDNN_VERSION = {
|
||||
"13.0": "9",
|
||||
}
|
||||
|
||||
# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
|
||||
ROCM_ARCHES = ["6.4", "7.0"]
|
||||
|
||||
XPU_ARCHES = ["xpu"]
|
||||
@ -56,7 +61,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
|
||||
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
|
||||
@ -73,7 +78,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
|
||||
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
|
||||
@ -90,27 +95,27 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
|
||||
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
|
||||
),
|
||||
"13.0": (
|
||||
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | "
|
||||
"nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | "
|
||||
"nvidia-cublas==13.0.0.19; platform_system == 'Linux' | "
|
||||
"nvidia-cufft==12.0.0.15; platform_system == 'Linux' | "
|
||||
"nvidia-cublas==13.1.0.3; platform_system == 'Linux' | "
|
||||
"nvidia-cufft==12.0.0.61; platform_system == 'Linux' | "
|
||||
"nvidia-curand==10.4.0.35; platform_system == 'Linux' | "
|
||||
"nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
|
||||
"nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
|
||||
"nvidia-cufile==1.15.0.42; platform_system == 'Linux'"
|
||||
"nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx==13.0.85; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | "
|
||||
"nvidia-cufile==1.15.1.6; platform_system == 'Linux'"
|
||||
),
|
||||
"xpu": (
|
||||
"intel-cmplr-lib-rt==2025.2.1 | "
|
||||
@ -137,9 +142,48 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
}
|
||||
|
||||
|
||||
def get_nccl_wheel_version(arch_version: str) -> str:
|
||||
import re
|
||||
# Used by tools/nightly.py
|
||||
PYTORCH_NIGHTLY_PIP_INDEX_URL = "https://download.pytorch.org/whl/nightly"
|
||||
NIGHTLY_SOURCE_MATRIX = {
|
||||
"cpu": dict(
|
||||
name="cpu",
|
||||
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cpu",
|
||||
supported_platforms=["Linux", "macOS", "Windows"],
|
||||
accelerator="cpu",
|
||||
)
|
||||
}
|
||||
CUDA_NIGHTLY_SOURCE_MATRIX = {
|
||||
f"cuda-{major}.{minor}": dict(
|
||||
name=f"cuda-{major}.{minor}",
|
||||
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cu{major}{minor}",
|
||||
supported_platforms=["Linux", "Windows"],
|
||||
accelerator="cuda",
|
||||
)
|
||||
for major, minor in (map(int, version.split(".")) for version in CUDA_ARCHES)
|
||||
}
|
||||
ROCM_NIGHTLY_SOURCE_MATRIX = {
|
||||
f"rocm-{major}.{minor}": dict(
|
||||
name=f"rocm-{major}.{minor}",
|
||||
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm{major}.{minor}",
|
||||
supported_platforms=["Linux"],
|
||||
accelerator="rocm",
|
||||
)
|
||||
for major, minor in (map(int, version.split(".")) for version in ROCM_ARCHES)
|
||||
}
|
||||
XPU_NIGHTLY_SOURCE_MATRIX = {
|
||||
"xpu": dict(
|
||||
name="xpu",
|
||||
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/xpu",
|
||||
supported_platforms=["Linux"],
|
||||
accelerator="xpu",
|
||||
)
|
||||
}
|
||||
NIGHTLY_SOURCE_MATRIX.update(CUDA_NIGHTLY_SOURCE_MATRIX)
|
||||
NIGHTLY_SOURCE_MATRIX.update(ROCM_NIGHTLY_SOURCE_MATRIX)
|
||||
NIGHTLY_SOURCE_MATRIX.update(XPU_NIGHTLY_SOURCE_MATRIX)
|
||||
|
||||
|
||||
def get_nccl_wheel_version(arch_version: str) -> str:
|
||||
requirements = map(
|
||||
str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
|
||||
)
|
||||
@ -147,17 +191,14 @@ def get_nccl_wheel_version(arch_version: str) -> str:
|
||||
|
||||
|
||||
def read_nccl_pin(arch_version: str) -> str:
|
||||
from pathlib import Path
|
||||
|
||||
nccl_pin_path = os.path.join(
|
||||
Path(__file__).absolute().parents[2],
|
||||
".ci",
|
||||
"docker",
|
||||
"ci_commit_pins",
|
||||
f"nccl-cu{arch_version[:2]}.txt",
|
||||
nccl_pin_path = (
|
||||
REPO_ROOT
|
||||
/ ".ci"
|
||||
/ "docker"
|
||||
/ "ci_commit_pins"
|
||||
/ f"nccl-cu{arch_version[:2]}.txt"
|
||||
)
|
||||
with open(nccl_pin_path) as f:
|
||||
return f.read().strip()
|
||||
return nccl_pin_path.read_text().strip()
|
||||
|
||||
|
||||
def validate_nccl_dep_consistency(arch_version: str) -> None:
|
||||
@ -165,7 +206,8 @@ def validate_nccl_dep_consistency(arch_version: str) -> None:
|
||||
wheel_ver = get_nccl_wheel_version(arch_version)
|
||||
if not nccl_release_tag.startswith(f"v{wheel_ver}"):
|
||||
raise RuntimeError(
|
||||
f"{arch_version} NCCL release tag version {nccl_release_tag} does not correspond to wheel version {wheel_ver}"
|
||||
f"{arch_version} NCCL release tag version {nccl_release_tag} "
|
||||
f"does not correspond to wheel version {wheel_ver}"
|
||||
)
|
||||
|
||||
|
||||
@ -412,7 +454,14 @@ def generate_wheels_matrix(
|
||||
return ret
|
||||
|
||||
|
||||
validate_nccl_dep_consistency("13.0")
|
||||
validate_nccl_dep_consistency("12.9")
|
||||
validate_nccl_dep_consistency("12.8")
|
||||
validate_nccl_dep_consistency("12.6")
|
||||
arch_version = ""
|
||||
for arch_version in CUDA_ARCHES:
|
||||
validate_nccl_dep_consistency(arch_version)
|
||||
del arch_version
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Used by tools/nightly.py
|
||||
(SCRIPT_DIR / "nightly_source_matrix.json").write_text(
|
||||
json.dumps(NIGHTLY_SOURCE_MATRIX, indent=4) + "\n"
|
||||
)
|
||||
|
||||
13
.github/workflows/_xpu-test.yml
vendored
13
.github/workflows/_xpu-test.yml
vendored
@ -38,6 +38,10 @@ on:
|
||||
default: ""
|
||||
description: |
|
||||
List of tests to include (empty string implies default list)
|
||||
dashboard-tag:
|
||||
required: false
|
||||
type: string
|
||||
default: ""
|
||||
disable-monitor:
|
||||
description: |
|
||||
[Experimental] Disable utilization monitoring for tests.
|
||||
@ -58,6 +62,11 @@ on:
|
||||
required: false
|
||||
type: number
|
||||
default: 1
|
||||
secrets:
|
||||
HUGGING_FACE_HUB_TOKEN:
|
||||
required: false
|
||||
description: |
|
||||
HF Auth token to avoid rate limits when downloading models or datasets from hub
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
@ -196,6 +205,8 @@ jobs:
|
||||
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
|
||||
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
|
||||
TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
|
||||
DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
|
||||
run: |
|
||||
# Fetch aws credential from IMDs
|
||||
@ -246,6 +257,8 @@ jobs:
|
||||
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
|
||||
-e TESTS_TO_INCLUDE \
|
||||
-e ZE_AFFINITY_MASK \
|
||||
-e HUGGING_FACE_HUB_TOKEN \
|
||||
-e DASHBOARD_TAG \
|
||||
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
||||
--ulimit stack=10485760:83886080 \
|
||||
--ulimit core=0 \
|
||||
|
||||
2
.github/workflows/build-almalinux-images.yml
vendored
2
.github/workflows/build-almalinux-images.yml
vendored
@ -36,7 +36,7 @@ jobs:
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
strategy:
|
||||
matrix:
|
||||
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"]
|
||||
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "rocm7.1", "cpu"]
|
||||
steps:
|
||||
- name: Build docker image
|
||||
uses: pytorch/pytorch/.github/actions/binary-docker-build@main
|
||||
|
||||
1
.github/workflows/build-libtorch-images.yml
vendored
1
.github/workflows/build-libtorch-images.yml
vendored
@ -54,6 +54,7 @@ jobs:
|
||||
{ tag: "cuda12.6" },
|
||||
{ tag: "rocm6.4" },
|
||||
{ tag: "rocm7.0" },
|
||||
{ tag: "rocm7.1" },
|
||||
{ tag: "cpu" },
|
||||
]
|
||||
steps:
|
||||
|
||||
2
.github/workflows/build-magma-rocm-linux.yml
vendored
2
.github/workflows/build-magma-rocm-linux.yml
vendored
@ -34,7 +34,7 @@ jobs:
|
||||
id-token: write
|
||||
strategy:
|
||||
matrix:
|
||||
rocm_version: ["70", "64"]
|
||||
rocm_version: ["71", "70", "64"]
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
1
.github/workflows/build-manywheel-images.yml
vendored
1
.github/workflows/build-manywheel-images.yml
vendored
@ -56,6 +56,7 @@ jobs:
|
||||
{ name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "rocm7.0", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "rocm7.1", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28_aarch64-builder", tag: "cpu-aarch64", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "xpu", runner: "linux.9xlarge.ephemeral" },
|
||||
|
||||
2
.github/workflows/docker-builds.yml
vendored
2
.github/workflows/docker-builds.yml
vendored
@ -57,6 +57,7 @@ jobs:
|
||||
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-py3.10-clang12,
|
||||
pytorch-linux-jammy-py3.13-clang12,
|
||||
pytorch-linux-jammy-py3.14-clang12,
|
||||
pytorch-linux-jammy-rocm-n-py3,
|
||||
pytorch-linux-noble-rocm-n-py3,
|
||||
pytorch-linux-jammy-rocm-n-py3-benchmarks,
|
||||
@ -66,6 +67,7 @@ jobs:
|
||||
pytorch-linux-jammy-py3.12-halide,
|
||||
pytorch-linux-jammy-xpu-n-1-py3,
|
||||
pytorch-linux-jammy-xpu-n-py3,
|
||||
pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks,
|
||||
pytorch-linux-jammy-py3-clang18-asan,
|
||||
pytorch-linux-jammy-py3-clang12-onnx,
|
||||
pytorch-linux-jammy-linter,
|
||||
|
||||
56
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
56
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -132,7 +132,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -178,7 +178,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -224,7 +224,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -270,7 +270,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -381,7 +381,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -427,7 +427,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -473,7 +473,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -519,7 +519,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -630,7 +630,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -676,7 +676,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -722,7 +722,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -768,7 +768,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -879,7 +879,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -925,7 +925,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -971,7 +971,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1017,7 +1017,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1128,7 +1128,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1174,7 +1174,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1220,7 +1220,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1266,7 +1266,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1377,7 +1377,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1423,7 +1423,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1469,7 +1469,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1515,7 +1515,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1626,7 +1626,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1672,7 +1672,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1718,7 +1718,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1764,7 +1764,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
56
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
56
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
@ -127,7 +127,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_6-test: # Testing
|
||||
@ -193,7 +193,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_8-test: # Testing
|
||||
@ -259,7 +259,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_9-test: # Testing
|
||||
@ -325,7 +325,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda13_0-test: # Testing
|
||||
@ -793,7 +793,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_6-test: # Testing
|
||||
@ -859,7 +859,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_8-test: # Testing
|
||||
@ -925,7 +925,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_9-test: # Testing
|
||||
@ -991,7 +991,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda13_0-test: # Testing
|
||||
@ -1459,7 +1459,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_6-test: # Testing
|
||||
@ -1525,7 +1525,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_8-test: # Testing
|
||||
@ -1591,7 +1591,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_9-test: # Testing
|
||||
@ -1657,7 +1657,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda13_0-test: # Testing
|
||||
@ -2125,7 +2125,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_6-test: # Testing
|
||||
@ -2191,7 +2191,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_8-test: # Testing
|
||||
@ -2257,7 +2257,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_9-test: # Testing
|
||||
@ -2323,7 +2323,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda13_0-test: # Testing
|
||||
@ -2791,7 +2791,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_6-test: # Testing
|
||||
@ -2857,7 +2857,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_8-test: # Testing
|
||||
@ -2923,7 +2923,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_9-test: # Testing
|
||||
@ -2989,7 +2989,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda13_0-test: # Testing
|
||||
@ -3457,7 +3457,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda12_6-test: # Testing
|
||||
@ -3523,7 +3523,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda12_8-test: # Testing
|
||||
@ -3589,7 +3589,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda12_9-test: # Testing
|
||||
@ -3655,7 +3655,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda13_0-test: # Testing
|
||||
@ -4123,7 +4123,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda12_6
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda12_6-test: # Testing
|
||||
@ -4189,7 +4189,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda12_8
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda12_8-test: # Testing
|
||||
@ -4255,7 +4255,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda12_9-test: # Testing
|
||||
@ -4321,7 +4321,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda13_0
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda13_0-test: # Testing
|
||||
|
||||
148
.github/workflows/inductor-perf-test-nightly-xpu.yml
vendored
Normal file
148
.github/workflows/inductor-perf-test-nightly-xpu.yml
vendored
Normal file
@ -0,0 +1,148 @@
|
||||
name: inductor-perf-nightly-xpu
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/inductor-perf-test-nightly-xpu/*
|
||||
schedule:
|
||||
- cron: 30 17 * * *
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
training:
|
||||
description: Run training (on by default)?
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
inference:
|
||||
description: Run inference (on by default)?
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
default:
|
||||
description: Run inductor_default?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
dynamic:
|
||||
description: Run inductor_dynamic_shapes?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
cppwrapper:
|
||||
description: Run inductor_cpp_wrapper?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
cudagraphs:
|
||||
description: Run inductor_cudagraphs?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
freezing_cudagraphs:
|
||||
description: Run inductor_cudagraphs with freezing for inference?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
aotinductor:
|
||||
description: Run aot_inductor for inference?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
maxautotune:
|
||||
description: Run inductor_max_autotune?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
benchmark_configs:
|
||||
description: The list of configs used the benchmark
|
||||
required: false
|
||||
type: string
|
||||
default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,cachebench
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
opt_out_experiments: lf
|
||||
|
||||
xpu-n-py3_10-inductor-benchmark-build:
|
||||
name: xpu-n-py3.10-inductor-benchmark
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-xpu-n-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks
|
||||
runner: linux.c7i.12xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor_huggingface_perf_xpu", shard: 1, num_shards: 5, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_huggingface_perf_xpu", shard: 2, num_shards: 5, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_huggingface_perf_xpu", shard: 3, num_shards: 5, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_huggingface_perf_xpu", shard: 4, num_shards: 5, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_huggingface_perf_xpu", shard: 5, num_shards: 5, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_timm_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_timm_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_timm_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_timm_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_timm_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_timm_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_torchbench_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_torchbench_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_torchbench_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_torchbench_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_torchbench_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_torchbench_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
xpu-n-py3_10-inductor-benchmark-test-nightly:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
if: github.event_name != 'workflow_dispatch'
|
||||
name: xpu-n-py3.10-inductor-benchmark
|
||||
uses: ./.github/workflows/_xpu-test.yml
|
||||
needs: xpu-n-py3_10-inductor-benchmark-build
|
||||
with:
|
||||
build-environment: linux-jammy-xpu-n-py3.10
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
|
||||
docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
# Disable monitor in perf tests for more investigation
|
||||
disable-monitor: true
|
||||
monitor-log-interval: 10
|
||||
monitor-data-collect-interval: 2
|
||||
secrets: inherit
|
||||
|
||||
xpu-n-py3_10-inductor-benchmark-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
if: github.event_name == 'workflow_dispatch'
|
||||
name: xpu-n-py3.10-inductor-test
|
||||
uses: ./.github/workflows/_xpu-test.yml
|
||||
needs: xpu-n-py3_10-inductor-benchmark-build
|
||||
with:
|
||||
build-environment: linux-jammy-xpu-n-py3.10
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
|
||||
docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
disable-monitor: false
|
||||
monitor-log-interval: 15
|
||||
monitor-data-collect-interval: 4
|
||||
secrets: inherit
|
||||
84
.github/workflows/periodic-rocm-mi200.yml
vendored
Normal file
84
.github/workflows/periodic-rocm-mi200.yml
vendored
Normal file
@ -0,0 +1,84 @@
|
||||
name: periodic-rocm-mi200
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
|
||||
# Also run less frequently on weekends.
|
||||
- cron: 45 0,8,16 * * 1-5
|
||||
- cron: 45 4 * * 0,6
|
||||
- cron: 45 4,12,20 * * 1-5
|
||||
- cron: 45 12 * * 0,6
|
||||
- cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests
|
||||
push:
|
||||
tags:
|
||||
- ciflow/periodic/*
|
||||
- ciflow/periodic-rocm-mi200/*
|
||||
branches:
|
||||
- release/*
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
llm-td:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: before-test
|
||||
uses: ./.github/workflows/llm_td_retrieval.yml
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
target-determination:
|
||||
name: before-test
|
||||
uses: ./.github/workflows/target_determination.yml
|
||||
needs: llm-td
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-rocm-py3_10-build:
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-jammy-rocm-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
31
.github/workflows/periodic.yml
vendored
31
.github/workflows/periodic.yml
vendored
@ -204,37 +204,6 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-build:
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-jammy-rocm-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build:
|
||||
name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
||||
1
.github/workflows/upload-test-stats.yml
vendored
1
.github/workflows/upload-test-stats.yml
vendored
@ -6,6 +6,7 @@ on:
|
||||
- pull
|
||||
- trunk
|
||||
- periodic
|
||||
- periodic-rocm-mi200
|
||||
- periodic-rocm-mi300
|
||||
- inductor
|
||||
- unstable
|
||||
|
||||
20
.github/workflows/xpu.yml
vendored
20
.github/workflows/xpu.yml
vendored
@ -59,14 +59,18 @@ jobs:
|
||||
runner: linux.c7i.12xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 1, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 2, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 3, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 4, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 5, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 6, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 7, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 8, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 9, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 10, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 11, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 12, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@ -143,6 +143,7 @@ scripts/release_notes/*.json
|
||||
sccache-stats*.json
|
||||
lint.json
|
||||
merge_record.json
|
||||
.github/scripts/nightly_source_matrix.json
|
||||
|
||||
# These files get copied over on invoking setup.py
|
||||
torchgen/packaged/*
|
||||
|
||||
@ -374,7 +374,7 @@ cmake_dependent_option(
|
||||
"Build the lazy Torchscript backend, not compatible with mobile builds" ON
|
||||
"NOT INTERN_BUILD_MOBILE" OFF)
|
||||
cmake_dependent_option(BUILD_FUNCTORCH "Build Functorch" ON "BUILD_PYTHON" OFF)
|
||||
cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
|
||||
cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin folder"
|
||||
OFF "USE_CUDA" OFF)
|
||||
cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
|
||||
"CPU_AARCH64" OFF)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||

|
||||

|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
@ -72,7 +72,7 @@ Elaborating Further:
|
||||
|
||||
If you use NumPy, then you have used Tensors (a.k.a. ndarray).
|
||||
|
||||

|
||||

|
||||
|
||||
PyTorch provides Tensors that can live either on the CPU or the GPU and accelerates the
|
||||
computation by a huge amount.
|
||||
@ -99,7 +99,7 @@ from several research papers on this topic, as well as current and past work suc
|
||||
While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date.
|
||||
You get the best of speed and flexibility for your crazy research.
|
||||
|
||||

|
||||

|
||||
|
||||
### Python First
|
||||
|
||||
|
||||
@ -260,7 +260,7 @@ IF(USE_FBGEMM_GENAI)
|
||||
if(USE_CUDA)
|
||||
# To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
|
||||
# If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
|
||||
set(FBGEMM_CUTLASS_KERNELS_REGEX ".*mx8mx8bf16_grouped.*")
|
||||
set(FBGEMM_CUTLASS_KERNELS_REGEX ".*(mx8mx8bf16_grouped|f4f4bf16_grouped|f4f4bf16).*")
|
||||
file(GLOB_RECURSE fbgemm_genai_native_cuda_cu
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu"
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
|
||||
@ -291,6 +291,7 @@ IF(USE_FBGEMM_GENAI)
|
||||
|
||||
set(fbgemm_genai_cuh
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/f4f4bf16_grouped/"
|
||||
"${FBGEMM_GENAI_SRCS}/"
|
||||
)
|
||||
|
||||
|
||||
@ -677,8 +677,8 @@ struct CachingHostAllocatorImpl {
|
||||
// size. This allows us to quickly find a free block of the right size.
|
||||
// We use deque to store per size free list and guard the list with its own
|
||||
// mutex.
|
||||
alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>> free_list_ =
|
||||
std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
|
||||
alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>>
|
||||
free_list_{MAX_SIZE_INDEX};
|
||||
|
||||
alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
|
||||
std::deque<std::pair<E, B*>> events_; // event queue paired with block
|
||||
|
||||
@ -354,47 +354,9 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
|
||||
Vectorized frac() const;
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(trunc)
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(sqrt)
|
||||
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
Vectorized<c10::BFloat16> neg() const {
|
||||
return -values;
|
||||
}
|
||||
Vectorized<c10::BFloat16> reciprocal() const {
|
||||
return 1.0f / values;
|
||||
}
|
||||
Vectorized<c10::BFloat16> operator==(
|
||||
const Vectorized<c10::BFloat16>& other) const {
|
||||
return values == other.values;
|
||||
}
|
||||
|
||||
Vectorized<c10::BFloat16> operator!=(
|
||||
const Vectorized<c10::BFloat16>& other) const {
|
||||
return values != other.values;
|
||||
}
|
||||
|
||||
Vectorized<c10::BFloat16> operator<(
|
||||
const Vectorized<c10::BFloat16>& other) const {
|
||||
return values < other.values;
|
||||
}
|
||||
|
||||
Vectorized<c10::BFloat16> operator<=(
|
||||
const Vectorized<c10::BFloat16>& other) const {
|
||||
return values <= other.values;
|
||||
}
|
||||
|
||||
Vectorized<c10::BFloat16> operator>(
|
||||
const Vectorized<c10::BFloat16>& other) const {
|
||||
return values > other.values;
|
||||
}
|
||||
|
||||
Vectorized<c10::BFloat16> operator>=(
|
||||
const Vectorized<c10::BFloat16>& other) const {
|
||||
return values >= other.values;
|
||||
}
|
||||
#else
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal)
|
||||
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator==)
|
||||
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator!=)
|
||||
@ -402,7 +364,6 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<=)
|
||||
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>)
|
||||
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>=)
|
||||
#endif
|
||||
|
||||
#undef DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD
|
||||
#undef DEFINE_BINARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD
|
||||
@ -451,52 +412,28 @@ template <>
|
||||
Vectorized<c10::BFloat16> inline operator+(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b) {
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
return x + y;
|
||||
#else
|
||||
return binary_operator_via_float(std::plus<Vectorized<float>>(), a, b);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::BFloat16> inline operator-(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b) {
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
return x - y;
|
||||
#else
|
||||
return binary_operator_via_float(std::minus<Vectorized<float>>(), a, b);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::BFloat16> inline operator*(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b) {
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
return x * y;
|
||||
#else
|
||||
return binary_operator_via_float(std::multiplies<Vectorized<float>>(), a, b);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::BFloat16> inline operator/(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b) {
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
return x / y;
|
||||
#else
|
||||
return binary_operator_via_float(std::divides<Vectorized<float>>(), a, b);
|
||||
#endif
|
||||
}
|
||||
|
||||
// frac. Implement this here so we can use subtraction
|
||||
@ -607,19 +544,12 @@ Vectorized<c10::BFloat16> inline fmadd(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
const Vectorized<c10::BFloat16>& c) {
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
bfloat16x8_t z = c;
|
||||
return x * y + z;
|
||||
#else
|
||||
// NOTE [BF16 FMA]: There isn't an FMA that accumulates into BF16! Also,
|
||||
// vbfmlalbq_f32 and vbfmlaltq_f32 take the even and odd-numbered
|
||||
// elements, not the bottom and top half, so they don't seem
|
||||
// particularly useful here. Ideally we would include dot product in
|
||||
// the Vectorized interface...
|
||||
return a * b + c;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
@ -627,15 +557,8 @@ Vectorized<c10::BFloat16> inline fnmadd(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
const Vectorized<c10::BFloat16>& c) {
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
bfloat16x8_t z = c;
|
||||
return (-x) * y + z;
|
||||
#else
|
||||
// See NOTE [BF16 FMA] above.
|
||||
return -a * b + c;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
@ -643,15 +566,8 @@ Vectorized<c10::BFloat16> inline fmsub(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
const Vectorized<c10::BFloat16>& c) {
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
bfloat16x8_t z = c;
|
||||
return x * y - z;
|
||||
#else
|
||||
// See NOTE [BF16 FMA] above.
|
||||
return a * b - c;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
@ -659,15 +575,8 @@ Vectorized<c10::BFloat16> inline fnmsub(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
const Vectorized<c10::BFloat16>& c) {
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
bfloat16x8_t z = c;
|
||||
return (-x) * y - z;
|
||||
#else
|
||||
// See NOTE [BF16 FMA] above.
|
||||
return -a * b - c;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // !defined(C10_MOBILE) && defined(__aarch64__)
|
||||
|
||||
@ -21,12 +21,46 @@ inline void convertImpl(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename to_type>
|
||||
inline void convertFromBool(
|
||||
const bool* __restrict src,
|
||||
to_type* __restrict dst,
|
||||
int64_t n) {
|
||||
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
|
||||
uint64_t len = static_cast<uint64_t>(n);
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
dst[i] = srcPtr[i] != 0 ? static_cast<to_type>(1) : static_cast<to_type>(0);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename from_type>
|
||||
inline void convertToBool(
|
||||
const from_type* __restrict src,
|
||||
bool* __restrict dst,
|
||||
int64_t n) {
|
||||
uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
|
||||
uint64_t len = static_cast<uint64_t>(n);
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
dstPtr[i] = src[i] != static_cast<from_type>(0) ? 1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
#define CONVERT_TEMPLATE(from_type, to_type) \
|
||||
template <> \
|
||||
inline void convert(const from_type* src, to_type* dst, int64_t n) { \
|
||||
return convertImpl<from_type, to_type>(src, dst, n); \
|
||||
}
|
||||
|
||||
#define CONVERT_FROM_BOOL_TEMPLATE(to_type) \
|
||||
inline void convert(const bool* src, to_type* dst, int64_t n) { \
|
||||
return convertFromBool<to_type>(src, dst, n); \
|
||||
}
|
||||
|
||||
#define CONVERT_TO_BOOL_TEMPLATE(from_type) \
|
||||
inline void convert(const from_type* src, bool* dst, int64_t n) { \
|
||||
return convertToBool<from_type>(src, dst, n); \
|
||||
}
|
||||
|
||||
CONVERT_TEMPLATE(uint8_t, uint8_t)
|
||||
CONVERT_TEMPLATE(uint8_t, int8_t)
|
||||
CONVERT_TEMPLATE(uint8_t, int16_t)
|
||||
@ -34,6 +68,7 @@ CONVERT_TEMPLATE(uint8_t, int32_t)
|
||||
CONVERT_TEMPLATE(uint8_t, int64_t)
|
||||
CONVERT_TEMPLATE(uint8_t, float)
|
||||
CONVERT_TEMPLATE(uint8_t, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(uint8_t)
|
||||
CONVERT_TEMPLATE(int8_t, uint8_t)
|
||||
CONVERT_TEMPLATE(int8_t, int8_t)
|
||||
CONVERT_TEMPLATE(int8_t, int16_t)
|
||||
@ -41,6 +76,7 @@ CONVERT_TEMPLATE(int8_t, int32_t)
|
||||
CONVERT_TEMPLATE(int8_t, int64_t)
|
||||
CONVERT_TEMPLATE(int8_t, float)
|
||||
CONVERT_TEMPLATE(int8_t, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(int8_t)
|
||||
CONVERT_TEMPLATE(int16_t, uint8_t)
|
||||
CONVERT_TEMPLATE(int16_t, int8_t)
|
||||
CONVERT_TEMPLATE(int16_t, int16_t)
|
||||
@ -48,6 +84,7 @@ CONVERT_TEMPLATE(int16_t, int32_t)
|
||||
CONVERT_TEMPLATE(int16_t, int64_t)
|
||||
CONVERT_TEMPLATE(int16_t, float)
|
||||
CONVERT_TEMPLATE(int16_t, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(int16_t)
|
||||
CONVERT_TEMPLATE(int32_t, uint8_t)
|
||||
CONVERT_TEMPLATE(int32_t, int8_t)
|
||||
CONVERT_TEMPLATE(int32_t, int16_t)
|
||||
@ -55,6 +92,7 @@ CONVERT_TEMPLATE(int32_t, int32_t)
|
||||
CONVERT_TEMPLATE(int32_t, int64_t)
|
||||
CONVERT_TEMPLATE(int32_t, float)
|
||||
CONVERT_TEMPLATE(int32_t, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(int32_t)
|
||||
CONVERT_TEMPLATE(int64_t, uint8_t)
|
||||
CONVERT_TEMPLATE(int64_t, int8_t)
|
||||
CONVERT_TEMPLATE(int64_t, int16_t)
|
||||
@ -62,6 +100,7 @@ CONVERT_TEMPLATE(int64_t, int32_t)
|
||||
CONVERT_TEMPLATE(int64_t, int64_t)
|
||||
CONVERT_TEMPLATE(int64_t, float)
|
||||
CONVERT_TEMPLATE(int64_t, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(int64_t)
|
||||
CONVERT_TEMPLATE(float, uint8_t)
|
||||
CONVERT_TEMPLATE(float, int8_t)
|
||||
CONVERT_TEMPLATE(float, int16_t)
|
||||
@ -69,6 +108,7 @@ CONVERT_TEMPLATE(float, int32_t)
|
||||
CONVERT_TEMPLATE(float, int64_t)
|
||||
CONVERT_TEMPLATE(float, float)
|
||||
CONVERT_TEMPLATE(float, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(float)
|
||||
CONVERT_TEMPLATE(double, uint8_t)
|
||||
CONVERT_TEMPLATE(double, int8_t)
|
||||
CONVERT_TEMPLATE(double, int16_t)
|
||||
@ -76,22 +116,80 @@ CONVERT_TEMPLATE(double, int32_t)
|
||||
CONVERT_TEMPLATE(double, int64_t)
|
||||
CONVERT_TEMPLATE(double, float)
|
||||
CONVERT_TEMPLATE(double, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(double)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(uint8_t)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(int8_t)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(int16_t)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(int32_t)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(int64_t)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(float)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(double)
|
||||
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
CONVERT_TEMPLATE(float16_t, uint8_t)
|
||||
CONVERT_TEMPLATE(float16_t, int8_t)
|
||||
CONVERT_TEMPLATE(float16_t, int16_t)
|
||||
CONVERT_TEMPLATE(float16_t, int32_t)
|
||||
CONVERT_TEMPLATE(float16_t, int64_t)
|
||||
CONVERT_TEMPLATE(float16_t, float16_t)
|
||||
CONVERT_TEMPLATE(float16_t, float)
|
||||
CONVERT_TEMPLATE(float16_t, double)
|
||||
CONVERT_TEMPLATE(uint8_t, float16_t)
|
||||
CONVERT_TEMPLATE(int8_t, float16_t)
|
||||
CONVERT_TEMPLATE(int16_t, float16_t)
|
||||
CONVERT_TEMPLATE(int32_t, float16_t)
|
||||
CONVERT_TEMPLATE(int64_t, float16_t)
|
||||
CONVERT_TEMPLATE(float, float16_t)
|
||||
CONVERT_TEMPLATE(double, float16_t)
|
||||
|
||||
#define CONVERT_FROM_FP16_TEMPLATE(to_type) \
|
||||
template <> \
|
||||
inline void convert(const at::Half* src, to_type* dst, int64_t n) { \
|
||||
const float16_t* srcPtr = reinterpret_cast<const float16_t*>(src); \
|
||||
return convertImpl<float16_t, to_type>(srcPtr, dst, n); \
|
||||
}
|
||||
|
||||
#define CONVERT_TO_FP16_TEMPLATE(from_type) \
|
||||
template <> \
|
||||
inline void convert(const from_type* src, at::Half* dst, int64_t n) { \
|
||||
float16_t* dstPtr = reinterpret_cast<float16_t*>(dst); \
|
||||
return convertImpl<from_type, float16_t>(src, dstPtr, n); \
|
||||
}
|
||||
|
||||
CONVERT_FROM_FP16_TEMPLATE(uint8_t)
|
||||
CONVERT_FROM_FP16_TEMPLATE(int8_t)
|
||||
CONVERT_FROM_FP16_TEMPLATE(int16_t)
|
||||
CONVERT_FROM_FP16_TEMPLATE(int32_t)
|
||||
CONVERT_FROM_FP16_TEMPLATE(int64_t)
|
||||
CONVERT_FROM_FP16_TEMPLATE(float16_t)
|
||||
CONVERT_FROM_FP16_TEMPLATE(float)
|
||||
CONVERT_FROM_FP16_TEMPLATE(double)
|
||||
CONVERT_TO_FP16_TEMPLATE(uint8_t)
|
||||
CONVERT_TO_FP16_TEMPLATE(int8_t)
|
||||
CONVERT_TO_FP16_TEMPLATE(int16_t)
|
||||
CONVERT_TO_FP16_TEMPLATE(int32_t)
|
||||
CONVERT_TO_FP16_TEMPLATE(int64_t)
|
||||
CONVERT_TO_FP16_TEMPLATE(float)
|
||||
CONVERT_TO_FP16_TEMPLATE(double)
|
||||
|
||||
inline void convertBoolToFp16Impl(
|
||||
const bool* __restrict src,
|
||||
at::Half* __restrict dst,
|
||||
int64_t n) {
|
||||
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
|
||||
float16_t* dstPtr = reinterpret_cast<float16_t*>(dst);
|
||||
uint64_t len = static_cast<uint64_t>(n);
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
dstPtr[i] = srcPtr[i] != 0 ? 1.0 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const bool* src, at::Half* dst, int64_t n) {
|
||||
return convertBoolToFp16Impl(src, dst, n);
|
||||
}
|
||||
|
||||
inline void convertFp16ToBoolImpl(
|
||||
const at::Half* __restrict src,
|
||||
bool* __restrict dst,
|
||||
int64_t n) {
|
||||
const float16_t* srcPtr = reinterpret_cast<const float16_t*>(src);
|
||||
uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
|
||||
uint64_t len = static_cast<uint64_t>(n);
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
dstPtr[i] = srcPtr[i] != 0.0 ? 1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const at::Half* src, bool* dst, int64_t n) {
|
||||
return convertFp16ToBoolImpl(src, dst, n);
|
||||
}
|
||||
|
||||
#endif
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
CONVERT_TEMPLATE(bfloat16_t, uint8_t)
|
||||
@ -109,6 +207,44 @@ CONVERT_TEMPLATE(int32_t, bfloat16_t)
|
||||
CONVERT_TEMPLATE(int64_t, bfloat16_t)
|
||||
CONVERT_TEMPLATE(float, bfloat16_t)
|
||||
CONVERT_TEMPLATE(double, bfloat16_t)
|
||||
|
||||
inline void convertBoolToBfloat16Impl(
|
||||
const bool* __restrict src,
|
||||
c10::BFloat16* __restrict dst,
|
||||
int64_t n) {
|
||||
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
|
||||
uint16_t* dstPtr = reinterpret_cast<uint16_t*>(dst);
|
||||
uint64_t len = static_cast<uint64_t>(n);
|
||||
constexpr uint16_t kBf16One = 0x3f80; // 1.0 in bfloat16
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
dstPtr[i] = srcPtr[i] != 0 ? kBf16One : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const bool* src, c10::BFloat16* dst, int64_t n) {
|
||||
return convertBoolToBfloat16Impl(src, dst, n);
|
||||
}
|
||||
|
||||
inline void convertBfloat16ToBoolImpl(
|
||||
const c10::BFloat16* __restrict src,
|
||||
bool* __restrict dst,
|
||||
int64_t n) {
|
||||
uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
|
||||
const uint16_t* srcPtr = reinterpret_cast<const uint16_t*>(src);
|
||||
uint64_t len = static_cast<uint64_t>(n);
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
// Check if all non-sign bits are 0
|
||||
bool isBf16Zero = (srcPtr[i] & 0x7fff) == 0;
|
||||
dstPtr[i] = isBf16Zero ? 0 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const c10::BFloat16* src, bool* dst, int64_t n) {
|
||||
return convertBfloat16ToBoolImpl(src, dst, n);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@ -309,7 +309,7 @@ class Vectorized<float> {
|
||||
DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1)
|
||||
// Implementation copied from Arm Optimized Routine
|
||||
// https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
|
||||
Vectorized<float> exp_u20() const {
|
||||
inline Vectorized<float> vexpq_f32_u20() const {
|
||||
// bail out to sleef if it's a special case:
|
||||
// i.e. there's an input s.t. |input| > 87.3....
|
||||
const float32x4_t special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);
|
||||
@ -348,6 +348,9 @@ class Vectorized<float> {
|
||||
|
||||
return vfmaq_f32(scale, poly, scale);
|
||||
}
|
||||
Vectorized<float> exp_u20() const {
|
||||
return vexpq_f32_u20();
|
||||
}
|
||||
Vectorized<float> fexp_u20() const {
|
||||
return exp_u20();
|
||||
}
|
||||
@ -634,7 +637,7 @@ inline Vectorized<float> Vectorized<float>::erf() const {
|
||||
// - exp(- x * x)
|
||||
auto pow_2 = (*this) * (*this);
|
||||
auto neg_pow_2 = pow_2 ^ neg_zero_vec;
|
||||
auto tmp4 = neg_pow_2.exp();
|
||||
auto tmp4 = neg_pow_2.vexpq_f32_u20();
|
||||
auto tmp5 = tmp4 ^ neg_zero_vec;
|
||||
// erf(x) = sign(x) * (1 - r * t * exp(- x * x))
|
||||
auto tmp6 = t * tmp5;
|
||||
|
||||
@ -2,10 +2,10 @@
|
||||
|
||||
#include <ATen/cuda/ATenCUDAGeneral.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/core/impl/GPUTrace.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <ATen/cuda/Exceptions.h>
|
||||
#include <c10/core/impl/GPUTrace.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
#include <cuda_runtime_api.h>
|
||||
@ -246,4 +246,79 @@ private:
|
||||
}
|
||||
};
|
||||
|
||||
// EventPool - Thread-safe pool of CUDA events to avoid expensive cudaEventCreate
|
||||
// calls. cudaEventCreate when concurrently invoked from multiple threads can be
|
||||
// very expensive (especially on certain device/driver combinations).
|
||||
using CUDAEventPtr =
|
||||
std::unique_ptr<CUDAEvent, std::function<void(CUDAEvent*)>>;
|
||||
|
||||
class EventPool {
|
||||
public:
|
||||
EventPool() : pools_(at::cuda::device_count()) {}
|
||||
|
||||
CUDAEventPtr get(const DeviceIndex device) {
|
||||
// If the device is invalid, return a default event and no pooling
|
||||
if (device < 0 || device >= (DeviceIndex)pools_.size()) {
|
||||
auto deleter = [](CUDAEvent* event) {
|
||||
delete event;
|
||||
};
|
||||
return CUDAEventPtr(
|
||||
std::make_unique<CUDAEvent>(cudaEventDisableTiming).release(), deleter);
|
||||
}
|
||||
|
||||
auto& pool = pools_[device];
|
||||
|
||||
// Create a destructor that returns the event to the appropriate device pool
|
||||
auto destructor = [&pool](CUDAEvent* event) noexcept {
|
||||
if (event != nullptr) {
|
||||
std::lock_guard<std::mutex> lock(pool.mutex_);
|
||||
pool.event_pool_.emplace_back(event);
|
||||
}
|
||||
};
|
||||
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(pool.mutex_);
|
||||
if (!pool.event_pool_.empty()) {
|
||||
auto event = std::move(pool.event_pool_.back());
|
||||
pool.event_pool_.pop_back();
|
||||
return CUDAEventPtr(event.release(), destructor);
|
||||
}
|
||||
}
|
||||
|
||||
return CUDAEventPtr(
|
||||
std::make_unique<CUDAEvent>(cudaEventDisableTiming).release(),
|
||||
destructor);
|
||||
}
|
||||
|
||||
void empty_cache() {
|
||||
for (auto& pool : pools_) {
|
||||
std::lock_guard<std::mutex> lock(pool.mutex_);
|
||||
pool.event_pool_.clear();
|
||||
}
|
||||
}
|
||||
|
||||
void init_num_events(const size_t num_events) {
|
||||
for (DeviceIndex device_idx = 0; device_idx < at::cuda::device_count(); ++device_idx) {
|
||||
CUDAGuard device_guard(device_idx);
|
||||
std::vector<CUDAEventPtr> temp_events;
|
||||
temp_events.reserve(num_events);
|
||||
for (size_t i = 0; i < num_events; ++i) {
|
||||
auto event = get(device_idx);
|
||||
// Record the event to ensure it's properly initialized
|
||||
event->record();
|
||||
temp_events.emplace_back(std::move(event));
|
||||
}
|
||||
// Events will be returned to pool when temp_events is destroyed
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
struct alignas(64) PerDevicePool {
|
||||
alignas(64) std::mutex mutex_;
|
||||
std::vector<std::unique_ptr<CUDAEvent>> event_pool_;
|
||||
};
|
||||
|
||||
std::vector<PerDevicePool> pools_;
|
||||
};
|
||||
|
||||
} // namespace at::cuda
|
||||
|
||||
@ -7,17 +7,6 @@
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
// hipSparse const API added in v2.4.0
|
||||
#if HIPSPARSE_VERSION >= 200400
|
||||
#define AT_USE_HIPSPARSE_GENERIC_API() 1
|
||||
#else
|
||||
#define AT_USE_HIPSPARSE_GENERIC_API() 1
|
||||
#endif
|
||||
#else // USE_ROCM
|
||||
#define AT_USE_HIPSPARSE_GENERIC_API() 0
|
||||
#endif // USE_ROCM
|
||||
|
||||
// cuSparse Generic API spsv function was added in CUDA 11.3.0
|
||||
#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11500)
|
||||
#define AT_USE_CUSPARSE_GENERIC_SPSV() 1
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
#include <ATen/cuda/CUDAContextLight.h>
|
||||
#include <ATen/cuda/Sleep.h>
|
||||
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <c10/cuda/CUDAException.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
|
||||
@ -24,8 +25,22 @@ __global__ void spin_kernel(int64_t cycles) {
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
thread_local int *flag = nullptr;
|
||||
|
||||
__global__ void busy_wait_for_flag_kernel(int *flag) {
|
||||
atomicExch(flag, 1);
|
||||
while (atomicAdd(flag, 0) == 1) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void clear_flag_kernel(int *flag) {
|
||||
atomicExch(flag, 0);
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
void sleep(int64_t cycles) {
|
||||
dim3 grid(1);
|
||||
dim3 block(1);
|
||||
@ -33,6 +48,26 @@ void sleep(int64_t cycles) {
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
|
||||
void busy_wait_for_flag() {
|
||||
if (!flag) {
|
||||
flag = (int*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(int));
|
||||
}
|
||||
dim3 grid(1);
|
||||
dim3 block(1);
|
||||
busy_wait_for_flag_kernel<<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(flag);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
|
||||
void clear_flag() {
|
||||
if (!flag) {
|
||||
flag = (int*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(int));
|
||||
}
|
||||
dim3 grid(1);
|
||||
dim3 block(1);
|
||||
clear_flag_kernel<<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(flag);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
|
||||
#ifdef USE_ROCM
|
||||
__global__ void flush_icache_kernel()
|
||||
{
|
||||
|
||||
@ -7,6 +7,11 @@ namespace at::cuda {
|
||||
// enqueues a kernel that spins for the specified number of cycles
|
||||
TORCH_CUDA_CU_API void sleep(int64_t cycles);
|
||||
|
||||
// enqueues a kernel that spins until a flag is cleared by a
|
||||
// corresponding call to clear_flag()
|
||||
TORCH_CUDA_CU_API void busy_wait_for_flag();
|
||||
TORCH_CUDA_CU_API void clear_flag();
|
||||
|
||||
// flushes instruction cache for ROCm; no-op for CUDA
|
||||
TORCH_CUDA_CU_API void flush_icache();
|
||||
|
||||
|
||||
@ -580,7 +580,7 @@ std::ofstream& TuningContext::GetUntunedFile(){
|
||||
filename.append(device);
|
||||
}
|
||||
|
||||
untuned_file_ = std::ofstream(filename, std::ios::out | std::ios::trunc);
|
||||
untuned_file_ = std::ofstream(filename, std::ios::out | std::ios::app);
|
||||
}
|
||||
return untuned_file_;
|
||||
}
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/CachingDeviceAllocator.h>
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
@ -151,6 +152,36 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
|
||||
}
|
||||
|
||||
virtual bool isAvailable() const override;
|
||||
|
||||
/* MTIAGraph related APIs */
|
||||
virtual int64_t mtiagraphCreate(bool keep_graph = false) const {
|
||||
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
virtual void mtiagraphCaptureBegin(int64_t handle, MempoolId_t pool) const {
|
||||
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||
}
|
||||
|
||||
virtual void mtiagraphCaptureEnd(int64_t handle) const {
|
||||
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||
}
|
||||
|
||||
virtual void mtiagraphInstantiate(int64_t handle) const {
|
||||
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||
}
|
||||
|
||||
virtual void mtiagraphReplay(int64_t handle) const {
|
||||
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||
}
|
||||
|
||||
virtual void mtiagraphReset(int64_t handle) const {
|
||||
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||
}
|
||||
|
||||
virtual MempoolId_t mtiagraphPool(int64_t handle) const {
|
||||
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||
}
|
||||
};
|
||||
|
||||
struct TORCH_API MTIAHooksArgs {};
|
||||
|
||||
@ -410,8 +410,8 @@ struct ConvParams {
|
||||
return false;
|
||||
}
|
||||
static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
|
||||
// broken on cuDNN 9.8
|
||||
if (cudnn_version >= 90800) {
|
||||
// broken on cuDNN 9.8 - 9.14
|
||||
if (cudnn_version >= 90800 && cudnn_version < 91500) {
|
||||
if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
|
||||
(input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) &&
|
||||
weight.dim() == 5) {
|
||||
@ -689,6 +689,10 @@ static void check_shape_forward(const at::Tensor& input,
|
||||
", but got bias of size ", at::symint::sizes<T>(bias), " instead");
|
||||
|
||||
for (const auto i : c10::irange(2, k)) {
|
||||
// T could be int64_t or SymInt, Specialized numeric_limts<SymInt> in c10/core/SymInt.h
|
||||
TORCH_CHECK(padding[i-2] <= (std::numeric_limits<T>::max() - padding[i-2]),
|
||||
"Given padding=", padding[i-2], " at dimension ", i-2, " , expected padding to be at most ",
|
||||
(std::numeric_limits<T>::max() / 2));
|
||||
input_shape.push_back(at::symint::size<T>(input, i) + 2 * padding[i-2]);
|
||||
// log new kernel size considering dilation
|
||||
kernel_shape.push_back(dilation[i-2] * (weight_sizes[i]-1) + 1);
|
||||
@ -715,6 +719,11 @@ static void check_shape_forward(const at::Tensor& input,
|
||||
"Kernel size: (", kernel_ss.str(), "). Kernel size can't be greater than actual input size");
|
||||
}
|
||||
} else { // transposed
|
||||
for (const auto i : c10::irange(2, k)) {
|
||||
TORCH_CHECK(padding[i-2] <= (std::numeric_limits<T>::max() - padding[i-2]),
|
||||
"Given padding=", padding[i-2], " at dimension ", i-2, " , expected padding to be at most ",
|
||||
(std::numeric_limits<T>::max() / 2));
|
||||
}
|
||||
TORCH_CHECK(at::symint::size<T>(input, 1) == weight_sizes[0],
|
||||
"Given transposed=", transposed, ", weight of size ", weight_sizes,
|
||||
", expected input", at::symint::sizes<T>(input), " to have ", weight_sizes[0],
|
||||
|
||||
@ -52,8 +52,7 @@ Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, in
|
||||
for (const auto k : c10::irange(kw)) {
|
||||
int iShift = std::max(0, static_cast<int>(k - real_pad));
|
||||
int oShift = std::max(0, static_cast<int>(real_pad - k));
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
int t = std::min(ilen + real_pad - k, olen) - oShift;
|
||||
long t = std::min(ilen + real_pad - k, olen) - oShift;
|
||||
// Note: gemm assumes column-major matrices
|
||||
// input is l*m (row-major)
|
||||
// weight is m*r (row-major)
|
||||
|
||||
@ -16,8 +16,7 @@ bool canUse32BitIndexMath(const TensorBase& t, int64_t max_elem) {
|
||||
auto linearId = elements - 1;
|
||||
|
||||
// NOTE: Assumes all strides are positive, which is true for now
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
for (int i = t.dim() - 1; i >= 0; --i) {
|
||||
for (auto i = t.dim() - 1; i >= 0; --i) {
|
||||
auto curDimIndex = linearId % t.sym_size(i);
|
||||
auto curDimOffset = curDimIndex * t.sym_stride(i);
|
||||
offset += curDimOffset;
|
||||
|
||||
@ -68,7 +68,6 @@ Tensor fbgemm_linear_int8_weight_fp32_activation(
|
||||
const float* input_ptr = input_contig.const_data_ptr<float>();
|
||||
|
||||
TORCH_CHECK(input.dim() >= 2);
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
|
||||
const int64_t K = input.size(input.dim() - 1);
|
||||
TORCH_CHECK(weight.dim() == 2);
|
||||
|
||||
@ -160,10 +160,9 @@ struct Dist {
|
||||
// value of k.
|
||||
parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [p, self_start, self_end, n, m, res_start](int64_t k, int64_t end) {
|
||||
const Vec pvec(p);
|
||||
double n2 = n - .5;
|
||||
double n2 = static_cast<double>(n) - .5;
|
||||
// The -1 accounts for floating point truncation issues
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
|
||||
int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2.0 * static_cast<double>(k) - 1.0)));
|
||||
int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
|
||||
|
||||
const scalar_t * self_i = self_start + i * m;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,11 +1,11 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/Context.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Dispatch_v2.h>
|
||||
#include <ATen/cuda/CachingHostAllocator.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <ATen/cuda/CUDAEvent.h>
|
||||
#include <ATen/cuda/CachingHostAllocator.h>
|
||||
#include <ATen/cuda/PeerToPeerAccess.h>
|
||||
#include <ATen/native/Copy.h>
|
||||
#include <ATen/native/TensorIterator.h>
|
||||
@ -27,6 +27,24 @@
|
||||
|
||||
namespace at::native {
|
||||
|
||||
namespace {
|
||||
|
||||
// Initial pool size for CUDA events per device.
|
||||
constexpr size_t kInitialEventPoolSize = 8;
|
||||
|
||||
at::cuda::CUDAEventPtr getEventFromPool(const at::DeviceIndex device_idx) {
|
||||
static auto* event_pool = []() {
|
||||
auto* pool = new at::cuda::EventPool();
|
||||
// Pre-populate the pool with events to avoid stalls in creating events
|
||||
pool->init_num_events(kInitialEventPoolSize);
|
||||
return pool;
|
||||
}();
|
||||
|
||||
return event_pool->get(device_idx);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void neg_kernel_cuda(TensorIteratorBase &iter);
|
||||
void conj_kernel_cuda(TensorIteratorBase &iter);
|
||||
|
||||
@ -263,12 +281,14 @@ void copy_device_to_device(TensorIterator& iter,
|
||||
// write-after-read dependencies on the destination side are handled, so
|
||||
// that no one is operating on the dst memory when we perform the copy.
|
||||
// src waits on dst barrier (src already waits on src)
|
||||
CUDAEvent dst_ready;
|
||||
|
||||
// Use event pool for better performance instead of creating new events
|
||||
auto dst_ready = getEventFromPool(dst_device.index());
|
||||
device_guard.set_device(dst_device);
|
||||
dst_ready.record(getCurrentCUDAStream(dst_device.index()));
|
||||
dst_ready->record(getCurrentCUDAStream(dst_device.index()));
|
||||
|
||||
device_guard.set_device(src_device);
|
||||
dst_ready.block(copy_stream);
|
||||
dst_ready->block(copy_stream);
|
||||
}
|
||||
|
||||
if (memcpy_eligible) {
|
||||
@ -307,11 +327,11 @@ void copy_device_to_device(TensorIterator& iter,
|
||||
// operate on dst's copy until the copy is complete.
|
||||
|
||||
// Still on src_device, record stream event
|
||||
CUDAEvent src_ready;
|
||||
src_ready.record(copy_stream);
|
||||
auto src_ready = getEventFromPool(src_device.index());
|
||||
src_ready->record(copy_stream);
|
||||
|
||||
device_guard.set_device(dst_device);
|
||||
src_ready.block(getCurrentCUDAStream(dst_device.index()));
|
||||
src_ready->block(getCurrentCUDAStream(dst_device.index()));
|
||||
}
|
||||
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
@ -208,6 +208,62 @@ _f8_f8_bf16_rowwise_grouped_mm(
|
||||
#endif
|
||||
}
|
||||
|
||||
Tensor&
|
||||
_f4_f4_bf16_grouped_mm_fbgemm(
|
||||
const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
const Tensor& scale_a,
|
||||
const std::optional<Tensor>& global_scale_a,
|
||||
const Tensor& scale_b,
|
||||
const std::optional<Tensor>& global_scale_b,
|
||||
const std::optional<Tensor>& offs,
|
||||
const std::optional<Tensor>& bias,
|
||||
Tensor& out) {
|
||||
#if !defined(USE_ROCM) && defined(USE_FBGEMM_GENAI)
|
||||
// Typing checks
|
||||
TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2,
|
||||
"mat_a must be Float4_e2n1fn_2, got: ", mat_a.scalar_type());
|
||||
TORCH_CHECK_VALUE(mat_b.scalar_type() == at::kFloat4_e2m1fn_x2,
|
||||
"mat_b must be Float4_e2n1fn_2, got: ", mat_b.scalar_type());
|
||||
|
||||
std::optional<Tensor> combined_global_scale = std::nullopt;
|
||||
if (global_scale_a.has_value() || global_scale_b.has_value()) {
|
||||
// NVFP4
|
||||
TORCH_CHECK_VALUE(global_scale_a.has_value() && global_scale_b.has_value(),
|
||||
"For NVFP4 grouped gemm both of global_scale_{a,b} must have values")
|
||||
TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e4m3fn,
|
||||
"scale_a must be Float8_e4m3fn, got: ", scale_a.scalar_type());
|
||||
TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e4m3fn,
|
||||
"scale_b must be Float8_e4m3fn, got: ", scale_b.scalar_type());
|
||||
TORCH_CHECK_VALUE(global_scale_a.value().scalar_type() == at::kFloat,
|
||||
"global_scale_a must be Float, got: ", global_scale_a.value().scalar_type());
|
||||
TORCH_CHECK_VALUE(global_scale_b.value().scalar_type() == at::kFloat,
|
||||
"global_scale_b must be Float, got: ", global_scale_b.value().scalar_type());
|
||||
combined_global_scale = global_scale_a.value().mul(global_scale_b.value());
|
||||
} else {
|
||||
// MXFP4
|
||||
TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e8m0fnu,
|
||||
"scale_a must be Float8_e8m0fnu, got: ", scale_a.scalar_type());
|
||||
TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e8m0fnu,
|
||||
"scale_b must be Float8_e8m0fnu, got: ", scale_b.scalar_type());
|
||||
}
|
||||
|
||||
auto o = fbgemm_gpu::f4f4bf16_grouped_mm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a,
|
||||
scale_b,
|
||||
offs.value(),
|
||||
out,
|
||||
combined_global_scale
|
||||
);
|
||||
#else
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "nvfp4 grouped gemm is not supported without USE_FBGEMM_GENAI, and only for CUDA")
|
||||
#endif
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
|
||||
// Checks scales for 2d or 3d target tensors (`mat`).
|
||||
if (mat.dim() == 2) {
|
||||
@ -245,7 +301,15 @@ void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int
|
||||
}
|
||||
}
|
||||
|
||||
void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) {
|
||||
void _check_scales_blocked(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) {
|
||||
// if {mx,nv}fp4, will need to modify K later
|
||||
bool is_fp4 = (mat.scalar_type() == kFloat4_e2m1fn_x2);
|
||||
int blocksize = 32;
|
||||
// check for nvfp4 vs. mxfp4 to fix blocksize
|
||||
if (is_fp4 && scale.scalar_type() == kFloat8_e4m3fn) {
|
||||
blocksize = 16;
|
||||
}
|
||||
|
||||
// Checks scales for 2d or 3d target tensors (`mat`).
|
||||
if (mat.dim() == 2) {
|
||||
// For MXFP8, 2d tensors have variable size groups represented as subtensors,
|
||||
@ -253,17 +317,19 @@ void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim,
|
||||
// so we can't check the scale sizes without doing a d2h sync to get the group sizes here.
|
||||
TORCH_CHECK(
|
||||
scale.dim() == mat.dim(),
|
||||
"for mxfp8, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(), " and scale.dim() = ", scale.dim(), " for arg ", arg_idx);
|
||||
"for block-scaled, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(),
|
||||
" and scale.dim() = ", scale.dim(), " for arg ", arg_idx
|
||||
);
|
||||
|
||||
// LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/32, 4))
|
||||
// RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/32, 4))
|
||||
// LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/blocksize, 4))
|
||||
// RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/blocksize, 4))
|
||||
// * weight is transposed prior to the call, scale stays non-transposed.
|
||||
bool LHS = arg_idx == 0;
|
||||
int scale_dim_to_check = 0;
|
||||
int mat_dim_to_check = LHS ? 0 : 1;
|
||||
TORCH_CHECK(
|
||||
scale.size(scale_dim_to_check) >= mat.size(mat_dim_to_check),
|
||||
"for mxfp8, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ",
|
||||
"for block-scaled, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ",
|
||||
"must have scale.shape[", scale_dim_to_check, "] >= ", mat.size(mat_dim_to_check), " but got scale.shape=(", scale.size(0), ", ", scale.size(1), ")");
|
||||
} else {
|
||||
// For MXFP8, 3d tensors have static group sizes (stack of 2d tensors),
|
||||
@ -273,32 +339,40 @@ void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim,
|
||||
};
|
||||
|
||||
// TODO: this is for 3d tensor in 2d-3d case specifically.
|
||||
// We'll need to support 3d-3d and 3d-2d cases once mxfp8 grouped gemm supports them.
|
||||
// We'll need to support 3d-3d and 3d-2d cases once mxfp8/nvfp4 grouped gemm supports them.
|
||||
int64_t G = mat.size(0);
|
||||
int64_t K = mat.size(1);
|
||||
if (is_fp4) {
|
||||
// FP4 packs 2 values into a single 8b word - the "real" K is 2x the
|
||||
// reported K. Reverse that adjustment.
|
||||
const int fp4_elems_per_byte = 2;
|
||||
K *= fp4_elems_per_byte;
|
||||
}
|
||||
int64_t N = mat.size(2);
|
||||
int64_t blocked_scale_K = round_up(K/32, 4);
|
||||
int64_t blocked_scale_K = round_up(K/blocksize, 4);
|
||||
int64_t blocked_scale_N = round_up(N, 128);
|
||||
|
||||
// fbgemm expects stack of flattened blocked scales for 3d tensor, shape (G, blocked_scale_K * blocked_scale_N).
|
||||
TORCH_CHECK(
|
||||
scale.dim() == mat.dim() - 1,
|
||||
"for mxfp8 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N), but scale is ", scale.dim(), "D for arg ", arg_idx
|
||||
"for block-scaled 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N),",
|
||||
"but scale is ", scale.dim(), "D for arg ", arg_idx
|
||||
);
|
||||
TORCH_CHECK(
|
||||
scale.size(0) == G && scale.size(1) == blocked_scale_K * blocked_scale_N,
|
||||
"for mxfp8, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ") for arg ", arg_idx
|
||||
"for block-scaled grouped GEMM, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ")",
|
||||
" for arg ", arg_idx, ", got: ", scale.size(0), ", ", scale.size(1)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
|
||||
bool using_fp8_rowwise = scale.scalar_type() == kFloat;
|
||||
bool using_mxfp8 = scale.scalar_type() == at::kFloat8_e8m0fnu;
|
||||
bool using_mx = scale.scalar_type() == at::kFloat8_e8m0fnu;
|
||||
if (using_fp8_rowwise) {
|
||||
_check_scales_fp8_rowwise(mat, scale, dim, arg_idx, scale_multiplier);
|
||||
} else if (using_mxfp8) {
|
||||
_check_scales_mxfp8(mat, scale, dim, arg_idx);
|
||||
} else if (using_mx) {
|
||||
_check_scales_blocked(mat, scale, dim, arg_idx);
|
||||
} else {
|
||||
TORCH_CHECK(false, "scale must be float32 or float8_e8m0fnu, but got ", scale.dtype());
|
||||
}
|
||||
@ -411,9 +485,11 @@ namespace {
|
||||
|
||||
using acceptance_fn = std::function<bool(c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&, c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&)>;
|
||||
|
||||
std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 2> scale_grouped_kernel_dispatch = {{
|
||||
std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 4> scale_grouped_kernel_dispatch = {{
|
||||
{ "rowwise_rowwise", scaled_blas::check_rowwise_recipe, ScaledGemmImplementation::ROWWISE_ROWWISE},
|
||||
{ "mxfp8_mxfp8", scaled_blas::check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8}}};
|
||||
{ "mxfp8_mxfp8", scaled_blas::check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8},
|
||||
{ "mxfp4_mxfp4", scaled_blas::check_mxfp4_recipe, ScaledGemmImplementation::MXFP4_MXFP4},
|
||||
{ "nvfp4_nvfp4", scaled_blas::check_nvfp4_recipe, ScaledGemmImplementation::NVFP4_NVFP4}}};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
@ -525,8 +601,9 @@ _scaled_grouped_mm_cuda_v2(
|
||||
out);
|
||||
}
|
||||
case ScaledGemmImplementation::MXFP8_MXFP8: {
|
||||
_check_scales_mxfp8(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
|
||||
_check_scales_mxfp8(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
|
||||
// scale shape checks
|
||||
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
|
||||
_check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
|
||||
return _mx8_mx8_bf16_grouped_mm_fbgemm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
@ -537,6 +614,36 @@ _scaled_grouped_mm_cuda_v2(
|
||||
offs.value(),
|
||||
out);
|
||||
}
|
||||
case ScaledGemmImplementation::MXFP4_MXFP4: {
|
||||
// scale shape checks
|
||||
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
|
||||
_check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
|
||||
return _f4_f4_bf16_grouped_mm_fbgemm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a[0], /* block-scale A */
|
||||
std::nullopt, /* global-scale A */
|
||||
scale_b[0], /* block-scale B */
|
||||
std::nullopt, /* global-scale B */
|
||||
offs.value(),
|
||||
std::nullopt, /* bias */
|
||||
out);
|
||||
}
|
||||
case ScaledGemmImplementation::NVFP4_NVFP4: {
|
||||
// scale shape checks
|
||||
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
|
||||
_check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
|
||||
return _f4_f4_bf16_grouped_mm_fbgemm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a[0], /* block-scale A */
|
||||
scale_a[1], /* global-scale A */
|
||||
scale_b[0], /* block-scale B */
|
||||
scale_b[1], /* global-scale B */
|
||||
offs.value(),
|
||||
std::nullopt, /* bias */
|
||||
out);
|
||||
}
|
||||
default:
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false,
|
||||
"_scaled_grouped_mm_cuda_v2 is in an inconsistent state - should never reach here");
|
||||
|
||||
@ -13,7 +13,7 @@ __global__ void vectorized_gather_kernel(char * out, char * inp, index_t * idx,
|
||||
if (allow_neg_indices) {
|
||||
ind = (ind < 0) ? ind + ind_dim_size : ind;
|
||||
}
|
||||
CUDA_KERNEL_ASSERT(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds");
|
||||
CUDA_KERNEL_ASSERT_VERBOSE(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds", "Expected 0 <= index < ind_dim_size(%ld), but got index = %ld", ind_dim_size, ind);
|
||||
int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; // off is guaranteed to be within int32 limits
|
||||
if (off >= slice_size) return;
|
||||
auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);
|
||||
|
||||
1388
aten/src/ATen/native/cuda/ScaledBlas.cpp
Normal file
1388
aten/src/ATen/native/cuda/ScaledBlas.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -160,8 +160,8 @@ struct _cuda_scatter_gather_internal_kernel {
|
||||
auto offsets = offset_calc.get(i);
|
||||
|
||||
int64_t idx_dim = *(index_t*)(index_ptr + offsets[2]);
|
||||
CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
|
||||
&& "scatter gather kernel index out of bounds");
|
||||
CUDA_KERNEL_ASSERT_VERBOSE(idx_dim >= 0 && idx_dim < index_size
|
||||
&& "scatter gather kernel index out of bounds", "Expected 0 <= idx_dim < index_size (%ld), but got idx_dim = %ld", index_size, idx_dim);
|
||||
|
||||
f(
|
||||
(scalar_t*)(self_ptr + offsets[0]),
|
||||
@ -406,9 +406,8 @@ struct _cuda_scatter_fill_internal_kernel {
|
||||
auto offsets = offset_calc.get(i);
|
||||
|
||||
int64_t idx_dim = *(index_t*)(index_ptr + offsets[1]);
|
||||
CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
|
||||
&& "index out of bounds"
|
||||
);
|
||||
CUDA_KERNEL_ASSERT_VERBOSE(idx_dim >= 0 && idx_dim < index_size
|
||||
&& "index out of bounds", "Expected 0 <= idx_dim < index_size (%ld), but got idx_dim = %ld", index_size, idx_dim);
|
||||
|
||||
f(
|
||||
(scalar_t*)(self_ptr + offsets[0]),
|
||||
|
||||
@ -12,14 +12,15 @@
|
||||
|
||||
namespace at::native {
|
||||
|
||||
#if AT_USE_JITERATOR()
|
||||
#if 0 && AT_USE_JITERATOR()
|
||||
constexpr char tan_name[] = "tan_impl";
|
||||
#endif
|
||||
|
||||
void tan_kernel_cuda(TensorIteratorBase& iter) {
|
||||
auto common_dtype = iter.common_dtype();
|
||||
if (at::isComplexType(common_dtype)) {
|
||||
#if AT_USE_JITERATOR()
|
||||
// Disabled due to accuracy issues
|
||||
#if 0 && AT_USE_JITERATOR()
|
||||
static const auto tan_string = jiterator_stringify(
|
||||
template <typename T> T tan_impl(T a) { return std::tan(a); });
|
||||
AT_DISPATCH_COMPLEX_TYPES_AND(
|
||||
|
||||
@ -12,14 +12,15 @@
|
||||
|
||||
namespace at::native {
|
||||
|
||||
#if AT_USE_JITERATOR()
|
||||
#if 0 && AT_USE_JITERATOR()
|
||||
constexpr char tanh_name[] = "tanh_impl";
|
||||
#endif
|
||||
|
||||
void tanh_kernel_cuda(TensorIteratorBase& iter) {
|
||||
auto common_dtype = iter.common_dtype();
|
||||
if (at::isComplexType(common_dtype)) {
|
||||
#if AT_USE_JITERATOR()
|
||||
// Disabled due to accuracy issues
|
||||
#if 0 && AT_USE_JITERATOR()
|
||||
static const auto tanh_string = jiterator_stringify(
|
||||
template <typename T> T tanh_impl(T a) { return std::tanh(a); });
|
||||
AT_DISPATCH_COMPLEX_TYPES_AND(
|
||||
|
||||
171
aten/src/ATen/native/cuda/cuBlasCommonArgs.h
Normal file
171
aten/src/ATen/native/cuda/cuBlasCommonArgs.h
Normal file
@ -0,0 +1,171 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/core/Tensor.h>
|
||||
|
||||
namespace at::native {
|
||||
|
||||
using at::blas::ScalingType;
|
||||
using at::blas::SwizzleType;
|
||||
|
||||
namespace {
|
||||
|
||||
// TODO: https://github.com/pytorch/pytorch/pull/59380#pullrequestreview-725310492
|
||||
c10::MaybeOwned<Tensor> inline resolve_conj_if_indicated(const Tensor& tensor, bool resolve_conj) {
|
||||
if (resolve_conj && tensor.is_conj()) {
|
||||
return c10::MaybeOwned<Tensor>::owned(tensor.resolve_conj());
|
||||
} else {
|
||||
return c10::MaybeOwned<Tensor>::borrowed(tensor);
|
||||
}
|
||||
}
|
||||
|
||||
c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, bool transpose_result) {
|
||||
if (tensor.is_non_overlapping_and_dense()) { // common case
|
||||
transpose_tensor = tensor.is_contiguous();
|
||||
return resolve_conj_if_indicated(tensor, transpose_result ? transpose_tensor : !transpose_tensor);
|
||||
}
|
||||
IntArrayRef tensor_strides = tensor.strides();
|
||||
IntArrayRef tensor_sizes = tensor.sizes();
|
||||
if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
|
||||
transpose_tensor = false;
|
||||
return resolve_conj_if_indicated(tensor, !transpose_result);
|
||||
} else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
|
||||
transpose_tensor = true;
|
||||
return resolve_conj_if_indicated(tensor, transpose_result);
|
||||
} else {
|
||||
transpose_tensor = true;
|
||||
return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
|
||||
}
|
||||
}
|
||||
|
||||
c10::MaybeOwned<Tensor> inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor) {
|
||||
if (tensor.is_non_overlapping_and_dense()) { // common case
|
||||
transpose_tensor = tensor.is_contiguous();
|
||||
return resolve_conj_if_indicated(tensor, true);
|
||||
}
|
||||
|
||||
IntArrayRef tensor_strides = tensor.strides();
|
||||
IntArrayRef tensor_sizes = tensor.sizes();
|
||||
if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max<int64_t>(1, tensor_sizes[0]))) {
|
||||
transpose_tensor = false;
|
||||
return resolve_conj_if_indicated(tensor, true);
|
||||
} else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max<int64_t>(1, tensor_sizes[1]))) {
|
||||
transpose_tensor = true;
|
||||
return resolve_conj_if_indicated(tensor, true);
|
||||
} else {
|
||||
transpose_tensor = true;
|
||||
return c10::MaybeOwned<Tensor>::owned(tensor.clone(at::MemoryFormat::Contiguous));
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
/**
|
||||
* @brief Prepares matrices for CUBLAS operation
|
||||
*
|
||||
* This constructor prepares tensors for CUBLAS
|
||||
* The main difference is that PyTorch uses row-major as the default and
|
||||
* CUBLAS expects column-major.
|
||||
*
|
||||
* @details
|
||||
* To enable row-major output while using CUBLAS,
|
||||
* we use the mathematical identity that (A × B)^T = B^T × A^T.
|
||||
*
|
||||
* Transpose in this context refers to Cublas's(Fortran) definition of transpose (row-major)
|
||||
* T = row-major, N = col-major
|
||||
*
|
||||
* Example:
|
||||
* For matrices A (M×K)(row-major) and B (K×N)(row-major):
|
||||
* - Standard multiplication: A × B = (M×K) × (K×N) = M×N result (row-major)
|
||||
* - Using our transpose trick: (B^T × A^T) = (N×K)(T) × (K×M)(T) = N×M(N)
|
||||
* - However, since the output form cublas is column-major this is
|
||||
* - equivalent to an output of size MxN row-major as expected
|
||||
*
|
||||
* The transpose flags are derived from the layouts of the passed in tensors
|
||||
*
|
||||
* If the operands are in packed float4 format, `k`, `lda` and `ldb` are adjusted
|
||||
* to their unpacked values to match what cuBLAS expects.
|
||||
*
|
||||
* @param mat1 First input matrix
|
||||
* @param mat2 Second input matrix
|
||||
* @param c Output matrix (result)
|
||||
* @param scale_a Optional scaling factor for first matrix
|
||||
* @param scale_b Optional scaling factor for second matrix
|
||||
* @param scale_result Optional scaling factor for result
|
||||
*/
|
||||
struct cublasCommonArgs {
|
||||
cublasCommonArgs(
|
||||
const Tensor& mat1,
|
||||
const Tensor& mat2,
|
||||
Tensor& c,
|
||||
const std::optional<Tensor>& scale_a = std::nullopt,
|
||||
const std::optional<Tensor>& scale_b = std::nullopt,
|
||||
const std::optional<Tensor>& scale_result = std::nullopt,
|
||||
const std::optional<ScalingType>& scaling_choice_a = std::nullopt,
|
||||
const std::optional<ScalingType>& scaling_choice_b = std::nullopt) {
|
||||
bool transpose_result = false, transpose_a = false, transpose_b = false;
|
||||
result = prepare_matrix_for_cublas(c, transpose_result);
|
||||
mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result);
|
||||
matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result);
|
||||
|
||||
// Handle scale tensors if provided
|
||||
if (scale_a && scale_b) {
|
||||
// By default since we return in row-major we run the gemm
|
||||
// as B.T @ A.T, check transpose_result to determine if we flip the scales
|
||||
scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr();
|
||||
scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type();
|
||||
scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a;
|
||||
scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr();
|
||||
scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type();
|
||||
scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b;
|
||||
}
|
||||
|
||||
if (scale_result) {
|
||||
scale_result_ptr = scale_result->data_ptr();
|
||||
scale_result_dtype = scale_result->scalar_type();
|
||||
}
|
||||
|
||||
// Update transpose flags
|
||||
if (transpose_result) {
|
||||
transpose_a = !transpose_a;
|
||||
transpose_b = !transpose_b;
|
||||
}
|
||||
|
||||
auto sizes_a = mata->sizes();
|
||||
auto sizes_b = matb->sizes();
|
||||
|
||||
m = sizes_a[transpose_result ? 1 : 0];
|
||||
k = sizes_a[transpose_result ? 0 : 1];
|
||||
n = sizes_b[transpose_result ? 0 : 1];
|
||||
lda = mata->stride((transpose_a == transpose_result) ? 1 : 0);
|
||||
ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0);
|
||||
result_ld = result->stride(transpose_result ? 0 : 1);
|
||||
transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n';
|
||||
transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n';
|
||||
|
||||
// cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing
|
||||
// if the gemm operands are in packed float4
|
||||
if (mat1.dtype() == at::kFloat4_e2m1fn_x2 && mat2.dtype() == at::kFloat4_e2m1fn_x2) {
|
||||
k = k * 2;
|
||||
lda = lda * 2;
|
||||
ldb = ldb * 2;
|
||||
}
|
||||
}
|
||||
|
||||
// Matrix members
|
||||
char transa, transb;
|
||||
int64_t m, n, k;
|
||||
int64_t lda, ldb, result_ld;
|
||||
c10::MaybeOwned<Tensor> mata, matb, result;
|
||||
|
||||
// Scale members
|
||||
void* scale_mata_ptr = nullptr;
|
||||
void* scale_matb_ptr = nullptr;
|
||||
void* scale_result_ptr = nullptr;
|
||||
std::optional<c10::ScalarType> scale_mata_dtype;
|
||||
std::optional<ScalingType> scaling_mata_type;
|
||||
std::optional<c10::ScalarType> scale_matb_dtype;
|
||||
std::optional<ScalingType> scaling_matb_type;
|
||||
std::optional<c10::ScalarType> scale_result_dtype;
|
||||
};
|
||||
|
||||
} // namespace at::native
|
||||
@ -141,7 +141,8 @@ WelfordDataLN cuWelfordOnlineSum(
|
||||
if constexpr (!rms_norm){
|
||||
U delta = val - curr_sum.mean;
|
||||
U new_count = curr_sum.count + 1.f;
|
||||
#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
|
||||
//Due to low CU count, we run into accuracy issues on gfx90a with `__builtin_amdgcn_rcpf`
|
||||
#if defined(USE_ROCM) && !defined(__gfx90a__) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
|
||||
U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
|
||||
#else
|
||||
U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
|
||||
@ -163,7 +164,8 @@ WelfordDataLN cuWelfordCombine(
|
||||
U count = dataA.count + dataB.count;
|
||||
U mean, sigma2;
|
||||
if (count > decltype(dataB.count){0}) {
|
||||
#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
|
||||
//Due to low CU count, we run into accuracy issues on gfx90a with `__builtin_amdgcn_rcpf`
|
||||
#if defined(USE_ROCM) && !defined(__gfx90a__) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
|
||||
auto coef = __builtin_amdgcn_rcpf(count);
|
||||
#else
|
||||
auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
|
||||
|
||||
@ -40,14 +40,37 @@ bool check_head_dim_size_xpu(sdp::sdp_params const& params, bool debug) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool check_no_grad(sdp::sdp_params const& params, bool debug) {
|
||||
const bool any_inputs_require_grad = params.query.requires_grad() ||
|
||||
params.key.requires_grad() || params.value.requires_grad();
|
||||
const bool gradmode_enabled = at::GradMode::is_enabled();
|
||||
if (debug && any_inputs_require_grad && gradmode_enabled) {
|
||||
TORCH_WARN("Backward or grad to be supported.");
|
||||
bool input_require_grad(
|
||||
const at::Tensor& query,
|
||||
const at::Tensor& key,
|
||||
const at::Tensor& value,
|
||||
const std::optional<at::Tensor>& attn_mask) {
|
||||
return at::GradMode::is_enabled() &&
|
||||
(query.requires_grad() || key.requires_grad() || value.requires_grad() ||
|
||||
(attn_mask.has_value() && attn_mask.value().requires_grad()));
|
||||
}
|
||||
|
||||
bool check_grad(sdp::sdp_params const& params, bool debug) {
|
||||
if (!input_require_grad(
|
||||
params.query, params.key, params.value, params.attn_mask))
|
||||
return true;
|
||||
|
||||
auto q_num_heads = params.query.sym_size(-3);
|
||||
auto k_num_heads = params.key.sym_size(-3);
|
||||
auto v_num_heads = params.value.sym_size(-3);
|
||||
bool is_gqa = q_num_heads != k_num_heads || q_num_heads != v_num_heads;
|
||||
if (debug && is_gqa)
|
||||
TORCH_WARN(
|
||||
"scale_dot_product_attention with gqa is not supported for gradient computation on xpu.");
|
||||
|
||||
bool attn_mask_needs_grad =
|
||||
params.attn_mask.has_value() && params.attn_mask.value().requires_grad();
|
||||
if (debug && attn_mask_needs_grad) {
|
||||
TORCH_WARN(
|
||||
"scale_dot_product_attention on xpu is not supported when attn_mask.requires_grad() == True.");
|
||||
}
|
||||
return !any_inputs_require_grad || !gradmode_enabled;
|
||||
|
||||
return !is_gqa && !attn_mask_needs_grad;
|
||||
}
|
||||
|
||||
bool can_use_overrideable_attention(sdp::sdp_params const& params, bool debug) {
|
||||
@ -65,7 +88,7 @@ bool can_use_overrideable_attention(sdp::sdp_params const& params, bool debug) {
|
||||
sdp::check_nonzero_sequence_lengths_dense,
|
||||
sdp::check_last_dim_stride_equals_1_dense<false /*ignore_singleton_dim*/>,
|
||||
check_head_dim_size_xpu,
|
||||
check_no_grad);
|
||||
check_grad);
|
||||
for (auto& constraint : constraints) {
|
||||
if (!constraint(params, debug)) {
|
||||
return false;
|
||||
@ -225,10 +248,11 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
|
||||
double dropout_p,
|
||||
bool is_causal,
|
||||
bool return_debug_mask,
|
||||
std::optional<double> scale) {
|
||||
std::optional<double> scale,
|
||||
bool compute_logsumexp) {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
query.dim() == 4 && key.dim() == 4 && value.dim() == 4,
|
||||
"scaled_dot_product_fused_attention_overrideable_xpu: Accept only 4 dims inputs shape of {(B), H, T, K}");
|
||||
"scaled_dot_product_fused_attention_overrideable_xpu: Accept only 4 dims inputs shape of {B, H, T, K}");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
(key.size(0) == value.size(0)) && (key.size(1) == value.size(1)) &&
|
||||
(key.size(2) == value.size(2)),
|
||||
@ -245,6 +269,9 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
!(attn_bias.has_value() && is_causal),
|
||||
"scaled_dot_product_fused_attention_overrideable_xpu: attn_bias cannot present with is_causal");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
!(attn_bias.has_value() && attn_bias.value().requires_grad()),
|
||||
"scaled_dot_product_fused_attention_overrideable_xpu: attn_bias cannot have requires_grad=True");
|
||||
|
||||
const int64_t batch_size = query.size(0);
|
||||
const int64_t num_head_q = query.size(1);
|
||||
@ -254,11 +281,14 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
|
||||
const int64_t seq_len_q = query.size(2);
|
||||
const int64_t seq_len_kv = key.size(2);
|
||||
|
||||
at::Tensor output;
|
||||
std::vector<int64_t> output_shape = {
|
||||
at::Tensor attention;
|
||||
std::vector<int64_t> attention_shape = {
|
||||
batch_size, num_head_q, seq_len_q, head_dim_v};
|
||||
alloc_with_matching_layout(query, output, output_shape);
|
||||
at::Tensor logsumexp, debug_attn_mask; // not supported
|
||||
alloc_with_matching_layout(query, attention, attention_shape);
|
||||
|
||||
auto opts = query.options();
|
||||
at::Tensor logsumexp =
|
||||
at::empty({batch_size, num_head_q, seq_len_q}, opts.dtype(at::kFloat));
|
||||
|
||||
at::native::onednn::sdpa(
|
||||
batch_size,
|
||||
@ -274,15 +304,15 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
|
||||
attn_bias,
|
||||
is_causal,
|
||||
scale.has_value() ? scale.value() : (1.0 / std::sqrt(head_dim_qk)),
|
||||
output,
|
||||
false,
|
||||
attention,
|
||||
compute_logsumexp,
|
||||
logsumexp);
|
||||
|
||||
// rng not used
|
||||
auto philox_seed = at::empty({}, at::dtype(at::kLong));
|
||||
auto philox_offset = at::empty({}, at::dtype(at::kLong));
|
||||
return std::make_tuple(
|
||||
output,
|
||||
attention,
|
||||
logsumexp,
|
||||
/* cum_seq_q */ at::Tensor(),
|
||||
/* cum_seq_k */ at::Tensor(),
|
||||
@ -290,7 +320,106 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
|
||||
seq_len_kv,
|
||||
philox_seed,
|
||||
philox_offset,
|
||||
debug_attn_mask);
|
||||
/*debug_attn_mask */ at::Tensor());
|
||||
}
|
||||
|
||||
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
|
||||
_scaled_dot_product_fused_attention_overrideable_backward_xpu(
|
||||
const at::Tensor& grad_out,
|
||||
const at::Tensor& query,
|
||||
const at::Tensor& key,
|
||||
const at::Tensor& value,
|
||||
const at::Tensor& attn_bias,
|
||||
std::array<bool, 4> grad_input_mask,
|
||||
const at::Tensor& out,
|
||||
const at::Tensor& logsumexp,
|
||||
const at::Tensor& cum_seq_q,
|
||||
const at::Tensor& cum_seq_k,
|
||||
int64_t max_q,
|
||||
int64_t max_k,
|
||||
double dropout_p,
|
||||
bool is_causal,
|
||||
const at::Tensor& philox_seed,
|
||||
const at::Tensor& philox_offset,
|
||||
std::optional<double> scale) {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
grad_out.dim() == 4 && out.dim() == 4 &&
|
||||
grad_out.size(0) == out.size(0) && grad_out.size(1) == out.size(1) &&
|
||||
grad_out.size(2) == out.size(2) && grad_out.size(3) == out.size(3),
|
||||
"scaled_dot_product_fused_attention_overrideable_backward_xpu: grad_out and out should have the same shape of {B, H, T, K}");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
query.dim() == 4 && key.dim() == 4 && value.dim() == 4,
|
||||
"scaled_dot_product_fused_attention_overrideable_backward_xpu: Accept only 4 dims inputs shape of {B, H, T, K}");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
(key.size(0) == value.size(0)) && (key.size(1) == value.size(1)) &&
|
||||
(key.size(2) == value.size(2)),
|
||||
"scaled_dot_product_fused_attention_overrideable_backward_xpu: K/V should have the same batch / seq / num_head");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
query.size(0) == grad_out.size(0) && query.size(1) == grad_out.size(1) &&
|
||||
query.size(2) == grad_out.size(2),
|
||||
"scaled_dot_product_fused_attention_overrideable_backward_xpu: Q should have the same batch / num_head / seq_len as grad_out");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
query.size(3) == key.size(3),
|
||||
"scaled_dot_product_fused_attention_overrideable_backward_xpu: Q/K should have the same head_dim");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
value.size(3) == grad_out.size(3),
|
||||
"scaled_dot_product_fused_attention_overrideable_backward_xpu: V should have the same head_dim as grad_out");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
query.size(1) == key.size(1),
|
||||
"scaled_dot_product_fused_attention_overrideable_backward_xpu: number of heads in K/V must equal to number of heads in Q");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
dropout_p == 0.0,
|
||||
"scaled_dot_product_fused_attention_overrideable_backward_xpu: Currently do not support dropout > 0");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
logsumexp.dim() == 3 && logsumexp.size(0) == query.size(0) &&
|
||||
logsumexp.size(1) == query.size(1) &&
|
||||
logsumexp.size(2) == query.size(2) &&
|
||||
"scaled_dot_product_fused_attention_overrideable_backward_xpu: logsumexp should have the shape of {B, H, T}");
|
||||
|
||||
std::optional<Tensor> attn_bias_opt;
|
||||
if (attn_bias.defined()) {
|
||||
attn_bias_opt = attn_bias;
|
||||
}
|
||||
|
||||
const int64_t batch_size = query.size(0);
|
||||
const int64_t num_head_q = query.size(1);
|
||||
const int64_t num_head_kv = key.size(1);
|
||||
const int64_t seq_len_q = query.size(2);
|
||||
const int64_t seq_len_kv = key.size(2);
|
||||
const int64_t head_dim_qk = query.size(3);
|
||||
const int64_t head_dim_v = value.size(3);
|
||||
|
||||
auto grad_q = at::empty_like(query);
|
||||
auto grad_k = at::empty_like(key);
|
||||
auto grad_v = at::empty_like(value);
|
||||
auto grad_attn_bias = attn_bias_opt.has_value()
|
||||
? at::empty_like(attn_bias_opt.value())
|
||||
: at::Tensor();
|
||||
at::native::onednn::sdpa_backward(
|
||||
batch_size,
|
||||
num_head_q,
|
||||
num_head_kv,
|
||||
seq_len_q,
|
||||
seq_len_kv,
|
||||
head_dim_qk,
|
||||
head_dim_v,
|
||||
grad_out,
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
out,
|
||||
logsumexp,
|
||||
attn_bias_opt,
|
||||
is_causal,
|
||||
scale.has_value() ? scale.value() : (1.0 / std::sqrt(query.size(3))),
|
||||
grad_q,
|
||||
grad_k,
|
||||
grad_v);
|
||||
return std::make_tuple(
|
||||
std::move(grad_q),
|
||||
std::move(grad_k),
|
||||
std::move(grad_v),
|
||||
std::move(grad_attn_bias));
|
||||
}
|
||||
|
||||
REGISTER_XPU_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_xpu);
|
||||
|
||||
@ -86,6 +86,28 @@ struct zeta_functor {
|
||||
}
|
||||
};
|
||||
|
||||
struct logaddexp_functor {
|
||||
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
|
||||
inline T operator()(const T a, const T b) {
|
||||
return c10::metal::logaddexp(a, b);
|
||||
}
|
||||
template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
|
||||
inline float operator()(const T a, const T b) {
|
||||
return c10::metal::logaddexp(float(a), float(b));
|
||||
}
|
||||
};
|
||||
|
||||
struct logaddexp2_functor {
|
||||
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
|
||||
inline T operator()(const T a, const T b) {
|
||||
return c10::metal::logaddexp2(a, b);
|
||||
}
|
||||
template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
|
||||
inline float operator()(const T a, const T b) {
|
||||
return c10::metal::logaddexp2(float(a), float(b));
|
||||
}
|
||||
};
|
||||
|
||||
struct xlog1py_functor {
|
||||
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
|
||||
inline T operator()(const T a, const T b) {
|
||||
@ -377,6 +399,10 @@ REGISTER_FLOAT_BINARY_OP(fmin);
|
||||
REGISTER_FLOAT_BINARY_OP(nextafter);
|
||||
REGISTER_FLOAT_BINARY_OP(zeta);
|
||||
REGISTER_INT2FLOAT_BINARY_OP(zeta);
|
||||
REGISTER_FLOAT_BINARY_OP(logaddexp);
|
||||
REGISTER_INT2FLOAT_BINARY_OP(logaddexp);
|
||||
REGISTER_FLOAT_BINARY_OP(logaddexp2);
|
||||
REGISTER_INT2FLOAT_BINARY_OP(logaddexp2);
|
||||
REGISTER_FLOAT_BINARY_OP(xlog1py);
|
||||
REGISTER_INT2FLOAT_BINARY_OP(xlog1py);
|
||||
REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_t);
|
||||
@ -463,6 +489,8 @@ REGISTER_BINARY_OP(add, float2, float2);
|
||||
REGISTER_BINARY_OP(add, half2, half2);
|
||||
REGISTER_BINARY_OP(sub, float2, float2);
|
||||
REGISTER_BINARY_OP(sub, half2, half2);
|
||||
REGISTER_BINARY_OP(logaddexp, float2, float2);
|
||||
REGISTER_BINARY_OP(logaddexp, half2, half2);
|
||||
REGISTER_BINARY_ALPHA_OP(add_alpha, float2, float2, float2);
|
||||
REGISTER_BINARY_ALPHA_OP(add_alpha, half2, half2, half2);
|
||||
REGISTER_BINARY_ALPHA_OP(sub_alpha, float2, float2, float2);
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#pragma onces
|
||||
#pragma once
|
||||
#include <c10/metal/common.h>
|
||||
|
||||
template <unsigned N = c10::metal::max_ndim>
|
||||
|
||||
@ -89,6 +89,14 @@ static void zeta_mps_kernel(TensorIteratorBase& iter) {
|
||||
lib.exec_binary_kernel(iter, "zeta");
|
||||
}
|
||||
|
||||
static void logaddexp_mps_kernel(TensorIteratorBase& iter) {
|
||||
lib.exec_binary_kernel(iter, "logaddexp");
|
||||
}
|
||||
|
||||
static void logaddexp2_mps_kernel(TensorIteratorBase& iter) {
|
||||
lib.exec_binary_kernel(iter, "logaddexp2");
|
||||
}
|
||||
|
||||
static void xlog1py_mps_kernel(TensorIteratorBase& iter) {
|
||||
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "xlog1py_mps not implemented for non-floating types");
|
||||
lib.exec_binary_kernel(iter, "xlog1py");
|
||||
@ -211,6 +219,8 @@ REGISTER_DISPATCH(fmin_stub, &fmin_mps_kernel)
|
||||
REGISTER_DISPATCH(copysign_stub, ©sign_mps_kernel)
|
||||
REGISTER_DISPATCH(nextafter_stub, &nextafter_mps_kernel)
|
||||
REGISTER_DISPATCH(zeta_stub, &zeta_mps_kernel)
|
||||
REGISTER_DISPATCH(logaddexp_stub, &logaddexp_mps_kernel);
|
||||
REGISTER_DISPATCH(logaddexp2_stub, &logaddexp2_mps_kernel);
|
||||
REGISTER_DISPATCH(xlog1py_stub, &xlog1py_mps_kernel)
|
||||
REGISTER_DISPATCH(chebyshev_polynomial_t_stub, &chebyshev_polynomial_t_mps_kernel)
|
||||
REGISTER_DISPATCH(chebyshev_polynomial_u_stub, &chebyshev_polynomial_u_mps_kernel)
|
||||
|
||||
@ -17,8 +17,6 @@
|
||||
#include <ATen/ops/ge_native.h>
|
||||
#include <ATen/ops/gt_native.h>
|
||||
#include <ATen/ops/le_native.h>
|
||||
#include <ATen/ops/logaddexp2_native.h>
|
||||
#include <ATen/ops/logaddexp_native.h>
|
||||
#include <ATen/ops/logical_and_native.h>
|
||||
#include <ATen/ops/logical_or_native.h>
|
||||
#include <ATen/ops/logical_xor_native.h>
|
||||
@ -277,30 +275,6 @@ TORCH_IMPL_FUNC(pow_Scalar_out_mps)(const Scalar& base, const Tensor& exp, const
|
||||
}
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(logaddexp_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
|
||||
mps::BinaryOpBlock logaddexp_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
|
||||
MPSGraph* mpsGraph = cachedGraph->graph();
|
||||
MPSGraphTensor* sumTensor =
|
||||
[mpsGraph additionWithPrimaryTensor:[mpsGraph exponentWithTensor:primaryCastTensor name:nil]
|
||||
secondaryTensor:[mpsGraph exponentWithTensor:secondaryCastTensor name:nil]
|
||||
name:nil];
|
||||
return [mpsGraph logarithmWithTensor:sumTensor name:nil];
|
||||
};
|
||||
mps::binaryOpTensor(self, other, output, "logaddexp_out_mps", logaddexp_op_block);
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(logaddexp2_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
|
||||
mps::BinaryOpBlock logaddexp2_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
|
||||
MPSGraph* mpsGraph = cachedGraph->graph();
|
||||
MPSGraphTensor* sumTensor =
|
||||
[mpsGraph additionWithPrimaryTensor:[mpsGraph exponentBase2WithTensor:primaryCastTensor name:nil]
|
||||
secondaryTensor:[mpsGraph exponentBase2WithTensor:secondaryCastTensor name:nil]
|
||||
name:nil];
|
||||
return [mpsGraph logarithmBase2WithTensor:sumTensor name:nil];
|
||||
};
|
||||
mps::binaryOpTensor(self, other, output, "logaddexp2_out_mps", logaddexp2_op_block);
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(xlogy_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
|
||||
mps::BinaryOpBlock xlogy_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
|
||||
MPSGraph* mpsGraph = cachedGraph->graph();
|
||||
|
||||
@ -57,6 +57,7 @@ Tensor& random_mps_impl(Tensor& self,
|
||||
if (self.numel() == 0) {
|
||||
return self;
|
||||
}
|
||||
at::assert_no_internal_overlap(self);
|
||||
// MPS random is broken for 5D+ tensors, see https://github.com/pytorch/pytorch/issues/147624
|
||||
const auto need_reshape = self.ndimension() > 4;
|
||||
auto mps_gen = get_generator_or_default<MPSGeneratorImpl>(gen, at::mps::detail::getDefaultMPSGenerator());
|
||||
@ -153,8 +154,16 @@ Tensor& random_mps_impl(Tensor& self,
|
||||
feeds[meanPlaceholder.getMPSGraphTensor()] = meanPlaceholder.getMPSGraphTensorData();
|
||||
}
|
||||
|
||||
Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self);
|
||||
// Handle non-contiguous output tensors by creating a contiguous temporary
|
||||
const auto needs_gather = needsGather(self);
|
||||
Tensor self_ = needs_gather ? at::empty_like(self, MemoryFormat::Contiguous) : self;
|
||||
Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self_);
|
||||
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
|
||||
|
||||
// Copy results back to original non-contiguous output
|
||||
if (needs_gather) {
|
||||
self.copy_(self_);
|
||||
}
|
||||
}
|
||||
|
||||
return self;
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/native/Resize.h>
|
||||
#include <ATen/native/SpectralOpsUtils.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
|
||||
@ -37,25 +39,12 @@ NSArray<NSNumber*>* IntArrayToNSArray(IntArrayRef arr) {
|
||||
} // anonymous namespace
|
||||
|
||||
Tensor _fft_c2r_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t last_dim_size) {
|
||||
TORCH_CHECK(self.is_complex());
|
||||
auto in_sizes = self.sizes();
|
||||
DimVector out_sizes(in_sizes.begin(), in_sizes.end());
|
||||
out_sizes[dim.back()] = last_dim_size;
|
||||
auto out = at::empty(out_sizes, self.options().dtype(c10::toRealValueType(self.scalar_type())));
|
||||
auto out = at::empty({}, self.options().dtype(c10::toRealValueType(self.scalar_type())));
|
||||
return _fft_c2r_mps_out(self, dim, normalization, last_dim_size, out);
|
||||
}
|
||||
|
||||
Tensor _fft_r2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) {
|
||||
TORCH_CHECK(self.is_floating_point());
|
||||
auto input_sizes = self.sizes();
|
||||
DimVector out_sizes(input_sizes.begin(), input_sizes.end());
|
||||
auto last_dim = dim.back();
|
||||
auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1;
|
||||
if (onesided) {
|
||||
out_sizes[last_dim] = last_dim_halfsize;
|
||||
}
|
||||
|
||||
auto out = at::empty(out_sizes, self.options().dtype(c10::toComplexType(self.scalar_type())));
|
||||
auto out = at::empty({}, self.options().dtype(c10::toComplexType(self.scalar_type())));
|
||||
return _fft_r2c_mps_out(self, dim, normalization, onesided, out);
|
||||
}
|
||||
|
||||
@ -72,6 +61,17 @@ using namespace mps;
|
||||
|
||||
// TODO: Investigate numerical discrepancies see https://github.com/pytorch/pytorch/issues/120237
|
||||
Tensor& _fft_r2c_mps_out(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided, Tensor& out) {
|
||||
TORCH_CHECK(self.scalar_type() == kFloat || self.scalar_type() == kHalf, "Only float and half dtypes are supported");
|
||||
TORCH_CHECK(out.scalar_type() == c10::toComplexType(self.scalar_type()));
|
||||
const auto input_sizes = self.sym_sizes();
|
||||
SymDimVector out_sizes(input_sizes.begin(), input_sizes.end());
|
||||
auto last_dim = dim.back();
|
||||
auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1;
|
||||
if (onesided) {
|
||||
out_sizes[last_dim] = last_dim_halfsize;
|
||||
}
|
||||
at::native::resize_output_symint(out, out_sizes);
|
||||
|
||||
auto key = __func__ + getTensorsStringKey({self, out}) + ":" + getArrayRefString(dim) + ":" +
|
||||
std::to_string(normalization) + ":" + std::to_string(onesided);
|
||||
@autoreleasepool {
|
||||
@ -112,6 +112,12 @@ Tensor& _fft_c2r_mps_out(const Tensor& self,
|
||||
int64_t normalization,
|
||||
int64_t last_dim_size,
|
||||
Tensor& out) {
|
||||
TORCH_CHECK(self.is_complex(), "Input must be complex");
|
||||
TORCH_CHECK(out.scalar_type() == c10::toRealValueType(self.scalar_type()), "Unexpected output type");
|
||||
const auto in_sizes = self.sym_sizes();
|
||||
SymDimVector out_sizes(in_sizes.begin(), in_sizes.end());
|
||||
out_sizes[dim.back()] = last_dim_size;
|
||||
at::native::resize_output_symint(out, out_sizes);
|
||||
auto key = __func__ + getTensorsStringKey({self}) + ":" + getArrayRefString(dim) + ":" +
|
||||
std::to_string(normalization) + ":" + std::to_string(last_dim_size);
|
||||
@autoreleasepool {
|
||||
|
||||
@ -617,6 +617,9 @@ Tensor& index_select_out_mps(const Tensor& self, int64_t dim, const Tensor& inde
|
||||
TORCH_CHECK(self.scalar_type() == output.scalar_type(),
|
||||
"index_select(): self and output must have the same scalar type");
|
||||
TORCH_CHECK(dim == 0 || dim < self.dim(), "index_select(): Indexing dim ", dim, " is out of bounds of tensor");
|
||||
at::assert_no_internal_overlap(output);
|
||||
at::assert_no_overlap(output, self);
|
||||
at::assert_no_overlap(output, index);
|
||||
auto output_size = self.sizes().vec();
|
||||
if (self.dim() > 0) {
|
||||
output_size[dim] = num_indices;
|
||||
|
||||
@ -1028,15 +1028,18 @@ TORCH_IMPL_FUNC(prod_out_mps)
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(amax_out_mps)(const Tensor& input_t, IntArrayRef dim, bool keepdim, const Tensor& output_t) {
|
||||
TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "amax is not defined for complex types");
|
||||
reduction_out_mps(input_t, dim, keepdim, std::nullopt, output_t, MPSReductionType::AMAX, "amax_out_mps");
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(amin_out_mps)(const Tensor& input_t, IntArrayRef dim, bool keepdim, const Tensor& output_t) {
|
||||
TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "amin is not defined for complex types");
|
||||
reduction_out_mps(input_t, dim, keepdim, std::nullopt, output_t, MPSReductionType::AMIN, "amin_out_mps");
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(aminmax_out_mps)
|
||||
(const Tensor& input_t, std::optional<int64_t> dim_opt, bool keepdim, const Tensor& min_t, const Tensor& max_t) {
|
||||
TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "aminmax is not defined for complex types");
|
||||
reduction_out_mps(input_t,
|
||||
dim_opt.has_value() ? OptionalIntArrayRef({*dim_opt}) : std::nullopt,
|
||||
keepdim,
|
||||
|
||||
@ -31,6 +31,7 @@ void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& v
|
||||
indices.copy_(values.toType(at::ScalarType::Long));
|
||||
return;
|
||||
}
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(self.scalar_type()), "kthvalue is not implemented for complex types");
|
||||
// issue #154890, raising error to prevent crash within MPSGraph until
|
||||
// workaround is implemented.
|
||||
TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890");
|
||||
|
||||
@ -3622,8 +3622,7 @@
|
||||
structured: True
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA: logaddexp_out
|
||||
MPS: logaddexp_out_mps
|
||||
CPU, CUDA, MPS: logaddexp_out
|
||||
tags: pointwise
|
||||
|
||||
- func: logaddexp(Tensor self, Tensor other) -> Tensor
|
||||
@ -3635,8 +3634,7 @@
|
||||
structured: True
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA: logaddexp2_out
|
||||
MPS: logaddexp2_out_mps
|
||||
CPU, CUDA, MPS: logaddexp2_out
|
||||
tags: pointwise
|
||||
|
||||
- func: logaddexp2(Tensor self, Tensor other) -> Tensor
|
||||
@ -15097,7 +15095,7 @@
|
||||
CPU: _scaled_dot_product_flash_attention_cpu
|
||||
tags: nondeterministic_seeded
|
||||
|
||||
- func: _scaled_dot_product_fused_attention_overrideable(Tensor query, Tensor key, Tensor value, Tensor? attn_bias=None, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
|
||||
- func: _scaled_dot_product_fused_attention_overrideable(Tensor query, Tensor key, Tensor value, Tensor? attn_bias=None, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None, bool compute_log_sumexp=True) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: _scaled_dot_product_fused_attention_overrideable
|
||||
XPU: _scaled_dot_product_fused_attention_overrideable_xpu
|
||||
@ -15121,6 +15119,7 @@
|
||||
variants: function
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: _scaled_dot_product_fused_attention_overrideable_backward
|
||||
XPU: _scaled_dot_product_fused_attention_overrideable_backward_xpu
|
||||
|
||||
- func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)
|
||||
dispatch:
|
||||
|
||||
@ -73,8 +73,7 @@ void upsample_bilinear2d_out_frame(
|
||||
const auto rwidth = area_pixel_compute_scale<float>(
|
||||
input_width, output_width, align_corners, scales_w);
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
|
||||
float output_scale = output.q_scale() / input.q_scale();
|
||||
float output_scale = static_cast<float>(output.q_scale() / input.q_scale());
|
||||
|
||||
const int64_t input_q_zero_point = input.q_zero_point();
|
||||
const int64_t output_q_zero_point = output.q_zero_point();
|
||||
|
||||
@ -148,7 +148,7 @@ Tensor qcat_nhwc_kernel(
|
||||
// Vectorized loop
|
||||
if (c + VLEN <= curr_C) {
|
||||
auto curr_scale_vec = Vectorized<float>(curr_scale);
|
||||
auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
|
||||
auto curr_zero_pt_vec = Vectorized<float>(curr_zero_pt);
|
||||
auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
|
||||
for (; c + VLEN <= curr_C; c += VLEN) {
|
||||
auto inp_vec = Vec::loadu(iptr + c);
|
||||
@ -174,7 +174,7 @@ Tensor qcat_nhwc_kernel(
|
||||
int64_t elem_size = curr_C - c;
|
||||
if ((VLEN == 4 * kVLEN) && elem_size >= kVLEN) {
|
||||
auto curr_scale_vec = Vectorized<float>(curr_scale);
|
||||
auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
|
||||
auto curr_zero_pt_vec = Vectorized<float>(curr_zero_pt);
|
||||
auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
|
||||
int64_t vec_num = elem_size / kVLEN;
|
||||
std::array<typename scalar_t::underlying, VLEN> buf_in{};
|
||||
@ -611,12 +611,10 @@ void qrelu_kernel(const Tensor& qx, Tensor& qy) {
|
||||
void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
|
||||
const Scalar& negval_) {
|
||||
int64_t i_zp = qx.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float i_scale = qx.q_scale();
|
||||
float i_scale = static_cast<float>(qx.q_scale());
|
||||
|
||||
int64_t o_zp = out.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float o_scale = out.q_scale();
|
||||
float o_scale = static_cast<float>(out.q_scale());
|
||||
float o_inv_scale = 1.0f / o_scale;
|
||||
|
||||
float negval = negval_.to<float>();
|
||||
@ -627,8 +625,8 @@ void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
|
||||
Vec zero_vec = Vec(0.0f);
|
||||
Vec one_vec = Vec(1.0f);
|
||||
|
||||
Vec i_scale_vec = Vec((float)i_scale);
|
||||
Vec i_zp_vec = Vec((float)i_zp);
|
||||
Vec i_scale_vec = Vec(i_scale);
|
||||
Vec i_zp_vec = Vec(i_zp);
|
||||
Vec i_scale_zp_neg_premul_vec = i_scale_vec * i_zp_vec.neg();
|
||||
|
||||
Vec negval_vec = Vec(negval);
|
||||
@ -738,10 +736,9 @@ void qprelu_out_kernel(Tensor& out,
|
||||
|
||||
void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) {
|
||||
int64_t zero_point = qx.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float scale = qx.q_scale();
|
||||
float scale = static_cast<float>(qx.q_scale());
|
||||
auto scale_vec = Vectorized<float>(scale);
|
||||
auto zero_point_vec = Vectorized<float>((float)zero_point);
|
||||
auto zero_point_vec = Vectorized<float>(zero_point);
|
||||
auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
|
||||
int64_t output_zero_point = zero_point;
|
||||
float output_scale = scale;
|
||||
@ -828,10 +825,9 @@ void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) {
|
||||
void qsigmoid_kernel(
|
||||
const Tensor& qx, Tensor& qy, double output_scale, int64_t output_zero_point ) {
|
||||
int64_t zero_point = qx.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float scale = qx.q_scale();
|
||||
float scale = static_cast<float>(qx.q_scale());
|
||||
auto scale_vec = Vectorized<float>(scale);
|
||||
auto zero_point_vec = Vectorized<float>((float)zero_point);
|
||||
auto zero_point_vec = Vectorized<float>(zero_point);
|
||||
|
||||
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qsigmoid", [&]() {
|
||||
float inv_output_scale = 1.0 / output_scale;
|
||||
@ -870,10 +866,9 @@ void qsigmoid_kernel(
|
||||
|
||||
void qhardsigmoid_kernel(const Tensor& qx, Tensor& qy) {
|
||||
int64_t zero_point = qx.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float scale = qx.q_scale();
|
||||
float scale = static_cast<float>(qx.q_scale());
|
||||
auto scale_vec = Vectorized<float>(scale);
|
||||
auto zero_point_vec = Vectorized<float>((float)zero_point);
|
||||
auto zero_point_vec = Vectorized<float>(zero_point);
|
||||
auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
|
||||
|
||||
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qhardsigmoid", [&]() {
|
||||
@ -1029,13 +1024,10 @@ void qthreshold_kernel(
|
||||
|
||||
// defines input and output scales and zero_points
|
||||
int64_t input_zero_point = qx.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float input_scale = qx.q_scale();
|
||||
float input_scale = static_cast<float>(qx.q_scale());
|
||||
int64_t output_zero_point = qy.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float output_scale = qy.q_scale();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float inv_output_scale = 1.0 / output_scale;
|
||||
float output_scale = static_cast<float>(qy.q_scale());
|
||||
float inv_output_scale = static_cast<float>(1.0 / output_scale);
|
||||
|
||||
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qthreshold", [&]() {
|
||||
qy = at::_empty_affine_quantized(
|
||||
@ -1096,8 +1088,7 @@ void qhardswish_kernel(const Tensor& qx, Tensor& qy) {
|
||||
|
||||
const auto o_scale = qy.q_scale();
|
||||
const auto o_zero_point = qy.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
const float o_inv_scale = 1.0 / o_scale;
|
||||
const float o_inv_scale = static_cast<float>(1.0 / o_scale);
|
||||
|
||||
using fVec = Vectorized<float>;
|
||||
fVec i_scale_vec(i_scale);
|
||||
@ -1135,10 +1126,9 @@ void qhardswish_kernel(const Tensor& qx, Tensor& qy) {
|
||||
|
||||
void qtanh_kernel(const Tensor& qx, Tensor& qy) {
|
||||
int64_t zero_point = qx.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float scale = qx.q_scale();
|
||||
float scale = static_cast<float>(qx.q_scale());
|
||||
auto scale_vec = Vectorized<float>(scale);
|
||||
auto zero_point_vec = Vectorized<float>((float)zero_point);
|
||||
auto zero_point_vec = Vectorized<float>(zero_point);
|
||||
auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
|
||||
|
||||
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qtanh", [&]() {
|
||||
@ -1198,16 +1188,13 @@ void qelu_kernel(
|
||||
// they are NOT related to the quantization scale term
|
||||
|
||||
int64_t i_zp = qx.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float i_scale = qx.q_scale();
|
||||
float i_scale = static_cast<float>(qx.q_scale());
|
||||
|
||||
// In a future PR, we can improve on output scale and zero_point
|
||||
// selection.
|
||||
int64_t o_zp = qy.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float o_scale = qy.q_scale();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float inv_o_scale = 1.0 / o_scale;
|
||||
float o_scale = static_cast<float>(qy.q_scale());
|
||||
float inv_o_scale = static_cast<float>(1.0 / o_scale);
|
||||
|
||||
float alpha_float = alpha.to<float>();
|
||||
float scale_coef = scale.to<float>();
|
||||
@ -1227,7 +1214,7 @@ void qelu_kernel(
|
||||
Vec scale_coef_vec = Vec(scale_coef);
|
||||
Vec input_scale_coef_vec = Vec(input_scale_coef);
|
||||
Vec i_scale_vec = Vec(i_scale);
|
||||
Vec i_zero_point_vec = Vec((float)i_zp);
|
||||
Vec i_zero_point_vec = Vec(i_zp);
|
||||
Vec i_scale_neg_zp_premul_vec = i_scale_vec * i_zero_point_vec.neg();
|
||||
|
||||
cpu_kernel_vec(
|
||||
@ -1326,23 +1313,20 @@ void qadd_scalar_kernel(Tensor& out, const Tensor& self, const Scalar& other) {
|
||||
template <bool ReLUFused = false>
|
||||
void qadd_kernel(Tensor& out, const Tensor& self, const Tensor& other) {
|
||||
int64_t zero_point = out.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float scale = out.q_scale();
|
||||
float scale = static_cast<float>(out.q_scale());
|
||||
float inv_scale = 1.0f / scale;
|
||||
int64_t self_zero_point = self.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float self_scale = self.q_scale();
|
||||
float self_scale = static_cast<float>(self.q_scale());
|
||||
int64_t other_zero_point = other.q_zero_point();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float other_scale = other.q_scale();
|
||||
float other_scale = static_cast<float>(other.q_scale());
|
||||
|
||||
// Broadcast out the parameters here to amortize out that cost across
|
||||
// loop iterations.
|
||||
// TODO: we can optimize dequantization by doing a premultiplication
|
||||
// of the zero point by scale and doing FMA on scale*x_q - (scale*zero_point)
|
||||
auto self_zero_point_vec = Vectorized<float>((float)self_zero_point);
|
||||
auto self_zero_point_vec = Vectorized<float>(self_zero_point);
|
||||
auto self_scale_vec = Vectorized<float>(self_scale);
|
||||
auto other_zero_point_vec = Vectorized<float>((float)other_zero_point);
|
||||
auto other_zero_point_vec = Vectorized<float>(other_zero_point);
|
||||
auto other_scale_vec = Vectorized<float>(other_scale);
|
||||
|
||||
auto self_scale_neg_zp_premul_vec = self_scale_vec * self_zero_point_vec.neg();
|
||||
@ -2965,7 +2949,7 @@ void quantized_normalize_kernel(
|
||||
const bool beta_null = beta_data == nullptr;
|
||||
int64_t x_zp = X.q_zero_point();
|
||||
float x_scale = X.q_scale();
|
||||
fVec x_zp_vec((float)x_zp);
|
||||
fVec x_zp_vec(x_zp);
|
||||
fVec one_vec(1.0f);
|
||||
fVec zero_vec(0.0f);
|
||||
float x_fake_scale = 1.0f;
|
||||
@ -3253,7 +3237,7 @@ void quantized_groupnorm_nhwc_kernel(
|
||||
const bool beta_null = beta_data == nullptr;
|
||||
int64_t x_zp = X.q_zero_point();
|
||||
float x_scale = X.q_scale();
|
||||
fVec x_zp_vec((float)x_zp);
|
||||
fVec x_zp_vec(x_zp);
|
||||
fVec one_vec(1.0f);
|
||||
fVec zero_vec(0.0f);
|
||||
float x_fake_scale = 1.0f;
|
||||
|
||||
@ -414,7 +414,6 @@ at::Tensor& PackedLinearWeightFp16::apply_dynamic_impl(
|
||||
TORCH_CHECK(input.size(input.dim() - 1) == packed_weight_fp16.numRows())
|
||||
TORCH_CHECK(input.dim() >= 2);
|
||||
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
|
||||
const int64_t N = packed_weight_fp16.numCols();
|
||||
std::vector<int64_t> output_sizes = input.sizes().vec();
|
||||
|
||||
@ -467,6 +467,28 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, IntArrayRe
|
||||
!options.has_layout() || options.layout() == kSparse,
|
||||
"expected sparse layout, but got layout ",
|
||||
options.layout());
|
||||
|
||||
if (indices.numel() > 0) {
|
||||
Tensor min_indices =
|
||||
std::get</* values */ 0>(indices.min(/* dim */ 1, /* keepdim */ false));
|
||||
Tensor cpu_min_indices;
|
||||
if (!indices.is_cpu()) {
|
||||
cpu_min_indices = min_indices.to(at::DeviceType::CPU);
|
||||
} else {
|
||||
cpu_min_indices = min_indices;
|
||||
}
|
||||
auto cpu_min_indices_accessor = cpu_min_indices.accessor<int64_t, 1>();
|
||||
for (const auto d : c10::irange(indices.size(0))) {
|
||||
int64_t min_index_in_dim = cpu_min_indices_accessor[d];
|
||||
TORCH_CHECK(
|
||||
min_index_in_dim >= 0,
|
||||
"found negative index ",
|
||||
min_index_in_dim,
|
||||
" for dim ",
|
||||
d);
|
||||
}
|
||||
}
|
||||
|
||||
return at::native::_sparse_coo_tensor_unsafe(
|
||||
indices,
|
||||
values,
|
||||
|
||||
@ -768,8 +768,11 @@ Tensor scaled_dot_product_attention(
|
||||
return std::get<0>(out_and_lse);
|
||||
}
|
||||
case SDPBackend::overrideable: {
|
||||
bool compute_logsumexp = should_compute_logsumexp(query_, key, value);
|
||||
compute_logsumexp = compute_logsumexp ||
|
||||
(at::GradMode::is_enabled() && attn_mask.has_value() && attn_mask.value().requires_grad());
|
||||
auto out_lse_softmax = at::_scaled_dot_product_fused_attention_overrideable(
|
||||
query_, key, value, attn_mask, dropout_p, is_causal, false /*return_debug_mask*/, scale);
|
||||
query_, key, value, attn_mask, dropout_p, is_causal, false /*return_debug_mask*/, scale, compute_logsumexp);
|
||||
return std::get<0>(out_lse_softmax);
|
||||
}
|
||||
case SDPBackend::math: {
|
||||
@ -1015,7 +1018,8 @@ _scaled_dot_product_fused_attention_overrideable(
|
||||
double dropout_p,
|
||||
bool is_causal,
|
||||
bool return_debug_mask,
|
||||
std::optional<double> scale) {
|
||||
std::optional<double> scale,
|
||||
bool compute_logsumexp) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "_scaled_dot_product_fused_attention_overrideable not implemented. This is an operator for privateuse1 backends, please use TORCH_LIBRARY_IMPL to override this function ");
|
||||
}
|
||||
|
||||
|
||||
@ -22,6 +22,7 @@
|
||||
#else
|
||||
#include <ATen/ops/empty.h>
|
||||
#include <ATen/ops/empty_like.h>
|
||||
#include <ATen/ops/zeros_like.h>
|
||||
#include <ATen/ops/reshape.h>
|
||||
#include <ATen/ops/scalar_tensor.h>
|
||||
#include <ATen/ops/sum.h>
|
||||
@ -42,7 +43,6 @@ C10_DIAGNOSTIC_POP()
|
||||
#include <static_switch.h>
|
||||
#include <ATen/native/transformers/cuda/flash_attn/flash_api.h>
|
||||
|
||||
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
namespace FLASH_NAMESPACE {
|
||||
@ -417,6 +417,26 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
|
||||
const int head_size_og = sizes[3];
|
||||
const int seqlen_k = k.size(1);
|
||||
const int num_heads_k = k.size(2);
|
||||
|
||||
if (batch_size == 0) {
|
||||
auto opts = q.options();
|
||||
at::Tensor out = at::empty({0, seqlen_q, num_heads, head_size_og}, opts);
|
||||
at::Tensor q_padded = at::empty({0, seqlen_q, num_heads, head_size_og}, opts);
|
||||
at::Tensor k_padded = at::empty({0, seqlen_k, num_heads_k, head_size_og}, opts);
|
||||
at::Tensor v_padded = at::empty({0, seqlen_k, num_heads_k, head_size_og}, opts);
|
||||
at::Tensor softmax_lse = at::empty({0, num_heads, seqlen_q}, opts.dtype(at::kFloat));
|
||||
at::Tensor rng_state = at::empty({2}, at::dtype(c10::kUInt64).device(at::kCUDA));
|
||||
at::Tensor _unused = at::empty({}, at::dtype(c10::kUInt64).device(at::kCUDA));
|
||||
at::Tensor p = at::empty({0}, opts);
|
||||
if (return_softmax) {
|
||||
auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
|
||||
const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
|
||||
const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
|
||||
p = at::empty({0, num_heads, seqlen_q_rounded, seqlen_k_rounded}, opts);
|
||||
}
|
||||
return {std::move(out), std::move(q_padded), std::move(k_padded), std::move(v_padded), std::move(softmax_lse), std::move(rng_state), _unused, std::move(p)};
|
||||
}
|
||||
|
||||
TORCH_CHECK(batch_size > 0, "batch size must be positive");
|
||||
TORCH_CHECK(head_size_og % 8 == 0, "head_size must be a multiple of 8, this is ensured by padding!");
|
||||
TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
|
||||
@ -547,7 +567,7 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
|
||||
q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og});
|
||||
softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
|
||||
}
|
||||
return {out, q_padded, k_padded, v_padded, softmax_lse, rng_state, _unused, p};
|
||||
return {std::move(out), std::move(q_padded), std::move(k_padded), std::move(v_padded), std::move(softmax_lse), std::move(rng_state), std::move(_unused), std::move(p)};
|
||||
}
|
||||
|
||||
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
|
||||
@ -852,7 +872,6 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si
|
||||
TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
|
||||
TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
|
||||
TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension");
|
||||
TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
|
||||
|
||||
const auto sizes = q.sizes();
|
||||
|
||||
@ -863,6 +882,20 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si
|
||||
const int head_size = sizes[3];
|
||||
const int seqlen_k = k.size(1);
|
||||
const int num_heads_k = k.size(2);
|
||||
|
||||
if (batch_size == 0) {
|
||||
auto opts = q.options();
|
||||
at::Tensor dq = at::empty_like(q);
|
||||
at::Tensor dk = at::empty_like(k);
|
||||
at::Tensor dv = at::empty_like(v);
|
||||
auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
|
||||
const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
|
||||
at::Tensor softmax_d = at::empty({0, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat));
|
||||
return {dq, dk, dv, softmax_d};
|
||||
}
|
||||
|
||||
TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
|
||||
|
||||
TORCH_CHECK(batch_size > 0, "batch size must be positive");
|
||||
TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
|
||||
TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!");
|
||||
|
||||
@ -202,6 +202,7 @@ supported:
|
||||
- select_backward
|
||||
- _trilinear
|
||||
- linalg_pinv.atol_rtol_tensor
|
||||
- svd
|
||||
- logsumexp.out
|
||||
symint:
|
||||
- empty.memory_format
|
||||
|
||||
@ -1837,6 +1837,10 @@ class BenchmarkRunner:
|
||||
def skip_models_for_cuda(self):
|
||||
return set()
|
||||
|
||||
@property
|
||||
def skip_models_for_xpu(self):
|
||||
return set()
|
||||
|
||||
@property
|
||||
def skip_models_for_cpu(self):
|
||||
return set()
|
||||
@ -3927,6 +3931,8 @@ def run(runner, args, original_dir=None):
|
||||
runner.skip_models.update(runner.skip_models_for_cpu_aarch64)
|
||||
elif args.devices == ["cuda"]:
|
||||
runner.skip_models.update(runner.skip_models_for_cuda)
|
||||
elif args.devices == ["xpu"]:
|
||||
runner.skip_models.update(runner.skip_models_for_xpu)
|
||||
|
||||
if not args.multiprocess:
|
||||
runner.skip_models.update(runner.skip_multiprocess_models)
|
||||
|
||||
@ -124,6 +124,10 @@ class TorchBenchmarkRunner(BenchmarkRunner):
|
||||
def skip_models_for_cuda(self):
|
||||
return self._skip["device"]["cuda"]
|
||||
|
||||
@property
|
||||
def skip_models_for_xpu(self):
|
||||
return self._skip["device"]["xpu"]
|
||||
|
||||
@property
|
||||
def skip_models_for_freezing_cuda(self):
|
||||
return self._skip["freezing"]["cuda"]
|
||||
|
||||
@ -217,6 +217,9 @@ skip:
|
||||
|
||||
cuda: []
|
||||
|
||||
xpu:
|
||||
- *DETECTRON2_MODELS
|
||||
|
||||
test:
|
||||
training:
|
||||
- *DETECTRON2_MODELS
|
||||
|
||||
157
benchmarks/transformer/config_utils.py
Normal file
157
benchmarks/transformer/config_utils.py
Normal file
@ -0,0 +1,157 @@
|
||||
"""Configuration utilities for parsing JSON and YAML config files."""
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
|
||||
def heads_input_type(s: str) -> tuple[int, int]:
|
||||
"""Convert string format 'Hq,Hkv' to tuple (Hq, Hkv)."""
|
||||
try:
|
||||
hq, hkv = map(int, s.split(","))
|
||||
return hq, hkv
|
||||
except Exception as e:
|
||||
raise ValueError("Heads must be Hq,Hkv") from e
|
||||
|
||||
|
||||
default_config = {
|
||||
"dynamic": False,
|
||||
"calculate_bwd": False,
|
||||
"dtype": "bfloat16",
|
||||
"b": [2, 8, 16],
|
||||
"nh": ["16,16", "16,2"],
|
||||
"s": [512, 1024, 4096],
|
||||
"d": [64, 128],
|
||||
"mods": ["noop", "causal", "alibi", "sliding_window"],
|
||||
"backend": ["efficient"],
|
||||
"max_autotune": False,
|
||||
"decoding": False,
|
||||
"kv_size": None,
|
||||
"throughput": True,
|
||||
"save_path": None,
|
||||
"output_json_for_dashboard": None,
|
||||
"benchmark_name": "PyTorch operator microbenchmark",
|
||||
}
|
||||
|
||||
|
||||
def load_config_file(config_path: str) -> dict:
|
||||
"""Load configuration from JSON or YAML file.
|
||||
|
||||
Automatically converts 'nh' field from strings to tuples.
|
||||
|
||||
Args:
|
||||
config_path: Path to the configuration file
|
||||
|
||||
Returns:
|
||||
Dictionary containing the configuration
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If config file doesn't exist
|
||||
ValueError: If config file format is invalid
|
||||
"""
|
||||
with open(config_path) as f:
|
||||
config_str = f.read()
|
||||
|
||||
# Try to load as JSON first
|
||||
try:
|
||||
config = json.loads(config_str)
|
||||
except json.JSONDecodeError:
|
||||
# Fall back to YAML parsing
|
||||
config = _parse_simple_yaml(config_str)
|
||||
|
||||
# Apply automatic conversions for 'nh' field
|
||||
if "nh" in config and isinstance(config["nh"], list):
|
||||
config["nh"] = [
|
||||
heads_input_type(h) if isinstance(h, str) else h for h in config["nh"]
|
||||
]
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _parse_simple_yaml(yaml_str: str) -> dict:
|
||||
"""Simple YAML parser for basic configs (without external dependencies).
|
||||
|
||||
Supports:
|
||||
- key: value pairs
|
||||
- booleans (true/false)
|
||||
- null values
|
||||
- integers and floats
|
||||
- strings (quoted and unquoted)
|
||||
- lists in JSON format [item1, item2, ...]
|
||||
- comments (lines starting with # or after #)
|
||||
|
||||
Args:
|
||||
yaml_str: YAML content as string
|
||||
|
||||
Returns:
|
||||
Dictionary containing parsed YAML content
|
||||
"""
|
||||
config = {}
|
||||
|
||||
for line in yaml_str.split("\n"):
|
||||
# Remove comments
|
||||
line = line.split("#")[0].strip()
|
||||
|
||||
if not line or ":" not in line:
|
||||
continue
|
||||
|
||||
key, value = line.split(":", 1)
|
||||
key = key.strip()
|
||||
value = value.strip()
|
||||
|
||||
# Parse value based on type
|
||||
if value.lower() == "true":
|
||||
config[key] = True
|
||||
elif value.lower() == "false":
|
||||
config[key] = False
|
||||
elif value.lower() in ("null", "none", ""):
|
||||
config[key] = None
|
||||
elif value.startswith("[") and value.endswith("]"):
|
||||
# Parse list - handle quoted strings properly
|
||||
pattern = r'"([^"]+)"|\'([^\']+)\'|([^,\[\]\s]+)'
|
||||
matches = re.findall(pattern, value[1:-1]) # Remove [ ]
|
||||
parsed_items = []
|
||||
for match in matches:
|
||||
# match is a tuple of (double_quoted, single_quoted, unquoted)
|
||||
item = match[0] or match[1] or match[2]
|
||||
item = item.strip()
|
||||
if item:
|
||||
try:
|
||||
parsed_items.append(int(item))
|
||||
except ValueError:
|
||||
parsed_items.append(item)
|
||||
config[key] = parsed_items
|
||||
elif value.startswith(('"', "'")):
|
||||
config[key] = value.strip("\"'")
|
||||
else:
|
||||
# Try to parse as number
|
||||
try:
|
||||
config[key] = int(value)
|
||||
except ValueError:
|
||||
try:
|
||||
config[key] = float(value)
|
||||
except ValueError:
|
||||
config[key] = value
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def print_default_config(output_format: str) -> None:
|
||||
"""Print a default configuration template in JSON or YAML format.
|
||||
|
||||
Args:
|
||||
output_format: Either "json" or "yaml"
|
||||
"""
|
||||
if output_format == "json":
|
||||
print(json.dumps(default_config, indent=2))
|
||||
else: # yaml
|
||||
for key, value in default_config.items():
|
||||
if value is None:
|
||||
print(f"{key}: null")
|
||||
elif isinstance(value, bool):
|
||||
print(f"{key}: {str(value).lower()}")
|
||||
elif isinstance(value, str):
|
||||
print(f'{key}: "{value}"')
|
||||
elif isinstance(value, list):
|
||||
print(f"{key}: {json.dumps(value)}")
|
||||
else:
|
||||
print(f"{key}: {value}")
|
||||
29
benchmarks/transformer/configs/config_basic.yaml
Normal file
29
benchmarks/transformer/configs/config_basic.yaml
Normal file
@ -0,0 +1,29 @@
|
||||
# Basic benchmark configuration for PyTorch transformer benchmarks
|
||||
# Usage: python score_mod.py --config config_basic.yaml
|
||||
|
||||
# Core parameters
|
||||
dynamic: false
|
||||
calculate_bwd: true
|
||||
dtype: "bfloat16"
|
||||
|
||||
# Shape parameters - larger sweep
|
||||
b: [1, 2, 4, 8, 16] # batch sizes
|
||||
nh: ["16,16", "16,2", "32,32", "32,4"] # [query_heads,key_value_heads]
|
||||
s: [512, 1024, 2048, 4096, 8192] # sequence lengths
|
||||
d: [64, 128] # head dimensions (limited to 128 for Flash Attention/cuDNN compatibility)
|
||||
|
||||
# All attention types
|
||||
mods: ["noop", "causal", "rel", "head_bias", "alibi", "sliding_window", "prefix_lm", "softcap"]
|
||||
|
||||
# Multiple backends for comparison (SDPA + Flash Attention) - flex is always included internally
|
||||
backend: ["efficient", "math", "cudnn", "fav2"]
|
||||
max_autotune: true # Enable torch.compile with max-autotune for optimal performance
|
||||
|
||||
# Decoding and cache settings
|
||||
decoding: false
|
||||
kv_size: null
|
||||
|
||||
# Metrics and output
|
||||
throughput: true # Calculate memory bandwidth & TFLOPS
|
||||
save_path: "comprehensive_results.csv" # Save to CSV
|
||||
output_json_for_dashboard: "attn_bench_basic.json"
|
||||
@ -1,15 +1,19 @@
|
||||
import argparse
|
||||
import csv
|
||||
import gc
|
||||
import itertools
|
||||
import json
|
||||
import random
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable
|
||||
from contextlib import nullcontext
|
||||
from dataclasses import asdict, dataclass
|
||||
from functools import partial
|
||||
from typing import Optional, Union
|
||||
from functools import partial, wraps
|
||||
from typing import Literal, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
from config_utils import heads_input_type, load_config_file, print_default_config
|
||||
from tabulate import tabulate
|
||||
from tqdm import tqdm
|
||||
|
||||
@ -33,6 +37,96 @@ torch._dynamo.config.recompile_limit = 1000
|
||||
from torch._inductor.runtime.benchmarking import benchmarker
|
||||
|
||||
|
||||
def cleanup_memory():
|
||||
"""Aggressively free GPU memory"""
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.synchronize()
|
||||
|
||||
|
||||
def safe_backend(backend_name=None, return_dict=False):
|
||||
"""Decorator that wraps backend functions with error handling
|
||||
|
||||
Args:
|
||||
backend_name: Name of the backend for error messages
|
||||
return_dict: If True, returns dict of results for all backends (for run_single_experiment)
|
||||
If False, returns single ExperimentResults (for individual backend functions)
|
||||
"""
|
||||
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapper(config, *args, **kwargs):
|
||||
try:
|
||||
return func(config, *args, **kwargs)
|
||||
except torch.OutOfMemoryError:
|
||||
print(
|
||||
f"[SKIP] OOM for {backend_name or func.__name__} with shape {config.shape}"
|
||||
)
|
||||
cleanup_memory()
|
||||
except RuntimeError as e:
|
||||
error_msg = str(e)
|
||||
if "out of resource" in error_msg or "OutOfMemoryError" in error_msg:
|
||||
print(
|
||||
f"[SKIP] Triton OOM for {backend_name or func.__name__} with shape {config.shape}"
|
||||
)
|
||||
cleanup_memory()
|
||||
elif "No valid triton configs" in error_msg:
|
||||
print(
|
||||
f"[SKIP] No valid Triton config for {backend_name or func.__name__} with shape {config.shape}"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"[SKIP] Runtime error for {backend_name or func.__name__} with shape {config.shape}: {str(e)[:100]}"
|
||||
)
|
||||
except Exception as e:
|
||||
print(
|
||||
f"[SKIP] Error for {backend_name or func.__name__} with shape {config.shape}: {str(e)[:100]}"
|
||||
)
|
||||
|
||||
# Return appropriate NaN result based on function type
|
||||
if return_dict:
|
||||
# For run_single_experiment: return dict with NaN for all backends
|
||||
nan_result = ExperimentResults(
|
||||
fwd_time=float("nan"),
|
||||
bwd_time=float("nan") if config.calculate_bwd_time else None,
|
||||
)
|
||||
results = dict.fromkeys(config.backends, nan_result)
|
||||
results["flex"] = ExperimentResults(
|
||||
fwd_time=float("nan"),
|
||||
bwd_time=float("nan") if config.calculate_bwd_time else None,
|
||||
sparsity=None,
|
||||
)
|
||||
return results
|
||||
else:
|
||||
# For individual backend functions: return single ExperimentResults
|
||||
return ExperimentResults(
|
||||
fwd_time=float("nan"),
|
||||
bwd_time=float("nan") if config.calculate_bwd_time else None,
|
||||
)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
# Type definitions
|
||||
Backend = Literal["math", "efficient", "cudnn", "fav2", "fav3", "fakv", "og-eager"]
|
||||
AttentionType = Literal[
|
||||
"noop",
|
||||
"causal",
|
||||
"rel",
|
||||
"head_bias",
|
||||
"alibi",
|
||||
"sliding_window",
|
||||
"document_mask",
|
||||
"prefix_lm",
|
||||
"softcap",
|
||||
]
|
||||
DtypeString = Literal["bfloat16", "float16", "float32"]
|
||||
SpeedupType = Literal["fwd", "bwd"]
|
||||
|
||||
|
||||
def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) -> float:
|
||||
# warmup
|
||||
for _ in range(5):
|
||||
@ -48,6 +142,7 @@ class ExperimentConfig:
|
||||
calculate_bwd_time: bool
|
||||
cal_bandwidth: bool
|
||||
backends: list[str]
|
||||
max_autotune: bool
|
||||
|
||||
def __post_init__(self):
|
||||
assert len(self.shape) == 6, (
|
||||
@ -62,6 +157,7 @@ class ExperimentConfig:
|
||||
d.pop("cal_bandwidth", None)
|
||||
d["shape(B,Hq,M,Hkv,N,D)"] = d.pop("shape")
|
||||
d.pop("backends", None)
|
||||
d.pop("max_autotune", False)
|
||||
return d
|
||||
|
||||
|
||||
@ -209,6 +305,7 @@ def query_key_value_clones(
|
||||
return query_ref, key_ref, value_ref
|
||||
|
||||
|
||||
@safe_backend("SDPA")
|
||||
def run_single_backend_sdpa(
|
||||
config: ExperimentConfig,
|
||||
query: torch.Tensor,
|
||||
@ -223,6 +320,7 @@ def run_single_backend_sdpa(
|
||||
backend_context = get_backend_context(backend)
|
||||
with backend_context:
|
||||
_device = torch.device("cuda")
|
||||
|
||||
eager_sdpa = generate_eager_sdpa(
|
||||
config.attn_type, config.shape, config.dtype, block_mask, score_mod
|
||||
)
|
||||
@ -290,6 +388,7 @@ def run_single_backend_sdpa(
|
||||
)
|
||||
|
||||
|
||||
@safe_backend("FlashAttention")
|
||||
def run_single_backend_FA(
|
||||
config: ExperimentConfig,
|
||||
query: torch.Tensor,
|
||||
@ -301,9 +400,9 @@ def run_single_backend_FA(
|
||||
mask_kwargs,
|
||||
backend: str,
|
||||
) -> ExperimentResults:
|
||||
assert backend in ["fav2", "fav3", "fakv"]
|
||||
assert backend in ["fav3", "fakv"]
|
||||
# Generate callable for specific backend.
|
||||
if backend in ["fav2", "fav3"]:
|
||||
if backend in ["fav3"]:
|
||||
FA = generate_FA_callable(
|
||||
config.attn_type, config.shape, config.dtype, backend, **mask_kwargs
|
||||
)
|
||||
@ -354,10 +453,10 @@ def run_single_backend_FA(
|
||||
)
|
||||
|
||||
|
||||
@safe_backend("flex_attention", return_dict=True)
|
||||
def run_single_experiment(
|
||||
config: ExperimentConfig,
|
||||
dynamic=False,
|
||||
max_autotune=False,
|
||||
) -> dict[str, ExperimentResults]:
|
||||
device = torch.device("cuda")
|
||||
batch_size, q_heads, q_seq_len, kv_heads, kv_seq_len, head_dim = config.shape
|
||||
@ -377,7 +476,7 @@ def run_single_experiment(
|
||||
block_mask, mask_kwargs = generate_block_mask(config.attn_type, config.shape)
|
||||
kernel_options = get_kernel_options(config.attn_type, config.shape)
|
||||
|
||||
if max_autotune:
|
||||
if config.max_autotune:
|
||||
compiled_sdpa = torch.compile(
|
||||
flex_attention, dynamic=dynamic, mode="max-autotune-no-cudagraphs"
|
||||
)
|
||||
@ -407,7 +506,7 @@ def run_single_experiment(
|
||||
|
||||
results = {}
|
||||
for backend in config.backends:
|
||||
if backend in ["fav2", "fav3", "fakv"]:
|
||||
if backend in ["fav3", "fakv"]:
|
||||
results[backend] = run_single_backend_FA(
|
||||
config,
|
||||
query,
|
||||
@ -419,7 +518,7 @@ def run_single_experiment(
|
||||
mask_kwargs,
|
||||
backend,
|
||||
)
|
||||
else: # sdpa
|
||||
else: # sdpa (also supports fav2)
|
||||
results[backend] = run_single_backend_sdpa(
|
||||
config,
|
||||
query,
|
||||
@ -440,7 +539,7 @@ def run_single_experiment(
|
||||
sparsity = block_mask.sparsity() / 100.0 if block_mask is not None else 0.0
|
||||
sparsity = sparsity if config.attn_type != "document_mask" else 0.5
|
||||
|
||||
results["compiled"] = ExperimentResults(
|
||||
results["flex"] = ExperimentResults(
|
||||
fwd_time=forward_compiled_time,
|
||||
bwd_time=backward_compile_time if config.calculate_bwd_time else None,
|
||||
sparsity=sparsity,
|
||||
@ -501,15 +600,15 @@ def calculate_tflops(config: ExperimentConfig, results: ExperimentResults) -> fl
|
||||
softmax_flops = M * N * 2 # Not counting online softmax overhead
|
||||
o_flops = M * D * N * 2
|
||||
# Not counting split k overhead
|
||||
total_flops = B * Hq * (qk_flops + softmax_flops + o_flops) * (1 - results.sparsity)
|
||||
sparsity = results.sparsity if results.sparsity is not None else 0.0
|
||||
total_flops = B * Hq * (qk_flops + softmax_flops + o_flops) * (1 - sparsity)
|
||||
return total_flops / results.fwd_time / 1e6 # in TFLOPs/
|
||||
|
||||
|
||||
def get_average_speedups(results: list[Experiment], type: str, backend: str):
|
||||
# Calculate speedups
|
||||
speedups = [
|
||||
calculate_speedup(r.results["compiled"], r.results[backend], type)
|
||||
for r in results
|
||||
calculate_speedup(r.results["flex"], r.results[backend], type) for r in results
|
||||
]
|
||||
|
||||
# Find indices of max and min speedups
|
||||
@ -537,7 +636,7 @@ def get_average_speedups(results: list[Experiment], type: str, backend: str):
|
||||
def print_results(results: list[Experiment], save_path: Optional[str] = None):
|
||||
table_data = defaultdict(list)
|
||||
for experiment in results:
|
||||
backends = experiment.config.backends + ["compiled"]
|
||||
backends = experiment.config.backends + ["flex"]
|
||||
for key, value in experiment.asdict().items():
|
||||
if key in backends:
|
||||
if value.fwd_time:
|
||||
@ -550,45 +649,43 @@ def print_results(results: list[Experiment], save_path: Optional[str] = None):
|
||||
# Calculate speedups
|
||||
for backend in results[0].config.backends:
|
||||
fwd_speedups = [
|
||||
calculate_speedup(r.results["compiled"], r.results[backend], type="fwd")
|
||||
calculate_speedup(r.results["flex"], r.results[backend], type="fwd")
|
||||
for r in results
|
||||
]
|
||||
table_data[f"fwd_{backend}_speedup"] = fwd_speedups
|
||||
table_data[f"fwd_speedup_flex_over_{backend}"] = fwd_speedups
|
||||
|
||||
if results[0].config.calculate_bwd_time:
|
||||
for backend in results[0].config.backends:
|
||||
bwd_speedups = [
|
||||
calculate_speedup(r.results["compiled"], r.results[backend], type="bwd")
|
||||
calculate_speedup(r.results["flex"], r.results[backend], type="bwd")
|
||||
for r in results
|
||||
]
|
||||
table_data[f"bwd_{backend}_speedup"] = bwd_speedups
|
||||
table_data[f"bwd_speedup_flex_over_{backend}"] = bwd_speedups
|
||||
|
||||
# Calculate mem + computational throughput
|
||||
if results[0].config.cal_bandwidth:
|
||||
fwd_bandwidth = [
|
||||
calculate_bandwidth(r.config, r.results["compiled"], type="fwd")
|
||||
calculate_bandwidth(r.config, r.results["flex"], type="fwd")
|
||||
for r in results
|
||||
]
|
||||
table_data["fwd_mem_bw (TB/s)"] = fwd_bandwidth
|
||||
fwd_tflops = [
|
||||
calculate_tflops(r.config, r.results["compiled"]) for r in results
|
||||
]
|
||||
fwd_tflops = [calculate_tflops(r.config, r.results["flex"]) for r in results]
|
||||
table_data["TFlops/s"] = fwd_tflops
|
||||
|
||||
print(tabulate(table_data, headers="keys", tablefmt="github", floatfmt=".3f"))
|
||||
|
||||
for backend in results[0].config.backends:
|
||||
if np.isnan(table_data[f"fwd_{backend}_speedup"]).all():
|
||||
if np.isnan(table_data[f"fwd_speedup_flex_over_{backend}"]).all():
|
||||
continue
|
||||
print("\n")
|
||||
print(f"FWD Speedups vs. {backend}".center(125, "="))
|
||||
print(f"FWD Speedup of Flex over {backend}".center(125, "="))
|
||||
print("\n")
|
||||
average_data = get_average_speedups(results, type="fwd", backend=backend)
|
||||
print(tabulate(average_data, headers="keys", tablefmt="github", floatfmt=".3f"))
|
||||
|
||||
if results[0].config.calculate_bwd_time:
|
||||
print("\n")
|
||||
print(f"BWD Speedups vs. {backend}".center(125, "="))
|
||||
print(f"BWD Speedup of Flex over {backend}".center(125, "="))
|
||||
print("\n")
|
||||
average_data = get_average_speedups(results, type="bwd", backend=backend)
|
||||
print(
|
||||
@ -791,14 +888,14 @@ def get_backend_context(backend: str):
|
||||
Returns a context manager for the specified backend.
|
||||
Args:
|
||||
backend (str): The name of the backend to use.
|
||||
Valid options are 'fav2', 'cudnn', 'math', 'efficient', 'fav3', 'fakv', 'og-eager'.
|
||||
Valid options are 'math', 'efficient', 'cudnn', 'fav2', 'fav3', 'fakv', 'og-eager'.
|
||||
Returns:
|
||||
A context manager for the specified backend.
|
||||
Raises:
|
||||
ValueError: If an invalid backend is specified.
|
||||
"""
|
||||
backends = {
|
||||
"fav2": nullcontext(),
|
||||
"fav2": sdpa_kernel(SDPBackend.FLASH_ATTENTION),
|
||||
"cudnn": sdpa_kernel(SDPBackend.CUDNN_ATTENTION),
|
||||
"math": sdpa_kernel(SDPBackend.MATH),
|
||||
"efficient": sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION),
|
||||
@ -820,15 +917,7 @@ def generate_FA_callable(
|
||||
) -> Callable | None:
|
||||
if dtype not in [torch.float16, torch.bfloat16]:
|
||||
return None
|
||||
if backend == "fav2":
|
||||
try:
|
||||
from flash_attn import flash_attn_func, flash_attn_varlen_func
|
||||
except ImportError:
|
||||
print(
|
||||
"Flash attention 2 is not installed. Please install it to run fav2 backend. "
|
||||
)
|
||||
raise
|
||||
elif backend == "fav3":
|
||||
if backend == "fav3":
|
||||
try:
|
||||
from flash_attn.flash_attn_interface import (
|
||||
flash_attn_func,
|
||||
@ -1034,6 +1123,7 @@ def generate_experiment_configs(
|
||||
kv_cache_size: list[int],
|
||||
cal_bandwidth: bool,
|
||||
backends: list[str],
|
||||
max_autotune: bool,
|
||||
) -> list[ExperimentConfig]:
|
||||
assert not (calculate_bwd and decoding), "Decoding does not support backward"
|
||||
|
||||
@ -1077,52 +1167,333 @@ def generate_experiment_configs(
|
||||
calculate_bwd_time=calculate_bwd,
|
||||
cal_bandwidth=cal_bandwidth,
|
||||
backends=backends,
|
||||
max_autotune=max_autotune,
|
||||
)
|
||||
)
|
||||
|
||||
return all_configs
|
||||
|
||||
|
||||
def main(args):
|
||||
def _output_json_for_dashboard(
|
||||
experiments,
|
||||
output_file,
|
||||
benchmark_name="PyTorch operator microbenchmark",
|
||||
):
|
||||
"""
|
||||
Write the result into JSON format for PyTorch OSS dashboard.
|
||||
The JSON format is defined at
|
||||
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
|
||||
|
||||
Args:
|
||||
experiments: List of experiment results
|
||||
output_file: Path to output JSON file
|
||||
benchmark_name: Name of the benchmark
|
||||
"""
|
||||
if not experiments:
|
||||
return
|
||||
|
||||
import math
|
||||
import platform
|
||||
from dataclasses import asdict, dataclass
|
||||
from typing import Any, Optional
|
||||
|
||||
# Prepare headers and records for JSON output
|
||||
records = []
|
||||
for experiment in experiments:
|
||||
config = experiment.config
|
||||
results_dict = (
|
||||
experiment.results
|
||||
) # This is a dict: backend -> ExperimentResults
|
||||
|
||||
# Process each backend result
|
||||
for backend, results in results_dict.items():
|
||||
# Skip backends that were not run (NaN results)
|
||||
if math.isnan(results.fwd_time):
|
||||
continue
|
||||
|
||||
# Extract data from experiment
|
||||
test_name = f"{backend}_{config.attn_type}_"
|
||||
input_config = f"shape: {config.shape}, dtype: {config.dtype}"
|
||||
|
||||
# Determine mode based on backward pass
|
||||
mode = "training" if config.calculate_bwd_time else "inference"
|
||||
|
||||
# Extract dtype
|
||||
dtype = (
|
||||
str(config.dtype).split(".")[1]
|
||||
if "." in str(config.dtype)
|
||||
else str(config.dtype)
|
||||
)
|
||||
|
||||
# Determine device
|
||||
device = "cuda"
|
||||
|
||||
# Get device architecture
|
||||
device_arch = (
|
||||
torch.cuda.get_device_name(0)
|
||||
if device == "cuda"
|
||||
else platform.processor()
|
||||
if device == "cpu"
|
||||
else "unknown"
|
||||
)
|
||||
|
||||
# Create dataclasses for JSON structure
|
||||
@dataclass
|
||||
class BenchmarkInfo:
|
||||
name: str
|
||||
mode: Optional[str]
|
||||
dtype: str
|
||||
extra_info: dict[str, Any]
|
||||
|
||||
@dataclass
|
||||
class ModelInfo:
|
||||
name: str
|
||||
type: str
|
||||
origins: list[str]
|
||||
extra_info: dict[str, Any]
|
||||
|
||||
@dataclass
|
||||
class MetricInfo:
|
||||
name: str
|
||||
unit: str
|
||||
benchmark_values: list[float]
|
||||
target_value: Optional[float]
|
||||
|
||||
@dataclass
|
||||
class BenchmarkRecord:
|
||||
benchmark: BenchmarkInfo
|
||||
model: ModelInfo
|
||||
metric: MetricInfo
|
||||
|
||||
# Benchmark extra info
|
||||
benchmark_extra_info = {
|
||||
"input_config": input_config,
|
||||
"device": device,
|
||||
"arch": device_arch,
|
||||
"operator_name": backend,
|
||||
"attn_type": config.attn_type,
|
||||
"shape": str(config.shape),
|
||||
"max_autotune": config.max_autotune,
|
||||
}
|
||||
# Add record for forward latency
|
||||
record_fwd_latency = BenchmarkRecord(
|
||||
benchmark=BenchmarkInfo(
|
||||
name=benchmark_name,
|
||||
mode=mode,
|
||||
dtype=dtype,
|
||||
extra_info=benchmark_extra_info,
|
||||
),
|
||||
model=ModelInfo(
|
||||
name=test_name + str(config.shape),
|
||||
type="attention-benchmark",
|
||||
origins=["pytorch"],
|
||||
extra_info={
|
||||
"operator_name": backend,
|
||||
"attn_type": config.attn_type,
|
||||
},
|
||||
),
|
||||
metric=MetricInfo(
|
||||
name="forward latency",
|
||||
unit="us",
|
||||
benchmark_values=[results.fwd_time],
|
||||
target_value=None,
|
||||
),
|
||||
)
|
||||
records.append(asdict(record_fwd_latency))
|
||||
|
||||
# Add record for forward memory bandwidth (if available)
|
||||
if config.cal_bandwidth:
|
||||
record_fwd_bandwidth = BenchmarkRecord(
|
||||
benchmark=BenchmarkInfo(
|
||||
name=benchmark_name,
|
||||
mode=mode,
|
||||
dtype=dtype,
|
||||
extra_info=benchmark_extra_info,
|
||||
),
|
||||
model=ModelInfo(
|
||||
name=test_name + str(config.shape),
|
||||
type="attention-benchmark",
|
||||
origins=["pytorch"],
|
||||
extra_info={
|
||||
"operator_name": backend,
|
||||
},
|
||||
),
|
||||
metric=MetricInfo(
|
||||
name="memory bandwidth",
|
||||
unit="TB/s",
|
||||
benchmark_values=[calculate_bandwidth(config, results, "fwd")],
|
||||
target_value=None,
|
||||
),
|
||||
)
|
||||
records.append(asdict(record_fwd_bandwidth))
|
||||
|
||||
# Add record for forward TFLOPS (if available)
|
||||
if config.cal_bandwidth:
|
||||
record_fwd_tflops = BenchmarkRecord(
|
||||
benchmark=BenchmarkInfo(
|
||||
name=benchmark_name,
|
||||
mode=mode,
|
||||
dtype=dtype,
|
||||
extra_info=benchmark_extra_info,
|
||||
),
|
||||
model=ModelInfo(
|
||||
name=test_name + str(config.shape),
|
||||
type="attention-benchmark",
|
||||
origins=["pytorch"],
|
||||
extra_info={
|
||||
"operator_name": backend,
|
||||
},
|
||||
),
|
||||
metric=MetricInfo(
|
||||
name="tflops",
|
||||
unit="TFLOPS/s",
|
||||
benchmark_values=[calculate_tflops(config, results)],
|
||||
target_value=None,
|
||||
),
|
||||
)
|
||||
records.append(asdict(record_fwd_tflops))
|
||||
|
||||
# Add record for backward latency (if available and not NaN)
|
||||
if (
|
||||
config.calculate_bwd_time
|
||||
and results.bwd_time is not None
|
||||
and not math.isnan(results.bwd_time)
|
||||
):
|
||||
record_bwd_latency = BenchmarkRecord(
|
||||
benchmark=BenchmarkInfo(
|
||||
name=benchmark_name,
|
||||
mode=mode,
|
||||
dtype=dtype,
|
||||
extra_info=benchmark_extra_info,
|
||||
),
|
||||
model=ModelInfo(
|
||||
name=test_name + str(config.shape),
|
||||
type="attention-benchmark",
|
||||
origins=["pytorch"],
|
||||
extra_info={
|
||||
"operator_name": backend,
|
||||
},
|
||||
),
|
||||
metric=MetricInfo(
|
||||
name="backward latency",
|
||||
unit="us",
|
||||
benchmark_values=[results.bwd_time],
|
||||
target_value=None,
|
||||
),
|
||||
)
|
||||
records.append(asdict(record_bwd_latency))
|
||||
|
||||
# Write all records to the output file
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(records, f, indent=2)
|
||||
|
||||
|
||||
def main(
|
||||
dynamic: bool = False,
|
||||
calculate_bwd: bool = False,
|
||||
dtype: DtypeString = "bfloat16",
|
||||
b: list[int] | None = None,
|
||||
nh: list[str] | None = None,
|
||||
s: list[int] | None = None,
|
||||
d: list[int] | None = None,
|
||||
mods: list[AttentionType] | None = None,
|
||||
backend: list[Backend] | None = None,
|
||||
max_autotune: bool = False,
|
||||
decoding: bool = False,
|
||||
kv_size: Optional[list[int]] = None,
|
||||
throughput: bool = True,
|
||||
save_path: Optional[str] = None,
|
||||
output_json_for_dashboard: Optional[str] = None,
|
||||
benchmark_name: str = "PyTorch operator microbenchmark",
|
||||
) -> None:
|
||||
"""Run sweep over sizes and score mods for flex attention.
|
||||
|
||||
Usage Examples:
|
||||
# Use a yml config file
|
||||
python score_mod.py --config basic_config.yaml
|
||||
|
||||
# Use a json config file
|
||||
python score_mod.py --config my_config.json
|
||||
|
||||
# Generate a config template
|
||||
python score_mod.py --print-config json > my_config.json # For a json config
|
||||
python score_mod.py --print-config yaml > my_config.yaml # For a yaml config
|
||||
|
||||
# Override config with CLI args
|
||||
python score_mod.py --config my_config.json -dtype float16 --max-autotune
|
||||
|
||||
# Pure CLI usage
|
||||
python score_mod.py -b 4 8 -s 1024 2048 -mods causal alibi --backend efficient
|
||||
|
||||
Args:
|
||||
dynamic: Runs a dynamic shapes version of compiled flex attention
|
||||
calculate_bwd: Calculate backward pass times
|
||||
dtype: Data type for tensors (bfloat16, float16, float32)
|
||||
b: Batch sizes to benchmark
|
||||
nh: Number of query and key/value heads in format "Hq,Hkv"
|
||||
s: Sequence lengths to benchmark
|
||||
d: Head dimensions to benchmark
|
||||
mods: Score modifications: noop, causal, rel, head_bias, alibi, sliding_window, document_mask, prefix_lm, softcap
|
||||
backend: Backends for attention computation: math, efficient, cudnn, fav2, fav3, fakv, og-eager
|
||||
max_autotune: Turn on max-autotune optimization
|
||||
decoding: Benchmark decoding mode (query sequence length = 1)
|
||||
kv_size: Key/value cache size in MiB (ignores batch size if specified)
|
||||
throughput: Calculate kernel memory bandwidth & computational throughput (always True)
|
||||
save_path: Path to save the results CSV file
|
||||
output_json_for_dashboard: Path to save results in JSON format for PyTorch OSS dashboard
|
||||
benchmark_name: Name of the benchmark for dashboard output
|
||||
"""
|
||||
# Convert dtype string to torch dtype (if not already converted)
|
||||
import torch
|
||||
|
||||
if isinstance(dtype, str):
|
||||
dtype = getattr(torch, dtype)
|
||||
|
||||
# Always calculate throughput
|
||||
throughput = True
|
||||
print("Backend: ", backend)
|
||||
seed = 123
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
results = []
|
||||
for config in tqdm(
|
||||
generate_experiment_configs(
|
||||
args.calculate_bwd,
|
||||
args.dtype,
|
||||
args.b,
|
||||
args.nh,
|
||||
args.s,
|
||||
args.d,
|
||||
args.mods,
|
||||
args.decoding,
|
||||
args.kv_size,
|
||||
args.throughput,
|
||||
args.backend,
|
||||
)
|
||||
for experiment_count, config in enumerate(
|
||||
tqdm(
|
||||
generate_experiment_configs(
|
||||
calculate_bwd,
|
||||
dtype,
|
||||
b,
|
||||
nh,
|
||||
s,
|
||||
d,
|
||||
mods,
|
||||
decoding,
|
||||
kv_size,
|
||||
throughput,
|
||||
backend,
|
||||
max_autotune,
|
||||
)
|
||||
),
|
||||
start=1,
|
||||
):
|
||||
results.append(
|
||||
Experiment(
|
||||
config,
|
||||
run_single_experiment(
|
||||
config,
|
||||
dynamic=args.dynamic,
|
||||
max_autotune=args.max_autotune,
|
||||
dynamic=dynamic,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
print_results(results, args.save_path)
|
||||
# Periodic memory cleanup every 50 experiments
|
||||
if experiment_count % 50 == 0:
|
||||
cleanup_memory()
|
||||
|
||||
print_results(results, save_path)
|
||||
|
||||
def heads_input_type(s):
|
||||
try:
|
||||
hq, hkv = map(int, s.split(","))
|
||||
return hq, hkv
|
||||
except Exception as e:
|
||||
raise argparse.ArgumentTypeError("Heads must be Hq,Hkv") from e
|
||||
# Output JSON for dashboard if requested
|
||||
if output_json_for_dashboard:
|
||||
_output_json_for_dashboard(results, output_json_for_dashboard, benchmark_name)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -1130,6 +1501,12 @@ if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run sweep over sizes and score mods for flex attention"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
help="Path to JSON config file. CLI args override config file values.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dynamic",
|
||||
action="store_true",
|
||||
@ -1199,8 +1576,49 @@ Ignores -b batch size and calculate batch size from kv size instead when specifi
|
||||
default=["efficient"],
|
||||
help="Backend to use for attention computation",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-json-for-dashboard",
|
||||
type=str,
|
||||
help="Path to save results in JSON format for PyTorch OSS dashboard",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--benchmark-name",
|
||||
type=str,
|
||||
help="Name of the benchmark for dashboard output",
|
||||
default="PyTorch operator microbenchmark",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--print-config",
|
||||
type=str,
|
||||
choices=["json", "yaml"],
|
||||
help="Print a default config template in JSON or YAML format and exit",
|
||||
default=None,
|
||||
)
|
||||
# Parse arguments
|
||||
args = parser.parse_args()
|
||||
args.dtype = getattr(torch, args.dtype)
|
||||
|
||||
main(args)
|
||||
# Handle --print-config
|
||||
if args.print_config:
|
||||
print_default_config(args.print_config)
|
||||
sys.exit(0)
|
||||
|
||||
# Load and merge config if provided
|
||||
if args.config:
|
||||
config = load_config_file(args.config)
|
||||
|
||||
# Merge config with CLI args (CLI args take precedence)
|
||||
json_args = argparse.Namespace()
|
||||
json_args.__dict__ = config
|
||||
args = parser.parse_args(namespace=json_args)
|
||||
|
||||
# Convert dtype string to torch dtype (only if it's still a string)
|
||||
if isinstance(args.dtype, str):
|
||||
args.dtype = getattr(torch, args.dtype)
|
||||
|
||||
# Remove config and print_config from args before passing to main
|
||||
args_dict = vars(args)
|
||||
args_dict.pop("config", None)
|
||||
args_dict.pop("print_config", None)
|
||||
|
||||
main(**args_dict)
|
||||
|
||||
@ -482,6 +482,7 @@ inductor_core_resources = [
|
||||
"torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp",
|
||||
"torch/csrc/inductor/inductor_ops.cpp",
|
||||
"torch/csrc/jit/serialization/pickle.cpp",
|
||||
"torch/csrc/shim_common.cpp",
|
||||
]
|
||||
|
||||
libtorch_core_sources = sorted(
|
||||
@ -916,6 +917,7 @@ libtorch_python_core_sources = [
|
||||
"torch/csrc/autograd/python_torch_functions_manual.cpp",
|
||||
"torch/csrc/autograd/python_variable.cpp",
|
||||
"torch/csrc/autograd/python_variable_indexing.cpp",
|
||||
"torch/csrc/distributed/python_placement.cpp",
|
||||
"torch/csrc/dynamo/python_compiled_autograd.cpp",
|
||||
"torch/csrc/dynamo/cache_entry.cpp",
|
||||
"torch/csrc/dynamo/cpp_shim.cpp",
|
||||
|
||||
@ -52,9 +52,7 @@ constexpr DispatchKeySet math_dispatch_keyset = backend_dispatch_keyset |
|
||||
// where we would like to support composite implicit kernels but not
|
||||
// explicit kernels therefore we manually add the key to the
|
||||
// math_dispatch_keyset
|
||||
DispatchKeySet{DispatchKey::NestedTensor} |
|
||||
// Functionalize should always reuse CompositeImplicit decomps.
|
||||
DispatchKeySet{DispatchKey::Functionalize};
|
||||
DispatchKeySet(DispatchKey::NestedTensor);
|
||||
|
||||
constexpr DispatchKeySet nested_dispatch_keyset =
|
||||
DispatchKeySet(
|
||||
|
||||
@ -556,3 +556,26 @@ inline SymBool sym_ge(const SymInt& a, const SymInt& b) {
|
||||
}
|
||||
|
||||
} // namespace c10
|
||||
|
||||
#include <limits>
|
||||
|
||||
namespace std {
|
||||
|
||||
template <>
|
||||
class numeric_limits<c10::SymInt> {
|
||||
public:
|
||||
static constexpr bool is_specialized = true;
|
||||
|
||||
static constexpr int64_t max() noexcept {
|
||||
return std::numeric_limits<int64_t>::max();
|
||||
}
|
||||
|
||||
static constexpr int64_t min() noexcept {
|
||||
return std::numeric_limits<int64_t>::min();
|
||||
}
|
||||
|
||||
static constexpr bool is_signed = true;
|
||||
static constexpr bool is_integer = true;
|
||||
};
|
||||
|
||||
} // namespace std
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Implementation of specal math functions for Metal
|
||||
// Implementation of special math functions for Metal
|
||||
#pragma once
|
||||
#include <c10/metal/expm1f.h>
|
||||
#include <c10/metal/igamma.h>
|
||||
@ -624,6 +624,64 @@ inline T spherical_bessel_j0(T x) {
|
||||
return static_cast<T>(::metal::sin(x) / x);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline ::metal::enable_if_t<is_scalar_floating_point_v<T>, T> logaddexp(
|
||||
T a,
|
||||
T b) {
|
||||
float a0 = static_cast<float>(a);
|
||||
float b0 = static_cast<float>(b);
|
||||
if (::metal::isinf(a0) && a0 == b0) {
|
||||
return static_cast<T>(a0);
|
||||
} else {
|
||||
float m0 = ::metal::max(a0, b0);
|
||||
return static_cast<T>(
|
||||
m0 + ::c10::metal::log1p(::metal::exp(-::metal::abs(a0 - b0))));
|
||||
}
|
||||
}
|
||||
|
||||
// The function is ported from mlx
|
||||
template <typename T>
|
||||
inline ::metal::enable_if_t<is_complex_v<T>, T> logaddexp(T a, T b) {
|
||||
if (::metal::isnan(a.x) || ::metal::isnan(a.y) || ::metal::isnan(b.x) ||
|
||||
::metal::isnan(b.y)) {
|
||||
return T(NAN, NAN);
|
||||
}
|
||||
|
||||
T maxval = a.x > b.x ? a : b;
|
||||
T minval = a.x < b.x ? a : b;
|
||||
constexpr auto inf = ::metal::numeric_limits<T>::infinity().x;
|
||||
|
||||
if (minval.x == -inf || maxval.x == inf) {
|
||||
return maxval;
|
||||
}
|
||||
|
||||
float2 maxval_ = static_cast<float2>(maxval);
|
||||
float2 minval_ = static_cast<float2>(minval);
|
||||
float m = ::metal::exp(minval_.x - maxval_.x);
|
||||
float2 dexp{
|
||||
m * ::metal::cos(minval_.y - maxval_.y),
|
||||
m * ::metal::sin(minval_.y - maxval_.y),
|
||||
};
|
||||
return static_cast<T>(maxval_ + ::c10::metal::log1p(dexp));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline T logaddexp2(T a, T b) {
|
||||
constexpr auto log_2 = float(0.693147180559945309417232121458176);
|
||||
constexpr auto inv_log_2 = float(1) / log_2;
|
||||
float a0 = static_cast<float>(a);
|
||||
float b0 = static_cast<float>(b);
|
||||
if (::metal::isinf(a0) && a0 == b0) {
|
||||
return static_cast<T>(a0);
|
||||
} else {
|
||||
float m0 = ::metal::max(a0, b0);
|
||||
return static_cast<T>(
|
||||
m0 +
|
||||
::c10::metal::log1p(::metal::pow(float(2), -::metal::abs(a0 - b0))) *
|
||||
inv_log_2);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline float xlog1py(T x, T y) {
|
||||
if (::metal::isnan(y)) {
|
||||
|
||||
@ -322,6 +322,24 @@ inline float log1p(float x) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
// The function is ported from mlx
|
||||
inline float2 log1p(float2 in) {
|
||||
float x = in.x;
|
||||
float y = in.y;
|
||||
float zabs = ::metal::precise::sqrt(x * x + y * y);
|
||||
float theta = ::metal::atan2(y, x + 1);
|
||||
if (zabs < 0.5f) {
|
||||
float r = x * (2 + x) + y * y;
|
||||
if (r == 0) { // handle underflow
|
||||
return {x, theta};
|
||||
}
|
||||
return {0.5f * log1p(r), theta};
|
||||
} else {
|
||||
auto z0 = ::metal::sqrt((x + 1) * (x + 1) + y * y);
|
||||
return {::metal::log(z0), theta};
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T1, typename T2 = T1>
|
||||
struct pair {
|
||||
T1 first;
|
||||
@ -329,17 +347,17 @@ struct pair {
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
static T conj(T a) {
|
||||
inline T conj(T a) {
|
||||
return a;
|
||||
}
|
||||
|
||||
template <>
|
||||
half2 conj(half2 a) {
|
||||
inline half2 conj(half2 a) {
|
||||
return half2(a.x, -a.y);
|
||||
}
|
||||
|
||||
template <>
|
||||
float2 conj(float2 a) {
|
||||
inline float2 conj(float2 a) {
|
||||
return float2(a.x, -a.y);
|
||||
}
|
||||
|
||||
|
||||
@ -34,7 +34,7 @@ struct MemEvent {
|
||||
bool overlaps(const MemBlock& a, const MemBlock& b) {
|
||||
// two blocks dont overlap if
|
||||
// |---a--------|--------------b--------|
|
||||
// strat_a end_a <= start_b end_b
|
||||
// start_a end_a <= start_b end_b
|
||||
return !(
|
||||
(a.end_offset <= b.start_offset) || (b.end_offset <= a.start_offset));
|
||||
}
|
||||
|
||||
@ -33,7 +33,7 @@ struct bitset final {
|
||||
constexpr bitset() noexcept = default;
|
||||
constexpr bitset(const bitset&) noexcept = default;
|
||||
constexpr bitset(bitset&&) noexcept = default;
|
||||
// there is an issure for gcc 5.3.0 when define default function as constexpr
|
||||
// there is an issue for gcc 5.3.0 when define default function as constexpr
|
||||
// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68754.
|
||||
bitset& operator=(const bitset&) noexcept = default;
|
||||
bitset& operator=(bitset&&) noexcept = default;
|
||||
|
||||
@ -123,6 +123,8 @@ class DeviceCachingAllocator {
|
||||
ska::flat_hash_map<xpu::XPUStream, std::deque<std::pair<sycl::event, Block*>>>
|
||||
xpu_events;
|
||||
DeviceIndex device_index;
|
||||
size_t allowed_memory_maximum = 0;
|
||||
bool set_fraction = false;
|
||||
|
||||
size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
|
||||
if (!src || src->allocated || src->event_count > 0 ||
|
||||
@ -245,6 +247,12 @@ class DeviceCachingAllocator {
|
||||
if (isRetry) {
|
||||
stats.num_alloc_retries += 1;
|
||||
}
|
||||
if (set_fraction &&
|
||||
stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current +
|
||||
size >
|
||||
allowed_memory_maximum) {
|
||||
return false;
|
||||
}
|
||||
void* ptr = sycl::aligned_alloc_device(
|
||||
kDeviceAlignment,
|
||||
size,
|
||||
@ -435,6 +443,11 @@ class DeviceCachingAllocator {
|
||||
device_free =
|
||||
raw_device.get_info<sycl::ext::intel::info::device::free_memory>();
|
||||
}
|
||||
std::string allowed_info;
|
||||
if (set_fraction) {
|
||||
allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
|
||||
}
|
||||
|
||||
auto allocated_bytes =
|
||||
stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
|
||||
.current;
|
||||
@ -459,7 +472,9 @@ class DeviceCachingAllocator {
|
||||
format_size(device_total),
|
||||
" of which ",
|
||||
format_size(device_free),
|
||||
" is free. Of the allocated memory ",
|
||||
" is free. ",
|
||||
allowed_info,
|
||||
"Of the allocated memory ",
|
||||
format_size(allocated_bytes),
|
||||
" is allocated by PyTorch, and ",
|
||||
format_size(reserved_bytes - allocated_bytes),
|
||||
@ -538,6 +553,25 @@ class DeviceCachingAllocator {
|
||||
stats.requested_bytes[statType].reset_peak();
|
||||
}
|
||||
}
|
||||
|
||||
double getMemoryFraction() {
|
||||
if (!set_fraction) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
c10::xpu::DeviceProp device_prop;
|
||||
c10::xpu::get_device_properties(&device_prop, device_index);
|
||||
return static_cast<double>(allowed_memory_maximum) /
|
||||
static_cast<double>(device_prop.global_mem_size);
|
||||
}
|
||||
|
||||
void setMemoryFraction(double fraction) {
|
||||
c10::xpu::DeviceProp device_prop;
|
||||
c10::xpu::get_device_properties(&device_prop, device_index);
|
||||
auto device_total = device_prop.global_mem_size;
|
||||
allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
|
||||
set_fraction = true;
|
||||
}
|
||||
};
|
||||
|
||||
static void local_raw_delete(void* ptr);
|
||||
@ -700,6 +734,21 @@ class XPUAllocator : public DeviceAllocator {
|
||||
assertValidDevice(device);
|
||||
device_allocators[device]->resetAccumulatedStats();
|
||||
}
|
||||
|
||||
double getMemoryFraction(DeviceIndex device) {
|
||||
assertValidDevice(device);
|
||||
return device_allocators[device]->getMemoryFraction();
|
||||
}
|
||||
|
||||
void setMemoryFraction(double fraction, DeviceIndex device) {
|
||||
assertValidDevice(device);
|
||||
TORCH_CHECK_VALUE(
|
||||
0 < fraction && fraction <= 1,
|
||||
"invalid fraction:",
|
||||
fraction,
|
||||
". Please set within (0, 1].");
|
||||
device_allocators[device]->setMemoryFraction(fraction);
|
||||
}
|
||||
};
|
||||
|
||||
static XPUAllocator allocator;
|
||||
@ -744,6 +793,14 @@ void recordStream(const DataPtr& dataPtr, XPUStream stream) {
|
||||
return allocator.recordStream(dataPtr, stream);
|
||||
}
|
||||
|
||||
double getMemoryFraction(DeviceIndex device) {
|
||||
return allocator.getMemoryFraction(device);
|
||||
}
|
||||
|
||||
void setMemoryFraction(double fraction, DeviceIndex device) {
|
||||
return allocator.setMemoryFraction(fraction, device);
|
||||
}
|
||||
|
||||
REGISTER_ALLOCATOR(kXPU, &allocator)
|
||||
|
||||
} // namespace c10::xpu::XPUCachingAllocator
|
||||
|
||||
@ -25,4 +25,8 @@ C10_XPU_API void raw_delete(void* ptr);
|
||||
|
||||
C10_XPU_API void recordStream(const DataPtr& dataPtr, XPUStream stream);
|
||||
|
||||
C10_XPU_API double getMemoryFraction(DeviceIndex device);
|
||||
|
||||
C10_XPU_API void setMemoryFraction(double fraction, DeviceIndex device);
|
||||
|
||||
} // namespace c10::xpu::XPUCachingAllocator
|
||||
|
||||
@ -1358,9 +1358,15 @@ if(BUILD_TEST)
|
||||
)
|
||||
else()
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/lazy ${CMAKE_BINARY_DIR}/test_lazy)
|
||||
# NativeRT is disabled
|
||||
# add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
|
||||
add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_abi_check ${CMAKE_BINARY_DIR}/test_aoti_abi_check)
|
||||
if(BUILD_AOT_INDUCTOR_TEST)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_inference ${CMAKE_BINARY_DIR}/test_aoti_inference)
|
||||
endif()
|
||||
|
||||
if(USE_DISTRIBUTED)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
|
||||
if(NOT WIN32)
|
||||
@ -1378,16 +1384,6 @@ if(BUILD_TEST)
|
||||
${CMAKE_BINARY_DIR}/test_mobile_nnc
|
||||
)
|
||||
endif()
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/lazy
|
||||
${CMAKE_BINARY_DIR}/test_lazy)
|
||||
endif()
|
||||
if(BUILD_AOT_INDUCTOR_TEST)
|
||||
add_subdirectory(
|
||||
${TORCH_ROOT}/test/cpp/aoti_abi_check
|
||||
${CMAKE_BINARY_DIR}/test_aoti_abi_check)
|
||||
add_subdirectory(
|
||||
${TORCH_ROOT}/test/cpp/aoti_inference
|
||||
${CMAKE_BINARY_DIR}/test_aoti_inference)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
@ -38,7 +38,7 @@ uint32_t crc32_combine (uint32_t crcA, uint32_t crcB, size_t lengthB);
|
||||
|
||||
/// compute CRC32 (bitwise algorithm)
|
||||
uint32_t crc32_bitwise (const void* data, size_t length, uint32_t previousCrc32 = 0);
|
||||
/// compute CRC32 (half-byte algoritm)
|
||||
/// compute CRC32 (half-byte algorithm)
|
||||
uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32 = 0);
|
||||
|
||||
#ifdef CRC32_USE_LOOKUP_TABLE_BYTE
|
||||
@ -96,7 +96,7 @@ uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previo
|
||||
#define __BIG_ENDIAN 4321
|
||||
#endif
|
||||
|
||||
// define endianess and some integer data types
|
||||
// define endianness and some integer data types
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
// Windows always little endian
|
||||
#define __BYTE_ORDER __LITTLE_ENDIAN
|
||||
@ -168,7 +168,7 @@ namespace
|
||||
/// zlib's CRC32 polynomial
|
||||
const uint32_t Polynomial = 0xEDB88320;
|
||||
|
||||
/// swap endianess
|
||||
/// swap endianness
|
||||
static inline uint32_t swap(uint32_t x)
|
||||
{
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
@ -229,7 +229,7 @@ uint32_t crc32_bitwise(const void* data, size_t length, uint32_t previousCrc32)
|
||||
}
|
||||
|
||||
|
||||
/// compute CRC32 (half-byte algoritm)
|
||||
/// compute CRC32 (half-byte algorithm)
|
||||
uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32)
|
||||
{
|
||||
uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
|
||||
@ -662,7 +662,7 @@ uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB)
|
||||
// - if you append length(B) zeros to A and call it A' (think of it as AAAA000)
|
||||
// and prepend length(A) zeros to B and call it B' (think of it as 0000BBB)
|
||||
// then exists a C' = A' ^ B'
|
||||
// - remember: if you XOR someting with zero, it remains unchanged: X ^ 0 = X
|
||||
// - remember: if you XOR something with zero, it remains unchanged: X ^ 0 = X
|
||||
// - that means C' = A concat B so that crc(A concat B) = crc(C') = crc(A') ^ crc(B')
|
||||
// - the trick is to compute crc(A') based on crc(A)
|
||||
// and crc(B') based on crc(B)
|
||||
|
||||
@ -76,7 +76,7 @@ typedef struct mz_zip_archive mz_zip_archive;
|
||||
// 2) Writing with 1-pass sequential access
|
||||
// -> We must take care not to require updating values that have already
|
||||
// been written. We place the variable-length index at the end and do
|
||||
// not put any indicies into the header to fulfill this constraint.
|
||||
// not put any index into the header to fulfill this constraint.
|
||||
|
||||
// The model.json, which contains all the metadata information,
|
||||
// should be written as the last file. One reason is that the size of tensor
|
||||
|
||||
@ -519,7 +519,7 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoadWithAllocator) {
|
||||
std::tie(data_ptr, size) = reader.getRecord("key1", &overrideAllocator);
|
||||
EXPECT_EQ(overrideAllocator.getAllocatedBytes(), kBytes1);
|
||||
EXPECT_EQ(baseAllocator.getAllocatedBytes(), allocBytes);
|
||||
// allcoate with base allocator
|
||||
// allocate with base allocator
|
||||
std::tie(data_ptr, size) = reader.getRecord("key1");
|
||||
EXPECT_EQ(overrideAllocator.getAllocatedBytes(), kBytes1);
|
||||
EXPECT_EQ(baseAllocator.getAllocatedBytes(), allocBytes + kBytes1);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user