mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-01 22:14:53 +08:00
Compare commits
1 Commits
ciflow/b20
...
revert-cpp
| Author | SHA1 | Date | |
|---|---|---|---|
| 2eacbe792a |
@ -195,16 +195,13 @@ case "$tag" in
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks)
|
||||
pytorch-linux-jammy-xpu-n-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
XPU_VERSION=2025.2
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
if [[ $tag =~ "benchmarks" ]]; then
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
fi
|
||||
;;
|
||||
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
|
||||
set -eux
|
||||
|
||||
ACL_VERSION=${ACL_VERSION:-"v52.6.0"}
|
||||
ACL_VERSION=${ACL_VERSION:-"v25.02"}
|
||||
ACL_INSTALL_DIR="/acl"
|
||||
|
||||
# Clone ACL
|
||||
|
||||
@ -49,20 +49,12 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
|
||||
export SYSROOT_DEP="sysroot_linux-64=2.17"
|
||||
fi
|
||||
|
||||
# Install correct Python version
|
||||
# Also ensure sysroot is using a modern GLIBC to match system compilers
|
||||
if [ "$ANACONDA_PYTHON_VERSION" = "3.14" ]; then
|
||||
as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
|
||||
python="3.14.0" \
|
||||
${SYSROOT_DEP} \
|
||||
-c conda-forge
|
||||
else
|
||||
# Install correct Python version
|
||||
# Also ensure sysroot is using a modern GLIBC to match system compilers
|
||||
as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
|
||||
python="$ANACONDA_PYTHON_VERSION" \
|
||||
${SYSROOT_DEP}
|
||||
fi
|
||||
|
||||
# libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30
|
||||
# which is provided in libstdcxx 12 and up.
|
||||
conda_install libstdcxx-ng=12.3.0 --update-deps -c conda-forge
|
||||
|
||||
@ -10,7 +10,7 @@ else
|
||||
arch_path='sbsa'
|
||||
fi
|
||||
|
||||
NVSHMEM_VERSION=3.4.5
|
||||
NVSHMEM_VERSION=3.3.24
|
||||
|
||||
function install_cuda {
|
||||
version=$1
|
||||
|
||||
@ -40,7 +40,11 @@ EOF
|
||||
|
||||
# Default url values
|
||||
rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
|
||||
amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
|
||||
|
||||
# Add amdgpu repository
|
||||
UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
|
||||
echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
|
||||
|
||||
# Add rocm repository
|
||||
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
|
||||
|
||||
@ -12,8 +12,8 @@ function do_install() {
|
||||
|
||||
rocm_version_nodot=${rocm_version//./}
|
||||
|
||||
# post merge of https://github.com/icl-utk-edu/magma/pull/65
|
||||
MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
|
||||
# https://github.com/icl-utk-edu/magma/pull/65
|
||||
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
|
||||
magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
|
||||
|
||||
rocm_dir="/opt/rocm"
|
||||
|
||||
@ -138,12 +138,10 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
|
||||
#test_binary_ufuncs.py
|
||||
numpy==1.22.4; python_version == "3.10"
|
||||
numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
|
||||
numpy==2.1.2; python_version >= "3.13" and python_version < "3.14"
|
||||
numpy==2.3.4; python_version >= "3.14"
|
||||
numpy==2.1.2; python_version >= "3.13"
|
||||
|
||||
pandas==2.0.3; python_version < "3.13"
|
||||
pandas==2.2.3; python_version >= "3.13" and python_version < "3.14"
|
||||
pandas==2.3.3; python_version >= "3.14"
|
||||
pandas==2.2.3; python_version >= "3.13"
|
||||
|
||||
#onnxruntime
|
||||
#Description: scoring engine for Open Neural Network Exchange (ONNX) models
|
||||
@ -155,8 +153,7 @@ opt-einsum==3.3
|
||||
#Pinned versions: 3.3
|
||||
#test that import: test_linalg.py
|
||||
|
||||
optree==0.13.0 ; python_version < "3.14"
|
||||
optree==0.17.0 ; python_version >= "3.14"
|
||||
optree==0.13.0
|
||||
#Description: A library for tree manipulation
|
||||
#Pinned versions: 0.13.0
|
||||
#test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
|
||||
@ -255,8 +252,7 @@ scikit-image==0.22.0
|
||||
#test that import:
|
||||
|
||||
scipy==1.10.1 ; python_version <= "3.11"
|
||||
scipy==1.14.1 ; python_version > "3.11" and python_version < "3.14"
|
||||
scipy==1.16.2 ; python_version >= "3.14"
|
||||
scipy==1.14.1 ; python_version >= "3.12"
|
||||
# Pin SciPy because of failing distribution tests (see #60347)
|
||||
#Description: scientific python
|
||||
#Pinned versions: 1.10.1
|
||||
@ -328,8 +324,7 @@ pywavelets==1.7.0 ; python_version >= "3.12"
|
||||
#Pinned versions: 1.4.1
|
||||
#test that import:
|
||||
|
||||
lxml==5.3.0 ; python_version < "3.14"
|
||||
lxml==6.0.2 ; python_version >= "3.14"
|
||||
lxml==5.3.0
|
||||
#Description: This is a requirement of unittest-xml-reporting
|
||||
|
||||
PyGithub==2.3.0
|
||||
@ -339,9 +334,7 @@ sympy==1.13.3
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
onnx==1.19.1 ; python_version < "3.14"
|
||||
# Unpin once Python 3.14 is supported. See onnxruntime issue 26309.
|
||||
onnx==1.18.0 ; python_version == "3.14"
|
||||
onnx==1.19.1
|
||||
#Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
@ -366,7 +359,7 @@ pwlf==2.2.1
|
||||
#test that import: test_sac_estimator.py
|
||||
|
||||
# To build PyTorch itself
|
||||
pyyaml==6.0.3
|
||||
pyyaml==6.0.2
|
||||
pyzstd
|
||||
setuptools==78.1.1
|
||||
packaging==23.1
|
||||
|
||||
@ -54,15 +54,12 @@ ENV OPENSSL_DIR /opt/openssl
|
||||
RUN rm install_openssl.sh
|
||||
|
||||
ARG INDUCTOR_BENCHMARKS
|
||||
ARG ANACONDA_PYTHON_VERSION
|
||||
ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
|
||||
COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
|
||||
COPY ci_commit_pins/timm.txt timm.txt
|
||||
COPY ci_commit_pins/torchbench.txt torchbench.txt
|
||||
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt
|
||||
|
||||
# Install XPU Dependencies
|
||||
ARG XPU_VERSION
|
||||
|
||||
@ -100,8 +100,6 @@ COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
|
||||
COPY ci_commit_pins/timm.txt timm.txt
|
||||
COPY ci_commit_pins/torchbench.txt torchbench.txt
|
||||
# Only build aoti cpp tests when INDUCTOR_BENCHMARKS is set to True
|
||||
ENV BUILD_AOT_INDUCTOR_TEST ${INDUCTOR_BENCHMARKS}
|
||||
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
|
||||
|
||||
|
||||
@ -6,7 +6,7 @@ dependencies = [
|
||||
"GitPython==3.1.45",
|
||||
"docker==7.1.0",
|
||||
"pytest==7.3.2",
|
||||
"uv==0.9.6"
|
||||
"uv==0.9.5"
|
||||
]
|
||||
|
||||
[tool.setuptools]
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
SHELL=/usr/bin/env bash
|
||||
|
||||
DOCKER_CMD ?= docker
|
||||
DESIRED_ROCM ?= 7.1
|
||||
DESIRED_ROCM ?= 7.0
|
||||
DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
|
||||
PACKAGE_NAME = magma-rocm
|
||||
# inherit this from underlying docker image, do not pass this env var to docker
|
||||
@ -16,7 +16,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
|
||||
magma-rocm/build_magma.sh
|
||||
|
||||
.PHONY: all
|
||||
all: magma-rocm71
|
||||
all: magma-rocm70
|
||||
all: magma-rocm64
|
||||
|
||||
@ -25,11 +24,6 @@ clean:
|
||||
$(RM) -r magma-*
|
||||
$(RM) -r output
|
||||
|
||||
.PHONY: magma-rocm71
|
||||
magma-rocm71: DESIRED_ROCM := 7.1
|
||||
magma-rocm71:
|
||||
$(DOCKER_RUN)
|
||||
|
||||
.PHONY: magma-rocm70
|
||||
magma-rocm70: DESIRED_ROCM := 7.0
|
||||
magma-rocm70:
|
||||
|
||||
@ -6,8 +6,8 @@ set -eou pipefail
|
||||
# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
|
||||
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
|
||||
# post merge of https://github.com/icl-utk-edu/magma/pull/65
|
||||
MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
|
||||
# https://github.com/icl-utk-edu/magma/pull/65
|
||||
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
|
||||
|
||||
# Folders for the build
|
||||
PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
|
||||
@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE
|
||||
|
||||
# Fetch magma sources and verify checksum
|
||||
pushd ${PACKAGE_DIR}
|
||||
git clone https://github.com/icl-utk-edu/magma
|
||||
git clone https://github.com/jeffdaily/magma
|
||||
pushd magma
|
||||
git checkout ${MAGMA_VERSION}
|
||||
popd
|
||||
|
||||
@ -426,7 +426,7 @@ fi
|
||||
if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; then
|
||||
# export test times so that potential sharded tests that'll branch off this build will use consistent data
|
||||
# don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
|
||||
PYTHONPATH=. python tools/stats/export_test_times.py
|
||||
python tools/stats/export_test_times.py
|
||||
fi
|
||||
# don't do this for bazel or s390x or riscv64 as they don't use sccache
|
||||
if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
|
||||
|
||||
@ -460,18 +460,28 @@ test_inductor_shard() {
|
||||
--verbose
|
||||
}
|
||||
|
||||
test_inductor_aoti_cpp() {
|
||||
test_inductor_aoti() {
|
||||
# docker build uses bdist_wheel which does not work with test_aot_inductor
|
||||
# TODO: need a faster way to build
|
||||
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
|
||||
# We need to hipify before building again
|
||||
python3 tools/amd_build/build_amd.py
|
||||
fi
|
||||
if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
|
||||
BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .)
|
||||
# TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
|
||||
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
|
||||
else
|
||||
BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .)
|
||||
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
|
||||
fi
|
||||
|
||||
# aoti cmake custom command requires `torch` to be installed
|
||||
# initialize the cmake build cache and install torch
|
||||
/usr/bin/env "${BUILD_COMMAND[@]}"
|
||||
# rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
|
||||
/usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
|
||||
|
||||
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
|
||||
}
|
||||
|
||||
@ -572,8 +582,6 @@ fi
|
||||
|
||||
if [[ "${TEST_CONFIG}" == *cpu* ]]; then
|
||||
DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
|
||||
elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
|
||||
DYNAMO_BENCHMARK_FLAGS+=(--device xpu)
|
||||
else
|
||||
DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
|
||||
fi
|
||||
@ -667,8 +675,6 @@ test_perf_for_dashboard() {
|
||||
device=cuda_b200
|
||||
elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
|
||||
device=rocm
|
||||
elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
|
||||
device=xpu
|
||||
fi
|
||||
|
||||
for mode in "${modes[@]}"; do
|
||||
@ -1761,7 +1767,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
|
||||
else
|
||||
# Do this after checkout_install_torchbench to ensure we clobber any
|
||||
# nightlies that torchbench may pull in
|
||||
if [[ "${TEST_CONFIG}" != *cpu* && "${TEST_CONFIG}" != *xpu* ]]; then
|
||||
if [[ "${TEST_CONFIG}" != *cpu* ]]; then
|
||||
install_torchrec_and_fbgemm
|
||||
fi
|
||||
PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
|
||||
@ -1770,7 +1776,7 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
|
||||
install_torchvision
|
||||
PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
|
||||
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
|
||||
test_inductor_aoti_cpp
|
||||
test_inductor_aoti
|
||||
fi
|
||||
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
|
||||
install_torchvision
|
||||
|
||||
@ -7,9 +7,12 @@ if "%DESIRED_PYTHON%" == "3.13t" (
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
|
||||
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
|
||||
set PYTHON_EXEC="python3.13t"
|
||||
) else if "%DESIRED_PYTHON%"=="3.14" (
|
||||
echo Python version is set to 3.14 or 3.14t
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
|
||||
) else if "%DESIRED_PYTHON%"=="3.14t" (
|
||||
echo Python version is set to 3.14 or 3.14t
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0-amd64.exe"
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
|
||||
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
|
||||
set PYTHON_EXEC="python3.14t"
|
||||
) else (
|
||||
|
||||
4
.github/actions/diskspace-cleanup/action.yml
vendored
4
.github/actions/diskspace-cleanup/action.yml
vendored
@ -27,9 +27,7 @@ runs:
|
||||
docker system prune -af
|
||||
diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
|
||||
if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
|
||||
diskspace_cutoff_int=$((diskspace_cutoff + 0))
|
||||
difference=$((100 - diskspace_cutoff_int))
|
||||
echo "Error: Available diskspace is less than $difference percent. Not enough diskspace."
|
||||
echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
|
||||
echo "$msg"
|
||||
exit 1
|
||||
else
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2
|
||||
69bbe7363897764f9e758d851cd0340147d27f94
|
||||
|
||||
2
.github/ci_commit_pins/vision.txt
vendored
2
.github/ci_commit_pins/vision.txt
vendored
@ -1 +1 @@
|
||||
218d2ab791d437309f91e0486eb9fa7f00badc17
|
||||
1752fe6809b74921644866275ab80244b96e80bc
|
||||
|
||||
20
.github/merge_rules.yaml
vendored
20
.github/merge_rules.yaml
vendored
@ -540,26 +540,6 @@
|
||||
- Lint
|
||||
- pull
|
||||
|
||||
- name: PrivateUse1
|
||||
patterns:
|
||||
- torch/accelerator/**
|
||||
- torch/utils/backend_registration.py
|
||||
- torch/csrc/acc/**
|
||||
- torch/csrc/DeviceAccelerator.*
|
||||
- torch/csrc/profiler/standalone/privateuse1_observer.*
|
||||
- aten/src/ATen/DeviceAccelerator.*
|
||||
- aten/src/ATen/core/GeneratorForPrivateuseone.*
|
||||
- aten/src/ATen/detail/PrivateUse1HooksInterface.*
|
||||
- docs/source/accelerator/**
|
||||
- test/cpp_extensions/open_registration_extension/torch_openreg/**
|
||||
approved_by:
|
||||
- albanD
|
||||
- fffrog
|
||||
mandatory_checks_name:
|
||||
- EasyCLA
|
||||
- Lint
|
||||
- pull
|
||||
|
||||
- name: superuser
|
||||
patterns:
|
||||
- '*'
|
||||
|
||||
2
.github/pytorch-probot.yml
vendored
2
.github/pytorch-probot.yml
vendored
@ -19,7 +19,6 @@ ciflow_push_tags:
|
||||
- ciflow/inductor-perf-test-nightly-rocm-mi300
|
||||
- ciflow/inductor-perf-test-nightly-rocm-mi355
|
||||
- ciflow/inductor-perf-test-nightly-x86-zen
|
||||
- ciflow/inductor-perf-test-nightly-xpu
|
||||
- ciflow/inductor-periodic
|
||||
- ciflow/inductor-rocm
|
||||
- ciflow/linux-aarch64
|
||||
@ -27,7 +26,6 @@ ciflow_push_tags:
|
||||
- ciflow/nightly
|
||||
- ciflow/op-benchmark
|
||||
- ciflow/periodic
|
||||
- ciflow/periodic-rocm-mi200
|
||||
- ciflow/periodic-rocm-mi300
|
||||
- ciflow/pull
|
||||
- ciflow/quantization-periodic
|
||||
|
||||
97
.github/scripts/generate_binary_build_matrix.py
vendored
97
.github/scripts/generate_binary_build_matrix.py
vendored
@ -11,17 +11,11 @@ architectures:
|
||||
* Latest XPU
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
SCRIPT_DIR = Path(__file__).absolute().parent
|
||||
REPO_ROOT = SCRIPT_DIR.parent.parent
|
||||
|
||||
|
||||
# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
|
||||
CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
|
||||
CUDA_STABLE = "12.8"
|
||||
CUDA_ARCHES_FULL_VERSION = {
|
||||
@ -37,7 +31,8 @@ CUDA_ARCHES_CUDNN_VERSION = {
|
||||
"13.0": "9",
|
||||
}
|
||||
|
||||
ROCM_ARCHES = ["7.0", "7.1"]
|
||||
# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
|
||||
ROCM_ARCHES = ["6.4", "7.0"]
|
||||
|
||||
XPU_ARCHES = ["xpu"]
|
||||
|
||||
@ -61,7 +56,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
|
||||
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
|
||||
@ -78,7 +73,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
|
||||
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
|
||||
@ -95,7 +90,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
|
||||
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
|
||||
@ -112,7 +107,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx==13.0.85; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | "
|
||||
"nvidia-cufile==1.15.1.6; platform_system == 'Linux'"
|
||||
@ -142,48 +137,9 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
}
|
||||
|
||||
|
||||
# Used by tools/nightly.py
|
||||
PYTORCH_NIGHTLY_PIP_INDEX_URL = "https://download.pytorch.org/whl/nightly"
|
||||
NIGHTLY_SOURCE_MATRIX = {
|
||||
"cpu": dict(
|
||||
name="cpu",
|
||||
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cpu",
|
||||
supported_platforms=["Linux", "macOS", "Windows"],
|
||||
accelerator="cpu",
|
||||
)
|
||||
}
|
||||
CUDA_NIGHTLY_SOURCE_MATRIX = {
|
||||
f"cuda-{major}.{minor}": dict(
|
||||
name=f"cuda-{major}.{minor}",
|
||||
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cu{major}{minor}",
|
||||
supported_platforms=["Linux", "Windows"],
|
||||
accelerator="cuda",
|
||||
)
|
||||
for major, minor in (map(int, version.split(".")) for version in CUDA_ARCHES)
|
||||
}
|
||||
ROCM_NIGHTLY_SOURCE_MATRIX = {
|
||||
f"rocm-{major}.{minor}": dict(
|
||||
name=f"rocm-{major}.{minor}",
|
||||
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm{major}.{minor}",
|
||||
supported_platforms=["Linux"],
|
||||
accelerator="rocm",
|
||||
)
|
||||
for major, minor in (map(int, version.split(".")) for version in ROCM_ARCHES)
|
||||
}
|
||||
XPU_NIGHTLY_SOURCE_MATRIX = {
|
||||
"xpu": dict(
|
||||
name="xpu",
|
||||
index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/xpu",
|
||||
supported_platforms=["Linux"],
|
||||
accelerator="xpu",
|
||||
)
|
||||
}
|
||||
NIGHTLY_SOURCE_MATRIX.update(CUDA_NIGHTLY_SOURCE_MATRIX)
|
||||
NIGHTLY_SOURCE_MATRIX.update(ROCM_NIGHTLY_SOURCE_MATRIX)
|
||||
NIGHTLY_SOURCE_MATRIX.update(XPU_NIGHTLY_SOURCE_MATRIX)
|
||||
|
||||
|
||||
def get_nccl_wheel_version(arch_version: str) -> str:
|
||||
import re
|
||||
|
||||
requirements = map(
|
||||
str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
|
||||
)
|
||||
@ -191,14 +147,17 @@ def get_nccl_wheel_version(arch_version: str) -> str:
|
||||
|
||||
|
||||
def read_nccl_pin(arch_version: str) -> str:
|
||||
nccl_pin_path = (
|
||||
REPO_ROOT
|
||||
/ ".ci"
|
||||
/ "docker"
|
||||
/ "ci_commit_pins"
|
||||
/ f"nccl-cu{arch_version[:2]}.txt"
|
||||
from pathlib import Path
|
||||
|
||||
nccl_pin_path = os.path.join(
|
||||
Path(__file__).absolute().parents[2],
|
||||
".ci",
|
||||
"docker",
|
||||
"ci_commit_pins",
|
||||
f"nccl-cu{arch_version[:2]}.txt",
|
||||
)
|
||||
return nccl_pin_path.read_text().strip()
|
||||
with open(nccl_pin_path) as f:
|
||||
return f.read().strip()
|
||||
|
||||
|
||||
def validate_nccl_dep_consistency(arch_version: str) -> None:
|
||||
@ -206,8 +165,7 @@ def validate_nccl_dep_consistency(arch_version: str) -> None:
|
||||
wheel_ver = get_nccl_wheel_version(arch_version)
|
||||
if not nccl_release_tag.startswith(f"v{wheel_ver}"):
|
||||
raise RuntimeError(
|
||||
f"{arch_version} NCCL release tag version {nccl_release_tag} "
|
||||
f"does not correspond to wheel version {wheel_ver}"
|
||||
f"{arch_version} NCCL release tag version {nccl_release_tag} does not correspond to wheel version {wheel_ver}"
|
||||
)
|
||||
|
||||
|
||||
@ -454,14 +412,7 @@ def generate_wheels_matrix(
|
||||
return ret
|
||||
|
||||
|
||||
arch_version = ""
|
||||
for arch_version in CUDA_ARCHES:
|
||||
validate_nccl_dep_consistency(arch_version)
|
||||
del arch_version
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Used by tools/nightly.py
|
||||
(SCRIPT_DIR / "nightly_source_matrix.json").write_text(
|
||||
json.dumps(NIGHTLY_SOURCE_MATRIX, indent=4) + "\n"
|
||||
)
|
||||
validate_nccl_dep_consistency("13.0")
|
||||
validate_nccl_dep_consistency("12.9")
|
||||
validate_nccl_dep_consistency("12.8")
|
||||
validate_nccl_dep_consistency("12.6")
|
||||
|
||||
13
.github/workflows/_xpu-test.yml
vendored
13
.github/workflows/_xpu-test.yml
vendored
@ -38,10 +38,6 @@ on:
|
||||
default: ""
|
||||
description: |
|
||||
List of tests to include (empty string implies default list)
|
||||
dashboard-tag:
|
||||
required: false
|
||||
type: string
|
||||
default: ""
|
||||
disable-monitor:
|
||||
description: |
|
||||
[Experimental] Disable utilization monitoring for tests.
|
||||
@ -62,11 +58,6 @@ on:
|
||||
required: false
|
||||
type: number
|
||||
default: 1
|
||||
secrets:
|
||||
HUGGING_FACE_HUB_TOKEN:
|
||||
required: false
|
||||
description: |
|
||||
HF Auth token to avoid rate limits when downloading models or datasets from hub
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
@ -205,8 +196,6 @@ jobs:
|
||||
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
|
||||
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
|
||||
TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
|
||||
DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
|
||||
run: |
|
||||
# Fetch aws credential from IMDs
|
||||
@ -257,8 +246,6 @@ jobs:
|
||||
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \
|
||||
-e TESTS_TO_INCLUDE \
|
||||
-e ZE_AFFINITY_MASK \
|
||||
-e HUGGING_FACE_HUB_TOKEN \
|
||||
-e DASHBOARD_TAG \
|
||||
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
||||
--ulimit stack=10485760:83886080 \
|
||||
--ulimit core=0 \
|
||||
|
||||
2
.github/workflows/build-almalinux-images.yml
vendored
2
.github/workflows/build-almalinux-images.yml
vendored
@ -36,7 +36,7 @@ jobs:
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
strategy:
|
||||
matrix:
|
||||
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm7.0", "rocm7.1", "cpu"]
|
||||
tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"]
|
||||
steps:
|
||||
- name: Build docker image
|
||||
uses: pytorch/pytorch/.github/actions/binary-docker-build@main
|
||||
|
||||
2
.github/workflows/build-libtorch-images.yml
vendored
2
.github/workflows/build-libtorch-images.yml
vendored
@ -52,8 +52,8 @@ jobs:
|
||||
{ tag: "cuda12.9" },
|
||||
{ tag: "cuda12.8" },
|
||||
{ tag: "cuda12.6" },
|
||||
{ tag: "rocm6.4" },
|
||||
{ tag: "rocm7.0" },
|
||||
{ tag: "rocm7.1" },
|
||||
{ tag: "cpu" },
|
||||
]
|
||||
steps:
|
||||
|
||||
2
.github/workflows/build-magma-rocm-linux.yml
vendored
2
.github/workflows/build-magma-rocm-linux.yml
vendored
@ -34,7 +34,7 @@ jobs:
|
||||
id-token: write
|
||||
strategy:
|
||||
matrix:
|
||||
rocm_version: ["71", "70"]
|
||||
rocm_version: ["70", "64"]
|
||||
steps:
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||
|
||||
2
.github/workflows/build-manywheel-images.yml
vendored
2
.github/workflows/build-manywheel-images.yml
vendored
@ -54,8 +54,8 @@ jobs:
|
||||
{ name: "manylinuxaarch64-builder", tag: "cuda12.9", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||
{ name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||
{ name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "rocm6.4", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "rocm7.0", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "rocm7.1", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "cpu", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28_aarch64-builder", tag: "cpu-aarch64", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "xpu", runner: "linux.9xlarge.ephemeral" },
|
||||
|
||||
2
.github/workflows/build-triton-wheel.yml
vendored
2
.github/workflows/build-triton-wheel.yml
vendored
@ -55,7 +55,7 @@ jobs:
|
||||
docker-image: ["pytorch/manylinux2_28-builder:cpu"]
|
||||
include:
|
||||
- device: "rocm"
|
||||
rocm_version: "7.1"
|
||||
rocm_version: "7.0"
|
||||
runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
|
||||
- device: "cuda"
|
||||
rocm_version: ""
|
||||
|
||||
2
.github/workflows/docker-builds.yml
vendored
2
.github/workflows/docker-builds.yml
vendored
@ -57,7 +57,6 @@ jobs:
|
||||
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-py3.10-clang12,
|
||||
pytorch-linux-jammy-py3.13-clang12,
|
||||
pytorch-linux-jammy-py3.14-clang12,
|
||||
pytorch-linux-jammy-rocm-n-py3,
|
||||
pytorch-linux-noble-rocm-n-py3,
|
||||
pytorch-linux-jammy-rocm-n-py3-benchmarks,
|
||||
@ -67,7 +66,6 @@ jobs:
|
||||
pytorch-linux-jammy-py3.12-halide,
|
||||
pytorch-linux-jammy-xpu-n-1-py3,
|
||||
pytorch-linux-jammy-xpu-n-py3,
|
||||
pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks,
|
||||
pytorch-linux-jammy-py3-clang18-asan,
|
||||
pytorch-linux-jammy-py3-clang12-onnx,
|
||||
pytorch-linux-jammy-linter,
|
||||
|
||||
56
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
56
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -132,7 +132,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -178,7 +178,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -224,7 +224,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -270,7 +270,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -381,7 +381,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -427,7 +427,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -473,7 +473,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -519,7 +519,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -630,7 +630,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -676,7 +676,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -722,7 +722,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -768,7 +768,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -879,7 +879,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -925,7 +925,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -971,7 +971,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1017,7 +1017,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1128,7 +1128,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1174,7 +1174,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1220,7 +1220,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1266,7 +1266,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1377,7 +1377,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1423,7 +1423,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1469,7 +1469,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1515,7 +1515,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1626,7 +1626,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_6
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1672,7 +1672,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_8
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1718,7 +1718,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
@ -1764,7 +1764,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-13_0
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
236
.github/workflows/generated-linux-binary-libtorch-nightly.yml
generated
vendored
236
.github/workflows/generated-linux-binary-libtorch-nightly.yml
generated
vendored
@ -384,6 +384,124 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
libtorch-rocm6_4-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm6.4
|
||||
GPU_ARCH_VERSION: "6.4"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
timeout-minutes: 300
|
||||
build_name: libtorch-rocm6_4-shared-with-deps-release
|
||||
build_environment: linux-binary-libtorch
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
libtorch-rocm6_4-shared-with-deps-release-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- libtorch-rocm6_4-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm6.4
|
||||
GPU_ARCH_VERSION: "6.4"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
SKIP_ALL_TESTS: 1
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
steps:
|
||||
- name: Setup ROCm
|
||||
uses: ./.github/actions/setup-rocm
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: libtorch-rocm6_4-shared-with-deps-release
|
||||
path: "${{ runner.temp }}/artifacts/"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: ROCm set GPU_FLAG
|
||||
run: |
|
||||
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
|
||||
- name: configure aws credentials
|
||||
id: aws_creds
|
||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
aws-region: us-east-1
|
||||
role-duration-seconds: 18000
|
||||
- name: Calculate docker image
|
||||
id: calculate-docker-image
|
||||
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
||||
with:
|
||||
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
|
||||
docker-image-name: libtorch-cxx11-builder
|
||||
custom-tag-prefix: rocm6.4
|
||||
docker-build-dir: .ci/docker
|
||||
working-directory: pytorch
|
||||
- name: Pull Docker image
|
||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
||||
with:
|
||||
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Test Pytorch binary
|
||||
uses: ./pytorch/.github/actions/test-pytorch-binary
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Teardown ROCm
|
||||
uses: ./.github/actions/teardown-rocm
|
||||
libtorch-rocm6_4-shared-with-deps-release-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: libtorch-rocm6_4-shared-with-deps-release-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm6.4
|
||||
GPU_ARCH_VERSION: "6.4"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm6.4
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
build_name: libtorch-rocm6_4-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
libtorch-rocm7_0-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -501,121 +619,3 @@ jobs:
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
libtorch-rocm7_1-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm7.1
|
||||
GPU_ARCH_VERSION: "7.1"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm7.1
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
timeout-minutes: 300
|
||||
build_name: libtorch-rocm7_1-shared-with-deps-release
|
||||
build_environment: linux-binary-libtorch
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
libtorch-rocm7_1-shared-with-deps-release-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- libtorch-rocm7_1-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
runs-on: linux.rocm.gpu.mi250
|
||||
timeout-minutes: 240
|
||||
env:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm7.1
|
||||
GPU_ARCH_VERSION: "7.1"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
SKIP_ALL_TESTS: 1
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm7.1
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
steps:
|
||||
- name: Setup ROCm
|
||||
uses: ./.github/actions/setup-rocm
|
||||
- uses: actions/download-artifact@v4.1.7
|
||||
name: Download Build Artifacts
|
||||
with:
|
||||
name: libtorch-rocm7_1-shared-with-deps-release
|
||||
path: "${{ runner.temp }}/artifacts/"
|
||||
- name: Checkout PyTorch
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
|
||||
submodules: recursive
|
||||
path: pytorch
|
||||
show-progress: false
|
||||
- name: Clean PyTorch checkout
|
||||
run: |
|
||||
# Remove any artifacts from the previous checkouts
|
||||
git clean -fxd
|
||||
working-directory: pytorch
|
||||
- name: ROCm set GPU_FLAG
|
||||
run: |
|
||||
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
|
||||
- name: configure aws credentials
|
||||
id: aws_creds
|
||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
aws-region: us-east-1
|
||||
role-duration-seconds: 18000
|
||||
- name: Calculate docker image
|
||||
id: calculate-docker-image
|
||||
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
|
||||
with:
|
||||
docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
|
||||
docker-image-name: libtorch-cxx11-builder
|
||||
custom-tag-prefix: rocm7.1
|
||||
docker-build-dir: .ci/docker
|
||||
working-directory: pytorch
|
||||
- name: Pull Docker image
|
||||
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
|
||||
with:
|
||||
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Test Pytorch binary
|
||||
uses: ./pytorch/.github/actions/test-pytorch-binary
|
||||
env:
|
||||
DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
|
||||
- name: Teardown ROCm
|
||||
uses: ./.github/actions/teardown-rocm
|
||||
libtorch-rocm7_1-shared-with-deps-release-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: libtorch-rocm7_1-shared-with-deps-release-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: rocm7.1
|
||||
GPU_ARCH_VERSION: "7.1"
|
||||
GPU_ARCH_TYPE: rocm
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: rocm7.1
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
build_name: libtorch-rocm7_1-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
1666
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
1666
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
File diff suppressed because it is too large
Load Diff
148
.github/workflows/inductor-perf-test-nightly-xpu.yml
vendored
148
.github/workflows/inductor-perf-test-nightly-xpu.yml
vendored
@ -1,148 +0,0 @@
|
||||
name: inductor-perf-nightly-xpu
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/inductor-perf-test-nightly-xpu/*
|
||||
schedule:
|
||||
- cron: 30 17 * * *
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
training:
|
||||
description: Run training (on by default)?
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
inference:
|
||||
description: Run inference (on by default)?
|
||||
required: false
|
||||
type: boolean
|
||||
default: true
|
||||
default:
|
||||
description: Run inductor_default?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
dynamic:
|
||||
description: Run inductor_dynamic_shapes?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
cppwrapper:
|
||||
description: Run inductor_cpp_wrapper?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
cudagraphs:
|
||||
description: Run inductor_cudagraphs?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
freezing_cudagraphs:
|
||||
description: Run inductor_cudagraphs with freezing for inference?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
aotinductor:
|
||||
description: Run aot_inductor for inference?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
maxautotune:
|
||||
description: Run inductor_max_autotune?
|
||||
required: false
|
||||
type: boolean
|
||||
default: false
|
||||
benchmark_configs:
|
||||
description: The list of configs used the benchmark
|
||||
required: false
|
||||
type: string
|
||||
default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,cachebench
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
opt_out_experiments: lf
|
||||
|
||||
xpu-n-py3_10-inductor-benchmark-build:
|
||||
name: xpu-n-py3.10-inductor-benchmark
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-xpu-n-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks
|
||||
runner: linux.c7i.12xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor_huggingface_perf_xpu", shard: 1, num_shards: 5, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_huggingface_perf_xpu", shard: 2, num_shards: 5, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_huggingface_perf_xpu", shard: 3, num_shards: 5, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_huggingface_perf_xpu", shard: 4, num_shards: 5, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_huggingface_perf_xpu", shard: 5, num_shards: 5, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_timm_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_timm_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_timm_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_timm_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_timm_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_timm_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_torchbench_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_torchbench_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_torchbench_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_torchbench_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_torchbench_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
{ config: "inductor_torchbench_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
xpu-n-py3_10-inductor-benchmark-test-nightly:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
if: github.event_name != 'workflow_dispatch'
|
||||
name: xpu-n-py3.10-inductor-benchmark
|
||||
uses: ./.github/workflows/_xpu-test.yml
|
||||
needs: xpu-n-py3_10-inductor-benchmark-build
|
||||
with:
|
||||
build-environment: linux-jammy-xpu-n-py3.10
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
|
||||
docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
# Disable monitor in perf tests for more investigation
|
||||
disable-monitor: true
|
||||
monitor-log-interval: 10
|
||||
monitor-data-collect-interval: 2
|
||||
secrets: inherit
|
||||
|
||||
xpu-n-py3_10-inductor-benchmark-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
if: github.event_name == 'workflow_dispatch'
|
||||
name: xpu-n-py3.10-inductor-test
|
||||
uses: ./.github/workflows/_xpu-test.yml
|
||||
needs: xpu-n-py3_10-inductor-benchmark-build
|
||||
with:
|
||||
build-environment: linux-jammy-xpu-n-py3.10
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
|
||||
docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
disable-monitor: false
|
||||
monitor-log-interval: 15
|
||||
monitor-data-collect-interval: 4
|
||||
secrets: inherit
|
||||
84
.github/workflows/periodic-rocm-mi200.yml
vendored
84
.github/workflows/periodic-rocm-mi200.yml
vendored
@ -1,84 +0,0 @@
|
||||
name: periodic-rocm-mi200
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
|
||||
# Also run less frequently on weekends.
|
||||
- cron: 45 0,8,16 * * 1-5
|
||||
- cron: 45 4 * * 0,6
|
||||
- cron: 45 4,12,20 * * 1-5
|
||||
- cron: 45 12 * * 0,6
|
||||
- cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests
|
||||
push:
|
||||
tags:
|
||||
- ciflow/periodic/*
|
||||
- ciflow/periodic-rocm-mi200/*
|
||||
branches:
|
||||
- release/*
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
llm-td:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: before-test
|
||||
uses: ./.github/workflows/llm_td_retrieval.yml
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
target-determination:
|
||||
name: before-test
|
||||
uses: ./.github/workflows/target_determination.yml
|
||||
needs: llm-td
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-rocm-py3_10-build:
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-jammy-rocm-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
31
.github/workflows/periodic.yml
vendored
31
.github/workflows/periodic.yml
vendored
@ -204,6 +204,37 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-build:
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
{ config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-jammy-rocm-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build:
|
||||
name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
||||
1
.github/workflows/upload-test-stats.yml
vendored
1
.github/workflows/upload-test-stats.yml
vendored
@ -6,7 +6,6 @@ on:
|
||||
- pull
|
||||
- trunk
|
||||
- periodic
|
||||
- periodic-rocm-mi200
|
||||
- periodic-rocm-mi300
|
||||
- inductor
|
||||
- unstable
|
||||
|
||||
20
.github/workflows/xpu.yml
vendored
20
.github/workflows/xpu.yml
vendored
@ -59,18 +59,14 @@ jobs:
|
||||
runner: linux.c7i.12xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 2, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 3, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 4, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 5, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 6, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 7, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 8, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 9, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 10, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 11, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 12, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@ -143,7 +143,6 @@ scripts/release_notes/*.json
|
||||
sccache-stats*.json
|
||||
lint.json
|
||||
merge_record.json
|
||||
.github/scripts/nightly_source_matrix.json
|
||||
|
||||
# These files get copied over on invoking setup.py
|
||||
torchgen/packaged/*
|
||||
|
||||
@ -374,7 +374,7 @@ cmake_dependent_option(
|
||||
"Build the lazy Torchscript backend, not compatible with mobile builds" ON
|
||||
"NOT INTERN_BUILD_MOBILE" OFF)
|
||||
cmake_dependent_option(BUILD_FUNCTORCH "Build Functorch" ON "BUILD_PYTHON" OFF)
|
||||
cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin folder"
|
||||
cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
|
||||
OFF "USE_CUDA" OFF)
|
||||
cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
|
||||
"CPU_AARCH64" OFF)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||

|
||||

|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
@ -72,7 +72,7 @@ Elaborating Further:
|
||||
|
||||
If you use NumPy, then you have used Tensors (a.k.a. ndarray).
|
||||
|
||||

|
||||

|
||||
|
||||
PyTorch provides Tensors that can live either on the CPU or the GPU and accelerates the
|
||||
computation by a huge amount.
|
||||
@ -99,7 +99,7 @@ from several research papers on this topic, as well as current and past work suc
|
||||
While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date.
|
||||
You get the best of speed and flexibility for your crazy research.
|
||||
|
||||

|
||||

|
||||
|
||||
### Python First
|
||||
|
||||
|
||||
@ -260,7 +260,7 @@ IF(USE_FBGEMM_GENAI)
|
||||
if(USE_CUDA)
|
||||
# To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
|
||||
# If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
|
||||
set(FBGEMM_CUTLASS_KERNELS_REGEX ".*(mx8mx8bf16_grouped|f4f4bf16_grouped|f4f4bf16).*")
|
||||
set(FBGEMM_CUTLASS_KERNELS_REGEX ".*mx8mx8bf16_grouped.*")
|
||||
file(GLOB_RECURSE fbgemm_genai_native_cuda_cu
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu"
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
|
||||
@ -291,7 +291,6 @@ IF(USE_FBGEMM_GENAI)
|
||||
|
||||
set(fbgemm_genai_cuh
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
|
||||
"${FBGEMM_GENAI_SRCS}/cutlass_extensions/f4f4bf16_grouped/"
|
||||
"${FBGEMM_GENAI_SRCS}/"
|
||||
)
|
||||
|
||||
|
||||
@ -677,8 +677,8 @@ struct CachingHostAllocatorImpl {
|
||||
// size. This allows us to quickly find a free block of the right size.
|
||||
// We use deque to store per size free list and guard the list with its own
|
||||
// mutex.
|
||||
alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>>
|
||||
free_list_{MAX_SIZE_INDEX};
|
||||
alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>> free_list_ =
|
||||
std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
|
||||
|
||||
alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
|
||||
std::deque<std::pair<E, B*>> events_; // event queue paired with block
|
||||
|
||||
@ -19,13 +19,6 @@ inline namespace CPU_CAPABILITY {
|
||||
#error "Big endian is not supported."
|
||||
#endif
|
||||
|
||||
// GCC does not properly optimize bf16 operators
|
||||
#if defined(__ARM_FEATURE_BF16) && (__clang_major__ >= 19)
|
||||
#define BF16_ARITHMETIC_SUPPORTED() 1
|
||||
#else
|
||||
#define BF16_ARITHMETIC_SUPPORTED() 0
|
||||
#endif
|
||||
|
||||
// Unlike the float16_t family of types, bfloat16_t is not available
|
||||
// when we're not targeting bfloat16 hardware support on some
|
||||
// platforms (but not Mac, so we have to be careful not to shadow the
|
||||
@ -359,35 +352,18 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
other, &Vectorized<float>::name); \
|
||||
}
|
||||
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
|
||||
Vectorized frac() const;
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(trunc)
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(sqrt)
|
||||
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
// Flip sign bit
|
||||
Vectorized<c10::BFloat16> neg() const {
|
||||
return vreinterpretq_bf16_s16(vreinterpretq_s16_bf16(values) ^ (-32768));
|
||||
return -values;
|
||||
}
|
||||
// Fast reciprocal is fine because we are truncating results
|
||||
Vectorized<c10::BFloat16> reciprocal() const {
|
||||
auto x = vcvtq_low_f32_bf16(values);
|
||||
auto y = vcvtq_high_f32_bf16(values);
|
||||
x = vrecpeq_f32(x);
|
||||
y = vrecpeq_f32(y);
|
||||
return vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(x), y);
|
||||
return 1.0f / values;
|
||||
}
|
||||
// Clearing the sign bit
|
||||
Vectorized<c10::BFloat16> abs() const {
|
||||
return vreinterpretq_bf16_u16(vreinterpretq_u16_bf16(values) & 0x7FFF);
|
||||
}
|
||||
#else
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal)
|
||||
#endif
|
||||
|
||||
// These functions are optimized on clang-21+
|
||||
#if BF16_ARITHMETIC_SUPPORTED() && (__clang_major__ >= 21)
|
||||
Vectorized<c10::BFloat16> operator==(
|
||||
const Vectorized<c10::BFloat16>& other) const {
|
||||
return values == other.values;
|
||||
@ -418,6 +394,8 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
return values >= other.values;
|
||||
}
|
||||
#else
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal)
|
||||
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator==)
|
||||
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator!=)
|
||||
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<)
|
||||
@ -473,7 +451,7 @@ template <>
|
||||
Vectorized<c10::BFloat16> inline operator+(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b) {
|
||||
#if BF16_ARITHMETIC_SUPPORTED()
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
return x + y;
|
||||
@ -486,7 +464,7 @@ template <>
|
||||
Vectorized<c10::BFloat16> inline operator-(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b) {
|
||||
#if BF16_ARITHMETIC_SUPPORTED()
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
return x - y;
|
||||
@ -499,7 +477,7 @@ template <>
|
||||
Vectorized<c10::BFloat16> inline operator*(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b) {
|
||||
#if BF16_ARITHMETIC_SUPPORTED()
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
return x * y;
|
||||
@ -512,7 +490,7 @@ template <>
|
||||
Vectorized<c10::BFloat16> inline operator/(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b) {
|
||||
#if BF16_ARITHMETIC_SUPPORTED()
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
return x / y;
|
||||
@ -629,7 +607,7 @@ Vectorized<c10::BFloat16> inline fmadd(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
const Vectorized<c10::BFloat16>& c) {
|
||||
#if BF16_ARITHMETIC_SUPPORTED()
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
bfloat16x8_t z = c;
|
||||
@ -649,7 +627,7 @@ Vectorized<c10::BFloat16> inline fnmadd(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
const Vectorized<c10::BFloat16>& c) {
|
||||
#if BF16_ARITHMETIC_SUPPORTED()
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
bfloat16x8_t z = c;
|
||||
@ -665,7 +643,7 @@ Vectorized<c10::BFloat16> inline fmsub(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
const Vectorized<c10::BFloat16>& c) {
|
||||
#if BF16_ARITHMETIC_SUPPORTED()
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
bfloat16x8_t z = c;
|
||||
@ -681,7 +659,7 @@ Vectorized<c10::BFloat16> inline fnmsub(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
const Vectorized<c10::BFloat16>& c) {
|
||||
#if BF16_ARITHMETIC_SUPPORTED()
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
bfloat16x8_t z = c;
|
||||
|
||||
@ -6,9 +6,9 @@ namespace at::vec {
|
||||
inline namespace CPU_CAPABILITY {
|
||||
#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
|
||||
|
||||
// Enable auto-vectorization for clang-17+
|
||||
// Enable auto-vectorization for GCC-13+ and clang-17+
|
||||
// GCC-12 has a bug: gcc.gnu.org/bugzilla/show_bug.cgi?id=117001
|
||||
#if defined(__clang__) && (__clang_major__ >= 17)
|
||||
#if __GNUC__ > 12 || (defined(__clang__) && (__clang_major__ >= 17))
|
||||
|
||||
template <typename from_type, typename to_type>
|
||||
inline void convertImpl(
|
||||
@ -21,46 +21,12 @@ inline void convertImpl(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename to_type>
|
||||
inline void convertFromBool(
|
||||
const bool* __restrict src,
|
||||
to_type* __restrict dst,
|
||||
int64_t n) {
|
||||
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
|
||||
uint64_t len = static_cast<uint64_t>(n);
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
dst[i] = srcPtr[i] != 0 ? static_cast<to_type>(1) : static_cast<to_type>(0);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename from_type>
|
||||
inline void convertToBool(
|
||||
const from_type* __restrict src,
|
||||
bool* __restrict dst,
|
||||
int64_t n) {
|
||||
uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
|
||||
uint64_t len = static_cast<uint64_t>(n);
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
dstPtr[i] = src[i] != static_cast<from_type>(0) ? 1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
#define CONVERT_TEMPLATE(from_type, to_type) \
|
||||
template <> \
|
||||
inline void convert(const from_type* src, to_type* dst, int64_t n) { \
|
||||
return convertImpl<from_type, to_type>(src, dst, n); \
|
||||
}
|
||||
|
||||
#define CONVERT_FROM_BOOL_TEMPLATE(to_type) \
|
||||
inline void convert(const bool* src, to_type* dst, int64_t n) { \
|
||||
return convertFromBool<to_type>(src, dst, n); \
|
||||
}
|
||||
|
||||
#define CONVERT_TO_BOOL_TEMPLATE(from_type) \
|
||||
inline void convert(const from_type* src, bool* dst, int64_t n) { \
|
||||
return convertToBool<from_type>(src, dst, n); \
|
||||
}
|
||||
|
||||
CONVERT_TEMPLATE(uint8_t, uint8_t)
|
||||
CONVERT_TEMPLATE(uint8_t, int8_t)
|
||||
CONVERT_TEMPLATE(uint8_t, int16_t)
|
||||
@ -68,7 +34,6 @@ CONVERT_TEMPLATE(uint8_t, int32_t)
|
||||
CONVERT_TEMPLATE(uint8_t, int64_t)
|
||||
CONVERT_TEMPLATE(uint8_t, float)
|
||||
CONVERT_TEMPLATE(uint8_t, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(uint8_t)
|
||||
CONVERT_TEMPLATE(int8_t, uint8_t)
|
||||
CONVERT_TEMPLATE(int8_t, int8_t)
|
||||
CONVERT_TEMPLATE(int8_t, int16_t)
|
||||
@ -76,7 +41,6 @@ CONVERT_TEMPLATE(int8_t, int32_t)
|
||||
CONVERT_TEMPLATE(int8_t, int64_t)
|
||||
CONVERT_TEMPLATE(int8_t, float)
|
||||
CONVERT_TEMPLATE(int8_t, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(int8_t)
|
||||
CONVERT_TEMPLATE(int16_t, uint8_t)
|
||||
CONVERT_TEMPLATE(int16_t, int8_t)
|
||||
CONVERT_TEMPLATE(int16_t, int16_t)
|
||||
@ -84,7 +48,6 @@ CONVERT_TEMPLATE(int16_t, int32_t)
|
||||
CONVERT_TEMPLATE(int16_t, int64_t)
|
||||
CONVERT_TEMPLATE(int16_t, float)
|
||||
CONVERT_TEMPLATE(int16_t, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(int16_t)
|
||||
CONVERT_TEMPLATE(int32_t, uint8_t)
|
||||
CONVERT_TEMPLATE(int32_t, int8_t)
|
||||
CONVERT_TEMPLATE(int32_t, int16_t)
|
||||
@ -92,7 +55,6 @@ CONVERT_TEMPLATE(int32_t, int32_t)
|
||||
CONVERT_TEMPLATE(int32_t, int64_t)
|
||||
CONVERT_TEMPLATE(int32_t, float)
|
||||
CONVERT_TEMPLATE(int32_t, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(int32_t)
|
||||
CONVERT_TEMPLATE(int64_t, uint8_t)
|
||||
CONVERT_TEMPLATE(int64_t, int8_t)
|
||||
CONVERT_TEMPLATE(int64_t, int16_t)
|
||||
@ -100,7 +62,6 @@ CONVERT_TEMPLATE(int64_t, int32_t)
|
||||
CONVERT_TEMPLATE(int64_t, int64_t)
|
||||
CONVERT_TEMPLATE(int64_t, float)
|
||||
CONVERT_TEMPLATE(int64_t, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(int64_t)
|
||||
CONVERT_TEMPLATE(float, uint8_t)
|
||||
CONVERT_TEMPLATE(float, int8_t)
|
||||
CONVERT_TEMPLATE(float, int16_t)
|
||||
@ -108,7 +69,6 @@ CONVERT_TEMPLATE(float, int32_t)
|
||||
CONVERT_TEMPLATE(float, int64_t)
|
||||
CONVERT_TEMPLATE(float, float)
|
||||
CONVERT_TEMPLATE(float, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(float)
|
||||
CONVERT_TEMPLATE(double, uint8_t)
|
||||
CONVERT_TEMPLATE(double, int8_t)
|
||||
CONVERT_TEMPLATE(double, int16_t)
|
||||
@ -116,14 +76,6 @@ CONVERT_TEMPLATE(double, int32_t)
|
||||
CONVERT_TEMPLATE(double, int64_t)
|
||||
CONVERT_TEMPLATE(double, float)
|
||||
CONVERT_TEMPLATE(double, double)
|
||||
CONVERT_TO_BOOL_TEMPLATE(double)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(uint8_t)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(int8_t)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(int16_t)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(int32_t)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(int64_t)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(float)
|
||||
CONVERT_FROM_BOOL_TEMPLATE(double)
|
||||
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
|
||||
#define CONVERT_FROM_FP16_TEMPLATE(to_type) \
|
||||
@ -155,41 +107,6 @@ CONVERT_TO_FP16_TEMPLATE(int32_t)
|
||||
CONVERT_TO_FP16_TEMPLATE(int64_t)
|
||||
CONVERT_TO_FP16_TEMPLATE(float)
|
||||
CONVERT_TO_FP16_TEMPLATE(double)
|
||||
|
||||
inline void convertBoolToFp16Impl(
|
||||
const bool* __restrict src,
|
||||
at::Half* __restrict dst,
|
||||
int64_t n) {
|
||||
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
|
||||
float16_t* dstPtr = reinterpret_cast<float16_t*>(dst);
|
||||
uint64_t len = static_cast<uint64_t>(n);
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
dstPtr[i] = srcPtr[i] != 0 ? 1.0 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const bool* src, at::Half* dst, int64_t n) {
|
||||
return convertBoolToFp16Impl(src, dst, n);
|
||||
}
|
||||
|
||||
inline void convertFp16ToBoolImpl(
|
||||
const at::Half* __restrict src,
|
||||
bool* __restrict dst,
|
||||
int64_t n) {
|
||||
const float16_t* srcPtr = reinterpret_cast<const float16_t*>(src);
|
||||
uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
|
||||
uint64_t len = static_cast<uint64_t>(n);
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
dstPtr[i] = srcPtr[i] != 0.0 ? 1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const at::Half* src, bool* dst, int64_t n) {
|
||||
return convertFp16ToBoolImpl(src, dst, n);
|
||||
}
|
||||
|
||||
#endif
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
CONVERT_TEMPLATE(bfloat16_t, uint8_t)
|
||||
@ -207,44 +124,6 @@ CONVERT_TEMPLATE(int32_t, bfloat16_t)
|
||||
CONVERT_TEMPLATE(int64_t, bfloat16_t)
|
||||
CONVERT_TEMPLATE(float, bfloat16_t)
|
||||
CONVERT_TEMPLATE(double, bfloat16_t)
|
||||
|
||||
inline void convertBoolToBfloat16Impl(
|
||||
const bool* __restrict src,
|
||||
c10::BFloat16* __restrict dst,
|
||||
int64_t n) {
|
||||
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
|
||||
uint16_t* dstPtr = reinterpret_cast<uint16_t*>(dst);
|
||||
uint64_t len = static_cast<uint64_t>(n);
|
||||
constexpr uint16_t kBf16One = 0x3f80; // 1.0 in bfloat16
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
dstPtr[i] = srcPtr[i] != 0 ? kBf16One : 0;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const bool* src, c10::BFloat16* dst, int64_t n) {
|
||||
return convertBoolToBfloat16Impl(src, dst, n);
|
||||
}
|
||||
|
||||
inline void convertBfloat16ToBoolImpl(
|
||||
const c10::BFloat16* __restrict src,
|
||||
bool* __restrict dst,
|
||||
int64_t n) {
|
||||
uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
|
||||
const uint16_t* srcPtr = reinterpret_cast<const uint16_t*>(src);
|
||||
uint64_t len = static_cast<uint64_t>(n);
|
||||
for (uint64_t i = 0; i < len; i++) {
|
||||
// Check if all non-sign bits are 0
|
||||
bool isBf16Zero = (srcPtr[i] & 0x7fff) == 0;
|
||||
dstPtr[i] = isBf16Zero ? 0 : 1;
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void convert(const c10::BFloat16* src, bool* dst, int64_t n) {
|
||||
return convertBfloat16ToBoolImpl(src, dst, n);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@ -309,7 +309,7 @@ class Vectorized<float> {
|
||||
DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1)
|
||||
// Implementation copied from Arm Optimized Routine
|
||||
// https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
|
||||
inline Vectorized<float> vexpq_f32_u20() const {
|
||||
Vectorized<float> exp_u20() const {
|
||||
// bail out to sleef if it's a special case:
|
||||
// i.e. there's an input s.t. |input| > 87.3....
|
||||
const float32x4_t special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);
|
||||
@ -348,9 +348,6 @@ class Vectorized<float> {
|
||||
|
||||
return vfmaq_f32(scale, poly, scale);
|
||||
}
|
||||
Vectorized<float> exp_u20() const {
|
||||
return vexpq_f32_u20();
|
||||
}
|
||||
Vectorized<float> fexp_u20() const {
|
||||
return exp_u20();
|
||||
}
|
||||
@ -637,7 +634,7 @@ inline Vectorized<float> Vectorized<float>::erf() const {
|
||||
// - exp(- x * x)
|
||||
auto pow_2 = (*this) * (*this);
|
||||
auto neg_pow_2 = pow_2 ^ neg_zero_vec;
|
||||
auto tmp4 = neg_pow_2.vexpq_f32_u20();
|
||||
auto tmp4 = neg_pow_2.exp();
|
||||
auto tmp5 = tmp4 ^ neg_zero_vec;
|
||||
// erf(x) = sign(x) * (1 - r * t * exp(- x * x))
|
||||
auto tmp6 = t * tmp5;
|
||||
|
||||
@ -1,90 +1,78 @@
|
||||
#include <ATen/cuda/CUDAGreenContext.h>
|
||||
|
||||
#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
||||
#include <c10/cuda/driver_api.h>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
#define HAS_CUDA_GREEN_CONTEXT() 1
|
||||
#else
|
||||
#define HAS_CUDA_GREEN_CONTEXT() 0
|
||||
// Suppress unsued private field warnings as this class is not supposed to be called
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-private-field")
|
||||
#endif
|
||||
|
||||
namespace at::cuda {
|
||||
GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
int driver_version;
|
||||
C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
|
||||
TORCH_CHECK(
|
||||
driver_version >= 12080, "cuda driver too old to use green context!");
|
||||
CUcontext pctx = nullptr;
|
||||
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
|
||||
if (C10_UNLIKELY(!pctx)) {
|
||||
TORCH_WARN(
|
||||
"Attempted to create a green context but"
|
||||
" there was no primary context! Creating a primary context...");
|
||||
|
||||
GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
|
||||
#if HAS_CUDA_GREEN_CONTEXT()
|
||||
int driver_version;
|
||||
C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
|
||||
TORCH_CHECK(
|
||||
driver_version >= 12080, "cuda driver too old to use green context!");
|
||||
CUcontext pctx = nullptr;
|
||||
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
|
||||
if (C10_UNLIKELY(!pctx)) {
|
||||
TORCH_WARN(
|
||||
"Attempted to create a green context but"
|
||||
" there was no primary context! Creating a primary context...");
|
||||
cudaFree(0);
|
||||
}
|
||||
|
||||
cudaFree(0);
|
||||
}
|
||||
CUdevice device;
|
||||
device_id_ = device_id;
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
|
||||
|
||||
CUdevice device;
|
||||
device_id_ = device_id;
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
|
||||
// Get device resources
|
||||
CUdevResource device_resource;
|
||||
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
|
||||
device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
|
||||
|
||||
// Get device resources
|
||||
CUdevResource device_resource;
|
||||
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
|
||||
device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
|
||||
// Split resources
|
||||
std::vector<CUdevResource> result(1);
|
||||
auto result_data = result.data();
|
||||
unsigned int nb_groups = 1;
|
||||
CUdevResource remaining;
|
||||
|
||||
// Split resources
|
||||
std::vector<CUdevResource> result(1);
|
||||
auto result_data = result.data();
|
||||
unsigned int nb_groups = 1;
|
||||
CUdevResource remaining;
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
|
||||
result_data,
|
||||
&nb_groups,
|
||||
&device_resource,
|
||||
&remaining,
|
||||
0, // default flags
|
||||
num_sms));
|
||||
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
|
||||
result_data,
|
||||
&nb_groups,
|
||||
&device_resource,
|
||||
&remaining,
|
||||
0, // default flags
|
||||
num_sms));
|
||||
TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
|
||||
|
||||
TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
|
||||
// Generate resource descriptor
|
||||
CUdevResourceDesc desc;
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
|
||||
&desc, result_data, 1));
|
||||
|
||||
// Generate resource descriptor
|
||||
CUdevResourceDesc desc;
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
|
||||
&desc, result_data, 1));
|
||||
// Create green context
|
||||
// CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
|
||||
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
|
||||
&green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
|
||||
|
||||
// Create green context
|
||||
// CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
|
||||
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
|
||||
&green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
|
||||
|
||||
// Convert to regular context
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
|
||||
TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
|
||||
// Convert to regular context
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
|
||||
TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
|
||||
#else
|
||||
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
||||
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
||||
#endif
|
||||
}
|
||||
|
||||
std::unique_ptr<GreenContext> GreenContext::create(
|
||||
uint32_t num_sms,
|
||||
std::optional<uint32_t> device_id) {
|
||||
#if HAS_CUDA_GREEN_CONTEXT()
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
if (!device_id.has_value()) {
|
||||
device_id = at::cuda::current_device();
|
||||
}
|
||||
return std::unique_ptr<GreenContext>(new GreenContext(device_id.value(), num_sms));
|
||||
return std::make_unique<GreenContext>(device_id.value(), num_sms);
|
||||
#else
|
||||
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
||||
#endif
|
||||
@ -92,7 +80,7 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
|
||||
|
||||
// Implement move operations
|
||||
GreenContext::GreenContext(GreenContext&& other) noexcept{
|
||||
#if HAS_CUDA_GREEN_CONTEXT()
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
device_id_ = std::exchange(other.device_id_, -1);
|
||||
green_ctx_ = std::exchange(other.green_ctx_, nullptr);
|
||||
context_ = std::exchange(other.context_, nullptr);
|
||||
@ -103,7 +91,7 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
|
||||
}
|
||||
|
||||
GreenContext& GreenContext::operator=(GreenContext&& other) noexcept{
|
||||
#if HAS_CUDA_GREEN_CONTEXT()
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
if (this != &other) {
|
||||
// Clean up current resources
|
||||
if (green_ctx_) {
|
||||
@ -132,7 +120,7 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
|
||||
}
|
||||
|
||||
GreenContext::~GreenContext() noexcept{
|
||||
#if HAS_CUDA_GREEN_CONTEXT()
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuGreenCtxDestroy_(green_ctx_));
|
||||
#else
|
||||
@ -140,9 +128,25 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
|
||||
#endif
|
||||
}
|
||||
|
||||
// Get the underlying CUDA context
|
||||
CUcontext GreenContext::getContext() const {
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
return context_;
|
||||
#else
|
||||
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
||||
#endif
|
||||
}
|
||||
|
||||
// Get the underlying green context
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
CUgreenCtx GreenContext::getGreenContext() const {
|
||||
return green_ctx_;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Make this context current
|
||||
void GreenContext::setContext() {
|
||||
#if HAS_CUDA_GREEN_CONTEXT()
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
auto current_stream = c10::cuda::getCurrentCUDAStream();
|
||||
parent_stream_ = current_stream.stream();
|
||||
|
||||
@ -171,7 +175,7 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
|
||||
}
|
||||
|
||||
void GreenContext::popContext() {
|
||||
#if HAS_CUDA_GREEN_CONTEXT()
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
// see above note about stream being hardcoded to the default stream
|
||||
at::cuda::CUDAEvent ev;
|
||||
ev.record(c10::cuda::getCurrentCUDAStream());
|
||||
|
||||
@ -1,38 +1,53 @@
|
||||
#pragma once
|
||||
#include <ATen/cuda/CUDAEvent.h>
|
||||
#include <cuda.h>
|
||||
|
||||
// Forward declare green context as opaque ptr
|
||||
typedef struct CUgreenCtx_st* CUgreenCtx;
|
||||
#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
||||
#include <c10/cuda/driver_api.h>
|
||||
#include <cuda.h>
|
||||
#include <memory>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
#define CUDA_HAS_GREEN_CONTEXT 1
|
||||
#else
|
||||
#define CUDA_HAS_GREEN_CONTEXT 0
|
||||
#endif
|
||||
|
||||
namespace at::cuda {
|
||||
|
||||
class TORCH_CUDA_CPP_API GreenContext {
|
||||
public:
|
||||
// Green context creation
|
||||
static std::unique_ptr<GreenContext> create(
|
||||
uint32_t num_sms,
|
||||
std::optional<uint32_t> device_id);
|
||||
~GreenContext() noexcept;
|
||||
GreenContext(uint32_t device_id, uint32_t num_sms);
|
||||
|
||||
static std::unique_ptr<GreenContext> create(uint32_t num_sms, std::optional<uint32_t> device_id);
|
||||
|
||||
// Delete copy constructor and assignment
|
||||
GreenContext(const GreenContext&) = delete;
|
||||
GreenContext& operator=(const GreenContext&) = delete;
|
||||
|
||||
// Implement move operations
|
||||
GreenContext(GreenContext&& other) noexcept;
|
||||
GreenContext& operator=(GreenContext&& other) noexcept;
|
||||
~GreenContext() noexcept;
|
||||
|
||||
// Get the underlying CUDA context
|
||||
CUcontext getContext() const;
|
||||
|
||||
// Get the underlying green context
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
CUgreenCtx getGreenContext() const;
|
||||
#endif
|
||||
|
||||
// Make this context current
|
||||
void setContext();
|
||||
|
||||
void popContext();
|
||||
|
||||
private:
|
||||
GreenContext(uint32_t device_id, uint32_t num_sms);
|
||||
// Implement move operations
|
||||
GreenContext(GreenContext&& other) noexcept;
|
||||
GreenContext& operator=(GreenContext&& other) noexcept;
|
||||
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
int32_t device_id_ = -1;
|
||||
CUgreenCtx green_ctx_ = nullptr;
|
||||
CUcontext context_ = nullptr;
|
||||
cudaStream_t parent_stream_ = nullptr;
|
||||
#endif
|
||||
};
|
||||
} // namespace at::cuda
|
||||
|
||||
@ -7,6 +7,17 @@
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(USE_ROCM)
|
||||
// hipSparse const API added in v2.4.0
|
||||
#if HIPSPARSE_VERSION >= 200400
|
||||
#define AT_USE_HIPSPARSE_GENERIC_API() 1
|
||||
#else
|
||||
#define AT_USE_HIPSPARSE_GENERIC_API() 1
|
||||
#endif
|
||||
#else // USE_ROCM
|
||||
#define AT_USE_HIPSPARSE_GENERIC_API() 0
|
||||
#endif // USE_ROCM
|
||||
|
||||
// cuSparse Generic API spsv function was added in CUDA 11.3.0
|
||||
#if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11500)
|
||||
#define AT_USE_CUSPARSE_GENERIC_SPSV() 1
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
#include <ATen/cuda/CUDAContextLight.h>
|
||||
#include <ATen/cuda/Sleep.h>
|
||||
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <c10/cuda/CUDAException.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
|
||||
@ -25,22 +24,8 @@ __global__ void spin_kernel(int64_t cycles) {
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
thread_local int *flag = nullptr;
|
||||
|
||||
__global__ void busy_wait_for_flag_kernel(int *flag) {
|
||||
atomicExch(flag, 1);
|
||||
while (atomicAdd(flag, 0) == 1) {
|
||||
// do nothing
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void clear_flag_kernel(int *flag) {
|
||||
atomicExch(flag, 0);
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
void sleep(int64_t cycles) {
|
||||
dim3 grid(1);
|
||||
dim3 block(1);
|
||||
@ -48,26 +33,6 @@ void sleep(int64_t cycles) {
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
|
||||
void busy_wait_for_flag() {
|
||||
if (!flag) {
|
||||
flag = (int*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(int));
|
||||
}
|
||||
dim3 grid(1);
|
||||
dim3 block(1);
|
||||
busy_wait_for_flag_kernel<<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(flag);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
|
||||
void clear_flag() {
|
||||
if (!flag) {
|
||||
flag = (int*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(int));
|
||||
}
|
||||
dim3 grid(1);
|
||||
dim3 block(1);
|
||||
clear_flag_kernel<<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(flag);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
|
||||
#ifdef USE_ROCM
|
||||
__global__ void flush_icache_kernel()
|
||||
{
|
||||
|
||||
@ -7,11 +7,6 @@ namespace at::cuda {
|
||||
// enqueues a kernel that spins for the specified number of cycles
|
||||
TORCH_CUDA_CU_API void sleep(int64_t cycles);
|
||||
|
||||
// enqueues a kernel that spins until a flag is cleared by a
|
||||
// corresponding call to clear_flag()
|
||||
TORCH_CUDA_CU_API void busy_wait_for_flag();
|
||||
TORCH_CUDA_CU_API void clear_flag();
|
||||
|
||||
// flushes instruction cache for ROCm; no-op for CUDA
|
||||
TORCH_CUDA_CU_API void flush_icache();
|
||||
|
||||
|
||||
@ -580,7 +580,7 @@ std::ofstream& TuningContext::GetUntunedFile(){
|
||||
filename.append(device);
|
||||
}
|
||||
|
||||
untuned_file_ = std::ofstream(filename, std::ios::out | std::ios::app);
|
||||
untuned_file_ = std::ofstream(filename, std::ios::out | std::ios::trunc);
|
||||
}
|
||||
return untuned_file_;
|
||||
}
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/CachingDeviceAllocator.h>
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
@ -152,36 +151,6 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
|
||||
}
|
||||
|
||||
virtual bool isAvailable() const override;
|
||||
|
||||
/* MTIAGraph related APIs */
|
||||
virtual int64_t mtiagraphCreate(bool keep_graph = false) const {
|
||||
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
virtual void mtiagraphCaptureBegin(int64_t handle, MempoolId_t pool) const {
|
||||
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||
}
|
||||
|
||||
virtual void mtiagraphCaptureEnd(int64_t handle) const {
|
||||
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||
}
|
||||
|
||||
virtual void mtiagraphInstantiate(int64_t handle) const {
|
||||
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||
}
|
||||
|
||||
virtual void mtiagraphReplay(int64_t handle) const {
|
||||
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||
}
|
||||
|
||||
virtual void mtiagraphReset(int64_t handle) const {
|
||||
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||
}
|
||||
|
||||
virtual MempoolId_t mtiagraphPool(int64_t handle) const {
|
||||
FAIL_MTIAHOOKS_FUNC(__func__);
|
||||
}
|
||||
};
|
||||
|
||||
struct TORCH_API MTIAHooksArgs {};
|
||||
|
||||
@ -410,8 +410,8 @@ struct ConvParams {
|
||||
return false;
|
||||
}
|
||||
static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
|
||||
// broken on cuDNN 9.8 - 9.14
|
||||
if (cudnn_version >= 90800 && cudnn_version < 91500) {
|
||||
// broken on cuDNN 9.8
|
||||
if (cudnn_version >= 90800) {
|
||||
if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
|
||||
(input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) &&
|
||||
weight.dim() == 5) {
|
||||
@ -689,10 +689,6 @@ static void check_shape_forward(const at::Tensor& input,
|
||||
", but got bias of size ", at::symint::sizes<T>(bias), " instead");
|
||||
|
||||
for (const auto i : c10::irange(2, k)) {
|
||||
// T could be int64_t or SymInt, Specialized numeric_limts<SymInt> in c10/core/SymInt.h
|
||||
TORCH_CHECK(padding[i-2] <= (std::numeric_limits<T>::max() - padding[i-2]),
|
||||
"Given padding=", padding[i-2], " at dimension ", i-2, " , expected padding to be at most ",
|
||||
(std::numeric_limits<T>::max() / 2));
|
||||
input_shape.push_back(at::symint::size<T>(input, i) + 2 * padding[i-2]);
|
||||
// log new kernel size considering dilation
|
||||
kernel_shape.push_back(dilation[i-2] * (weight_sizes[i]-1) + 1);
|
||||
@ -719,11 +715,6 @@ static void check_shape_forward(const at::Tensor& input,
|
||||
"Kernel size: (", kernel_ss.str(), "). Kernel size can't be greater than actual input size");
|
||||
}
|
||||
} else { // transposed
|
||||
for (const auto i : c10::irange(2, k)) {
|
||||
TORCH_CHECK(padding[i-2] <= (std::numeric_limits<T>::max() - padding[i-2]),
|
||||
"Given padding=", padding[i-2], " at dimension ", i-2, " , expected padding to be at most ",
|
||||
(std::numeric_limits<T>::max() / 2));
|
||||
}
|
||||
TORCH_CHECK(at::symint::size<T>(input, 1) == weight_sizes[0],
|
||||
"Given transposed=", transposed, ", weight of size ", weight_sizes,
|
||||
", expected input", at::symint::sizes<T>(input), " to have ", weight_sizes[0],
|
||||
|
||||
@ -52,7 +52,8 @@ Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, in
|
||||
for (const auto k : c10::irange(kw)) {
|
||||
int iShift = std::max(0, static_cast<int>(k - real_pad));
|
||||
int oShift = std::max(0, static_cast<int>(real_pad - k));
|
||||
long t = std::min(ilen + real_pad - k, olen) - oShift;
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
int t = std::min(ilen + real_pad - k, olen) - oShift;
|
||||
// Note: gemm assumes column-major matrices
|
||||
// input is l*m (row-major)
|
||||
// weight is m*r (row-major)
|
||||
|
||||
@ -16,7 +16,8 @@ bool canUse32BitIndexMath(const TensorBase& t, int64_t max_elem) {
|
||||
auto linearId = elements - 1;
|
||||
|
||||
// NOTE: Assumes all strides are positive, which is true for now
|
||||
for (auto i = t.dim() - 1; i >= 0; --i) {
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
for (int i = t.dim() - 1; i >= 0; --i) {
|
||||
auto curDimIndex = linearId % t.sym_size(i);
|
||||
auto curDimOffset = curDimIndex * t.sym_stride(i);
|
||||
offset += curDimOffset;
|
||||
|
||||
@ -68,6 +68,7 @@ Tensor fbgemm_linear_int8_weight_fp32_activation(
|
||||
const float* input_ptr = input_contig.const_data_ptr<float>();
|
||||
|
||||
TORCH_CHECK(input.dim() >= 2);
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
|
||||
const int64_t K = input.size(input.dim() - 1);
|
||||
TORCH_CHECK(weight.dim() == 2);
|
||||
|
||||
@ -160,9 +160,10 @@ struct Dist {
|
||||
// value of k.
|
||||
parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [p, self_start, self_end, n, m, res_start](int64_t k, int64_t end) {
|
||||
const Vec pvec(p);
|
||||
double n2 = static_cast<double>(n) - .5;
|
||||
double n2 = n - .5;
|
||||
// The -1 accounts for floating point truncation issues
|
||||
int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2.0 * static_cast<double>(k) - 1.0)));
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
|
||||
int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;
|
||||
|
||||
const scalar_t * self_i = self_start + i * m;
|
||||
|
||||
@ -170,14 +170,10 @@ static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const
|
||||
#if defined(CUDA_VERSION) || defined(USE_ROCM)
|
||||
const auto scalar_type = mat1.scalar_type();
|
||||
return (beta.toComplexDouble() == 1.0
|
||||
// self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
|
||||
// is to use lt interface only when self is bias.
|
||||
&& self.dim() == 1 && self.sizes()[0] == mat2_sizes[1] && self.is_contiguous()
|
||||
&& result.dim() == 2 && result.is_contiguous()
|
||||
// Conditions for bias to be fusable
|
||||
&& (
|
||||
self.is_contiguous() &&
|
||||
// NOTE: fine to have 1-len dims to the left from the right-most one
|
||||
(self.dim() == 1 || self.squeeze().dim() == 1) &&
|
||||
self.sizes().back() == mat2_sizes[1]
|
||||
)
|
||||
&& ( // some dtype restrictions
|
||||
#ifndef USE_ROCM
|
||||
scalar_type == at::ScalarType::Double ||
|
||||
|
||||
@ -208,62 +208,6 @@ _f8_f8_bf16_rowwise_grouped_mm(
|
||||
#endif
|
||||
}
|
||||
|
||||
Tensor&
|
||||
_f4_f4_bf16_grouped_mm_fbgemm(
|
||||
const Tensor& mat_a,
|
||||
const Tensor& mat_b,
|
||||
const Tensor& scale_a,
|
||||
const std::optional<Tensor>& global_scale_a,
|
||||
const Tensor& scale_b,
|
||||
const std::optional<Tensor>& global_scale_b,
|
||||
const std::optional<Tensor>& offs,
|
||||
const std::optional<Tensor>& bias,
|
||||
Tensor& out) {
|
||||
#if !defined(USE_ROCM) && defined(USE_FBGEMM_GENAI)
|
||||
// Typing checks
|
||||
TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2,
|
||||
"mat_a must be Float4_e2n1fn_2, got: ", mat_a.scalar_type());
|
||||
TORCH_CHECK_VALUE(mat_b.scalar_type() == at::kFloat4_e2m1fn_x2,
|
||||
"mat_b must be Float4_e2n1fn_2, got: ", mat_b.scalar_type());
|
||||
|
||||
std::optional<Tensor> combined_global_scale = std::nullopt;
|
||||
if (global_scale_a.has_value() || global_scale_b.has_value()) {
|
||||
// NVFP4
|
||||
TORCH_CHECK_VALUE(global_scale_a.has_value() && global_scale_b.has_value(),
|
||||
"For NVFP4 grouped gemm both of global_scale_{a,b} must have values")
|
||||
TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e4m3fn,
|
||||
"scale_a must be Float8_e4m3fn, got: ", scale_a.scalar_type());
|
||||
TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e4m3fn,
|
||||
"scale_b must be Float8_e4m3fn, got: ", scale_b.scalar_type());
|
||||
TORCH_CHECK_VALUE(global_scale_a.value().scalar_type() == at::kFloat,
|
||||
"global_scale_a must be Float, got: ", global_scale_a.value().scalar_type());
|
||||
TORCH_CHECK_VALUE(global_scale_b.value().scalar_type() == at::kFloat,
|
||||
"global_scale_b must be Float, got: ", global_scale_b.value().scalar_type());
|
||||
combined_global_scale = global_scale_a.value().mul(global_scale_b.value());
|
||||
} else {
|
||||
// MXFP4
|
||||
TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e8m0fnu,
|
||||
"scale_a must be Float8_e8m0fnu, got: ", scale_a.scalar_type());
|
||||
TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e8m0fnu,
|
||||
"scale_b must be Float8_e8m0fnu, got: ", scale_b.scalar_type());
|
||||
}
|
||||
|
||||
auto o = fbgemm_gpu::f4f4bf16_grouped_mm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a,
|
||||
scale_b,
|
||||
offs.value(),
|
||||
out,
|
||||
combined_global_scale
|
||||
);
|
||||
#else
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "nvfp4 grouped gemm is not supported without USE_FBGEMM_GENAI, and only for CUDA")
|
||||
#endif
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
|
||||
// Checks scales for 2d or 3d target tensors (`mat`).
|
||||
if (mat.dim() == 2) {
|
||||
@ -301,15 +245,7 @@ void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int
|
||||
}
|
||||
}
|
||||
|
||||
void _check_scales_blocked(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) {
|
||||
// if {mx,nv}fp4, will need to modify K later
|
||||
bool is_fp4 = (mat.scalar_type() == kFloat4_e2m1fn_x2);
|
||||
int blocksize = 32;
|
||||
// check for nvfp4 vs. mxfp4 to fix blocksize
|
||||
if (is_fp4 && scale.scalar_type() == kFloat8_e4m3fn) {
|
||||
blocksize = 16;
|
||||
}
|
||||
|
||||
void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) {
|
||||
// Checks scales for 2d or 3d target tensors (`mat`).
|
||||
if (mat.dim() == 2) {
|
||||
// For MXFP8, 2d tensors have variable size groups represented as subtensors,
|
||||
@ -317,19 +253,17 @@ void _check_scales_blocked(const Tensor& mat, const Tensor& scale, const int dim
|
||||
// so we can't check the scale sizes without doing a d2h sync to get the group sizes here.
|
||||
TORCH_CHECK(
|
||||
scale.dim() == mat.dim(),
|
||||
"for block-scaled, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(),
|
||||
" and scale.dim() = ", scale.dim(), " for arg ", arg_idx
|
||||
);
|
||||
"for mxfp8, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(), " and scale.dim() = ", scale.dim(), " for arg ", arg_idx);
|
||||
|
||||
// LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/blocksize, 4))
|
||||
// RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/blocksize, 4))
|
||||
// LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/32, 4))
|
||||
// RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/32, 4))
|
||||
// * weight is transposed prior to the call, scale stays non-transposed.
|
||||
bool LHS = arg_idx == 0;
|
||||
int scale_dim_to_check = 0;
|
||||
int mat_dim_to_check = LHS ? 0 : 1;
|
||||
TORCH_CHECK(
|
||||
scale.size(scale_dim_to_check) >= mat.size(mat_dim_to_check),
|
||||
"for block-scaled, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ",
|
||||
"for mxfp8, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ",
|
||||
"must have scale.shape[", scale_dim_to_check, "] >= ", mat.size(mat_dim_to_check), " but got scale.shape=(", scale.size(0), ", ", scale.size(1), ")");
|
||||
} else {
|
||||
// For MXFP8, 3d tensors have static group sizes (stack of 2d tensors),
|
||||
@ -339,40 +273,32 @@ void _check_scales_blocked(const Tensor& mat, const Tensor& scale, const int dim
|
||||
};
|
||||
|
||||
// TODO: this is for 3d tensor in 2d-3d case specifically.
|
||||
// We'll need to support 3d-3d and 3d-2d cases once mxfp8/nvfp4 grouped gemm supports them.
|
||||
// We'll need to support 3d-3d and 3d-2d cases once mxfp8 grouped gemm supports them.
|
||||
int64_t G = mat.size(0);
|
||||
int64_t K = mat.size(1);
|
||||
if (is_fp4) {
|
||||
// FP4 packs 2 values into a single 8b word - the "real" K is 2x the
|
||||
// reported K. Reverse that adjustment.
|
||||
const int fp4_elems_per_byte = 2;
|
||||
K *= fp4_elems_per_byte;
|
||||
}
|
||||
int64_t N = mat.size(2);
|
||||
int64_t blocked_scale_K = round_up(K/blocksize, 4);
|
||||
int64_t blocked_scale_K = round_up(K/32, 4);
|
||||
int64_t blocked_scale_N = round_up(N, 128);
|
||||
|
||||
// fbgemm expects stack of flattened blocked scales for 3d tensor, shape (G, blocked_scale_K * blocked_scale_N).
|
||||
TORCH_CHECK(
|
||||
scale.dim() == mat.dim() - 1,
|
||||
"for block-scaled 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N),",
|
||||
"but scale is ", scale.dim(), "D for arg ", arg_idx
|
||||
"for mxfp8 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N), but scale is ", scale.dim(), "D for arg ", arg_idx
|
||||
);
|
||||
TORCH_CHECK(
|
||||
scale.size(0) == G && scale.size(1) == blocked_scale_K * blocked_scale_N,
|
||||
"for block-scaled grouped GEMM, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ")",
|
||||
" for arg ", arg_idx, ", got: ", scale.size(0), ", ", scale.size(1)
|
||||
"for mxfp8, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ") for arg ", arg_idx
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
|
||||
bool using_fp8_rowwise = scale.scalar_type() == kFloat;
|
||||
bool using_mx = scale.scalar_type() == at::kFloat8_e8m0fnu;
|
||||
bool using_mxfp8 = scale.scalar_type() == at::kFloat8_e8m0fnu;
|
||||
if (using_fp8_rowwise) {
|
||||
_check_scales_fp8_rowwise(mat, scale, dim, arg_idx, scale_multiplier);
|
||||
} else if (using_mx) {
|
||||
_check_scales_blocked(mat, scale, dim, arg_idx);
|
||||
} else if (using_mxfp8) {
|
||||
_check_scales_mxfp8(mat, scale, dim, arg_idx);
|
||||
} else {
|
||||
TORCH_CHECK(false, "scale must be float32 or float8_e8m0fnu, but got ", scale.dtype());
|
||||
}
|
||||
@ -485,11 +411,9 @@ namespace {
|
||||
|
||||
using acceptance_fn = std::function<bool(c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&, c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&)>;
|
||||
|
||||
std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 4> scale_grouped_kernel_dispatch = {{
|
||||
std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 2> scale_grouped_kernel_dispatch = {{
|
||||
{ "rowwise_rowwise", scaled_blas::check_rowwise_recipe, ScaledGemmImplementation::ROWWISE_ROWWISE},
|
||||
{ "mxfp8_mxfp8", scaled_blas::check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8},
|
||||
{ "mxfp4_mxfp4", scaled_blas::check_mxfp4_recipe, ScaledGemmImplementation::MXFP4_MXFP4},
|
||||
{ "nvfp4_nvfp4", scaled_blas::check_nvfp4_recipe, ScaledGemmImplementation::NVFP4_NVFP4}}};
|
||||
{ "mxfp8_mxfp8", scaled_blas::check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8}}};
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
@ -601,9 +525,8 @@ _scaled_grouped_mm_cuda_v2(
|
||||
out);
|
||||
}
|
||||
case ScaledGemmImplementation::MXFP8_MXFP8: {
|
||||
// scale shape checks
|
||||
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
|
||||
_check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
|
||||
_check_scales_mxfp8(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
|
||||
_check_scales_mxfp8(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
|
||||
return _mx8_mx8_bf16_grouped_mm_fbgemm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
@ -614,36 +537,6 @@ _scaled_grouped_mm_cuda_v2(
|
||||
offs.value(),
|
||||
out);
|
||||
}
|
||||
case ScaledGemmImplementation::MXFP4_MXFP4: {
|
||||
// scale shape checks
|
||||
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
|
||||
_check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
|
||||
return _f4_f4_bf16_grouped_mm_fbgemm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a[0], /* block-scale A */
|
||||
std::nullopt, /* global-scale A */
|
||||
scale_b[0], /* block-scale B */
|
||||
std::nullopt, /* global-scale B */
|
||||
offs.value(),
|
||||
std::nullopt, /* bias */
|
||||
out);
|
||||
}
|
||||
case ScaledGemmImplementation::NVFP4_NVFP4: {
|
||||
// scale shape checks
|
||||
_check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
|
||||
_check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
|
||||
return _f4_f4_bf16_grouped_mm_fbgemm(
|
||||
mat_a,
|
||||
mat_b,
|
||||
scale_a[0], /* block-scale A */
|
||||
scale_a[1], /* global-scale A */
|
||||
scale_b[0], /* block-scale B */
|
||||
scale_b[1], /* global-scale B */
|
||||
offs.value(),
|
||||
std::nullopt, /* bias */
|
||||
out);
|
||||
}
|
||||
default:
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false,
|
||||
"_scaled_grouped_mm_cuda_v2 is in an inconsistent state - should never reach here");
|
||||
|
||||
@ -13,7 +13,7 @@ __global__ void vectorized_gather_kernel(char * out, char * inp, index_t * idx,
|
||||
if (allow_neg_indices) {
|
||||
ind = (ind < 0) ? ind + ind_dim_size : ind;
|
||||
}
|
||||
CUDA_KERNEL_ASSERT_VERBOSE(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds", "Expected 0 <= index < ind_dim_size(%ld), but got index = %ld", ind_dim_size, ind);
|
||||
CUDA_KERNEL_ASSERT(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds");
|
||||
int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; // off is guaranteed to be within int32 limits
|
||||
if (off >= slice_size) return;
|
||||
auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);
|
||||
|
||||
@ -54,6 +54,7 @@ namespace {
|
||||
using DtypeScale = float;
|
||||
using DtypeAccum = float;
|
||||
using DtypeEpilogue = float;
|
||||
using DtypeOutput = cutlass::bfloat16_t;
|
||||
|
||||
using Multiply = cutlass::epilogue::fusion::Sm90Compute<
|
||||
cutlass::multiplies,
|
||||
@ -67,6 +68,12 @@ using Add = cutlass::epilogue::fusion::Sm90Compute<
|
||||
DtypeEpilogue,
|
||||
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||
|
||||
using Cast = cutlass::epilogue::fusion::Sm90Compute<
|
||||
cutlass::epilogue::thread::Identity,
|
||||
DtypeOutput,
|
||||
DtypeEpilogue,
|
||||
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||
|
||||
template <bool LargeTile, bool FastAccum>
|
||||
struct Schedule;
|
||||
|
||||
@ -113,8 +120,7 @@ template <
|
||||
typename FastAccum,
|
||||
typename DtypeA,
|
||||
typename DtypeB,
|
||||
typename DtypeBias,
|
||||
typename DtypeOutput>
|
||||
typename DtypeBias>
|
||||
void f8f8bf16_rowwise_impl(
|
||||
at::Tensor XQ, // FP8
|
||||
at::Tensor WQ, // FP8
|
||||
@ -175,11 +181,6 @@ void f8f8bf16_rowwise_impl(
|
||||
WScale,
|
||||
cutlass::epilogue::fusion::Sm90EVT<Multiply, XScale, Accum>>;
|
||||
|
||||
using Cast = cutlass::epilogue::fusion::Sm90Compute<
|
||||
cutlass::epilogue::thread::Identity,
|
||||
DtypeOutput,
|
||||
DtypeEpilogue,
|
||||
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||
using EpilogueEVT = cutlass::epilogue::fusion::Sm90EVT<
|
||||
Cast,
|
||||
cutlass::epilogue::fusion::Sm90EVT<
|
||||
@ -312,8 +313,7 @@ template <
|
||||
typename FastAccum,
|
||||
typename DtypeA,
|
||||
typename DtypeB,
|
||||
typename DtypeBias,
|
||||
typename DtypeOutput>
|
||||
typename DtypeBias>
|
||||
void f8f8bf16_rowwise_impl_sm100_sm120(
|
||||
at::Tensor XQ, // FP8
|
||||
at::Tensor WQ, // FP8
|
||||
@ -372,11 +372,6 @@ void f8f8bf16_rowwise_impl_sm100_sm120(
|
||||
WScale,
|
||||
cutlass::epilogue::fusion::Sm90EVT<Multiply, XScale, Accum>>;
|
||||
|
||||
using Cast = cutlass::epilogue::fusion::Sm90Compute<
|
||||
cutlass::epilogue::thread::Identity,
|
||||
DtypeOutput,
|
||||
DtypeEpilogue,
|
||||
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||
using EpilogueEVT = cutlass::epilogue::fusion::Sm90EVT<
|
||||
Cast,
|
||||
cutlass::epilogue::fusion::Sm90EVT<
|
||||
@ -503,8 +498,7 @@ template <
|
||||
typename FastAccum,
|
||||
typename DtypeA,
|
||||
typename DtypeB,
|
||||
typename DtypeBias,
|
||||
typename DtypeOutput>
|
||||
typename DtypeBias>
|
||||
void f8f8bf16_rowwise_impl_sm89(
|
||||
at::Tensor XQ, // FP8
|
||||
at::Tensor WQ, // FP8
|
||||
@ -771,8 +765,7 @@ template <
|
||||
typename FastAccum,
|
||||
typename DtypeA,
|
||||
typename DtypeB,
|
||||
typename DtypeBias,
|
||||
typename DtypeOutput>
|
||||
typename DtypeBias>
|
||||
void handle_transposition(
|
||||
at::Tensor XQ,
|
||||
at::Tensor WQ,
|
||||
@ -789,8 +782,7 @@ void handle_transposition(
|
||||
FastAccum,
|
||||
DtypeA,
|
||||
DtypeB,
|
||||
DtypeBias,
|
||||
DtypeOutput>(XQ, WQ, x_scale, w_scale, bias, out, swizzle);
|
||||
DtypeBias>(XQ, WQ, x_scale, w_scale, bias, out, swizzle);
|
||||
} else {
|
||||
dispatch_fp8_rowwise_kernel_on_tile_size<
|
||||
ClusterShape,
|
||||
@ -799,8 +791,7 @@ void handle_transposition(
|
||||
FastAccum,
|
||||
DtypeB,
|
||||
DtypeA,
|
||||
DtypeBias,
|
||||
DtypeOutput>(WQ.t(), XQ.t(), w_scale.t(), x_scale.t(), bias, out.t(), swizzle);
|
||||
DtypeBias>(WQ.t(), XQ.t(), w_scale.t(), x_scale.t(), bias, out.t(), swizzle);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1036,19 +1027,11 @@ void dispatch_fp8_rowwise_kernel_on_bias_dtype(
|
||||
at::Tensor out) {
|
||||
if (bias.has_value() && bias->dtype() == at::kBFloat16) {
|
||||
dispatch_fp8_rowwise_kernel_on_input_dtypes<
|
||||
cutlass::bfloat16_t,
|
||||
cutlass::bfloat16_t>
|
||||
(XQ, WQ, x_scale, w_scale, bias, use_fast_accum, out);
|
||||
} else if (bias.has_value() && bias->dtype() == at::kHalf){
|
||||
TORCH_CHECK(out.dtype() == at::kHalf, "Output should be Float16 when bias is Float16");
|
||||
dispatch_fp8_rowwise_kernel_on_input_dtypes<
|
||||
cutlass::half_t,
|
||||
cutlass::half_t>
|
||||
(XQ, WQ, x_scale, w_scale, bias, use_fast_accum, out);
|
||||
} else {
|
||||
dispatch_fp8_rowwise_kernel_on_input_dtypes<
|
||||
float,
|
||||
cutlass::bfloat16_t>
|
||||
float>
|
||||
//Types...>
|
||||
(XQ, WQ, x_scale, w_scale, bias, use_fast_accum, out);
|
||||
}
|
||||
@ -1090,14 +1073,14 @@ void check_inputs(
|
||||
|
||||
if (bias.has_value()) {
|
||||
TORCH_CHECK(bias->device() == b.device());
|
||||
TORCH_CHECK(bias->dtype() == at::kFloat || bias->dtype() == at::kBFloat16 || bias->dtype() == at::kHalf);
|
||||
TORCH_CHECK(bias->dtype() == at::kFloat || bias->dtype() == at::kBFloat16);
|
||||
TORCH_CHECK(bias->dim() == 1);
|
||||
TORCH_CHECK(bias->size(0) == b.size(1));
|
||||
TORCH_CHECK(bias->stride(0) == 1);
|
||||
}
|
||||
|
||||
TORCH_CHECK(out.device() == a.device());
|
||||
TORCH_CHECK(out.dtype() == at::kBFloat16 || out.dtype() == at::kHalf);
|
||||
TORCH_CHECK(out.dtype() == at::kBFloat16);
|
||||
TORCH_CHECK(out.dim() == 2);
|
||||
TORCH_CHECK(out.size(0) == a.size(0));
|
||||
TORCH_CHECK(out.size(1) == b.size(1));
|
||||
|
||||
@ -59,22 +59,6 @@
|
||||
// forward declare
|
||||
class cublasCommonArgs;
|
||||
|
||||
namespace fbgemm_gpu {
|
||||
|
||||
// NOTE(slayton58): FBGemm_GPU kernels come from <fbgemm_gpu/torch_ops.h> within the FBGemm repo.
|
||||
// To update supported ops means a submodule bump, which is.. painful. Instead, we
|
||||
// can simply forward-declare the methods we want to use.. Works at least as a short-term
|
||||
// thing, but should still be fixed somewhere/somehow.
|
||||
at::Tensor f4f4bf16(
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
at::Tensor,
|
||||
std::optional<at::Tensor>,
|
||||
bool use_mx);
|
||||
|
||||
} // namespace fbgemm_gpu
|
||||
|
||||
using at::blas::ScalingType;
|
||||
using at::blas::SwizzleType;
|
||||
|
||||
@ -607,7 +591,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
|
||||
if ((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
|
||||
// cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
|
||||
|| (dprops->major >= 10 && (!scale_a.sizes().empty() || !scale_b.sizes().empty()))) {
|
||||
TORCH_CHECK_VALUE(out.dtype() == kBFloat16 || out.dtype() == kHalf, "Only bf16 and fp16 high precision output types are supported for row-wise scaling.");
|
||||
TORCH_CHECK_VALUE(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
|
||||
return _scaled_rowwise_rowwise(
|
||||
mat1,
|
||||
mat2,
|
||||
@ -752,7 +736,7 @@ _scaled_rowwise_rowwise(
|
||||
if (((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
|
||||
// cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
|
||||
|| (dprops->major == 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
|
||||
TORCH_CHECK_VALUE(out.dtype() == kBFloat16 || out.dtype() == kHalf, "Only bf16 and fp16 high precision output types are supported for row-wise scaling.");
|
||||
TORCH_CHECK_VALUE(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
|
||||
at::cuda::detail::f8f8bf16_rowwise(
|
||||
mat_a,
|
||||
mat_b,
|
||||
@ -810,24 +794,6 @@ void _check_deepseek_scale_stride(const Tensor& scale, const Tensor& t, const Sc
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
_check_deepseek_support() {
|
||||
#ifndef USE_ROCM
|
||||
auto dprops = at::cuda::getCurrentDeviceProperties();
|
||||
if (dprops->major != 9) {
|
||||
// Only on Hopper GPUs
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
dprops->major == 9,
|
||||
"DeepSeek style (1x128, 128x128) scaling only supported in CUDA for SM90")
|
||||
}
|
||||
// Only in cublasLt >= 12.9
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900,
|
||||
"DeepSeek style (1x128, 128x128) scaling requires cublasLt >= 12.9"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
Tensor&
|
||||
_scaled_block1x128_block1x128(
|
||||
const Tensor& mat_a, const Tensor& mat_b,
|
||||
@ -836,12 +802,8 @@ _scaled_block1x128_block1x128(
|
||||
const c10::ScalarType out_dtype,
|
||||
const bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
#ifndef USE_ROCM
|
||||
// Restrictions:
|
||||
// A, B are FP8, scales are fp32, shape K//128
|
||||
// CUDA: Only Hopper GPUs
|
||||
_check_deepseek_support();
|
||||
|
||||
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
|
||||
mat_a.scalar_type(), mat_b.scalar_type());
|
||||
TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
|
||||
@ -859,12 +821,6 @@ _scaled_block1x128_block1x128(
|
||||
_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
|
||||
|
||||
return out;
|
||||
#else
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false,
|
||||
"1x128 and 128x128 scaling not available with ROCm"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
Tensor&
|
||||
@ -875,12 +831,10 @@ _scaled_block128x128_block1x128(
|
||||
const c10::ScalarType out_dtype,
|
||||
const bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
#ifndef USE_ROCM
|
||||
// Restrictions:
|
||||
// A, B are FP8, scales are fp32, shape K//128
|
||||
// CUDA: Only Hopper GPUs
|
||||
_check_deepseek_support();
|
||||
|
||||
std::cout << "mat_b: " << mat_b.dim() << ", " << mat_b.sizes() << ", " << mat_b.strides() << std::endl;
|
||||
std::cout << "scale_b: " << scale_b.dim() << ", " << scale_b.sizes() << ", " << scale_b.strides() << std::endl;
|
||||
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
|
||||
mat_a.scalar_type(), mat_b.scalar_type());
|
||||
TORCH_CHECK_VALUE(scale_a.sizes()[0] == ceil_div<int64_t>(mat_a.sizes()[0], 128) && scale_a.sizes()[1] == ceil_div<int64_t>(mat_a.sizes()[1], 128) && scale_a.scalar_type() == kFloat,
|
||||
@ -898,12 +852,6 @@ _scaled_block128x128_block1x128(
|
||||
_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
|
||||
|
||||
return out;
|
||||
#else
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false,
|
||||
"1x128 and 128x128 scaling not available with ROCm"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
Tensor&
|
||||
@ -914,12 +862,8 @@ _scaled_block1x128_block128x128(
|
||||
const c10::ScalarType out_dtype,
|
||||
const bool use_fast_accum,
|
||||
Tensor& out) {
|
||||
#ifndef USE_ROCM
|
||||
// Restrictions:
|
||||
// A, B are FP8, scales are fp32, A: shape K//128, B: K//128, N//128
|
||||
// CUDA: Only Hopper GPUs
|
||||
_check_deepseek_support();
|
||||
|
||||
TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
|
||||
mat_a.scalar_type(), mat_b.scalar_type());
|
||||
TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
|
||||
@ -937,12 +881,6 @@ _scaled_block1x128_block128x128(
|
||||
_scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);
|
||||
|
||||
return out;
|
||||
#else
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false,
|
||||
"1x128 and 128x128 scaling not available with ROCm"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
Tensor&
|
||||
@ -1013,47 +951,26 @@ _scaled_mxfp4_mxfp4(
|
||||
const std::optional<Tensor>& bias,
|
||||
const c10::ScalarType out_dtype,
|
||||
Tensor& out) {
|
||||
#if !defined(USE_ROCM) && !defined(USE_FBGEMM_GENAI)
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM and CUDA+FBGEMM_GENAI only");
|
||||
#ifndef USE_ROCM
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM only");
|
||||
#endif
|
||||
// Restrictions:
|
||||
// A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
|
||||
TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",
|
||||
mat_a.scalar_type(), mat_b.scalar_type());
|
||||
|
||||
// Packed FP4 format means actual-K = 2 * reported-K -- adjust
|
||||
auto K_multiplier = 2;
|
||||
#ifdef USE_ROCM
|
||||
// AMD
|
||||
auto scale_a_elems = ceil_div<int64_t>(K_multiplier * mat_a.size(0), 32) * mat_a.size(1);
|
||||
auto scale_b_elems = ceil_div<int64_t>(K_multiplier * mat_b.size(1), 32) * mat_b.size(0);
|
||||
#else
|
||||
// NVIDIA
|
||||
auto scale_a_elems = round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(K_multiplier * mat_a.size(1), 32), 4);
|
||||
auto scale_b_elems = round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(K_multiplier * mat_b.size(0), 32), 4);
|
||||
#endif
|
||||
auto scale_a_elems = ceil_div<int64_t>(2 * mat_a.size(0), 32) * mat_a.size(1);
|
||||
auto scale_b_elems = ceil_div<int64_t>(2 * mat_b.size(1), 32) * mat_b.size(0);
|
||||
TORCH_CHECK_VALUE(scale_a_elems == scale_a.numel(),
|
||||
"For Blockwise scaling scale_a should have ", scale_a_elems, " elements, got: ", scale_a.numel());
|
||||
TORCH_CHECK_VALUE(scale_b_elems == scale_b.numel(),
|
||||
"For Blockwise scaling scale_b should have ", scale_b_elems, " elements, got: ", scale_b.numel());
|
||||
|
||||
#ifdef USE_ROCM
|
||||
// AMD
|
||||
TORCH_CHECK_VALUE(swizzle_a == SwizzleType::NO_SWIZZLE, "scale_a must not be swizzled (NO_SWIZZLE format)");
|
||||
TORCH_CHECK_VALUE(swizzle_b == SwizzleType::NO_SWIZZLE, "scale_b must not be swizzled (NO_SWIZZLE format)");
|
||||
#else
|
||||
// NVIDIA
|
||||
TORCH_CHECK_VALUE(swizzle_a == SwizzleType::SWIZZLE_32_4_4, "scale_a must be swizzled to SWIZZLE_32_4_4 format");
|
||||
TORCH_CHECK_VALUE(swizzle_b == SwizzleType::SWIZZLE_32_4_4, "scale_b must be swizzled to SWIZZLE_32_4_4 format");
|
||||
#endif
|
||||
|
||||
TORCH_CHECK_VALUE(scale_a.is_contiguous() && scale_b.is_contiguous(),
|
||||
"For Blockwise scaling both scales should be contiguous");
|
||||
|
||||
TORCH_CHECK_VALUE(out.scalar_type() == out_dtype, "expected out.scalar_type() to be ", out_dtype, ", but got ", out_dtype);
|
||||
|
||||
#ifdef USE_ROCM
|
||||
// AMD
|
||||
auto scaling_choice_a = ScalingType::BlockWise1x32;
|
||||
auto scaling_choice_b = ScalingType::BlockWise1x32;
|
||||
|
||||
@ -1068,29 +985,11 @@ _scaled_mxfp4_mxfp4(
|
||||
TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16 ||
|
||||
out.scalar_type() == ScalarType::Half,
|
||||
"Block-wise scaling only supports BFloat16 or Half output types");
|
||||
#else
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
|
||||
#endif
|
||||
|
||||
return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
|
||||
#else
|
||||
// NVIDIA
|
||||
// NOTE(slayton58): fbgemm_gpu::f4f4bf16 does *not* allow passing an output tensor,
|
||||
// but we have one we need to use. Two clear options are to copy into
|
||||
// our output (slow), or use a move-assignment-operator (faster).
|
||||
// However, the compiler can complain about the explicit move preventing
|
||||
// copy elision because the return from f4f4bf16 is a temporary object.
|
||||
// So we don't explicitly move, and trust the compiler here...
|
||||
// In the longer term this should be fixed on the FBGemm side.
|
||||
out = fbgemm_gpu::f4f4bf16(
|
||||
mat_a,
|
||||
mat_b.transpose(-2, -1),
|
||||
scale_a,
|
||||
scale_b,
|
||||
std::nullopt, /* global_scale */
|
||||
true /* use_mx */
|
||||
);
|
||||
|
||||
return out;
|
||||
#endif
|
||||
}
|
||||
|
||||
Tensor&
|
||||
@ -1215,20 +1114,17 @@ _scaled_mm_cuda_v2_out(
|
||||
mat_a.size(0), "x", mat_a.size(1), " and ", mat_b.size(0), "x", mat_b.size(1), ")");
|
||||
}
|
||||
|
||||
// Handle fp4 packed-K dimension
|
||||
int K_multiplier = (mat_a.scalar_type() == ScalarType::Float4_e2m1fn_x2) ? 2 : 1;
|
||||
|
||||
TORCH_CHECK_VALUE(!bias || bias->numel() == mat_b.sizes()[1], "Bias must be size ", mat_b.sizes()[1],
|
||||
" but got ", bias->numel());
|
||||
TORCH_CHECK_VALUE(
|
||||
K_multiplier * mat_a.sizes()[1] % 16 == 0,
|
||||
mat_a.sizes()[1] % 16 == 0,
|
||||
"Expected trailing dimension of mat1 to be divisible by 16 ",
|
||||
"but got mat1 shape: (",
|
||||
mat_a.sizes()[0],
|
||||
"x",
|
||||
K_multiplier * mat_a.sizes()[1],
|
||||
mat_a.sizes()[1],
|
||||
").");
|
||||
TORCH_CHECK_VALUE(K_multiplier * mat_b.sizes()[0] % 16 == 0 && mat_b.sizes()[1] % 16 == 0, "mat2 shape (", mat_b.sizes()[0], "x",
|
||||
TORCH_CHECK_VALUE(mat_b.sizes()[0] % 16 == 0 && mat_b.sizes()[1] % 16 == 0, "mat2 shape (", mat_b.sizes()[0], "x",
|
||||
mat_b.sizes()[1], ") must be divisible by 16");
|
||||
|
||||
// TODO(slayton): Existing checks, not sure if they should really be here.
|
||||
|
||||
@ -160,8 +160,8 @@ struct _cuda_scatter_gather_internal_kernel {
|
||||
auto offsets = offset_calc.get(i);
|
||||
|
||||
int64_t idx_dim = *(index_t*)(index_ptr + offsets[2]);
|
||||
CUDA_KERNEL_ASSERT_VERBOSE(idx_dim >= 0 && idx_dim < index_size
|
||||
&& "scatter gather kernel index out of bounds", "Expected 0 <= idx_dim < index_size (%ld), but got idx_dim = %ld", index_size, idx_dim);
|
||||
CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
|
||||
&& "scatter gather kernel index out of bounds");
|
||||
|
||||
f(
|
||||
(scalar_t*)(self_ptr + offsets[0]),
|
||||
@ -406,8 +406,9 @@ struct _cuda_scatter_fill_internal_kernel {
|
||||
auto offsets = offset_calc.get(i);
|
||||
|
||||
int64_t idx_dim = *(index_t*)(index_ptr + offsets[1]);
|
||||
CUDA_KERNEL_ASSERT_VERBOSE(idx_dim >= 0 && idx_dim < index_size
|
||||
&& "index out of bounds", "Expected 0 <= idx_dim < index_size (%ld), but got idx_dim = %ld", index_size, idx_dim);
|
||||
CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
|
||||
&& "index out of bounds"
|
||||
);
|
||||
|
||||
f(
|
||||
(scalar_t*)(self_ptr + offsets[0]),
|
||||
|
||||
@ -12,15 +12,14 @@
|
||||
|
||||
namespace at::native {
|
||||
|
||||
#if 0 && AT_USE_JITERATOR()
|
||||
#if AT_USE_JITERATOR()
|
||||
constexpr char tan_name[] = "tan_impl";
|
||||
#endif
|
||||
|
||||
void tan_kernel_cuda(TensorIteratorBase& iter) {
|
||||
auto common_dtype = iter.common_dtype();
|
||||
if (at::isComplexType(common_dtype)) {
|
||||
// Disabled due to accuracy issues
|
||||
#if 0 && AT_USE_JITERATOR()
|
||||
#if AT_USE_JITERATOR()
|
||||
static const auto tan_string = jiterator_stringify(
|
||||
template <typename T> T tan_impl(T a) { return std::tan(a); });
|
||||
AT_DISPATCH_COMPLEX_TYPES_AND(
|
||||
|
||||
@ -12,15 +12,14 @@
|
||||
|
||||
namespace at::native {
|
||||
|
||||
#if 0 && AT_USE_JITERATOR()
|
||||
#if AT_USE_JITERATOR()
|
||||
constexpr char tanh_name[] = "tanh_impl";
|
||||
#endif
|
||||
|
||||
void tanh_kernel_cuda(TensorIteratorBase& iter) {
|
||||
auto common_dtype = iter.common_dtype();
|
||||
if (at::isComplexType(common_dtype)) {
|
||||
// Disabled due to accuracy issues
|
||||
#if 0 && AT_USE_JITERATOR()
|
||||
#if AT_USE_JITERATOR()
|
||||
static const auto tanh_string = jiterator_stringify(
|
||||
template <typename T> T tanh_impl(T a) { return std::tanh(a); });
|
||||
AT_DISPATCH_COMPLEX_TYPES_AND(
|
||||
|
||||
@ -141,8 +141,7 @@ WelfordDataLN cuWelfordOnlineSum(
|
||||
if constexpr (!rms_norm){
|
||||
U delta = val - curr_sum.mean;
|
||||
U new_count = curr_sum.count + 1.f;
|
||||
//Due to low CU count, we run into accuracy issues on gfx90a with `__builtin_amdgcn_rcpf`
|
||||
#if defined(USE_ROCM) && !defined(__gfx90a__) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
|
||||
#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
|
||||
U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
|
||||
#else
|
||||
U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
|
||||
@ -164,8 +163,7 @@ WelfordDataLN cuWelfordCombine(
|
||||
U count = dataA.count + dataB.count;
|
||||
U mean, sigma2;
|
||||
if (count > decltype(dataB.count){0}) {
|
||||
//Due to low CU count, we run into accuracy issues on gfx90a with `__builtin_amdgcn_rcpf`
|
||||
#if defined(USE_ROCM) && !defined(__gfx90a__) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
|
||||
#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
|
||||
auto coef = __builtin_amdgcn_rcpf(count);
|
||||
#else
|
||||
auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
|
||||
|
||||
@ -40,37 +40,14 @@ bool check_head_dim_size_xpu(sdp::sdp_params const& params, bool debug) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool input_require_grad(
|
||||
const at::Tensor& query,
|
||||
const at::Tensor& key,
|
||||
const at::Tensor& value,
|
||||
const std::optional<at::Tensor>& attn_mask) {
|
||||
return at::GradMode::is_enabled() &&
|
||||
(query.requires_grad() || key.requires_grad() || value.requires_grad() ||
|
||||
(attn_mask.has_value() && attn_mask.value().requires_grad()));
|
||||
}
|
||||
|
||||
bool check_grad(sdp::sdp_params const& params, bool debug) {
|
||||
if (!input_require_grad(
|
||||
params.query, params.key, params.value, params.attn_mask))
|
||||
return true;
|
||||
|
||||
auto q_num_heads = params.query.sym_size(-3);
|
||||
auto k_num_heads = params.key.sym_size(-3);
|
||||
auto v_num_heads = params.value.sym_size(-3);
|
||||
bool is_gqa = q_num_heads != k_num_heads || q_num_heads != v_num_heads;
|
||||
if (debug && is_gqa)
|
||||
TORCH_WARN(
|
||||
"scale_dot_product_attention with gqa is not supported for gradient computation on xpu.");
|
||||
|
||||
bool attn_mask_needs_grad =
|
||||
params.attn_mask.has_value() && params.attn_mask.value().requires_grad();
|
||||
if (debug && attn_mask_needs_grad) {
|
||||
TORCH_WARN(
|
||||
"scale_dot_product_attention on xpu is not supported when attn_mask.requires_grad() == True.");
|
||||
bool check_no_grad(sdp::sdp_params const& params, bool debug) {
|
||||
const bool any_inputs_require_grad = params.query.requires_grad() ||
|
||||
params.key.requires_grad() || params.value.requires_grad();
|
||||
const bool gradmode_enabled = at::GradMode::is_enabled();
|
||||
if (debug && any_inputs_require_grad && gradmode_enabled) {
|
||||
TORCH_WARN("Backward or grad to be supported.");
|
||||
}
|
||||
|
||||
return !is_gqa && !attn_mask_needs_grad;
|
||||
return !any_inputs_require_grad || !gradmode_enabled;
|
||||
}
|
||||
|
||||
bool can_use_overrideable_attention(sdp::sdp_params const& params, bool debug) {
|
||||
@ -88,7 +65,7 @@ bool can_use_overrideable_attention(sdp::sdp_params const& params, bool debug) {
|
||||
sdp::check_nonzero_sequence_lengths_dense,
|
||||
sdp::check_last_dim_stride_equals_1_dense<false /*ignore_singleton_dim*/>,
|
||||
check_head_dim_size_xpu,
|
||||
check_grad);
|
||||
check_no_grad);
|
||||
for (auto& constraint : constraints) {
|
||||
if (!constraint(params, debug)) {
|
||||
return false;
|
||||
@ -248,11 +225,10 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
|
||||
double dropout_p,
|
||||
bool is_causal,
|
||||
bool return_debug_mask,
|
||||
std::optional<double> scale,
|
||||
bool compute_logsumexp) {
|
||||
std::optional<double> scale) {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
query.dim() == 4 && key.dim() == 4 && value.dim() == 4,
|
||||
"scaled_dot_product_fused_attention_overrideable_xpu: Accept only 4 dims inputs shape of {B, H, T, K}");
|
||||
"scaled_dot_product_fused_attention_overrideable_xpu: Accept only 4 dims inputs shape of {(B), H, T, K}");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
(key.size(0) == value.size(0)) && (key.size(1) == value.size(1)) &&
|
||||
(key.size(2) == value.size(2)),
|
||||
@ -269,9 +245,6 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
!(attn_bias.has_value() && is_causal),
|
||||
"scaled_dot_product_fused_attention_overrideable_xpu: attn_bias cannot present with is_causal");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
!(attn_bias.has_value() && attn_bias.value().requires_grad()),
|
||||
"scaled_dot_product_fused_attention_overrideable_xpu: attn_bias cannot have requires_grad=True");
|
||||
|
||||
const int64_t batch_size = query.size(0);
|
||||
const int64_t num_head_q = query.size(1);
|
||||
@ -281,14 +254,11 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
|
||||
const int64_t seq_len_q = query.size(2);
|
||||
const int64_t seq_len_kv = key.size(2);
|
||||
|
||||
at::Tensor attention;
|
||||
std::vector<int64_t> attention_shape = {
|
||||
at::Tensor output;
|
||||
std::vector<int64_t> output_shape = {
|
||||
batch_size, num_head_q, seq_len_q, head_dim_v};
|
||||
alloc_with_matching_layout(query, attention, attention_shape);
|
||||
|
||||
auto opts = query.options();
|
||||
at::Tensor logsumexp =
|
||||
at::empty({batch_size, num_head_q, seq_len_q}, opts.dtype(at::kFloat));
|
||||
alloc_with_matching_layout(query, output, output_shape);
|
||||
at::Tensor logsumexp, debug_attn_mask; // not supported
|
||||
|
||||
at::native::onednn::sdpa(
|
||||
batch_size,
|
||||
@ -304,15 +274,15 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
|
||||
attn_bias,
|
||||
is_causal,
|
||||
scale.has_value() ? scale.value() : (1.0 / std::sqrt(head_dim_qk)),
|
||||
attention,
|
||||
compute_logsumexp,
|
||||
output,
|
||||
false,
|
||||
logsumexp);
|
||||
|
||||
// rng not used
|
||||
auto philox_seed = at::empty({}, at::dtype(at::kLong));
|
||||
auto philox_offset = at::empty({}, at::dtype(at::kLong));
|
||||
return std::make_tuple(
|
||||
attention,
|
||||
output,
|
||||
logsumexp,
|
||||
/* cum_seq_q */ at::Tensor(),
|
||||
/* cum_seq_k */ at::Tensor(),
|
||||
@ -320,106 +290,7 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
|
||||
seq_len_kv,
|
||||
philox_seed,
|
||||
philox_offset,
|
||||
/*debug_attn_mask */ at::Tensor());
|
||||
}
|
||||
|
||||
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
|
||||
_scaled_dot_product_fused_attention_overrideable_backward_xpu(
|
||||
const at::Tensor& grad_out,
|
||||
const at::Tensor& query,
|
||||
const at::Tensor& key,
|
||||
const at::Tensor& value,
|
||||
const at::Tensor& attn_bias,
|
||||
std::array<bool, 4> grad_input_mask,
|
||||
const at::Tensor& out,
|
||||
const at::Tensor& logsumexp,
|
||||
const at::Tensor& cum_seq_q,
|
||||
const at::Tensor& cum_seq_k,
|
||||
int64_t max_q,
|
||||
int64_t max_k,
|
||||
double dropout_p,
|
||||
bool is_causal,
|
||||
const at::Tensor& philox_seed,
|
||||
const at::Tensor& philox_offset,
|
||||
std::optional<double> scale) {
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
grad_out.dim() == 4 && out.dim() == 4 &&
|
||||
grad_out.size(0) == out.size(0) && grad_out.size(1) == out.size(1) &&
|
||||
grad_out.size(2) == out.size(2) && grad_out.size(3) == out.size(3),
|
||||
"scaled_dot_product_fused_attention_overrideable_backward_xpu: grad_out and out should have the same shape of {B, H, T, K}");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
query.dim() == 4 && key.dim() == 4 && value.dim() == 4,
|
||||
"scaled_dot_product_fused_attention_overrideable_backward_xpu: Accept only 4 dims inputs shape of {B, H, T, K}");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
(key.size(0) == value.size(0)) && (key.size(1) == value.size(1)) &&
|
||||
(key.size(2) == value.size(2)),
|
||||
"scaled_dot_product_fused_attention_overrideable_backward_xpu: K/V should have the same batch / seq / num_head");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
query.size(0) == grad_out.size(0) && query.size(1) == grad_out.size(1) &&
|
||||
query.size(2) == grad_out.size(2),
|
||||
"scaled_dot_product_fused_attention_overrideable_backward_xpu: Q should have the same batch / num_head / seq_len as grad_out");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
query.size(3) == key.size(3),
|
||||
"scaled_dot_product_fused_attention_overrideable_backward_xpu: Q/K should have the same head_dim");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
value.size(3) == grad_out.size(3),
|
||||
"scaled_dot_product_fused_attention_overrideable_backward_xpu: V should have the same head_dim as grad_out");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
query.size(1) == key.size(1),
|
||||
"scaled_dot_product_fused_attention_overrideable_backward_xpu: number of heads in K/V must equal to number of heads in Q");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
dropout_p == 0.0,
|
||||
"scaled_dot_product_fused_attention_overrideable_backward_xpu: Currently do not support dropout > 0");
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
logsumexp.dim() == 3 && logsumexp.size(0) == query.size(0) &&
|
||||
logsumexp.size(1) == query.size(1) &&
|
||||
logsumexp.size(2) == query.size(2) &&
|
||||
"scaled_dot_product_fused_attention_overrideable_backward_xpu: logsumexp should have the shape of {B, H, T}");
|
||||
|
||||
std::optional<Tensor> attn_bias_opt;
|
||||
if (attn_bias.defined()) {
|
||||
attn_bias_opt = attn_bias;
|
||||
}
|
||||
|
||||
const int64_t batch_size = query.size(0);
|
||||
const int64_t num_head_q = query.size(1);
|
||||
const int64_t num_head_kv = key.size(1);
|
||||
const int64_t seq_len_q = query.size(2);
|
||||
const int64_t seq_len_kv = key.size(2);
|
||||
const int64_t head_dim_qk = query.size(3);
|
||||
const int64_t head_dim_v = value.size(3);
|
||||
|
||||
auto grad_q = at::empty_like(query);
|
||||
auto grad_k = at::empty_like(key);
|
||||
auto grad_v = at::empty_like(value);
|
||||
auto grad_attn_bias = attn_bias_opt.has_value()
|
||||
? at::empty_like(attn_bias_opt.value())
|
||||
: at::Tensor();
|
||||
at::native::onednn::sdpa_backward(
|
||||
batch_size,
|
||||
num_head_q,
|
||||
num_head_kv,
|
||||
seq_len_q,
|
||||
seq_len_kv,
|
||||
head_dim_qk,
|
||||
head_dim_v,
|
||||
grad_out,
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
out,
|
||||
logsumexp,
|
||||
attn_bias_opt,
|
||||
is_causal,
|
||||
scale.has_value() ? scale.value() : (1.0 / std::sqrt(query.size(3))),
|
||||
grad_q,
|
||||
grad_k,
|
||||
grad_v);
|
||||
return std::make_tuple(
|
||||
std::move(grad_q),
|
||||
std::move(grad_k),
|
||||
std::move(grad_v),
|
||||
std::move(grad_attn_bias));
|
||||
debug_attn_mask);
|
||||
}
|
||||
|
||||
REGISTER_XPU_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_xpu);
|
||||
|
||||
@ -86,28 +86,6 @@ struct zeta_functor {
|
||||
}
|
||||
};
|
||||
|
||||
struct logaddexp_functor {
|
||||
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
|
||||
inline T operator()(const T a, const T b) {
|
||||
return c10::metal::logaddexp(a, b);
|
||||
}
|
||||
template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
|
||||
inline float operator()(const T a, const T b) {
|
||||
return c10::metal::logaddexp(float(a), float(b));
|
||||
}
|
||||
};
|
||||
|
||||
struct logaddexp2_functor {
|
||||
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
|
||||
inline T operator()(const T a, const T b) {
|
||||
return c10::metal::logaddexp2(a, b);
|
||||
}
|
||||
template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
|
||||
inline float operator()(const T a, const T b) {
|
||||
return c10::metal::logaddexp2(float(a), float(b));
|
||||
}
|
||||
};
|
||||
|
||||
struct xlog1py_functor {
|
||||
template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
|
||||
inline T operator()(const T a, const T b) {
|
||||
@ -399,10 +377,6 @@ REGISTER_FLOAT_BINARY_OP(fmin);
|
||||
REGISTER_FLOAT_BINARY_OP(nextafter);
|
||||
REGISTER_FLOAT_BINARY_OP(zeta);
|
||||
REGISTER_INT2FLOAT_BINARY_OP(zeta);
|
||||
REGISTER_FLOAT_BINARY_OP(logaddexp);
|
||||
REGISTER_INT2FLOAT_BINARY_OP(logaddexp);
|
||||
REGISTER_FLOAT_BINARY_OP(logaddexp2);
|
||||
REGISTER_INT2FLOAT_BINARY_OP(logaddexp2);
|
||||
REGISTER_FLOAT_BINARY_OP(xlog1py);
|
||||
REGISTER_INT2FLOAT_BINARY_OP(xlog1py);
|
||||
REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_t);
|
||||
@ -489,8 +463,6 @@ REGISTER_BINARY_OP(add, float2, float2);
|
||||
REGISTER_BINARY_OP(add, half2, half2);
|
||||
REGISTER_BINARY_OP(sub, float2, float2);
|
||||
REGISTER_BINARY_OP(sub, half2, half2);
|
||||
REGISTER_BINARY_OP(logaddexp, float2, float2);
|
||||
REGISTER_BINARY_OP(logaddexp, half2, half2);
|
||||
REGISTER_BINARY_ALPHA_OP(add_alpha, float2, float2, float2);
|
||||
REGISTER_BINARY_ALPHA_OP(add_alpha, half2, half2, half2);
|
||||
REGISTER_BINARY_ALPHA_OP(sub_alpha, float2, float2, float2);
|
||||
|
||||
@ -89,14 +89,6 @@ static void zeta_mps_kernel(TensorIteratorBase& iter) {
|
||||
lib.exec_binary_kernel(iter, "zeta");
|
||||
}
|
||||
|
||||
static void logaddexp_mps_kernel(TensorIteratorBase& iter) {
|
||||
lib.exec_binary_kernel(iter, "logaddexp");
|
||||
}
|
||||
|
||||
static void logaddexp2_mps_kernel(TensorIteratorBase& iter) {
|
||||
lib.exec_binary_kernel(iter, "logaddexp2");
|
||||
}
|
||||
|
||||
static void xlog1py_mps_kernel(TensorIteratorBase& iter) {
|
||||
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "xlog1py_mps not implemented for non-floating types");
|
||||
lib.exec_binary_kernel(iter, "xlog1py");
|
||||
@ -219,8 +211,6 @@ REGISTER_DISPATCH(fmin_stub, &fmin_mps_kernel)
|
||||
REGISTER_DISPATCH(copysign_stub, ©sign_mps_kernel)
|
||||
REGISTER_DISPATCH(nextafter_stub, &nextafter_mps_kernel)
|
||||
REGISTER_DISPATCH(zeta_stub, &zeta_mps_kernel)
|
||||
REGISTER_DISPATCH(logaddexp_stub, &logaddexp_mps_kernel);
|
||||
REGISTER_DISPATCH(logaddexp2_stub, &logaddexp2_mps_kernel);
|
||||
REGISTER_DISPATCH(xlog1py_stub, &xlog1py_mps_kernel)
|
||||
REGISTER_DISPATCH(chebyshev_polynomial_t_stub, &chebyshev_polynomial_t_mps_kernel)
|
||||
REGISTER_DISPATCH(chebyshev_polynomial_u_stub, &chebyshev_polynomial_u_mps_kernel)
|
||||
|
||||
@ -17,6 +17,8 @@
|
||||
#include <ATen/ops/ge_native.h>
|
||||
#include <ATen/ops/gt_native.h>
|
||||
#include <ATen/ops/le_native.h>
|
||||
#include <ATen/ops/logaddexp2_native.h>
|
||||
#include <ATen/ops/logaddexp_native.h>
|
||||
#include <ATen/ops/logical_and_native.h>
|
||||
#include <ATen/ops/logical_or_native.h>
|
||||
#include <ATen/ops/logical_xor_native.h>
|
||||
@ -275,6 +277,30 @@ TORCH_IMPL_FUNC(pow_Scalar_out_mps)(const Scalar& base, const Tensor& exp, const
|
||||
}
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(logaddexp_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
|
||||
mps::BinaryOpBlock logaddexp_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
|
||||
MPSGraph* mpsGraph = cachedGraph->graph();
|
||||
MPSGraphTensor* sumTensor =
|
||||
[mpsGraph additionWithPrimaryTensor:[mpsGraph exponentWithTensor:primaryCastTensor name:nil]
|
||||
secondaryTensor:[mpsGraph exponentWithTensor:secondaryCastTensor name:nil]
|
||||
name:nil];
|
||||
return [mpsGraph logarithmWithTensor:sumTensor name:nil];
|
||||
};
|
||||
mps::binaryOpTensor(self, other, output, "logaddexp_out_mps", logaddexp_op_block);
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(logaddexp2_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
|
||||
mps::BinaryOpBlock logaddexp2_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
|
||||
MPSGraph* mpsGraph = cachedGraph->graph();
|
||||
MPSGraphTensor* sumTensor =
|
||||
[mpsGraph additionWithPrimaryTensor:[mpsGraph exponentBase2WithTensor:primaryCastTensor name:nil]
|
||||
secondaryTensor:[mpsGraph exponentBase2WithTensor:secondaryCastTensor name:nil]
|
||||
name:nil];
|
||||
return [mpsGraph logarithmBase2WithTensor:sumTensor name:nil];
|
||||
};
|
||||
mps::binaryOpTensor(self, other, output, "logaddexp2_out_mps", logaddexp2_op_block);
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(xlogy_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
|
||||
mps::BinaryOpBlock xlogy_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
|
||||
MPSGraph* mpsGraph = cachedGraph->graph();
|
||||
|
||||
@ -57,7 +57,6 @@ Tensor& random_mps_impl(Tensor& self,
|
||||
if (self.numel() == 0) {
|
||||
return self;
|
||||
}
|
||||
at::assert_no_internal_overlap(self);
|
||||
// MPS random is broken for 5D+ tensors, see https://github.com/pytorch/pytorch/issues/147624
|
||||
const auto need_reshape = self.ndimension() > 4;
|
||||
auto mps_gen = get_generator_or_default<MPSGeneratorImpl>(gen, at::mps::detail::getDefaultMPSGenerator());
|
||||
@ -154,16 +153,8 @@ Tensor& random_mps_impl(Tensor& self,
|
||||
feeds[meanPlaceholder.getMPSGraphTensor()] = meanPlaceholder.getMPSGraphTensorData();
|
||||
}
|
||||
|
||||
// Handle non-contiguous output tensors by creating a contiguous temporary
|
||||
const auto needs_gather = needsGather(self);
|
||||
Tensor self_ = needs_gather ? at::empty_like(self, MemoryFormat::Contiguous) : self;
|
||||
Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self_);
|
||||
Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self);
|
||||
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
|
||||
|
||||
// Copy results back to original non-contiguous output
|
||||
if (needs_gather) {
|
||||
self.copy_(self_);
|
||||
}
|
||||
}
|
||||
|
||||
return self;
|
||||
|
||||
@ -617,9 +617,6 @@ Tensor& index_select_out_mps(const Tensor& self, int64_t dim, const Tensor& inde
|
||||
TORCH_CHECK(self.scalar_type() == output.scalar_type(),
|
||||
"index_select(): self and output must have the same scalar type");
|
||||
TORCH_CHECK(dim == 0 || dim < self.dim(), "index_select(): Indexing dim ", dim, " is out of bounds of tensor");
|
||||
at::assert_no_internal_overlap(output);
|
||||
at::assert_no_overlap(output, self);
|
||||
at::assert_no_overlap(output, index);
|
||||
auto output_size = self.sizes().vec();
|
||||
if (self.dim() > 0) {
|
||||
output_size[dim] = num_indices;
|
||||
|
||||
@ -1028,18 +1028,15 @@ TORCH_IMPL_FUNC(prod_out_mps)
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(amax_out_mps)(const Tensor& input_t, IntArrayRef dim, bool keepdim, const Tensor& output_t) {
|
||||
TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "amax is not defined for complex types");
|
||||
reduction_out_mps(input_t, dim, keepdim, std::nullopt, output_t, MPSReductionType::AMAX, "amax_out_mps");
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(amin_out_mps)(const Tensor& input_t, IntArrayRef dim, bool keepdim, const Tensor& output_t) {
|
||||
TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "amin is not defined for complex types");
|
||||
reduction_out_mps(input_t, dim, keepdim, std::nullopt, output_t, MPSReductionType::AMIN, "amin_out_mps");
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(aminmax_out_mps)
|
||||
(const Tensor& input_t, std::optional<int64_t> dim_opt, bool keepdim, const Tensor& min_t, const Tensor& max_t) {
|
||||
TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "aminmax is not defined for complex types");
|
||||
reduction_out_mps(input_t,
|
||||
dim_opt.has_value() ? OptionalIntArrayRef({*dim_opt}) : std::nullopt,
|
||||
keepdim,
|
||||
|
||||
@ -31,7 +31,6 @@ void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& v
|
||||
indices.copy_(values.toType(at::ScalarType::Long));
|
||||
return;
|
||||
}
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(self.scalar_type()), "kthvalue is not implemented for complex types");
|
||||
// issue #154890, raising error to prevent crash within MPSGraph until
|
||||
// workaround is implemented.
|
||||
TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890");
|
||||
|
||||
@ -3622,7 +3622,8 @@
|
||||
structured: True
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: logaddexp_out
|
||||
CPU, CUDA: logaddexp_out
|
||||
MPS: logaddexp_out_mps
|
||||
tags: pointwise
|
||||
|
||||
- func: logaddexp(Tensor self, Tensor other) -> Tensor
|
||||
@ -3634,7 +3635,8 @@
|
||||
structured: True
|
||||
structured_inherits: TensorIteratorBase
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: logaddexp2_out
|
||||
CPU, CUDA: logaddexp2_out
|
||||
MPS: logaddexp2_out_mps
|
||||
tags: pointwise
|
||||
|
||||
- func: logaddexp2(Tensor self, Tensor other) -> Tensor
|
||||
@ -15095,7 +15097,7 @@
|
||||
CPU: _scaled_dot_product_flash_attention_cpu
|
||||
tags: nondeterministic_seeded
|
||||
|
||||
- func: _scaled_dot_product_fused_attention_overrideable(Tensor query, Tensor key, Tensor value, Tensor? attn_bias=None, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None, bool compute_log_sumexp=True) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
|
||||
- func: _scaled_dot_product_fused_attention_overrideable(Tensor query, Tensor key, Tensor value, Tensor? attn_bias=None, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: _scaled_dot_product_fused_attention_overrideable
|
||||
XPU: _scaled_dot_product_fused_attention_overrideable_xpu
|
||||
@ -15119,7 +15121,6 @@
|
||||
variants: function
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: _scaled_dot_product_fused_attention_overrideable_backward
|
||||
XPU: _scaled_dot_product_fused_attention_overrideable_backward_xpu
|
||||
|
||||
- func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)
|
||||
dispatch:
|
||||
|
||||
@ -73,7 +73,8 @@ void upsample_bilinear2d_out_frame(
|
||||
const auto rwidth = area_pixel_compute_scale<float>(
|
||||
input_width, output_width, align_corners, scales_w);
|
||||
|
||||
float output_scale = static_cast<float>(output.q_scale() / input.q_scale());
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
|
||||
float output_scale = output.q_scale() / input.q_scale();
|
||||
|
||||
const int64_t input_q_zero_point = input.q_zero_point();
|
||||
const int64_t output_q_zero_point = output.q_zero_point();
|
||||
|
||||
@ -148,7 +148,7 @@ Tensor qcat_nhwc_kernel(
|
||||
// Vectorized loop
|
||||
if (c + VLEN <= curr_C) {
|
||||
auto curr_scale_vec = Vectorized<float>(curr_scale);
|
||||
auto curr_zero_pt_vec = Vectorized<float>(curr_zero_pt);
|
||||
auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
|
||||
auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
|
||||
for (; c + VLEN <= curr_C; c += VLEN) {
|
||||
auto inp_vec = Vec::loadu(iptr + c);
|
||||
@ -174,7 +174,7 @@ Tensor qcat_nhwc_kernel(
|
||||
int64_t elem_size = curr_C - c;
|
||||
if ((VLEN == 4 * kVLEN) && elem_size >= kVLEN) {
|
||||
auto curr_scale_vec = Vectorized<float>(curr_scale);
|
||||
auto curr_zero_pt_vec = Vectorized<float>(curr_zero_pt);
|
||||
auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
|
||||
auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
|
||||
int64_t vec_num = elem_size / kVLEN;
|
||||
std::array<typename scalar_t::underlying, VLEN> buf_in{};
|
||||
@ -611,10 +611,12 @@ void qrelu_kernel(const Tensor& qx, Tensor& qy) {
|
||||
void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
|
||||
const Scalar& negval_) {
|
||||
int64_t i_zp = qx.q_zero_point();
|
||||
float i_scale = static_cast<float>(qx.q_scale());
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float i_scale = qx.q_scale();
|
||||
|
||||
int64_t o_zp = out.q_zero_point();
|
||||
float o_scale = static_cast<float>(out.q_scale());
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float o_scale = out.q_scale();
|
||||
float o_inv_scale = 1.0f / o_scale;
|
||||
|
||||
float negval = negval_.to<float>();
|
||||
@ -625,8 +627,8 @@ void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
|
||||
Vec zero_vec = Vec(0.0f);
|
||||
Vec one_vec = Vec(1.0f);
|
||||
|
||||
Vec i_scale_vec = Vec(i_scale);
|
||||
Vec i_zp_vec = Vec(i_zp);
|
||||
Vec i_scale_vec = Vec((float)i_scale);
|
||||
Vec i_zp_vec = Vec((float)i_zp);
|
||||
Vec i_scale_zp_neg_premul_vec = i_scale_vec * i_zp_vec.neg();
|
||||
|
||||
Vec negval_vec = Vec(negval);
|
||||
@ -736,9 +738,10 @@ void qprelu_out_kernel(Tensor& out,
|
||||
|
||||
void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) {
|
||||
int64_t zero_point = qx.q_zero_point();
|
||||
float scale = static_cast<float>(qx.q_scale());
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float scale = qx.q_scale();
|
||||
auto scale_vec = Vectorized<float>(scale);
|
||||
auto zero_point_vec = Vectorized<float>(zero_point);
|
||||
auto zero_point_vec = Vectorized<float>((float)zero_point);
|
||||
auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
|
||||
int64_t output_zero_point = zero_point;
|
||||
float output_scale = scale;
|
||||
@ -825,9 +828,10 @@ void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) {
|
||||
void qsigmoid_kernel(
|
||||
const Tensor& qx, Tensor& qy, double output_scale, int64_t output_zero_point ) {
|
||||
int64_t zero_point = qx.q_zero_point();
|
||||
float scale = static_cast<float>(qx.q_scale());
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float scale = qx.q_scale();
|
||||
auto scale_vec = Vectorized<float>(scale);
|
||||
auto zero_point_vec = Vectorized<float>(zero_point);
|
||||
auto zero_point_vec = Vectorized<float>((float)zero_point);
|
||||
|
||||
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qsigmoid", [&]() {
|
||||
float inv_output_scale = 1.0 / output_scale;
|
||||
@ -866,9 +870,10 @@ void qsigmoid_kernel(
|
||||
|
||||
void qhardsigmoid_kernel(const Tensor& qx, Tensor& qy) {
|
||||
int64_t zero_point = qx.q_zero_point();
|
||||
float scale = static_cast<float>(qx.q_scale());
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float scale = qx.q_scale();
|
||||
auto scale_vec = Vectorized<float>(scale);
|
||||
auto zero_point_vec = Vectorized<float>(zero_point);
|
||||
auto zero_point_vec = Vectorized<float>((float)zero_point);
|
||||
auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
|
||||
|
||||
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qhardsigmoid", [&]() {
|
||||
@ -1024,10 +1029,13 @@ void qthreshold_kernel(
|
||||
|
||||
// defines input and output scales and zero_points
|
||||
int64_t input_zero_point = qx.q_zero_point();
|
||||
float input_scale = static_cast<float>(qx.q_scale());
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float input_scale = qx.q_scale();
|
||||
int64_t output_zero_point = qy.q_zero_point();
|
||||
float output_scale = static_cast<float>(qy.q_scale());
|
||||
float inv_output_scale = static_cast<float>(1.0 / output_scale);
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float output_scale = qy.q_scale();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float inv_output_scale = 1.0 / output_scale;
|
||||
|
||||
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qthreshold", [&]() {
|
||||
qy = at::_empty_affine_quantized(
|
||||
@ -1088,7 +1096,8 @@ void qhardswish_kernel(const Tensor& qx, Tensor& qy) {
|
||||
|
||||
const auto o_scale = qy.q_scale();
|
||||
const auto o_zero_point = qy.q_zero_point();
|
||||
const float o_inv_scale = static_cast<float>(1.0 / o_scale);
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
const float o_inv_scale = 1.0 / o_scale;
|
||||
|
||||
using fVec = Vectorized<float>;
|
||||
fVec i_scale_vec(i_scale);
|
||||
@ -1126,9 +1135,10 @@ void qhardswish_kernel(const Tensor& qx, Tensor& qy) {
|
||||
|
||||
void qtanh_kernel(const Tensor& qx, Tensor& qy) {
|
||||
int64_t zero_point = qx.q_zero_point();
|
||||
float scale = static_cast<float>(qx.q_scale());
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float scale = qx.q_scale();
|
||||
auto scale_vec = Vectorized<float>(scale);
|
||||
auto zero_point_vec = Vectorized<float>(zero_point);
|
||||
auto zero_point_vec = Vectorized<float>((float)zero_point);
|
||||
auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
|
||||
|
||||
AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qtanh", [&]() {
|
||||
@ -1188,13 +1198,16 @@ void qelu_kernel(
|
||||
// they are NOT related to the quantization scale term
|
||||
|
||||
int64_t i_zp = qx.q_zero_point();
|
||||
float i_scale = static_cast<float>(qx.q_scale());
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float i_scale = qx.q_scale();
|
||||
|
||||
// In a future PR, we can improve on output scale and zero_point
|
||||
// selection.
|
||||
int64_t o_zp = qy.q_zero_point();
|
||||
float o_scale = static_cast<float>(qy.q_scale());
|
||||
float inv_o_scale = static_cast<float>(1.0 / o_scale);
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float o_scale = qy.q_scale();
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float inv_o_scale = 1.0 / o_scale;
|
||||
|
||||
float alpha_float = alpha.to<float>();
|
||||
float scale_coef = scale.to<float>();
|
||||
@ -1214,7 +1227,7 @@ void qelu_kernel(
|
||||
Vec scale_coef_vec = Vec(scale_coef);
|
||||
Vec input_scale_coef_vec = Vec(input_scale_coef);
|
||||
Vec i_scale_vec = Vec(i_scale);
|
||||
Vec i_zero_point_vec = Vec(i_zp);
|
||||
Vec i_zero_point_vec = Vec((float)i_zp);
|
||||
Vec i_scale_neg_zp_premul_vec = i_scale_vec * i_zero_point_vec.neg();
|
||||
|
||||
cpu_kernel_vec(
|
||||
@ -1313,20 +1326,23 @@ void qadd_scalar_kernel(Tensor& out, const Tensor& self, const Scalar& other) {
|
||||
template <bool ReLUFused = false>
|
||||
void qadd_kernel(Tensor& out, const Tensor& self, const Tensor& other) {
|
||||
int64_t zero_point = out.q_zero_point();
|
||||
float scale = static_cast<float>(out.q_scale());
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float scale = out.q_scale();
|
||||
float inv_scale = 1.0f / scale;
|
||||
int64_t self_zero_point = self.q_zero_point();
|
||||
float self_scale = static_cast<float>(self.q_scale());
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float self_scale = self.q_scale();
|
||||
int64_t other_zero_point = other.q_zero_point();
|
||||
float other_scale = static_cast<float>(other.q_scale());
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
float other_scale = other.q_scale();
|
||||
|
||||
// Broadcast out the parameters here to amortize out that cost across
|
||||
// loop iterations.
|
||||
// TODO: we can optimize dequantization by doing a premultiplication
|
||||
// of the zero point by scale and doing FMA on scale*x_q - (scale*zero_point)
|
||||
auto self_zero_point_vec = Vectorized<float>(self_zero_point);
|
||||
auto self_zero_point_vec = Vectorized<float>((float)self_zero_point);
|
||||
auto self_scale_vec = Vectorized<float>(self_scale);
|
||||
auto other_zero_point_vec = Vectorized<float>(other_zero_point);
|
||||
auto other_zero_point_vec = Vectorized<float>((float)other_zero_point);
|
||||
auto other_scale_vec = Vectorized<float>(other_scale);
|
||||
|
||||
auto self_scale_neg_zp_premul_vec = self_scale_vec * self_zero_point_vec.neg();
|
||||
@ -2949,7 +2965,7 @@ void quantized_normalize_kernel(
|
||||
const bool beta_null = beta_data == nullptr;
|
||||
int64_t x_zp = X.q_zero_point();
|
||||
float x_scale = X.q_scale();
|
||||
fVec x_zp_vec(x_zp);
|
||||
fVec x_zp_vec((float)x_zp);
|
||||
fVec one_vec(1.0f);
|
||||
fVec zero_vec(0.0f);
|
||||
float x_fake_scale = 1.0f;
|
||||
@ -3237,7 +3253,7 @@ void quantized_groupnorm_nhwc_kernel(
|
||||
const bool beta_null = beta_data == nullptr;
|
||||
int64_t x_zp = X.q_zero_point();
|
||||
float x_scale = X.q_scale();
|
||||
fVec x_zp_vec(x_zp);
|
||||
fVec x_zp_vec((float)x_zp);
|
||||
fVec one_vec(1.0f);
|
||||
fVec zero_vec(0.0f);
|
||||
float x_fake_scale = 1.0f;
|
||||
|
||||
@ -414,6 +414,7 @@ at::Tensor& PackedLinearWeightFp16::apply_dynamic_impl(
|
||||
TORCH_CHECK(input.size(input.dim() - 1) == packed_weight_fp16.numRows())
|
||||
TORCH_CHECK(input.dim() >= 2);
|
||||
|
||||
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
|
||||
const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
|
||||
const int64_t N = packed_weight_fp16.numCols();
|
||||
std::vector<int64_t> output_sizes = input.sizes().vec();
|
||||
|
||||
@ -467,28 +467,6 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, IntArrayRe
|
||||
!options.has_layout() || options.layout() == kSparse,
|
||||
"expected sparse layout, but got layout ",
|
||||
options.layout());
|
||||
|
||||
if (indices.numel() > 0) {
|
||||
Tensor min_indices =
|
||||
std::get</* values */ 0>(indices.min(/* dim */ 1, /* keepdim */ false));
|
||||
Tensor cpu_min_indices;
|
||||
if (!indices.is_cpu()) {
|
||||
cpu_min_indices = min_indices.to(at::DeviceType::CPU);
|
||||
} else {
|
||||
cpu_min_indices = min_indices;
|
||||
}
|
||||
auto cpu_min_indices_accessor = cpu_min_indices.accessor<int64_t, 1>();
|
||||
for (const auto d : c10::irange(indices.size(0))) {
|
||||
int64_t min_index_in_dim = cpu_min_indices_accessor[d];
|
||||
TORCH_CHECK(
|
||||
min_index_in_dim >= 0,
|
||||
"found negative index ",
|
||||
min_index_in_dim,
|
||||
" for dim ",
|
||||
d);
|
||||
}
|
||||
}
|
||||
|
||||
return at::native::_sparse_coo_tensor_unsafe(
|
||||
indices,
|
||||
values,
|
||||
|
||||
@ -768,11 +768,8 @@ Tensor scaled_dot_product_attention(
|
||||
return std::get<0>(out_and_lse);
|
||||
}
|
||||
case SDPBackend::overrideable: {
|
||||
bool compute_logsumexp = should_compute_logsumexp(query_, key, value);
|
||||
compute_logsumexp = compute_logsumexp ||
|
||||
(at::GradMode::is_enabled() && attn_mask.has_value() && attn_mask.value().requires_grad());
|
||||
auto out_lse_softmax = at::_scaled_dot_product_fused_attention_overrideable(
|
||||
query_, key, value, attn_mask, dropout_p, is_causal, false /*return_debug_mask*/, scale, compute_logsumexp);
|
||||
query_, key, value, attn_mask, dropout_p, is_causal, false /*return_debug_mask*/, scale);
|
||||
return std::get<0>(out_lse_softmax);
|
||||
}
|
||||
case SDPBackend::math: {
|
||||
@ -1018,8 +1015,7 @@ _scaled_dot_product_fused_attention_overrideable(
|
||||
double dropout_p,
|
||||
bool is_causal,
|
||||
bool return_debug_mask,
|
||||
std::optional<double> scale,
|
||||
bool compute_logsumexp) {
|
||||
std::optional<double> scale) {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "_scaled_dot_product_fused_attention_overrideable not implemented. This is an operator for privateuse1 backends, please use TORCH_LIBRARY_IMPL to override this function ");
|
||||
}
|
||||
|
||||
|
||||
@ -22,7 +22,6 @@
|
||||
#else
|
||||
#include <ATen/ops/empty.h>
|
||||
#include <ATen/ops/empty_like.h>
|
||||
#include <ATen/ops/zeros_like.h>
|
||||
#include <ATen/ops/reshape.h>
|
||||
#include <ATen/ops/scalar_tensor.h>
|
||||
#include <ATen/ops/sum.h>
|
||||
@ -43,6 +42,7 @@ C10_DIAGNOSTIC_POP()
|
||||
#include <static_switch.h>
|
||||
#include <ATen/native/transformers/cuda/flash_attn/flash_api.h>
|
||||
|
||||
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
namespace FLASH_NAMESPACE {
|
||||
@ -417,26 +417,6 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
|
||||
const int head_size_og = sizes[3];
|
||||
const int seqlen_k = k.size(1);
|
||||
const int num_heads_k = k.size(2);
|
||||
|
||||
if (batch_size == 0) {
|
||||
auto opts = q.options();
|
||||
at::Tensor out = at::empty({0, seqlen_q, num_heads, head_size_og}, opts);
|
||||
at::Tensor q_padded = at::empty({0, seqlen_q, num_heads, head_size_og}, opts);
|
||||
at::Tensor k_padded = at::empty({0, seqlen_k, num_heads_k, head_size_og}, opts);
|
||||
at::Tensor v_padded = at::empty({0, seqlen_k, num_heads_k, head_size_og}, opts);
|
||||
at::Tensor softmax_lse = at::empty({0, num_heads, seqlen_q}, opts.dtype(at::kFloat));
|
||||
at::Tensor rng_state = at::empty({2}, at::dtype(c10::kUInt64).device(at::kCUDA));
|
||||
at::Tensor _unused = at::empty({}, at::dtype(c10::kUInt64).device(at::kCUDA));
|
||||
at::Tensor p = at::empty({0}, opts);
|
||||
if (return_softmax) {
|
||||
auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
|
||||
const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
|
||||
const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
|
||||
p = at::empty({0, num_heads, seqlen_q_rounded, seqlen_k_rounded}, opts);
|
||||
}
|
||||
return {std::move(out), std::move(q_padded), std::move(k_padded), std::move(v_padded), std::move(softmax_lse), std::move(rng_state), _unused, std::move(p)};
|
||||
}
|
||||
|
||||
TORCH_CHECK(batch_size > 0, "batch size must be positive");
|
||||
TORCH_CHECK(head_size_og % 8 == 0, "head_size must be a multiple of 8, this is ensured by padding!");
|
||||
TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
|
||||
@ -567,7 +547,7 @@ mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head
|
||||
q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og});
|
||||
softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
|
||||
}
|
||||
return {std::move(out), std::move(q_padded), std::move(k_padded), std::move(v_padded), std::move(softmax_lse), std::move(rng_state), std::move(_unused), std::move(p)};
|
||||
return {out, q_padded, k_padded, v_padded, softmax_lse, rng_state, _unused, p};
|
||||
}
|
||||
|
||||
std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
|
||||
@ -872,6 +852,7 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si
|
||||
TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
|
||||
TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
|
||||
TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension");
|
||||
TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
|
||||
|
||||
const auto sizes = q.sizes();
|
||||
|
||||
@ -882,20 +863,6 @@ mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_si
|
||||
const int head_size = sizes[3];
|
||||
const int seqlen_k = k.size(1);
|
||||
const int num_heads_k = k.size(2);
|
||||
|
||||
if (batch_size == 0) {
|
||||
auto opts = q.options();
|
||||
at::Tensor dq = at::empty_like(q);
|
||||
at::Tensor dk = at::empty_like(k);
|
||||
at::Tensor dv = at::empty_like(v);
|
||||
auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
|
||||
const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
|
||||
at::Tensor softmax_d = at::empty({0, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat));
|
||||
return {dq, dk, dv, softmax_d};
|
||||
}
|
||||
|
||||
TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
|
||||
|
||||
TORCH_CHECK(batch_size > 0, "batch size must be positive");
|
||||
TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
|
||||
TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!");
|
||||
|
||||
@ -1837,10 +1837,6 @@ class BenchmarkRunner:
|
||||
def skip_models_for_cuda(self):
|
||||
return set()
|
||||
|
||||
@property
|
||||
def skip_models_for_xpu(self):
|
||||
return set()
|
||||
|
||||
@property
|
||||
def skip_models_for_cpu(self):
|
||||
return set()
|
||||
@ -3931,8 +3927,6 @@ def run(runner, args, original_dir=None):
|
||||
runner.skip_models.update(runner.skip_models_for_cpu_aarch64)
|
||||
elif args.devices == ["cuda"]:
|
||||
runner.skip_models.update(runner.skip_models_for_cuda)
|
||||
elif args.devices == ["xpu"]:
|
||||
runner.skip_models.update(runner.skip_models_for_xpu)
|
||||
|
||||
if not args.multiprocess:
|
||||
runner.skip_models.update(runner.skip_multiprocess_models)
|
||||
|
||||
@ -124,10 +124,6 @@ class TorchBenchmarkRunner(BenchmarkRunner):
|
||||
def skip_models_for_cuda(self):
|
||||
return self._skip["device"]["cuda"]
|
||||
|
||||
@property
|
||||
def skip_models_for_xpu(self):
|
||||
return self._skip["device"]["xpu"]
|
||||
|
||||
@property
|
||||
def skip_models_for_freezing_cuda(self):
|
||||
return self._skip["freezing"]["cuda"]
|
||||
|
||||
@ -217,9 +217,6 @@ skip:
|
||||
|
||||
cuda: []
|
||||
|
||||
xpu:
|
||||
- *DETECTRON2_MODELS
|
||||
|
||||
test:
|
||||
training:
|
||||
- *DETECTRON2_MODELS
|
||||
|
||||
@ -1,157 +0,0 @@
|
||||
"""Configuration utilities for parsing JSON and YAML config files."""
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
|
||||
def heads_input_type(s: str) -> tuple[int, int]:
|
||||
"""Convert string format 'Hq,Hkv' to tuple (Hq, Hkv)."""
|
||||
try:
|
||||
hq, hkv = map(int, s.split(","))
|
||||
return hq, hkv
|
||||
except Exception as e:
|
||||
raise ValueError("Heads must be Hq,Hkv") from e
|
||||
|
||||
|
||||
default_config = {
|
||||
"dynamic": False,
|
||||
"calculate_bwd": False,
|
||||
"dtype": "bfloat16",
|
||||
"b": [2, 8, 16],
|
||||
"nh": ["16,16", "16,2"],
|
||||
"s": [512, 1024, 4096],
|
||||
"d": [64, 128],
|
||||
"mods": ["noop", "causal", "alibi", "sliding_window"],
|
||||
"backend": ["efficient"],
|
||||
"max_autotune": False,
|
||||
"decoding": False,
|
||||
"kv_size": None,
|
||||
"throughput": True,
|
||||
"save_path": None,
|
||||
"output_json_for_dashboard": None,
|
||||
"benchmark_name": "PyTorch operator microbenchmark",
|
||||
}
|
||||
|
||||
|
||||
def load_config_file(config_path: str) -> dict:
|
||||
"""Load configuration from JSON or YAML file.
|
||||
|
||||
Automatically converts 'nh' field from strings to tuples.
|
||||
|
||||
Args:
|
||||
config_path: Path to the configuration file
|
||||
|
||||
Returns:
|
||||
Dictionary containing the configuration
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If config file doesn't exist
|
||||
ValueError: If config file format is invalid
|
||||
"""
|
||||
with open(config_path) as f:
|
||||
config_str = f.read()
|
||||
|
||||
# Try to load as JSON first
|
||||
try:
|
||||
config = json.loads(config_str)
|
||||
except json.JSONDecodeError:
|
||||
# Fall back to YAML parsing
|
||||
config = _parse_simple_yaml(config_str)
|
||||
|
||||
# Apply automatic conversions for 'nh' field
|
||||
if "nh" in config and isinstance(config["nh"], list):
|
||||
config["nh"] = [
|
||||
heads_input_type(h) if isinstance(h, str) else h for h in config["nh"]
|
||||
]
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _parse_simple_yaml(yaml_str: str) -> dict:
|
||||
"""Simple YAML parser for basic configs (without external dependencies).
|
||||
|
||||
Supports:
|
||||
- key: value pairs
|
||||
- booleans (true/false)
|
||||
- null values
|
||||
- integers and floats
|
||||
- strings (quoted and unquoted)
|
||||
- lists in JSON format [item1, item2, ...]
|
||||
- comments (lines starting with # or after #)
|
||||
|
||||
Args:
|
||||
yaml_str: YAML content as string
|
||||
|
||||
Returns:
|
||||
Dictionary containing parsed YAML content
|
||||
"""
|
||||
config = {}
|
||||
|
||||
for line in yaml_str.split("\n"):
|
||||
# Remove comments
|
||||
line = line.split("#")[0].strip()
|
||||
|
||||
if not line or ":" not in line:
|
||||
continue
|
||||
|
||||
key, value = line.split(":", 1)
|
||||
key = key.strip()
|
||||
value = value.strip()
|
||||
|
||||
# Parse value based on type
|
||||
if value.lower() == "true":
|
||||
config[key] = True
|
||||
elif value.lower() == "false":
|
||||
config[key] = False
|
||||
elif value.lower() in ("null", "none", ""):
|
||||
config[key] = None
|
||||
elif value.startswith("[") and value.endswith("]"):
|
||||
# Parse list - handle quoted strings properly
|
||||
pattern = r'"([^"]+)"|\'([^\']+)\'|([^,\[\]\s]+)'
|
||||
matches = re.findall(pattern, value[1:-1]) # Remove [ ]
|
||||
parsed_items = []
|
||||
for match in matches:
|
||||
# match is a tuple of (double_quoted, single_quoted, unquoted)
|
||||
item = match[0] or match[1] or match[2]
|
||||
item = item.strip()
|
||||
if item:
|
||||
try:
|
||||
parsed_items.append(int(item))
|
||||
except ValueError:
|
||||
parsed_items.append(item)
|
||||
config[key] = parsed_items
|
||||
elif value.startswith(('"', "'")):
|
||||
config[key] = value.strip("\"'")
|
||||
else:
|
||||
# Try to parse as number
|
||||
try:
|
||||
config[key] = int(value)
|
||||
except ValueError:
|
||||
try:
|
||||
config[key] = float(value)
|
||||
except ValueError:
|
||||
config[key] = value
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def print_default_config(output_format: str) -> None:
|
||||
"""Print a default configuration template in JSON or YAML format.
|
||||
|
||||
Args:
|
||||
output_format: Either "json" or "yaml"
|
||||
"""
|
||||
if output_format == "json":
|
||||
print(json.dumps(default_config, indent=2))
|
||||
else: # yaml
|
||||
for key, value in default_config.items():
|
||||
if value is None:
|
||||
print(f"{key}: null")
|
||||
elif isinstance(value, bool):
|
||||
print(f"{key}: {str(value).lower()}")
|
||||
elif isinstance(value, str):
|
||||
print(f'{key}: "{value}"')
|
||||
elif isinstance(value, list):
|
||||
print(f"{key}: {json.dumps(value)}")
|
||||
else:
|
||||
print(f"{key}: {value}")
|
||||
@ -1,29 +0,0 @@
|
||||
# Basic benchmark configuration for PyTorch transformer benchmarks
|
||||
# Usage: python score_mod.py --config config_basic.yaml
|
||||
|
||||
# Core parameters
|
||||
dynamic: false
|
||||
calculate_bwd: true
|
||||
dtype: "bfloat16"
|
||||
|
||||
# Shape parameters - larger sweep
|
||||
b: [1, 2, 4, 8, 16] # batch sizes
|
||||
nh: ["16,16", "16,2", "32,32", "32,4"] # [query_heads,key_value_heads]
|
||||
s: [512, 1024, 2048, 4096, 8192] # sequence lengths
|
||||
d: [64, 128] # head dimensions (limited to 128 for Flash Attention/cuDNN compatibility)
|
||||
|
||||
# All attention types
|
||||
mods: ["noop", "causal", "rel", "head_bias", "alibi", "sliding_window", "prefix_lm", "softcap"]
|
||||
|
||||
# Multiple backends for comparison (SDPA + Flash Attention) - flex is always included internally
|
||||
backend: ["efficient", "math", "cudnn", "fav2"]
|
||||
max_autotune: true # Enable torch.compile with max-autotune for optimal performance
|
||||
|
||||
# Decoding and cache settings
|
||||
decoding: false
|
||||
kv_size: null
|
||||
|
||||
# Metrics and output
|
||||
throughput: true # Calculate memory bandwidth & TFLOPS
|
||||
save_path: "comprehensive_results.csv" # Save to CSV
|
||||
output_json_for_dashboard: "attn_bench_basic.json"
|
||||
@ -1,19 +1,15 @@
|
||||
import argparse
|
||||
import csv
|
||||
import gc
|
||||
import itertools
|
||||
import json
|
||||
import random
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from collections.abc import Callable
|
||||
from contextlib import nullcontext
|
||||
from dataclasses import asdict, dataclass
|
||||
from functools import partial, wraps
|
||||
from typing import Literal, Optional, Union
|
||||
from functools import partial
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
from config_utils import heads_input_type, load_config_file, print_default_config
|
||||
from tabulate import tabulate
|
||||
from tqdm import tqdm
|
||||
|
||||
@ -37,96 +33,6 @@ torch._dynamo.config.recompile_limit = 1000
|
||||
from torch._inductor.runtime.benchmarking import benchmarker
|
||||
|
||||
|
||||
def cleanup_memory():
|
||||
"""Aggressively free GPU memory"""
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.synchronize()
|
||||
|
||||
|
||||
def safe_backend(backend_name=None, return_dict=False):
|
||||
"""Decorator that wraps backend functions with error handling
|
||||
|
||||
Args:
|
||||
backend_name: Name of the backend for error messages
|
||||
return_dict: If True, returns dict of results for all backends (for run_single_experiment)
|
||||
If False, returns single ExperimentResults (for individual backend functions)
|
||||
"""
|
||||
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapper(config, *args, **kwargs):
|
||||
try:
|
||||
return func(config, *args, **kwargs)
|
||||
except torch.OutOfMemoryError:
|
||||
print(
|
||||
f"[SKIP] OOM for {backend_name or func.__name__} with shape {config.shape}"
|
||||
)
|
||||
cleanup_memory()
|
||||
except RuntimeError as e:
|
||||
error_msg = str(e)
|
||||
if "out of resource" in error_msg or "OutOfMemoryError" in error_msg:
|
||||
print(
|
||||
f"[SKIP] Triton OOM for {backend_name or func.__name__} with shape {config.shape}"
|
||||
)
|
||||
cleanup_memory()
|
||||
elif "No valid triton configs" in error_msg:
|
||||
print(
|
||||
f"[SKIP] No valid Triton config for {backend_name or func.__name__} with shape {config.shape}"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"[SKIP] Runtime error for {backend_name or func.__name__} with shape {config.shape}: {str(e)[:100]}"
|
||||
)
|
||||
except Exception as e:
|
||||
print(
|
||||
f"[SKIP] Error for {backend_name or func.__name__} with shape {config.shape}: {str(e)[:100]}"
|
||||
)
|
||||
|
||||
# Return appropriate NaN result based on function type
|
||||
if return_dict:
|
||||
# For run_single_experiment: return dict with NaN for all backends
|
||||
nan_result = ExperimentResults(
|
||||
fwd_time=float("nan"),
|
||||
bwd_time=float("nan") if config.calculate_bwd_time else None,
|
||||
)
|
||||
results = dict.fromkeys(config.backends, nan_result)
|
||||
results["flex"] = ExperimentResults(
|
||||
fwd_time=float("nan"),
|
||||
bwd_time=float("nan") if config.calculate_bwd_time else None,
|
||||
sparsity=None,
|
||||
)
|
||||
return results
|
||||
else:
|
||||
# For individual backend functions: return single ExperimentResults
|
||||
return ExperimentResults(
|
||||
fwd_time=float("nan"),
|
||||
bwd_time=float("nan") if config.calculate_bwd_time else None,
|
||||
)
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
# Type definitions
|
||||
Backend = Literal["math", "efficient", "cudnn", "fav2", "fav3", "fakv", "og-eager"]
|
||||
AttentionType = Literal[
|
||||
"noop",
|
||||
"causal",
|
||||
"rel",
|
||||
"head_bias",
|
||||
"alibi",
|
||||
"sliding_window",
|
||||
"document_mask",
|
||||
"prefix_lm",
|
||||
"softcap",
|
||||
]
|
||||
DtypeString = Literal["bfloat16", "float16", "float32"]
|
||||
SpeedupType = Literal["fwd", "bwd"]
|
||||
|
||||
|
||||
def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) -> float:
|
||||
# warmup
|
||||
for _ in range(5):
|
||||
@ -142,7 +48,6 @@ class ExperimentConfig:
|
||||
calculate_bwd_time: bool
|
||||
cal_bandwidth: bool
|
||||
backends: list[str]
|
||||
max_autotune: bool
|
||||
|
||||
def __post_init__(self):
|
||||
assert len(self.shape) == 6, (
|
||||
@ -157,7 +62,6 @@ class ExperimentConfig:
|
||||
d.pop("cal_bandwidth", None)
|
||||
d["shape(B,Hq,M,Hkv,N,D)"] = d.pop("shape")
|
||||
d.pop("backends", None)
|
||||
d.pop("max_autotune", False)
|
||||
return d
|
||||
|
||||
|
||||
@ -305,7 +209,6 @@ def query_key_value_clones(
|
||||
return query_ref, key_ref, value_ref
|
||||
|
||||
|
||||
@safe_backend("SDPA")
|
||||
def run_single_backend_sdpa(
|
||||
config: ExperimentConfig,
|
||||
query: torch.Tensor,
|
||||
@ -320,7 +223,6 @@ def run_single_backend_sdpa(
|
||||
backend_context = get_backend_context(backend)
|
||||
with backend_context:
|
||||
_device = torch.device("cuda")
|
||||
|
||||
eager_sdpa = generate_eager_sdpa(
|
||||
config.attn_type, config.shape, config.dtype, block_mask, score_mod
|
||||
)
|
||||
@ -388,7 +290,6 @@ def run_single_backend_sdpa(
|
||||
)
|
||||
|
||||
|
||||
@safe_backend("FlashAttention")
|
||||
def run_single_backend_FA(
|
||||
config: ExperimentConfig,
|
||||
query: torch.Tensor,
|
||||
@ -400,9 +301,9 @@ def run_single_backend_FA(
|
||||
mask_kwargs,
|
||||
backend: str,
|
||||
) -> ExperimentResults:
|
||||
assert backend in ["fav3", "fakv"]
|
||||
assert backend in ["fav2", "fav3", "fakv"]
|
||||
# Generate callable for specific backend.
|
||||
if backend in ["fav3"]:
|
||||
if backend in ["fav2", "fav3"]:
|
||||
FA = generate_FA_callable(
|
||||
config.attn_type, config.shape, config.dtype, backend, **mask_kwargs
|
||||
)
|
||||
@ -453,10 +354,10 @@ def run_single_backend_FA(
|
||||
)
|
||||
|
||||
|
||||
@safe_backend("flex_attention", return_dict=True)
|
||||
def run_single_experiment(
|
||||
config: ExperimentConfig,
|
||||
dynamic=False,
|
||||
max_autotune=False,
|
||||
) -> dict[str, ExperimentResults]:
|
||||
device = torch.device("cuda")
|
||||
batch_size, q_heads, q_seq_len, kv_heads, kv_seq_len, head_dim = config.shape
|
||||
@ -476,7 +377,7 @@ def run_single_experiment(
|
||||
block_mask, mask_kwargs = generate_block_mask(config.attn_type, config.shape)
|
||||
kernel_options = get_kernel_options(config.attn_type, config.shape)
|
||||
|
||||
if config.max_autotune:
|
||||
if max_autotune:
|
||||
compiled_sdpa = torch.compile(
|
||||
flex_attention, dynamic=dynamic, mode="max-autotune-no-cudagraphs"
|
||||
)
|
||||
@ -506,7 +407,7 @@ def run_single_experiment(
|
||||
|
||||
results = {}
|
||||
for backend in config.backends:
|
||||
if backend in ["fav3", "fakv"]:
|
||||
if backend in ["fav2", "fav3", "fakv"]:
|
||||
results[backend] = run_single_backend_FA(
|
||||
config,
|
||||
query,
|
||||
@ -518,7 +419,7 @@ def run_single_experiment(
|
||||
mask_kwargs,
|
||||
backend,
|
||||
)
|
||||
else: # sdpa (also supports fav2)
|
||||
else: # sdpa
|
||||
results[backend] = run_single_backend_sdpa(
|
||||
config,
|
||||
query,
|
||||
@ -539,7 +440,7 @@ def run_single_experiment(
|
||||
sparsity = block_mask.sparsity() / 100.0 if block_mask is not None else 0.0
|
||||
sparsity = sparsity if config.attn_type != "document_mask" else 0.5
|
||||
|
||||
results["flex"] = ExperimentResults(
|
||||
results["compiled"] = ExperimentResults(
|
||||
fwd_time=forward_compiled_time,
|
||||
bwd_time=backward_compile_time if config.calculate_bwd_time else None,
|
||||
sparsity=sparsity,
|
||||
@ -600,15 +501,15 @@ def calculate_tflops(config: ExperimentConfig, results: ExperimentResults) -> fl
|
||||
softmax_flops = M * N * 2 # Not counting online softmax overhead
|
||||
o_flops = M * D * N * 2
|
||||
# Not counting split k overhead
|
||||
sparsity = results.sparsity if results.sparsity is not None else 0.0
|
||||
total_flops = B * Hq * (qk_flops + softmax_flops + o_flops) * (1 - sparsity)
|
||||
total_flops = B * Hq * (qk_flops + softmax_flops + o_flops) * (1 - results.sparsity)
|
||||
return total_flops / results.fwd_time / 1e6 # in TFLOPs/
|
||||
|
||||
|
||||
def get_average_speedups(results: list[Experiment], type: str, backend: str):
|
||||
# Calculate speedups
|
||||
speedups = [
|
||||
calculate_speedup(r.results["flex"], r.results[backend], type) for r in results
|
||||
calculate_speedup(r.results["compiled"], r.results[backend], type)
|
||||
for r in results
|
||||
]
|
||||
|
||||
# Find indices of max and min speedups
|
||||
@ -636,7 +537,7 @@ def get_average_speedups(results: list[Experiment], type: str, backend: str):
|
||||
def print_results(results: list[Experiment], save_path: Optional[str] = None):
|
||||
table_data = defaultdict(list)
|
||||
for experiment in results:
|
||||
backends = experiment.config.backends + ["flex"]
|
||||
backends = experiment.config.backends + ["compiled"]
|
||||
for key, value in experiment.asdict().items():
|
||||
if key in backends:
|
||||
if value.fwd_time:
|
||||
@ -649,43 +550,45 @@ def print_results(results: list[Experiment], save_path: Optional[str] = None):
|
||||
# Calculate speedups
|
||||
for backend in results[0].config.backends:
|
||||
fwd_speedups = [
|
||||
calculate_speedup(r.results["flex"], r.results[backend], type="fwd")
|
||||
calculate_speedup(r.results["compiled"], r.results[backend], type="fwd")
|
||||
for r in results
|
||||
]
|
||||
table_data[f"fwd_speedup_flex_over_{backend}"] = fwd_speedups
|
||||
table_data[f"fwd_{backend}_speedup"] = fwd_speedups
|
||||
|
||||
if results[0].config.calculate_bwd_time:
|
||||
for backend in results[0].config.backends:
|
||||
bwd_speedups = [
|
||||
calculate_speedup(r.results["flex"], r.results[backend], type="bwd")
|
||||
calculate_speedup(r.results["compiled"], r.results[backend], type="bwd")
|
||||
for r in results
|
||||
]
|
||||
table_data[f"bwd_speedup_flex_over_{backend}"] = bwd_speedups
|
||||
table_data[f"bwd_{backend}_speedup"] = bwd_speedups
|
||||
|
||||
# Calculate mem + computational throughput
|
||||
if results[0].config.cal_bandwidth:
|
||||
fwd_bandwidth = [
|
||||
calculate_bandwidth(r.config, r.results["flex"], type="fwd")
|
||||
calculate_bandwidth(r.config, r.results["compiled"], type="fwd")
|
||||
for r in results
|
||||
]
|
||||
table_data["fwd_mem_bw (TB/s)"] = fwd_bandwidth
|
||||
fwd_tflops = [calculate_tflops(r.config, r.results["flex"]) for r in results]
|
||||
fwd_tflops = [
|
||||
calculate_tflops(r.config, r.results["compiled"]) for r in results
|
||||
]
|
||||
table_data["TFlops/s"] = fwd_tflops
|
||||
|
||||
print(tabulate(table_data, headers="keys", tablefmt="github", floatfmt=".3f"))
|
||||
|
||||
for backend in results[0].config.backends:
|
||||
if np.isnan(table_data[f"fwd_speedup_flex_over_{backend}"]).all():
|
||||
if np.isnan(table_data[f"fwd_{backend}_speedup"]).all():
|
||||
continue
|
||||
print("\n")
|
||||
print(f"FWD Speedup of Flex over {backend}".center(125, "="))
|
||||
print(f"FWD Speedups vs. {backend}".center(125, "="))
|
||||
print("\n")
|
||||
average_data = get_average_speedups(results, type="fwd", backend=backend)
|
||||
print(tabulate(average_data, headers="keys", tablefmt="github", floatfmt=".3f"))
|
||||
|
||||
if results[0].config.calculate_bwd_time:
|
||||
print("\n")
|
||||
print(f"BWD Speedup of Flex over {backend}".center(125, "="))
|
||||
print(f"BWD Speedups vs. {backend}".center(125, "="))
|
||||
print("\n")
|
||||
average_data = get_average_speedups(results, type="bwd", backend=backend)
|
||||
print(
|
||||
@ -888,14 +791,14 @@ def get_backend_context(backend: str):
|
||||
Returns a context manager for the specified backend.
|
||||
Args:
|
||||
backend (str): The name of the backend to use.
|
||||
Valid options are 'math', 'efficient', 'cudnn', 'fav2', 'fav3', 'fakv', 'og-eager'.
|
||||
Valid options are 'fav2', 'cudnn', 'math', 'efficient', 'fav3', 'fakv', 'og-eager'.
|
||||
Returns:
|
||||
A context manager for the specified backend.
|
||||
Raises:
|
||||
ValueError: If an invalid backend is specified.
|
||||
"""
|
||||
backends = {
|
||||
"fav2": sdpa_kernel(SDPBackend.FLASH_ATTENTION),
|
||||
"fav2": nullcontext(),
|
||||
"cudnn": sdpa_kernel(SDPBackend.CUDNN_ATTENTION),
|
||||
"math": sdpa_kernel(SDPBackend.MATH),
|
||||
"efficient": sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION),
|
||||
@ -917,7 +820,15 @@ def generate_FA_callable(
|
||||
) -> Callable | None:
|
||||
if dtype not in [torch.float16, torch.bfloat16]:
|
||||
return None
|
||||
if backend == "fav3":
|
||||
if backend == "fav2":
|
||||
try:
|
||||
from flash_attn import flash_attn_func, flash_attn_varlen_func
|
||||
except ImportError:
|
||||
print(
|
||||
"Flash attention 2 is not installed. Please install it to run fav2 backend. "
|
||||
)
|
||||
raise
|
||||
elif backend == "fav3":
|
||||
try:
|
||||
from flash_attn.flash_attn_interface import (
|
||||
flash_attn_func,
|
||||
@ -1123,7 +1034,6 @@ def generate_experiment_configs(
|
||||
kv_cache_size: list[int],
|
||||
cal_bandwidth: bool,
|
||||
backends: list[str],
|
||||
max_autotune: bool,
|
||||
) -> list[ExperimentConfig]:
|
||||
assert not (calculate_bwd and decoding), "Decoding does not support backward"
|
||||
|
||||
@ -1167,333 +1077,52 @@ def generate_experiment_configs(
|
||||
calculate_bwd_time=calculate_bwd,
|
||||
cal_bandwidth=cal_bandwidth,
|
||||
backends=backends,
|
||||
max_autotune=max_autotune,
|
||||
)
|
||||
)
|
||||
|
||||
return all_configs
|
||||
|
||||
|
||||
def _output_json_for_dashboard(
|
||||
experiments,
|
||||
output_file,
|
||||
benchmark_name="PyTorch operator microbenchmark",
|
||||
):
|
||||
"""
|
||||
Write the result into JSON format for PyTorch OSS dashboard.
|
||||
The JSON format is defined at
|
||||
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
|
||||
|
||||
Args:
|
||||
experiments: List of experiment results
|
||||
output_file: Path to output JSON file
|
||||
benchmark_name: Name of the benchmark
|
||||
"""
|
||||
if not experiments:
|
||||
return
|
||||
|
||||
import math
|
||||
import platform
|
||||
from dataclasses import asdict, dataclass
|
||||
from typing import Any, Optional
|
||||
|
||||
# Prepare headers and records for JSON output
|
||||
records = []
|
||||
for experiment in experiments:
|
||||
config = experiment.config
|
||||
results_dict = (
|
||||
experiment.results
|
||||
) # This is a dict: backend -> ExperimentResults
|
||||
|
||||
# Process each backend result
|
||||
for backend, results in results_dict.items():
|
||||
# Skip backends that were not run (NaN results)
|
||||
if math.isnan(results.fwd_time):
|
||||
continue
|
||||
|
||||
# Extract data from experiment
|
||||
test_name = f"{backend}_{config.attn_type}_"
|
||||
input_config = f"shape: {config.shape}, dtype: {config.dtype}"
|
||||
|
||||
# Determine mode based on backward pass
|
||||
mode = "training" if config.calculate_bwd_time else "inference"
|
||||
|
||||
# Extract dtype
|
||||
dtype = (
|
||||
str(config.dtype).split(".")[1]
|
||||
if "." in str(config.dtype)
|
||||
else str(config.dtype)
|
||||
)
|
||||
|
||||
# Determine device
|
||||
device = "cuda"
|
||||
|
||||
# Get device architecture
|
||||
device_arch = (
|
||||
torch.cuda.get_device_name(0)
|
||||
if device == "cuda"
|
||||
else platform.processor()
|
||||
if device == "cpu"
|
||||
else "unknown"
|
||||
)
|
||||
|
||||
# Create dataclasses for JSON structure
|
||||
@dataclass
|
||||
class BenchmarkInfo:
|
||||
name: str
|
||||
mode: Optional[str]
|
||||
dtype: str
|
||||
extra_info: dict[str, Any]
|
||||
|
||||
@dataclass
|
||||
class ModelInfo:
|
||||
name: str
|
||||
type: str
|
||||
origins: list[str]
|
||||
extra_info: dict[str, Any]
|
||||
|
||||
@dataclass
|
||||
class MetricInfo:
|
||||
name: str
|
||||
unit: str
|
||||
benchmark_values: list[float]
|
||||
target_value: Optional[float]
|
||||
|
||||
@dataclass
|
||||
class BenchmarkRecord:
|
||||
benchmark: BenchmarkInfo
|
||||
model: ModelInfo
|
||||
metric: MetricInfo
|
||||
|
||||
# Benchmark extra info
|
||||
benchmark_extra_info = {
|
||||
"input_config": input_config,
|
||||
"device": device,
|
||||
"arch": device_arch,
|
||||
"operator_name": backend,
|
||||
"attn_type": config.attn_type,
|
||||
"shape": str(config.shape),
|
||||
"max_autotune": config.max_autotune,
|
||||
}
|
||||
# Add record for forward latency
|
||||
record_fwd_latency = BenchmarkRecord(
|
||||
benchmark=BenchmarkInfo(
|
||||
name=benchmark_name,
|
||||
mode=mode,
|
||||
dtype=dtype,
|
||||
extra_info=benchmark_extra_info,
|
||||
),
|
||||
model=ModelInfo(
|
||||
name=test_name + str(config.shape),
|
||||
type="attention-benchmark",
|
||||
origins=["pytorch"],
|
||||
extra_info={
|
||||
"operator_name": backend,
|
||||
"attn_type": config.attn_type,
|
||||
},
|
||||
),
|
||||
metric=MetricInfo(
|
||||
name="forward latency",
|
||||
unit="us",
|
||||
benchmark_values=[results.fwd_time],
|
||||
target_value=None,
|
||||
),
|
||||
)
|
||||
records.append(asdict(record_fwd_latency))
|
||||
|
||||
# Add record for forward memory bandwidth (if available)
|
||||
if config.cal_bandwidth:
|
||||
record_fwd_bandwidth = BenchmarkRecord(
|
||||
benchmark=BenchmarkInfo(
|
||||
name=benchmark_name,
|
||||
mode=mode,
|
||||
dtype=dtype,
|
||||
extra_info=benchmark_extra_info,
|
||||
),
|
||||
model=ModelInfo(
|
||||
name=test_name + str(config.shape),
|
||||
type="attention-benchmark",
|
||||
origins=["pytorch"],
|
||||
extra_info={
|
||||
"operator_name": backend,
|
||||
},
|
||||
),
|
||||
metric=MetricInfo(
|
||||
name="memory bandwidth",
|
||||
unit="TB/s",
|
||||
benchmark_values=[calculate_bandwidth(config, results, "fwd")],
|
||||
target_value=None,
|
||||
),
|
||||
)
|
||||
records.append(asdict(record_fwd_bandwidth))
|
||||
|
||||
# Add record for forward TFLOPS (if available)
|
||||
if config.cal_bandwidth:
|
||||
record_fwd_tflops = BenchmarkRecord(
|
||||
benchmark=BenchmarkInfo(
|
||||
name=benchmark_name,
|
||||
mode=mode,
|
||||
dtype=dtype,
|
||||
extra_info=benchmark_extra_info,
|
||||
),
|
||||
model=ModelInfo(
|
||||
name=test_name + str(config.shape),
|
||||
type="attention-benchmark",
|
||||
origins=["pytorch"],
|
||||
extra_info={
|
||||
"operator_name": backend,
|
||||
},
|
||||
),
|
||||
metric=MetricInfo(
|
||||
name="tflops",
|
||||
unit="TFLOPS/s",
|
||||
benchmark_values=[calculate_tflops(config, results)],
|
||||
target_value=None,
|
||||
),
|
||||
)
|
||||
records.append(asdict(record_fwd_tflops))
|
||||
|
||||
# Add record for backward latency (if available and not NaN)
|
||||
if (
|
||||
config.calculate_bwd_time
|
||||
and results.bwd_time is not None
|
||||
and not math.isnan(results.bwd_time)
|
||||
):
|
||||
record_bwd_latency = BenchmarkRecord(
|
||||
benchmark=BenchmarkInfo(
|
||||
name=benchmark_name,
|
||||
mode=mode,
|
||||
dtype=dtype,
|
||||
extra_info=benchmark_extra_info,
|
||||
),
|
||||
model=ModelInfo(
|
||||
name=test_name + str(config.shape),
|
||||
type="attention-benchmark",
|
||||
origins=["pytorch"],
|
||||
extra_info={
|
||||
"operator_name": backend,
|
||||
},
|
||||
),
|
||||
metric=MetricInfo(
|
||||
name="backward latency",
|
||||
unit="us",
|
||||
benchmark_values=[results.bwd_time],
|
||||
target_value=None,
|
||||
),
|
||||
)
|
||||
records.append(asdict(record_bwd_latency))
|
||||
|
||||
# Write all records to the output file
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(records, f, indent=2)
|
||||
|
||||
|
||||
def main(
|
||||
dynamic: bool = False,
|
||||
calculate_bwd: bool = False,
|
||||
dtype: DtypeString = "bfloat16",
|
||||
b: list[int] | None = None,
|
||||
nh: list[str] | None = None,
|
||||
s: list[int] | None = None,
|
||||
d: list[int] | None = None,
|
||||
mods: list[AttentionType] | None = None,
|
||||
backend: list[Backend] | None = None,
|
||||
max_autotune: bool = False,
|
||||
decoding: bool = False,
|
||||
kv_size: Optional[list[int]] = None,
|
||||
throughput: bool = True,
|
||||
save_path: Optional[str] = None,
|
||||
output_json_for_dashboard: Optional[str] = None,
|
||||
benchmark_name: str = "PyTorch operator microbenchmark",
|
||||
) -> None:
|
||||
"""Run sweep over sizes and score mods for flex attention.
|
||||
|
||||
Usage Examples:
|
||||
# Use a yml config file
|
||||
python score_mod.py --config basic_config.yaml
|
||||
|
||||
# Use a json config file
|
||||
python score_mod.py --config my_config.json
|
||||
|
||||
# Generate a config template
|
||||
python score_mod.py --print-config json > my_config.json # For a json config
|
||||
python score_mod.py --print-config yaml > my_config.yaml # For a yaml config
|
||||
|
||||
# Override config with CLI args
|
||||
python score_mod.py --config my_config.json -dtype float16 --max-autotune
|
||||
|
||||
# Pure CLI usage
|
||||
python score_mod.py -b 4 8 -s 1024 2048 -mods causal alibi --backend efficient
|
||||
|
||||
Args:
|
||||
dynamic: Runs a dynamic shapes version of compiled flex attention
|
||||
calculate_bwd: Calculate backward pass times
|
||||
dtype: Data type for tensors (bfloat16, float16, float32)
|
||||
b: Batch sizes to benchmark
|
||||
nh: Number of query and key/value heads in format "Hq,Hkv"
|
||||
s: Sequence lengths to benchmark
|
||||
d: Head dimensions to benchmark
|
||||
mods: Score modifications: noop, causal, rel, head_bias, alibi, sliding_window, document_mask, prefix_lm, softcap
|
||||
backend: Backends for attention computation: math, efficient, cudnn, fav2, fav3, fakv, og-eager
|
||||
max_autotune: Turn on max-autotune optimization
|
||||
decoding: Benchmark decoding mode (query sequence length = 1)
|
||||
kv_size: Key/value cache size in MiB (ignores batch size if specified)
|
||||
throughput: Calculate kernel memory bandwidth & computational throughput (always True)
|
||||
save_path: Path to save the results CSV file
|
||||
output_json_for_dashboard: Path to save results in JSON format for PyTorch OSS dashboard
|
||||
benchmark_name: Name of the benchmark for dashboard output
|
||||
"""
|
||||
# Convert dtype string to torch dtype (if not already converted)
|
||||
import torch
|
||||
|
||||
if isinstance(dtype, str):
|
||||
dtype = getattr(torch, dtype)
|
||||
|
||||
# Always calculate throughput
|
||||
throughput = True
|
||||
print("Backend: ", backend)
|
||||
def main(args):
|
||||
seed = 123
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
results = []
|
||||
for experiment_count, config in enumerate(
|
||||
tqdm(
|
||||
generate_experiment_configs(
|
||||
calculate_bwd,
|
||||
dtype,
|
||||
b,
|
||||
nh,
|
||||
s,
|
||||
d,
|
||||
mods,
|
||||
decoding,
|
||||
kv_size,
|
||||
throughput,
|
||||
backend,
|
||||
max_autotune,
|
||||
)
|
||||
),
|
||||
start=1,
|
||||
for config in tqdm(
|
||||
generate_experiment_configs(
|
||||
args.calculate_bwd,
|
||||
args.dtype,
|
||||
args.b,
|
||||
args.nh,
|
||||
args.s,
|
||||
args.d,
|
||||
args.mods,
|
||||
args.decoding,
|
||||
args.kv_size,
|
||||
args.throughput,
|
||||
args.backend,
|
||||
)
|
||||
):
|
||||
results.append(
|
||||
Experiment(
|
||||
config,
|
||||
run_single_experiment(
|
||||
config,
|
||||
dynamic=dynamic,
|
||||
dynamic=args.dynamic,
|
||||
max_autotune=args.max_autotune,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# Periodic memory cleanup every 50 experiments
|
||||
if experiment_count % 50 == 0:
|
||||
cleanup_memory()
|
||||
print_results(results, args.save_path)
|
||||
|
||||
print_results(results, save_path)
|
||||
|
||||
# Output JSON for dashboard if requested
|
||||
if output_json_for_dashboard:
|
||||
_output_json_for_dashboard(results, output_json_for_dashboard, benchmark_name)
|
||||
def heads_input_type(s):
|
||||
try:
|
||||
hq, hkv = map(int, s.split(","))
|
||||
return hq, hkv
|
||||
except Exception as e:
|
||||
raise argparse.ArgumentTypeError("Heads must be Hq,Hkv") from e
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -1501,12 +1130,6 @@ if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run sweep over sizes and score mods for flex attention"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
help="Path to JSON config file. CLI args override config file values.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dynamic",
|
||||
action="store_true",
|
||||
@ -1576,49 +1199,8 @@ Ignores -b batch size and calculate batch size from kv size instead when specifi
|
||||
default=["efficient"],
|
||||
help="Backend to use for attention computation",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-json-for-dashboard",
|
||||
type=str,
|
||||
help="Path to save results in JSON format for PyTorch OSS dashboard",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--benchmark-name",
|
||||
type=str,
|
||||
help="Name of the benchmark for dashboard output",
|
||||
default="PyTorch operator microbenchmark",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--print-config",
|
||||
type=str,
|
||||
choices=["json", "yaml"],
|
||||
help="Print a default config template in JSON or YAML format and exit",
|
||||
default=None,
|
||||
)
|
||||
# Parse arguments
|
||||
args = parser.parse_args()
|
||||
args.dtype = getattr(torch, args.dtype)
|
||||
|
||||
# Handle --print-config
|
||||
if args.print_config:
|
||||
print_default_config(args.print_config)
|
||||
sys.exit(0)
|
||||
|
||||
# Load and merge config if provided
|
||||
if args.config:
|
||||
config = load_config_file(args.config)
|
||||
|
||||
# Merge config with CLI args (CLI args take precedence)
|
||||
json_args = argparse.Namespace()
|
||||
json_args.__dict__ = config
|
||||
args = parser.parse_args(namespace=json_args)
|
||||
|
||||
# Convert dtype string to torch dtype (only if it's still a string)
|
||||
if isinstance(args.dtype, str):
|
||||
args.dtype = getattr(torch, args.dtype)
|
||||
|
||||
# Remove config and print_config from args before passing to main
|
||||
args_dict = vars(args)
|
||||
args_dict.pop("config", None)
|
||||
args_dict.pop("print_config", None)
|
||||
|
||||
main(**args_dict)
|
||||
main(args)
|
||||
|
||||
@ -482,7 +482,6 @@ inductor_core_resources = [
|
||||
"torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp",
|
||||
"torch/csrc/inductor/inductor_ops.cpp",
|
||||
"torch/csrc/jit/serialization/pickle.cpp",
|
||||
"torch/csrc/shim_common.cpp",
|
||||
]
|
||||
|
||||
libtorch_core_sources = sorted(
|
||||
|
||||
@ -556,26 +556,3 @@ inline SymBool sym_ge(const SymInt& a, const SymInt& b) {
|
||||
}
|
||||
|
||||
} // namespace c10
|
||||
|
||||
#include <limits>
|
||||
|
||||
namespace std {
|
||||
|
||||
template <>
|
||||
class numeric_limits<c10::SymInt> {
|
||||
public:
|
||||
static constexpr bool is_specialized = true;
|
||||
|
||||
static constexpr int64_t max() noexcept {
|
||||
return std::numeric_limits<int64_t>::max();
|
||||
}
|
||||
|
||||
static constexpr int64_t min() noexcept {
|
||||
return std::numeric_limits<int64_t>::min();
|
||||
}
|
||||
|
||||
static constexpr bool is_signed = true;
|
||||
static constexpr bool is_integer = true;
|
||||
};
|
||||
|
||||
} // namespace std
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
// Implementation of special math functions for Metal
|
||||
// Implementation of specal math functions for Metal
|
||||
#pragma once
|
||||
#include <c10/metal/expm1f.h>
|
||||
#include <c10/metal/igamma.h>
|
||||
@ -624,64 +624,6 @@ inline T spherical_bessel_j0(T x) {
|
||||
return static_cast<T>(::metal::sin(x) / x);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline ::metal::enable_if_t<is_scalar_floating_point_v<T>, T> logaddexp(
|
||||
T a,
|
||||
T b) {
|
||||
float a0 = static_cast<float>(a);
|
||||
float b0 = static_cast<float>(b);
|
||||
if (::metal::isinf(a0) && a0 == b0) {
|
||||
return static_cast<T>(a0);
|
||||
} else {
|
||||
float m0 = ::metal::max(a0, b0);
|
||||
return static_cast<T>(
|
||||
m0 + ::c10::metal::log1p(::metal::exp(-::metal::abs(a0 - b0))));
|
||||
}
|
||||
}
|
||||
|
||||
// The function is ported from mlx
|
||||
template <typename T>
|
||||
inline ::metal::enable_if_t<is_complex_v<T>, T> logaddexp(T a, T b) {
|
||||
if (::metal::isnan(a.x) || ::metal::isnan(a.y) || ::metal::isnan(b.x) ||
|
||||
::metal::isnan(b.y)) {
|
||||
return T(NAN, NAN);
|
||||
}
|
||||
|
||||
T maxval = a.x > b.x ? a : b;
|
||||
T minval = a.x < b.x ? a : b;
|
||||
constexpr auto inf = ::metal::numeric_limits<T>::infinity().x;
|
||||
|
||||
if (minval.x == -inf || maxval.x == inf) {
|
||||
return maxval;
|
||||
}
|
||||
|
||||
float2 maxval_ = static_cast<float2>(maxval);
|
||||
float2 minval_ = static_cast<float2>(minval);
|
||||
float m = ::metal::exp(minval_.x - maxval_.x);
|
||||
float2 dexp{
|
||||
m * ::metal::cos(minval_.y - maxval_.y),
|
||||
m * ::metal::sin(minval_.y - maxval_.y),
|
||||
};
|
||||
return static_cast<T>(maxval_ + ::c10::metal::log1p(dexp));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline T logaddexp2(T a, T b) {
|
||||
constexpr auto log_2 = float(0.693147180559945309417232121458176);
|
||||
constexpr auto inv_log_2 = float(1) / log_2;
|
||||
float a0 = static_cast<float>(a);
|
||||
float b0 = static_cast<float>(b);
|
||||
if (::metal::isinf(a0) && a0 == b0) {
|
||||
return static_cast<T>(a0);
|
||||
} else {
|
||||
float m0 = ::metal::max(a0, b0);
|
||||
return static_cast<T>(
|
||||
m0 +
|
||||
::c10::metal::log1p(::metal::pow(float(2), -::metal::abs(a0 - b0))) *
|
||||
inv_log_2);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline float xlog1py(T x, T y) {
|
||||
if (::metal::isnan(y)) {
|
||||
|
||||
@ -322,24 +322,6 @@ inline float log1p(float x) {
|
||||
return rc;
|
||||
}
|
||||
|
||||
// The function is ported from mlx
|
||||
inline float2 log1p(float2 in) {
|
||||
float x = in.x;
|
||||
float y = in.y;
|
||||
float zabs = ::metal::precise::sqrt(x * x + y * y);
|
||||
float theta = ::metal::atan2(y, x + 1);
|
||||
if (zabs < 0.5f) {
|
||||
float r = x * (2 + x) + y * y;
|
||||
if (r == 0) { // handle underflow
|
||||
return {x, theta};
|
||||
}
|
||||
return {0.5f * log1p(r), theta};
|
||||
} else {
|
||||
auto z0 = ::metal::sqrt((x + 1) * (x + 1) + y * y);
|
||||
return {::metal::log(z0), theta};
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T1, typename T2 = T1>
|
||||
struct pair {
|
||||
T1 first;
|
||||
|
||||
@ -34,7 +34,7 @@ struct MemEvent {
|
||||
bool overlaps(const MemBlock& a, const MemBlock& b) {
|
||||
// two blocks dont overlap if
|
||||
// |---a--------|--------------b--------|
|
||||
// start_a end_a <= start_b end_b
|
||||
// strat_a end_a <= start_b end_b
|
||||
return !(
|
||||
(a.end_offset <= b.start_offset) || (b.end_offset <= a.start_offset));
|
||||
}
|
||||
|
||||
@ -33,7 +33,7 @@ struct bitset final {
|
||||
constexpr bitset() noexcept = default;
|
||||
constexpr bitset(const bitset&) noexcept = default;
|
||||
constexpr bitset(bitset&&) noexcept = default;
|
||||
// there is an issue for gcc 5.3.0 when define default function as constexpr
|
||||
// there is an issure for gcc 5.3.0 when define default function as constexpr
|
||||
// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68754.
|
||||
bitset& operator=(const bitset&) noexcept = default;
|
||||
bitset& operator=(bitset&&) noexcept = default;
|
||||
|
||||
@ -123,8 +123,6 @@ class DeviceCachingAllocator {
|
||||
ska::flat_hash_map<xpu::XPUStream, std::deque<std::pair<sycl::event, Block*>>>
|
||||
xpu_events;
|
||||
DeviceIndex device_index;
|
||||
size_t allowed_memory_maximum = 0;
|
||||
bool set_fraction = false;
|
||||
|
||||
size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
|
||||
if (!src || src->allocated || src->event_count > 0 ||
|
||||
@ -247,12 +245,6 @@ class DeviceCachingAllocator {
|
||||
if (isRetry) {
|
||||
stats.num_alloc_retries += 1;
|
||||
}
|
||||
if (set_fraction &&
|
||||
stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current +
|
||||
size >
|
||||
allowed_memory_maximum) {
|
||||
return false;
|
||||
}
|
||||
void* ptr = sycl::aligned_alloc_device(
|
||||
kDeviceAlignment,
|
||||
size,
|
||||
@ -443,11 +435,6 @@ class DeviceCachingAllocator {
|
||||
device_free =
|
||||
raw_device.get_info<sycl::ext::intel::info::device::free_memory>();
|
||||
}
|
||||
std::string allowed_info;
|
||||
if (set_fraction) {
|
||||
allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
|
||||
}
|
||||
|
||||
auto allocated_bytes =
|
||||
stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
|
||||
.current;
|
||||
@ -472,9 +459,7 @@ class DeviceCachingAllocator {
|
||||
format_size(device_total),
|
||||
" of which ",
|
||||
format_size(device_free),
|
||||
" is free. ",
|
||||
allowed_info,
|
||||
"Of the allocated memory ",
|
||||
" is free. Of the allocated memory ",
|
||||
format_size(allocated_bytes),
|
||||
" is allocated by PyTorch, and ",
|
||||
format_size(reserved_bytes - allocated_bytes),
|
||||
@ -553,25 +538,6 @@ class DeviceCachingAllocator {
|
||||
stats.requested_bytes[statType].reset_peak();
|
||||
}
|
||||
}
|
||||
|
||||
double getMemoryFraction() {
|
||||
if (!set_fraction) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
c10::xpu::DeviceProp device_prop;
|
||||
c10::xpu::get_device_properties(&device_prop, device_index);
|
||||
return static_cast<double>(allowed_memory_maximum) /
|
||||
static_cast<double>(device_prop.global_mem_size);
|
||||
}
|
||||
|
||||
void setMemoryFraction(double fraction) {
|
||||
c10::xpu::DeviceProp device_prop;
|
||||
c10::xpu::get_device_properties(&device_prop, device_index);
|
||||
auto device_total = device_prop.global_mem_size;
|
||||
allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
|
||||
set_fraction = true;
|
||||
}
|
||||
};
|
||||
|
||||
static void local_raw_delete(void* ptr);
|
||||
@ -734,21 +700,6 @@ class XPUAllocator : public DeviceAllocator {
|
||||
assertValidDevice(device);
|
||||
device_allocators[device]->resetAccumulatedStats();
|
||||
}
|
||||
|
||||
double getMemoryFraction(DeviceIndex device) {
|
||||
assertValidDevice(device);
|
||||
return device_allocators[device]->getMemoryFraction();
|
||||
}
|
||||
|
||||
void setMemoryFraction(double fraction, DeviceIndex device) {
|
||||
assertValidDevice(device);
|
||||
TORCH_CHECK_VALUE(
|
||||
0 < fraction && fraction <= 1,
|
||||
"invalid fraction:",
|
||||
fraction,
|
||||
". Please set within (0, 1].");
|
||||
device_allocators[device]->setMemoryFraction(fraction);
|
||||
}
|
||||
};
|
||||
|
||||
static XPUAllocator allocator;
|
||||
@ -793,14 +744,6 @@ void recordStream(const DataPtr& dataPtr, XPUStream stream) {
|
||||
return allocator.recordStream(dataPtr, stream);
|
||||
}
|
||||
|
||||
double getMemoryFraction(DeviceIndex device) {
|
||||
return allocator.getMemoryFraction(device);
|
||||
}
|
||||
|
||||
void setMemoryFraction(double fraction, DeviceIndex device) {
|
||||
return allocator.setMemoryFraction(fraction, device);
|
||||
}
|
||||
|
||||
REGISTER_ALLOCATOR(kXPU, &allocator)
|
||||
|
||||
} // namespace c10::xpu::XPUCachingAllocator
|
||||
|
||||
@ -25,8 +25,4 @@ C10_XPU_API void raw_delete(void* ptr);
|
||||
|
||||
C10_XPU_API void recordStream(const DataPtr& dataPtr, XPUStream stream);
|
||||
|
||||
C10_XPU_API double getMemoryFraction(DeviceIndex device);
|
||||
|
||||
C10_XPU_API void setMemoryFraction(double fraction, DeviceIndex device);
|
||||
|
||||
} // namespace c10::xpu::XPUCachingAllocator
|
||||
|
||||
@ -1358,15 +1358,9 @@ if(BUILD_TEST)
|
||||
)
|
||||
else()
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/lazy ${CMAKE_BINARY_DIR}/test_lazy)
|
||||
# NativeRT is disabled
|
||||
# add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
|
||||
add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_abi_check ${CMAKE_BINARY_DIR}/test_aoti_abi_check)
|
||||
if(BUILD_AOT_INDUCTOR_TEST)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_inference ${CMAKE_BINARY_DIR}/test_aoti_inference)
|
||||
endif()
|
||||
|
||||
if(USE_DISTRIBUTED)
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
|
||||
if(NOT WIN32)
|
||||
@ -1384,6 +1378,16 @@ if(BUILD_TEST)
|
||||
${CMAKE_BINARY_DIR}/test_mobile_nnc
|
||||
)
|
||||
endif()
|
||||
add_subdirectory(${TORCH_ROOT}/test/cpp/lazy
|
||||
${CMAKE_BINARY_DIR}/test_lazy)
|
||||
endif()
|
||||
if(BUILD_AOT_INDUCTOR_TEST)
|
||||
add_subdirectory(
|
||||
${TORCH_ROOT}/test/cpp/aoti_abi_check
|
||||
${CMAKE_BINARY_DIR}/test_aoti_abi_check)
|
||||
add_subdirectory(
|
||||
${TORCH_ROOT}/test/cpp/aoti_inference
|
||||
${CMAKE_BINARY_DIR}/test_aoti_inference)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
@ -38,7 +38,7 @@ uint32_t crc32_combine (uint32_t crcA, uint32_t crcB, size_t lengthB);
|
||||
|
||||
/// compute CRC32 (bitwise algorithm)
|
||||
uint32_t crc32_bitwise (const void* data, size_t length, uint32_t previousCrc32 = 0);
|
||||
/// compute CRC32 (half-byte algorithm)
|
||||
/// compute CRC32 (half-byte algoritm)
|
||||
uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32 = 0);
|
||||
|
||||
#ifdef CRC32_USE_LOOKUP_TABLE_BYTE
|
||||
@ -96,7 +96,7 @@ uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previo
|
||||
#define __BIG_ENDIAN 4321
|
||||
#endif
|
||||
|
||||
// define endianness and some integer data types
|
||||
// define endianess and some integer data types
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
// Windows always little endian
|
||||
#define __BYTE_ORDER __LITTLE_ENDIAN
|
||||
@ -168,7 +168,7 @@ namespace
|
||||
/// zlib's CRC32 polynomial
|
||||
const uint32_t Polynomial = 0xEDB88320;
|
||||
|
||||
/// swap endianness
|
||||
/// swap endianess
|
||||
static inline uint32_t swap(uint32_t x)
|
||||
{
|
||||
#if defined(__GNUC__) || defined(__clang__)
|
||||
@ -229,7 +229,7 @@ uint32_t crc32_bitwise(const void* data, size_t length, uint32_t previousCrc32)
|
||||
}
|
||||
|
||||
|
||||
/// compute CRC32 (half-byte algorithm)
|
||||
/// compute CRC32 (half-byte algoritm)
|
||||
uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32)
|
||||
{
|
||||
uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
|
||||
@ -662,7 +662,7 @@ uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB)
|
||||
// - if you append length(B) zeros to A and call it A' (think of it as AAAA000)
|
||||
// and prepend length(A) zeros to B and call it B' (think of it as 0000BBB)
|
||||
// then exists a C' = A' ^ B'
|
||||
// - remember: if you XOR something with zero, it remains unchanged: X ^ 0 = X
|
||||
// - remember: if you XOR someting with zero, it remains unchanged: X ^ 0 = X
|
||||
// - that means C' = A concat B so that crc(A concat B) = crc(C') = crc(A') ^ crc(B')
|
||||
// - the trick is to compute crc(A') based on crc(A)
|
||||
// and crc(B') based on crc(B)
|
||||
|
||||
@ -76,7 +76,7 @@ typedef struct mz_zip_archive mz_zip_archive;
|
||||
// 2) Writing with 1-pass sequential access
|
||||
// -> We must take care not to require updating values that have already
|
||||
// been written. We place the variable-length index at the end and do
|
||||
// not put any index into the header to fulfill this constraint.
|
||||
// not put any indicies into the header to fulfill this constraint.
|
||||
|
||||
// The model.json, which contains all the metadata information,
|
||||
// should be written as the last file. One reason is that the size of tensor
|
||||
|
||||
@ -519,7 +519,7 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoadWithAllocator) {
|
||||
std::tie(data_ptr, size) = reader.getRecord("key1", &overrideAllocator);
|
||||
EXPECT_EQ(overrideAllocator.getAllocatedBytes(), kBytes1);
|
||||
EXPECT_EQ(baseAllocator.getAllocatedBytes(), allocBytes);
|
||||
// allocate with base allocator
|
||||
// allcoate with base allocator
|
||||
std::tie(data_ptr, size) = reader.getRecord("key1");
|
||||
EXPECT_EQ(overrideAllocator.getAllocatedBytes(), kBytes1);
|
||||
EXPECT_EQ(baseAllocator.getAllocatedBytes(), allocBytes + kBytes1);
|
||||
|
||||
@ -383,7 +383,7 @@ function(torch_compile_options libname)
|
||||
-Wno-strict-aliasing
|
||||
)
|
||||
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||
list(APPEND private_compile_options -Wredundant-move -Wno-interference-size)
|
||||
list(APPEND private_compile_options -Wredundant-move)
|
||||
endif()
|
||||
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
list(APPEND private_compile_options -Wextra-semi -Wmove)
|
||||
|
||||
@ -14,7 +14,7 @@ Combining, these building blocks form a research and
|
||||
production ready C++ library for tensor computation and dynamic neural
|
||||
networks with strong emphasis on GPU acceleration as well as fast CPU
|
||||
performance. It is currently in use at Facebook in research and
|
||||
production; we are looking forward to welcoming more users of the PyTorch C++ API.
|
||||
production; we are looking forward to welcome more users of the PyTorch C++ API.
|
||||
|
||||
.. warning::
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user